]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/suse-2.6.27.25/patches.xen/xen3-patch-2.6.27
Changed checkfs to auto reboot after correctable fsck fixes.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.xen / xen3-patch-2.6.27
CommitLineData
cc90b958
BS
1From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
2Subject: [PATCH] Linux: Update to 2.6.27
3Patch-mainline: 2.6.27
4
5 This patch contains the differences between Linux 2.6.26 and 2.6.27.
6
7Acked-by: Jeff Mahoney <jeffm@suse.com>
8Automatically created from "patches.kernel.org/patch-2.6.27" by xen-port-patches.py
9
00e5a55c
BS
10--- sle11-2009-06-04.orig/arch/x86/Kconfig 2009-03-16 16:38:05.000000000 +0100
11+++ sle11-2009-06-04/arch/x86/Kconfig 2009-06-04 10:21:39.000000000 +0200
12@@ -594,7 +594,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
cc90b958
BS
13 config AMD_IOMMU
14 bool "AMD IOMMU support"
15 select SWIOTLB
16- depends on X86_64 && PCI && ACPI
17+ depends on X86_64 && PCI && ACPI && !X86_64_XEN
18 help
19 With this option you can enable support for AMD IOMMU hardware in
20 your system. An IOMMU is a hardware component which provides
00e5a55c 21@@ -629,8 +629,10 @@ config MAXSMP
cc90b958
BS
22
23 config NR_CPUS
00e5a55c 24 int "Maximum number of CPUs (2-4096)"
cc90b958 25+ range 2 32 if XEN
00e5a55c 26 range 2 4096
cc90b958
BS
27 depends on SMP
28+ default "32" if MAXSMP && XEN
29 default "4096" if MAXSMP
30 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
31 default "16" if X86_64_XEN
00e5a55c 32@@ -1227,7 +1229,7 @@ config MTRR
cc90b958
BS
33 config MTRR_SANITIZER
34 bool
35 prompt "MTRR cleanup support"
36- depends on MTRR
37+ depends on MTRR && !XEN
38 help
39 Convert MTRR layout from continuous to discrete, so X drivers can
40 add writeback entries.
00e5a55c
BS
41--- sle11-2009-06-04.orig/arch/x86/Kconfig.debug 2009-03-16 16:33:40.000000000 +0100
42+++ sle11-2009-06-04/arch/x86/Kconfig.debug 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
43@@ -25,6 +25,7 @@ config STRICT_DEVMEM
44 config X86_VERBOSE_BOOTUP
45 bool "Enable verbose x86 bootup info messages"
46 default y
47+ depends on !XEN
48 help
49 Enables the informational output from the decompression stage
50 (e.g. bzImage) of the boot. If you disable this you will still
00e5a55c 51@@ -179,7 +180,7 @@ config MMIOTRACE_HOOKS
cc90b958
BS
52
53 config MMIOTRACE
54 bool "Memory mapped IO tracing"
55- depends on DEBUG_KERNEL && PCI
56+ depends on DEBUG_KERNEL && PCI && !XEN
57 select TRACING
58 select MMIOTRACE_HOOKS
59 help
00e5a55c
BS
60--- sle11-2009-06-04.orig/arch/x86/Makefile 2009-02-16 16:18:36.000000000 +0100
61+++ sle11-2009-06-04/arch/x86/Makefile 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
62@@ -116,8 +116,8 @@ mflags-$(CONFIG_X86_VOYAGER) := -Iinclud
63 mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
64
65 # Xen subarch support
66-mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen
67-mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/
68+mflags-$(CONFIG_XEN) := -Iinclude/asm-x86/mach-xen
69+mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
70
71 # generic subarchitecture
72 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
73@@ -128,8 +128,10 @@ mcore-$(CONFIG_X86_GENERICARCH) := arch/
74 mflags-y += -Iinclude/asm-x86/mach-default
75
76 # 64 bit does not support subarch support - clear sub arch variables
77+ifneq ($(CONFIG_XEN),y)
78 fcore-$(CONFIG_X86_64) :=
79 mcore-$(CONFIG_X86_64) :=
80+endif
81
82 KBUILD_CFLAGS += $(mflags-y)
83 KBUILD_AFLAGS += $(mflags-y)
00e5a55c
BS
84--- sle11-2009-06-04.orig/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:38:05.000000000 +0100
85+++ sle11-2009-06-04/arch/x86/ia32/ia32entry-xen.S 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
86@@ -15,6 +15,16 @@
87 #include <asm/irqflags.h>
88 #include <linux/linkage.h>
89
90+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
91+#include <linux/elf-em.h>
92+#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
93+#define __AUDIT_ARCH_LE 0x40000000
94+
95+#ifndef CONFIG_AUDITSYSCALL
96+#define sysexit_audit int_ret_from_sys_call
97+#define sysretl_audit int_ret_from_sys_call
98+#endif
99+
100 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
101
102 .macro IA32_ARG_FIXUP noebp=0
103@@ -37,6 +47,11 @@
104 movq %rax,R8(%rsp)
105 .endm
106
107+ /*
108+ * Reload arg registers from stack in case ptrace changed them.
109+ * We don't reload %eax because syscall_trace_enter() returned
110+ * the value it wants us to use in the table lookup.
111+ */
112 .macro LOAD_ARGS32 offset
113 movl \offset(%rsp),%r11d
114 movl \offset+8(%rsp),%r10d
115@@ -46,7 +61,6 @@
116 movl \offset+48(%rsp),%edx
117 movl \offset+56(%rsp),%esi
118 movl \offset+64(%rsp),%edi
119- movl \offset+72(%rsp),%eax
120 .endm
121
122 .macro CFI_STARTPROC32 simple
123@@ -61,6 +75,19 @@
124 CFI_UNDEFINED r15
125 .endm
126
127+#ifdef CONFIG_PARAVIRT
128+ENTRY(native_usergs_sysret32)
129+ swapgs
130+ sysretl
131+ENDPROC(native_usergs_sysret32)
132+
133+ENTRY(native_irq_enable_sysexit)
134+ swapgs
135+ sti
136+ sysexit
137+ENDPROC(native_irq_enable_sysexit)
138+#endif
139+
140 /*
141 * 32bit SYSENTER instruction entry.
142 *
143@@ -98,7 +125,7 @@ ENTRY(ia32_sysenter_target)
144 CFI_RESTORE rcx
145 movl %ebp,%ebp /* zero extension */
146 movl %eax,%eax
147- movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
148+ movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
149 movl $__USER32_DS,40(%rsp)
150 movq %rbp,32(%rsp)
151 movl $__USER32_CS,16(%rsp)
152@@ -113,19 +140,79 @@ ENTRY(ia32_sysenter_target)
153 .quad 1b,ia32_badarg
154 .previous
155 GET_THREAD_INFO(%r10)
156- orl $TS_COMPAT,threadinfo_status(%r10)
157- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
158+ orl $TS_COMPAT,TI_status(%r10)
159+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
160 jnz sysenter_tracesys
161-sysenter_do_call:
162 cmpl $(IA32_NR_syscalls-1),%eax
163 ja ia32_badsys
164+sysenter_do_call:
165 IA32_ARG_FIXUP 1
166+sysenter_dispatch:
167 call *ia32_sys_call_table(,%rax,8)
168 movq %rax,RAX-ARGOFFSET(%rsp)
169+ GET_THREAD_INFO(%r10)
170+ DISABLE_INTERRUPTS(CLBR_NONE)
171+ TRACE_IRQS_OFF
172+ testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
173+ jnz sysexit_audit
174+ jmp int_ret_from_sys_call
175+
176+#ifdef CONFIG_AUDITSYSCALL
177+ .macro auditsys_entry_common
178+ movl %esi,%r9d /* 6th arg: 4th syscall arg */
179+ movl %edx,%r8d /* 5th arg: 3rd syscall arg */
180+ /* (already in %ecx) 4th arg: 2nd syscall arg */
181+ movl %ebx,%edx /* 3rd arg: 1st syscall arg */
182+ movl %eax,%esi /* 2nd arg: syscall number */
183+ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
184+ call audit_syscall_entry
185+ movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
186+ cmpl $(IA32_NR_syscalls-1),%eax
187+ ja ia32_badsys
188+ movl %ebx,%edi /* reload 1st syscall arg */
189+ movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
190+ movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
191+ movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
192+ movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
193+ .endm
194+
195+ .macro auditsys_exit exit,ebpsave=RBP
196+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
197+ jnz int_ret_from_sys_call
198+ TRACE_IRQS_ON
199+ ENABLE_INTERRUPTS(CLBR_NONE)
200+ movl %eax,%esi /* second arg, syscall return value */
201+ cmpl $0,%eax /* is it < 0? */
202+ setl %al /* 1 if so, 0 if not */
203+ movzbl %al,%edi /* zero-extend that into %edi */
204+ inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
205+ call audit_syscall_exit
206+ GET_THREAD_INFO(%r10)
207+ movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
208+ movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
209+ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
210+ DISABLE_INTERRUPTS(CLBR_NONE)
211+ TRACE_IRQS_OFF
212+ testl %edi,TI_flags(%r10)
213+ jnz int_with_check
214 jmp int_ret_from_sys_call
215+ .endm
216+
217+sysenter_auditsys:
218+ auditsys_entry_common
219+ movl %ebp,%r9d /* reload 6th syscall arg */
220+ jmp sysenter_dispatch
221+
222+sysexit_audit:
223+ auditsys_exit sysexit_from_sys_call
224+#endif
225
226 sysenter_tracesys:
227 xchgl %r9d,%ebp
228+#ifdef CONFIG_AUDITSYSCALL
229+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
230+ jz sysenter_auditsys
231+#endif
232 SAVE_REST
233 CLEAR_RREGS
234 movq %r9,R9(%rsp)
235@@ -186,18 +273,38 @@ ENTRY(ia32_cstar_target)
236 .quad 1b,ia32_badarg
237 .previous
238 GET_THREAD_INFO(%r10)
239- orl $TS_COMPAT,threadinfo_status(%r10)
240- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
241+ orl $TS_COMPAT,TI_status(%r10)
242+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
243 jnz cstar_tracesys
244 cstar_do_call:
245 cmpl $IA32_NR_syscalls-1,%eax
246 ja ia32_badsys
247 IA32_ARG_FIXUP 1
248+cstar_dispatch:
249 call *ia32_sys_call_table(,%rax,8)
250 movq %rax,RAX-ARGOFFSET(%rsp)
251+ GET_THREAD_INFO(%r10)
252+ DISABLE_INTERRUPTS(CLBR_NONE)
253+ testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
254+ jnz sysretl_audit
255 jmp int_ret_from_sys_call
256
257-cstar_tracesys:
258+#ifdef CONFIG_AUDITSYSCALL
259+cstar_auditsys:
260+ movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
261+ auditsys_entry_common
262+ movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
263+ jmp cstar_dispatch
264+
265+sysretl_audit:
266+ auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
267+#endif
268+
269+cstar_tracesys:
270+#ifdef CONFIG_AUDITSYSCALL
271+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
272+ jz cstar_auditsys
273+#endif
274 xchgl %r9d,%ebp
275 SAVE_REST
276 CLEAR_RREGS
277@@ -263,8 +370,8 @@ ENTRY(ia32_syscall)
278 this could be a problem. */
279 SAVE_ARGS 0,0,1
280 GET_THREAD_INFO(%r10)
281- orl $TS_COMPAT,threadinfo_status(%r10)
282- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
283+ orl $TS_COMPAT,TI_status(%r10)
284+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
285 jnz ia32_tracesys
286 ia32_do_syscall:
287 cmpl $(IA32_NR_syscalls-1),%eax
288@@ -309,13 +416,11 @@ quiet_ni_syscall:
289 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
290 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
291 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
292- PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
293 PTREGSCALL stub32_execve, sys32_execve, %rcx
294 PTREGSCALL stub32_fork, sys_fork, %rdi
295 PTREGSCALL stub32_clone, sys32_clone, %rdx
296 PTREGSCALL stub32_vfork, sys_vfork, %rdi
297 PTREGSCALL stub32_iopl, sys_iopl, %rsi
298- PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
299
300 ENTRY(ia32_ptregs_common)
301 popq %r11
302@@ -415,7 +520,7 @@ ia32_sys_call_table:
303 .quad sys_ssetmask
304 .quad sys_setreuid16 /* 70 */
305 .quad sys_setregid16
306- .quad stub32_sigsuspend
307+ .quad sys32_sigsuspend
308 .quad compat_sys_sigpending
309 .quad sys_sethostname
310 .quad compat_sys_setrlimit /* 75 */
311@@ -522,7 +627,7 @@ ia32_sys_call_table:
312 .quad sys32_rt_sigpending
313 .quad compat_sys_rt_sigtimedwait
314 .quad sys32_rt_sigqueueinfo
315- .quad stub32_rt_sigsuspend
316+ .quad sys_rt_sigsuspend
317 .quad sys32_pread /* 180 */
318 .quad sys32_pwrite
319 .quad sys_chown16
320@@ -670,4 +775,10 @@ ia32_sys_call_table:
321 .quad sys32_fallocate
322 .quad compat_sys_timerfd_settime /* 325 */
323 .quad compat_sys_timerfd_gettime
324+ .quad compat_sys_signalfd4
325+ .quad sys_eventfd2
326+ .quad sys_epoll_create1
327+ .quad sys_dup3 /* 330 */
328+ .quad sys_pipe2
329+ .quad sys_inotify_init1
330 ia32_syscall_end:
00e5a55c
BS
331--- sle11-2009-06-04.orig/arch/x86/kernel/Makefile 2009-03-16 16:38:05.000000000 +0100
332+++ sle11-2009-06-04/arch/x86/kernel/Makefile 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
333@@ -120,9 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
334
335 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
336
337- obj-$(CONFIG_XEN) += nmi_64.o
338+ obj-$(CONFIG_XEN) += nmi.o
339 time_64-$(CONFIG_XEN) += time_32.o
340 endif
341
00e5a55c 342-disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
cc90b958 343- pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
00e5a55c 344+disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \
cc90b958
BS
345+ i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
346+ tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o
00e5a55c
BS
347--- sle11-2009-06-04.orig/arch/x86/kernel/acpi/boot.c 2009-03-16 16:38:05.000000000 +0100
348+++ sle11-2009-06-04/arch/x86/kernel/acpi/boot.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
349@@ -951,7 +951,9 @@ void __init mp_register_ioapic(int id, u
350 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
351 mp_ioapics[idx].mp_apicaddr = address;
352
353+#ifndef CONFIG_XEN
354 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
355+#endif
356 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
357 #ifdef CONFIG_X86_32
358 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
359@@ -1108,7 +1110,7 @@ int mp_register_gsi(u32 gsi, int trigger
360 {
361 int ioapic;
362 int ioapic_pin;
363-#ifdef CONFIG_X86_32
364+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
365 #define MAX_GSI_NUM 4096
366 #define IRQ_COMPRESSION_START 64
367
368@@ -1156,7 +1158,7 @@ int mp_register_gsi(u32 gsi, int trigger
369 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
370 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
371 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
372-#ifdef CONFIG_X86_32
373+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
374 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
375 #else
376 return gsi;
377@@ -1164,7 +1166,7 @@ int mp_register_gsi(u32 gsi, int trigger
378 }
379
380 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
381-#ifdef CONFIG_X86_32
382+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
383 /*
384 * For GSI >= 64, use IRQ compression
385 */
00e5a55c
BS
386--- sle11-2009-06-04.orig/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:38:05.000000000 +0100
387+++ sle11-2009-06-04/arch/x86/kernel/acpi/sleep-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
388@@ -9,6 +9,7 @@
389 #include <linux/bootmem.h>
390 #include <linux/dmi.h>
391 #include <linux/cpumask.h>
392+#include <asm/segment.h>
393
394 #include "realmode/wakeup.h"
395 #include "sleep.h"
396@@ -20,7 +21,7 @@ unsigned long acpi_realmode_flags;
397 /* address in low memory of the wakeup routine. */
398 static unsigned long acpi_realmode;
399
400-#ifdef CONFIG_64BIT
401+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
402 static char temp_stack[10240];
403 #endif
404 #endif
405@@ -54,18 +55,27 @@ int acpi_save_state_mem(void)
406 header->video_mode = saved_video_mode;
407
408 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
409+
410+ /*
411+ * Set up the wakeup GDT. We set these up as Big Real Mode,
412+ * that is, with limits set to 4 GB. At least the Lenovo
413+ * Thinkpad X61 is known to need this for the video BIOS
414+ * initialization quirk to work; this is likely to also
415+ * be the case for other laptops or integrated video devices.
416+ */
417+
418 /* GDT[0]: GDT self-pointer */
419 header->wakeup_gdt[0] =
420 (u64)(sizeof(header->wakeup_gdt) - 1) +
421 ((u64)(acpi_wakeup_address +
422 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
423 << 16);
424- /* GDT[1]: real-mode-like code segment */
425- header->wakeup_gdt[1] = (0x009bULL << 40) +
426- ((u64)acpi_wakeup_address << 16) + 0xffff;
427- /* GDT[2]: real-mode-like data segment */
428- header->wakeup_gdt[2] = (0x0093ULL << 40) +
429- ((u64)acpi_wakeup_address << 16) + 0xffff;
430+ /* GDT[1]: big real mode-like code segment */
431+ header->wakeup_gdt[1] =
432+ GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
433+ /* GDT[2]: big real mode-like data segment */
434+ header->wakeup_gdt[2] =
435+ GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
436
437 #ifndef CONFIG_64BIT
438 store_gdt((struct desc_ptr *)&header->pmode_gdt);
439@@ -79,7 +89,7 @@ int acpi_save_state_mem(void)
440 #endif /* !CONFIG_64BIT */
441
442 header->pmode_cr0 = read_cr0();
443- header->pmode_cr4 = read_cr4();
444+ header->pmode_cr4 = read_cr4_safe();
445 header->realmode_flags = acpi_realmode_flags;
446 header->real_magic = 0x12345678;
447
448@@ -89,7 +99,9 @@ int acpi_save_state_mem(void)
449 saved_magic = 0x12345678;
450 #else /* CONFIG_64BIT */
451 header->trampoline_segment = setup_trampoline() >> 4;
452- init_rsp = (unsigned long)temp_stack + 4096;
453+#ifdef CONFIG_SMP
454+ stack_start.sp = temp_stack + 4096;
455+#endif
456 initial_code = (unsigned long)wakeup_long64;
457 saved_magic = 0x123456789abcdef0;
458 #endif /* CONFIG_64BIT */
459@@ -145,6 +157,12 @@ static int __init acpi_sleep_setup(char
460 acpi_realmode_flags |= 2;
461 if (strncmp(str, "s3_beep", 7) == 0)
462 acpi_realmode_flags |= 4;
463+#ifdef CONFIG_HIBERNATION
464+ if (strncmp(str, "s4_nohwsig", 10) == 0)
465+ acpi_no_s4_hw_signature();
466+#endif
467+ if (strncmp(str, "old_ordering", 12) == 0)
468+ acpi_old_suspend_ordering();
469 str = strchr(str, ',');
470 if (str != NULL)
471 str += strspn(str, ", \t");
00e5a55c
BS
472--- sle11-2009-06-04.orig/arch/x86/kernel/apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
473+++ sle11-2009-06-04/arch/x86/kernel/apic_32-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
474@@ -59,7 +59,10 @@ static cpumask_t timer_bcast_ipi;
475 /*
476 * Debug level, exported for io_apic.c
477 */
478-int apic_verbosity;
479+unsigned int apic_verbosity;
480+
481+/* Have we found an MP table */
482+int smp_found_config;
483
484 #ifndef CONFIG_XEN
485 static int modern_apic(void)
00e5a55c
BS
486--- sle11-2009-06-04.orig/arch/x86/kernel/apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
487+++ sle11-2009-06-04/arch/x86/kernel/apic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
488@@ -39,7 +39,10 @@ int disable_apic;
489 /*
490 * Debug level, exported for io_apic.c
491 */
492-int apic_verbosity;
493+unsigned int apic_verbosity;
494+
495+/* Have we found an MP table */
496+int smp_found_config;
497
498 /*
499 * The guts of the apic timer interrupt
00e5a55c
BS
500--- sle11-2009-06-04.orig/arch/x86/kernel/asm-offsets_64.c 2008-11-25 12:35:54.000000000 +0100
501+++ sle11-2009-06-04/arch/x86/kernel/asm-offsets_64.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
502@@ -138,7 +138,7 @@ int main(void)
503
504 BLANK();
505 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
506-#ifdef CONFIG_XEN
507+#ifdef CONFIG_PARAVIRT_XEN
508 BLANK();
509 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
510 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
00e5a55c
BS
511--- sle11-2009-06-04.orig/arch/x86/kernel/cpu/amd_64.c 2009-06-04 11:08:07.000000000 +0200
512+++ sle11-2009-06-04/arch/x86/kernel/cpu/amd_64.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
513@@ -193,6 +193,7 @@ static void __cpuinit init_amd(struct cp
514 fam10h_check_enable_mmcfg();
515 }
516
517+#ifndef CONFIG_XEN
518 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
519 unsigned long long tseg;
520
521@@ -211,6 +212,7 @@ static void __cpuinit init_amd(struct cp
522 set_memory_4k((unsigned long)__va(tseg), 1);
523 }
524 }
525+#endif
526 }
527
528 static struct cpu_dev amd_cpu_dev __cpuinitdata = {
00e5a55c
BS
529--- sle11-2009-06-04.orig/arch/x86/kernel/cpu/bugs_64.c 2009-06-04 11:08:07.000000000 +0200
530+++ sle11-2009-06-04/arch/x86/kernel/cpu/bugs_64.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
531@@ -20,6 +20,7 @@ void __init check_bugs(void)
532 #endif
533 alternative_instructions();
534
535+#ifndef CONFIG_XEN
536 /*
537 * Make sure the first 2MB area is not mapped by huge pages
538 * There are typically fixed size MTRRs in there and overlapping
539@@ -30,4 +31,5 @@ void __init check_bugs(void)
540 */
541 if (!direct_gbpages)
542 set_memory_4k((unsigned long)__va(0), 1);
543+#endif
544 }
00e5a55c
BS
545--- sle11-2009-06-04.orig/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:38:05.000000000 +0100
546+++ sle11-2009-06-04/arch/x86/kernel/cpu/common-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
547@@ -13,6 +13,7 @@
548 #include <asm/mtrr.h>
549 #include <asm/mce.h>
550 #include <asm/pat.h>
551+#include <asm/asm.h>
552 #ifdef CONFIG_X86_LOCAL_APIC
553 #include <asm/mpspec.h>
554 #include <asm/apic.h>
555@@ -341,11 +342,24 @@ static void __init early_cpu_detect(void
556
557 get_cpu_vendor(c, 1);
558
559+ early_get_cap(c);
560+
561 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
562 cpu_devs[c->x86_vendor]->c_early_init)
563 cpu_devs[c->x86_vendor]->c_early_init(c);
564+}
565
566- early_get_cap(c);
567+/*
568+ * The NOPL instruction is supposed to exist on all CPUs with
569+ * family >= 6; unfortunately, that's not true in practice because
570+ * of early VIA chips and (more importantly) broken virtualizers that
571+ * are not easy to detect. In the latter case it doesn't even *fail*
572+ * reliably, so probing for it doesn't even work. Disable it completely
573+ * unless we can find a reliable way to detect all the broken cases.
574+ */
575+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
576+{
577+ clear_cpu_cap(c, X86_FEATURE_NOPL);
578 }
579
580 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
581@@ -402,8 +416,8 @@ static void __cpuinit generic_identify(s
582 }
583
584 init_scattered_cpuid_features(c);
585+ detect_nopl(c);
586 }
587-
588 }
589
590 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
591@@ -434,7 +448,7 @@ __setup("serialnumber", x86_serial_nr_se
592 /*
593 * This does the hard work of actually picking apart the CPU stuff...
594 */
595-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
596+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
597 {
598 int i;
599
600@@ -448,6 +462,8 @@ void __cpuinit identify_cpu(struct cpuin
601 c->x86_max_cores = 1;
602 c->x86_clflush_size = 32;
603 memset(&c->x86_capability, 0, sizeof c->x86_capability);
604+ if (boot_cpu_has(X86_FEATURE_SYSCALL32))
605+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
606
607 if (!have_cpuid_p()) {
608 /*
cc90b958 609--- /dev/null 1970-01-01 00:00:00.000000000 +0000
00e5a55c 610+++ sle11-2009-06-04/arch/x86/kernel/cpu/common_64-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
611@@ -0,0 +1,771 @@
612+#include <linux/init.h>
613+#include <linux/kernel.h>
614+#include <linux/sched.h>
615+#include <linux/string.h>
616+#include <linux/bootmem.h>
617+#include <linux/bitops.h>
618+#include <linux/module.h>
619+#include <linux/kgdb.h>
620+#include <linux/topology.h>
621+#include <linux/delay.h>
622+#include <linux/smp.h>
623+#include <linux/percpu.h>
624+#include <asm/i387.h>
625+#include <asm/msr.h>
626+#include <asm/io.h>
627+#include <asm/linkage.h>
628+#include <asm/mmu_context.h>
629+#include <asm/mtrr.h>
630+#include <asm/mce.h>
631+#include <asm/pat.h>
632+#include <asm/asm.h>
633+#include <asm/numa.h>
634+#ifdef CONFIG_X86_LOCAL_APIC
635+#include <asm/mpspec.h>
636+#include <asm/apic.h>
637+#include <mach_apic.h>
638+#elif defined(CONFIG_XEN)
639+#include <mach_apic.h>
640+#endif
641+#include <asm/pda.h>
642+#include <asm/pgtable.h>
643+#include <asm/processor.h>
644+#include <asm/desc.h>
645+#include <asm/atomic.h>
646+#include <asm/proto.h>
647+#include <asm/sections.h>
648+#include <asm/setup.h>
649+#include <asm/genapic.h>
650+
651+#include "cpu.h"
652+
653+/* We need valid kernel segments for data and code in long mode too
654+ * IRET will check the segment types kkeil 2000/10/28
655+ * Also sysret mandates a special GDT layout
656+ */
657+/* The TLS descriptors are currently at a different place compared to i386.
658+ Hopefully nobody expects them at a fixed place (Wine?) */
659+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
660+ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
661+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
662+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
663+ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
664+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
665+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
666+} };
667+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
668+
669+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
670+
671+/* Current gdt points %fs at the "master" per-cpu area: after this,
672+ * it's on the real one. */
673+void switch_to_new_gdt(void)
674+{
675+#ifndef CONFIG_XEN
676+ struct desc_ptr gdt_descr;
677+
678+ gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
679+ gdt_descr.size = GDT_SIZE - 1;
680+ load_gdt(&gdt_descr);
681+#else
682+ void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id());
683+ unsigned long frames[16];
684+ unsigned int f = 0;
685+
686+ for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) {
687+ frames[f++] = virt_to_mfn(va);
688+ make_page_readonly(va, XENFEAT_writable_descriptor_tables);
689+ }
690+ if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct)))
691+ BUG();
692+#endif
693+}
694+
695+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
696+
697+static void __cpuinit default_init(struct cpuinfo_x86 *c)
698+{
699+ display_cacheinfo(c);
700+}
701+
702+static struct cpu_dev __cpuinitdata default_cpu = {
703+ .c_init = default_init,
704+ .c_vendor = "Unknown",
705+};
706+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
707+
708+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
709+{
710+ unsigned int *v;
711+
712+ if (c->extended_cpuid_level < 0x80000004)
713+ return 0;
714+
715+ v = (unsigned int *) c->x86_model_id;
716+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
717+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
718+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
719+ c->x86_model_id[48] = 0;
720+ return 1;
721+}
722+
723+
724+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
725+{
726+ unsigned int n, dummy, ebx, ecx, edx;
727+
728+ n = c->extended_cpuid_level;
729+
730+ if (n >= 0x80000005) {
731+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
732+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
733+ "D cache %dK (%d bytes/line)\n",
734+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
735+ c->x86_cache_size = (ecx>>24) + (edx>>24);
736+ /* On K8 L1 TLB is inclusive, so don't count it */
737+ c->x86_tlbsize = 0;
738+ }
739+
740+ if (n >= 0x80000006) {
741+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
742+ ecx = cpuid_ecx(0x80000006);
743+ c->x86_cache_size = ecx >> 16;
744+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
745+
746+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
747+ c->x86_cache_size, ecx & 0xFF);
748+ }
749+}
750+
751+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
752+{
753+#ifdef CONFIG_SMP
754+ u32 eax, ebx, ecx, edx;
755+ int index_msb, core_bits;
756+
757+ cpuid(1, &eax, &ebx, &ecx, &edx);
758+
759+
760+ if (!cpu_has(c, X86_FEATURE_HT))
761+ return;
762+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
763+ goto out;
764+
765+ smp_num_siblings = (ebx & 0xff0000) >> 16;
766+
767+ if (smp_num_siblings == 1) {
768+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
769+ } else if (smp_num_siblings > 1) {
770+
771+ if (smp_num_siblings > NR_CPUS) {
772+ printk(KERN_WARNING "CPU: Unsupported number of "
773+ "siblings %d", smp_num_siblings);
774+ smp_num_siblings = 1;
775+ return;
776+ }
777+
778+ index_msb = get_count_order(smp_num_siblings);
779+ c->phys_proc_id = phys_pkg_id(index_msb);
780+
781+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
782+
783+ index_msb = get_count_order(smp_num_siblings);
784+
785+ core_bits = get_count_order(c->x86_max_cores);
786+
787+ c->cpu_core_id = phys_pkg_id(index_msb) &
788+ ((1 << core_bits) - 1);
789+ }
790+out:
791+ if ((c->x86_max_cores * smp_num_siblings) > 1) {
792+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
793+ c->phys_proc_id);
794+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
795+ c->cpu_core_id);
796+ }
797+
798+#endif
799+}
800+
801+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
802+{
803+ char *v = c->x86_vendor_id;
804+ int i;
805+ static int printed;
806+
807+ for (i = 0; i < X86_VENDOR_NUM; i++) {
808+ if (cpu_devs[i]) {
809+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
810+ (cpu_devs[i]->c_ident[1] &&
811+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
812+ c->x86_vendor = i;
813+ this_cpu = cpu_devs[i];
814+ return;
815+ }
816+ }
817+ }
818+ if (!printed) {
819+ printed++;
820+ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
821+ printk(KERN_ERR "CPU: Your system may be unstable.\n");
822+ }
823+ c->x86_vendor = X86_VENDOR_UNKNOWN;
824+}
825+
826+static void __init early_cpu_support_print(void)
827+{
828+ int i,j;
829+ struct cpu_dev *cpu_devx;
830+
831+ printk("KERNEL supported cpus:\n");
832+ for (i = 0; i < X86_VENDOR_NUM; i++) {
833+ cpu_devx = cpu_devs[i];
834+ if (!cpu_devx)
835+ continue;
836+ for (j = 0; j < 2; j++) {
837+ if (!cpu_devx->c_ident[j])
838+ continue;
839+ printk(" %s %s\n", cpu_devx->c_vendor,
840+ cpu_devx->c_ident[j]);
841+ }
842+ }
843+}
844+
845+/*
846+ * The NOPL instruction is supposed to exist on all CPUs with
847+ * family >= 6, unfortunately, that's not true in practice because
848+ * of early VIA chips and (more importantly) broken virtualizers that
849+ * are not easy to detect. Hence, probe for it based on first
850+ * principles.
851+ *
852+ * Note: no 64-bit chip is known to lack these, but put the code here
853+ * for consistency with 32 bits, and to make it utterly trivial to
854+ * diagnose the problem should it ever surface.
855+ */
856+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
857+{
858+ const u32 nopl_signature = 0x888c53b1; /* Random number */
859+ u32 has_nopl = nopl_signature;
860+
861+ clear_cpu_cap(c, X86_FEATURE_NOPL);
862+ if (c->x86 >= 6) {
863+ asm volatile("\n"
864+ "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
865+ "2:\n"
866+ " .section .fixup,\"ax\"\n"
867+ "3: xor %0,%0\n"
868+ " jmp 2b\n"
869+ " .previous\n"
870+ _ASM_EXTABLE(1b,3b)
871+ : "+a" (has_nopl));
872+
873+ if (has_nopl == nopl_signature)
874+ set_cpu_cap(c, X86_FEATURE_NOPL);
875+ }
876+}
877+
878+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
879+
880+void __init early_cpu_init(void)
881+{
882+ struct cpu_vendor_dev *cvdev;
883+
884+ for (cvdev = __x86cpuvendor_start ;
885+ cvdev < __x86cpuvendor_end ;
886+ cvdev++)
887+ cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
888+ early_cpu_support_print();
889+ early_identify_cpu(&boot_cpu_data);
890+}
891+
892+/* Do some early cpuid on the boot CPU to get some parameter that are
893+ needed before check_bugs. Everything advanced is in identify_cpu
894+ below. */
895+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
896+{
897+ u32 tfms, xlvl;
898+
899+ c->loops_per_jiffy = loops_per_jiffy;
900+ c->x86_cache_size = -1;
901+ c->x86_vendor = X86_VENDOR_UNKNOWN;
902+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
903+ c->x86_vendor_id[0] = '\0'; /* Unset */
904+ c->x86_model_id[0] = '\0'; /* Unset */
905+ c->x86_clflush_size = 64;
906+ c->x86_cache_alignment = c->x86_clflush_size;
907+ c->x86_max_cores = 1;
908+ c->x86_coreid_bits = 0;
909+ c->extended_cpuid_level = 0;
910+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
911+
912+ /* Get vendor name */
913+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
914+ (unsigned int *)&c->x86_vendor_id[0],
915+ (unsigned int *)&c->x86_vendor_id[8],
916+ (unsigned int *)&c->x86_vendor_id[4]);
917+
918+ get_cpu_vendor(c);
919+
920+ /* Initialize the standard set of capabilities */
921+ /* Note that the vendor-specific code below might override */
922+
923+ /* Intel-defined flags: level 0x00000001 */
924+ if (c->cpuid_level >= 0x00000001) {
925+ __u32 misc;
926+ cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
927+ &c->x86_capability[0]);
928+ c->x86 = (tfms >> 8) & 0xf;
929+ c->x86_model = (tfms >> 4) & 0xf;
930+ c->x86_mask = tfms & 0xf;
931+ if (c->x86 == 0xf)
932+ c->x86 += (tfms >> 20) & 0xff;
933+ if (c->x86 >= 0x6)
934+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
935+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
936+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
937+ } else {
938+ /* Have CPUID level 0 only - unheard of */
939+ c->x86 = 4;
940+ }
941+
942+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
943+#ifdef CONFIG_SMP
944+ c->phys_proc_id = c->initial_apicid;
945+#endif
946+ /* AMD-defined flags: level 0x80000001 */
947+ xlvl = cpuid_eax(0x80000000);
948+ c->extended_cpuid_level = xlvl;
949+ if ((xlvl & 0xffff0000) == 0x80000000) {
950+ if (xlvl >= 0x80000001) {
951+ c->x86_capability[1] = cpuid_edx(0x80000001);
952+ c->x86_capability[6] = cpuid_ecx(0x80000001);
953+ }
954+ if (xlvl >= 0x80000004)
955+ get_model_name(c); /* Default name */
956+ }
957+
958+ /* Transmeta-defined flags: level 0x80860001 */
959+ xlvl = cpuid_eax(0x80860000);
960+ if ((xlvl & 0xffff0000) == 0x80860000) {
961+ /* Don't set x86_cpuid_level here for now to not confuse. */
962+ if (xlvl >= 0x80860001)
963+ c->x86_capability[2] = cpuid_edx(0x80860001);
964+ }
965+
966+ if (c->extended_cpuid_level >= 0x80000007)
967+ c->x86_power = cpuid_edx(0x80000007);
968+
969+ if (c->extended_cpuid_level >= 0x80000008) {
970+ u32 eax = cpuid_eax(0x80000008);
971+
972+ c->x86_virt_bits = (eax >> 8) & 0xff;
973+ c->x86_phys_bits = eax & 0xff;
974+ }
975+
976+ detect_nopl(c);
977+
978+ if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
979+ cpu_devs[c->x86_vendor]->c_early_init)
980+ cpu_devs[c->x86_vendor]->c_early_init(c);
981+
982+ validate_pat_support(c);
983+}
984+
985+/*
986+ * This does the hard work of actually picking apart the CPU stuff...
987+ */
988+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
989+{
990+ int i;
991+
992+ early_identify_cpu(c);
993+
994+ init_scattered_cpuid_features(c);
995+
996+ c->apicid = phys_pkg_id(0);
997+
998+ /*
999+ * Vendor-specific initialization. In this section we
1000+ * canonicalize the feature flags, meaning if there are
1001+ * features a certain CPU supports which CPUID doesn't
1002+ * tell us, CPUID claiming incorrect flags, or other bugs,
1003+ * we handle them here.
1004+ *
1005+ * At the end of this section, c->x86_capability better
1006+ * indicate the features this CPU genuinely supports!
1007+ */
1008+ if (this_cpu->c_init)
1009+ this_cpu->c_init(c);
1010+
1011+ detect_ht(c);
1012+
1013+ /*
1014+ * On SMP, boot_cpu_data holds the common feature set between
1015+ * all CPUs; so make sure that we indicate which features are
1016+ * common between the CPUs. The first time this routine gets
1017+ * executed, c == &boot_cpu_data.
1018+ */
1019+ if (c != &boot_cpu_data) {
1020+ /* AND the already accumulated flags with these */
1021+ for (i = 0; i < NCAPINTS; i++)
1022+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1023+ }
1024+
1025+ /* Clear all flags overriden by options */
1026+ for (i = 0; i < NCAPINTS; i++)
1027+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
1028+
1029+#ifdef CONFIG_X86_MCE
1030+ mcheck_init(c);
1031+#endif
1032+ select_idle_routine(c);
1033+
1034+#ifdef CONFIG_NUMA
1035+ numa_add_cpu(smp_processor_id());
1036+#endif
1037+
1038+}
1039+
1040+void __cpuinit identify_boot_cpu(void)
1041+{
1042+ identify_cpu(&boot_cpu_data);
1043+}
1044+
1045+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1046+{
1047+ BUG_ON(c == &boot_cpu_data);
1048+ identify_cpu(c);
1049+ mtrr_ap_init();
1050+}
1051+
1052+static __init int setup_noclflush(char *arg)
1053+{
1054+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1055+ return 1;
1056+}
1057+__setup("noclflush", setup_noclflush);
1058+
1059+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1060+{
1061+ if (c->x86_model_id[0])
1062+ printk(KERN_CONT "%s", c->x86_model_id);
1063+
1064+ if (c->x86_mask || c->cpuid_level >= 0)
1065+ printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1066+ else
1067+ printk(KERN_CONT "\n");
1068+}
1069+
1070+static __init int setup_disablecpuid(char *arg)
1071+{
1072+ int bit;
1073+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1074+ setup_clear_cpu_cap(bit);
1075+ else
1076+ return 0;
1077+ return 1;
1078+}
1079+__setup("clearcpuid=", setup_disablecpuid);
1080+
1081+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1082+
1083+struct x8664_pda **_cpu_pda __read_mostly;
1084+EXPORT_SYMBOL(_cpu_pda);
1085+
1086+#ifndef CONFIG_X86_NO_IDT
1087+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
1088+#endif
1089+
1090+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
1091+
1092+unsigned long __supported_pte_mask __read_mostly = ~0UL;
1093+EXPORT_SYMBOL_GPL(__supported_pte_mask);
1094+
1095+static int do_not_nx __cpuinitdata;
1096+
1097+/* noexec=on|off
1098+Control non executable mappings for 64bit processes.
1099+
1100+on Enable(default)
1101+off Disable
1102+*/
1103+static int __init nonx_setup(char *str)
1104+{
1105+ if (!str)
1106+ return -EINVAL;
1107+ if (!strncmp(str, "on", 2)) {
1108+ __supported_pte_mask |= _PAGE_NX;
1109+ do_not_nx = 0;
1110+ } else if (!strncmp(str, "off", 3)) {
1111+ do_not_nx = 1;
1112+ __supported_pte_mask &= ~_PAGE_NX;
1113+ }
1114+ return 0;
1115+}
1116+early_param("noexec", nonx_setup);
1117+
1118+int force_personality32;
1119+
1120+/* noexec32=on|off
1121+Control non executable heap for 32bit processes.
1122+To control the stack too use noexec=off
1123+
1124+on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
1125+off PROT_READ implies PROT_EXEC
1126+*/
1127+static int __init nonx32_setup(char *str)
1128+{
1129+ if (!strcmp(str, "on"))
1130+ force_personality32 &= ~READ_IMPLIES_EXEC;
1131+ else if (!strcmp(str, "off"))
1132+ force_personality32 |= READ_IMPLIES_EXEC;
1133+ return 1;
1134+}
1135+__setup("noexec32=", nonx32_setup);
1136+
1137+static void __init_refok switch_pt(int cpu)
1138+{
1139+#ifdef CONFIG_XEN
1140+ if (cpu == 0)
1141+ xen_init_pt();
1142+ xen_pt_switch(__pa_symbol(init_level4_pgt));
1143+ xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
1144+#endif
1145+}
1146+
1147+void pda_init(int cpu)
1148+{
1149+ struct x8664_pda *pda = cpu_pda(cpu);
1150+
1151+ /* Setup up data that may be needed in __get_free_pages early */
1152+ loadsegment(fs, 0);
1153+ loadsegment(gs, 0);
1154+#ifndef CONFIG_XEN
1155+ /* Memory clobbers used to order PDA accessed */
1156+ mb();
1157+ wrmsrl(MSR_GS_BASE, pda);
1158+ mb();
1159+#else
1160+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
1161+ (unsigned long)pda))
1162+ BUG();
1163+#endif
1164+
1165+ pda->cpunumber = cpu;
1166+ pda->irqcount = -1;
1167+ pda->kernelstack = (unsigned long)stack_thread_info() -
1168+ PDA_STACKOFFSET + THREAD_SIZE;
1169+ pda->active_mm = &init_mm;
1170+ pda->mmu_state = 0;
1171+
1172+ if (cpu == 0) {
1173+ /* others are initialized in smpboot.c */
1174+ pda->pcurrent = &init_task;
1175+ pda->irqstackptr = boot_cpu_stack;
1176+ pda->irqstackptr += IRQSTACKSIZE - 64;
1177+ } else {
1178+ if (!pda->irqstackptr) {
1179+ pda->irqstackptr = (char *)
1180+ __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
1181+ if (!pda->irqstackptr)
1182+ panic("cannot allocate irqstack for cpu %d",
1183+ cpu);
1184+ pda->irqstackptr += IRQSTACKSIZE - 64;
1185+ }
1186+
1187+ if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
1188+ pda->nodenumber = cpu_to_node(cpu);
1189+ }
1190+
1191+ switch_pt(cpu);
1192+}
1193+
1194+#ifndef CONFIG_X86_NO_TSS
1195+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
1196+ DEBUG_STKSZ] __page_aligned_bss;
1197+#endif
1198+
1199+extern asmlinkage void ignore_sysret(void);
1200+
1201+void __cpuinit syscall_init(void)
1202+{
1203+#ifndef CONFIG_XEN
1204+ /*
1205+ * LSTAR and STAR live in a bit strange symbiosis.
1206+ * They both write to the same internal register. STAR allows to
1207+ * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
1208+ */
1209+ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1210+ wrmsrl(MSR_LSTAR, system_call);
1211+ wrmsrl(MSR_CSTAR, ignore_sysret);
1212+
1213+ /* Flags to clear on syscall */
1214+ wrmsrl(MSR_SYSCALL_MASK,
1215+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1216+#endif
1217+#ifdef CONFIG_IA32_EMULATION
1218+ syscall32_cpu_init();
1219+#else
00e5a55c 1220+ static const struct callback_register __cpuinitconst cstar = {
cc90b958
BS
1221+ .type = CALLBACKTYPE_syscall32,
1222+ .address = (unsigned long)ignore_sysret
1223+ };
1224+
1225+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
1226+ printk(KERN_WARN "Unable to register CSTAR callback\n");
1227+#endif
1228+}
1229+
1230+void __cpuinit check_efer(void)
1231+{
1232+ unsigned long efer;
1233+
1234+ rdmsrl(MSR_EFER, efer);
1235+ if (!(efer & EFER_NX) || do_not_nx)
1236+ __supported_pte_mask &= ~_PAGE_NX;
1237+}
1238+
1239+unsigned long kernel_eflags;
1240+
1241+#ifndef CONFIG_X86_NO_TSS
1242+/*
1243+ * Copies of the original ist values from the tss are only accessed during
1244+ * debugging, no special alignment required.
1245+ */
1246+DEFINE_PER_CPU(struct orig_ist, orig_ist);
1247+#endif
1248+
1249+/*
1250+ * cpu_init() initializes state that is per-CPU. Some data is already
1251+ * initialized (naturally) in the bootstrap process, such as the GDT
1252+ * and IDT. We reload them nevertheless, this function acts as a
1253+ * 'CPU state barrier', nothing should get across.
1254+ * A lot of state is already set up in PDA init.
1255+ */
1256+void __cpuinit cpu_init(void)
1257+{
1258+ int cpu = stack_smp_processor_id();
1259+#ifndef CONFIG_X86_NO_TSS
1260+ struct tss_struct *t = &per_cpu(init_tss, cpu);
1261+ struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
1262+ unsigned long v;
1263+ char *estacks = NULL;
1264+ int i;
1265+#endif
1266+ struct task_struct *me;
1267+
1268+ /* CPU 0 is initialised in head64.c */
1269+ if (cpu != 0)
1270+ pda_init(cpu);
1271+#ifndef CONFIG_X86_NO_TSS
1272+ else
1273+ estacks = boot_exception_stacks;
1274+#endif
1275+
1276+ me = current;
1277+
1278+ if (cpu_test_and_set(cpu, cpu_initialized))
1279+ panic("CPU#%d already initialized!\n", cpu);
1280+
1281+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1282+
1283+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1284+
1285+ /*
1286+ * Initialize the per-CPU GDT with the boot GDT,
1287+ * and set up the GDT descriptor:
1288+ */
1289+
1290+ switch_to_new_gdt();
1291+#ifndef CONFIG_X86_NO_IDT
1292+ load_idt((const struct desc_ptr *)&idt_descr);
1293+#endif
1294+
1295+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
1296+ syscall_init();
1297+
1298+ wrmsrl(MSR_FS_BASE, 0);
1299+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
1300+ barrier();
1301+
1302+ check_efer();
1303+
1304+#ifndef CONFIG_X86_NO_TSS
1305+ /*
1306+ * set up and load the per-CPU TSS
1307+ */
1308+ if (!orig_ist->ist[0]) {
1309+ static const unsigned int order[N_EXCEPTION_STACKS] = {
1310+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1311+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1312+ };
1313+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1314+ if (cpu) {
1315+ estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1316+ if (!estacks)
1317+ panic("Cannot allocate exception "
1318+ "stack %ld %d\n", v, cpu);
1319+ }
1320+ estacks += PAGE_SIZE << order[v];
1321+ orig_ist->ist[v] = t->x86_tss.ist[v] =
1322+ (unsigned long)estacks;
1323+ }
1324+ }
1325+
1326+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1327+ /*
1328+ * <= is required because the CPU will access up to
1329+ * 8 bits beyond the end of the IO permission bitmap.
1330+ */
1331+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
1332+ t->io_bitmap[i] = ~0UL;
1333+#endif
1334+
1335+ atomic_inc(&init_mm.mm_count);
1336+ me->active_mm = &init_mm;
1337+ if (me->mm)
1338+ BUG();
1339+ enter_lazy_tlb(&init_mm, me);
1340+
1341+ load_sp0(t, &current->thread);
1342+#ifndef CONFIG_X86_NO_TSS
1343+ set_tss_desc(cpu, t);
1344+ load_TR_desc();
1345+#endif
1346+ load_LDT(&init_mm.context);
1347+
1348+#ifdef CONFIG_KGDB
1349+ /*
1350+ * If the kgdb is connected no debug regs should be altered. This
1351+ * is only applicable when KGDB and a KGDB I/O module are built
1352+ * into the kernel and you are using early debugging with
1353+ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1354+ */
1355+ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1356+ arch_kgdb_ops.correct_hw_break();
1357+ else {
1358+#endif
1359+ /*
1360+ * Clear all 6 debug registers:
1361+ */
1362+
1363+ set_debugreg(0UL, 0);
1364+ set_debugreg(0UL, 1);
1365+ set_debugreg(0UL, 2);
1366+ set_debugreg(0UL, 3);
1367+ set_debugreg(0UL, 6);
1368+ set_debugreg(0UL, 7);
1369+#ifdef CONFIG_KGDB
1370+ /* If the kgdb is connected no debug regs should be altered. */
1371+ }
1372+#endif
1373+
1374+ fpu_init();
1375+
1376+ asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
1377+ if (raw_irqs_disabled())
1378+ kernel_eflags &= ~X86_EFLAGS_IF;
1379+
1380+ if (is_uv_system())
1381+ uv_cpu_init();
1382+}
cc90b958 1383--- /dev/null 1970-01-01 00:00:00.000000000 +0000
00e5a55c
BS
1384+++ sle11-2009-06-04/arch/x86/kernel/e820-xen.c 2009-06-04 10:21:39.000000000 +0200
1385@@ -0,0 +1,1545 @@
cc90b958
BS
1386+/*
1387+ * Handle the memory map.
1388+ * The functions here do the job until bootmem takes over.
1389+ *
1390+ * Getting sanitize_e820_map() in sync with i386 version by applying change:
1391+ * - Provisions for empty E820 memory regions (reported by certain BIOSes).
1392+ * Alex Achenbach <xela@slit.de>, December 2002.
1393+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
1394+ *
1395+ */
1396+#include <linux/kernel.h>
1397+#include <linux/types.h>
1398+#include <linux/init.h>
1399+#include <linux/bootmem.h>
1400+#include <linux/ioport.h>
1401+#include <linux/string.h>
1402+#include <linux/kexec.h>
1403+#include <linux/module.h>
1404+#include <linux/mm.h>
1405+#include <linux/pfn.h>
1406+#include <linux/suspend.h>
1407+#include <linux/firmware-map.h>
1408+
1409+#include <asm/pgtable.h>
1410+#include <asm/page.h>
1411+#include <asm/e820.h>
1412+#include <asm/proto.h>
1413+#include <asm/setup.h>
1414+#include <xen/interface/memory.h>
1415+
1416+/*
1417+ * The e820 map is the map that gets modified e.g. with command line parameters
1418+ * and that is also registered with modifications in the kernel resource tree
1419+ * with the iomem_resource as parent.
1420+ *
1421+ * The e820_saved is directly saved after the BIOS-provided memory map is
1422+ * copied. It doesn't get modified afterwards. It's registered for the
1423+ * /sys/firmware/memmap interface.
1424+ *
1425+ * That memory map is not modified and is used as base for kexec. The kexec'd
1426+ * kernel should get the same memory map as the firmware provides. Then the
1427+ * user can e.g. boot the original kernel with mem=1G while still booting the
1428+ * next kernel with full memory.
1429+ */
1430+struct e820map e820;
00e5a55c 1431+#ifndef CONFIG_XEN
cc90b958 1432+struct e820map e820_saved;
00e5a55c 1433+#else
cc90b958 1434+static struct e820map machine_e820;
00e5a55c 1435+#define e820_saved machine_e820
cc90b958
BS
1436+#endif
1437+
1438+/* For PCI or other memory-mapped resources */
1439+unsigned long pci_mem_start = 0xaeedbabe;
1440+#ifdef CONFIG_PCI
1441+EXPORT_SYMBOL(pci_mem_start);
1442+#endif
1443+
1444+/*
1445+ * This function checks if any part of the range <start,end> is mapped
1446+ * with type.
1447+ */
1448+int
1449+e820_any_mapped(u64 start, u64 end, unsigned type)
1450+{
1451+ int i;
1452+
1453+#ifndef CONFIG_XEN
1454+ for (i = 0; i < e820.nr_map; i++) {
1455+ struct e820entry *ei = &e820.map[i];
1456+#else
1457+ if (!is_initial_xendomain())
1458+ return 0;
1459+ for (i = 0; i < machine_e820.nr_map; ++i) {
1460+ const struct e820entry *ei = &machine_e820.map[i];
1461+#endif
1462+
1463+ if (type && ei->type != type)
1464+ continue;
1465+ if (ei->addr >= end || ei->addr + ei->size <= start)
1466+ continue;
1467+ return 1;
1468+ }
1469+ return 0;
1470+}
1471+EXPORT_SYMBOL_GPL(e820_any_mapped);
1472+
1473+/*
1474+ * This function checks if the entire range <start,end> is mapped with type.
1475+ *
1476+ * Note: this function only works correct if the e820 table is sorted and
1477+ * not-overlapping, which is the case
1478+ */
1479+int __init e820_all_mapped(u64 start, u64 end, unsigned type)
1480+{
1481+ int i;
1482+
1483+#ifndef CONFIG_XEN
1484+ for (i = 0; i < e820.nr_map; i++) {
1485+ struct e820entry *ei = &e820.map[i];
1486+#else
1487+ if (!is_initial_xendomain())
1488+ return 0;
1489+ for (i = 0; i < machine_e820.nr_map; ++i) {
1490+ const struct e820entry *ei = &machine_e820.map[i];
1491+#endif
1492+
1493+ if (type && ei->type != type)
1494+ continue;
1495+ /* is the region (part) in overlap with the current region ?*/
1496+ if (ei->addr >= end || ei->addr + ei->size <= start)
1497+ continue;
1498+
1499+ /* if the region is at the beginning of <start,end> we move
1500+ * start to the end of the region since it's ok until there
1501+ */
1502+ if (ei->addr <= start)
1503+ start = ei->addr + ei->size;
1504+ /*
1505+ * if start is now at or beyond end, we're done, full
1506+ * coverage
1507+ */
1508+ if (start >= end)
1509+ return 1;
1510+ }
1511+ return 0;
1512+}
1513+
1514+/*
1515+ * Add a memory region to the kernel e820 map.
1516+ */
1517+void __init e820_add_region(u64 start, u64 size, int type)
1518+{
1519+ int x = e820.nr_map;
1520+
1521+ if (x == ARRAY_SIZE(e820.map)) {
1522+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1523+ return;
1524+ }
1525+
1526+ e820.map[x].addr = start;
1527+ e820.map[x].size = size;
1528+ e820.map[x].type = type;
1529+ e820.nr_map++;
1530+}
1531+
1532+void __init e820_print_map(char *who)
1533+{
1534+ int i;
1535+
1536+ for (i = 0; i < e820.nr_map; i++) {
1537+ printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1538+ (unsigned long long) e820.map[i].addr,
1539+ (unsigned long long)
1540+ (e820.map[i].addr + e820.map[i].size));
1541+ switch (e820.map[i].type) {
1542+ case E820_RAM:
1543+ case E820_RESERVED_KERN:
1544+ printk(KERN_CONT "(usable)\n");
1545+ break;
1546+ case E820_RESERVED:
1547+ printk(KERN_CONT "(reserved)\n");
1548+ break;
1549+ case E820_ACPI:
1550+ printk(KERN_CONT "(ACPI data)\n");
1551+ break;
1552+ case E820_NVS:
1553+ printk(KERN_CONT "(ACPI NVS)\n");
1554+ break;
1555+ default:
1556+ printk(KERN_CONT "type %u\n", e820.map[i].type);
1557+ break;
1558+ }
1559+ }
1560+}
1561+
1562+/*
1563+ * Sanitize the BIOS e820 map.
1564+ *
1565+ * Some e820 responses include overlapping entries. The following
1566+ * replaces the original e820 map with a new one, removing overlaps,
1567+ * and resolving conflicting memory types in favor of highest
1568+ * numbered type.
1569+ *
1570+ * The input parameter biosmap points to an array of 'struct
1571+ * e820entry' which on entry has elements in the range [0, *pnr_map)
1572+ * valid, and which has space for up to max_nr_map entries.
1573+ * On return, the resulting sanitized e820 map entries will be in
1574+ * overwritten in the same location, starting at biosmap.
1575+ *
1576+ * The integer pointed to by pnr_map must be valid on entry (the
1577+ * current number of valid entries located at biosmap) and will
1578+ * be updated on return, with the new number of valid entries
1579+ * (something no more than max_nr_map.)
1580+ *
1581+ * The return value from sanitize_e820_map() is zero if it
1582+ * successfully 'sanitized' the map entries passed in, and is -1
1583+ * if it did nothing, which can happen if either of (1) it was
1584+ * only passed one map entry, or (2) any of the input map entries
1585+ * were invalid (start + size < start, meaning that the size was
1586+ * so big the described memory range wrapped around through zero.)
1587+ *
1588+ * Visually we're performing the following
1589+ * (1,2,3,4 = memory types)...
1590+ *
1591+ * Sample memory map (w/overlaps):
1592+ * ____22__________________
1593+ * ______________________4_
1594+ * ____1111________________
1595+ * _44_____________________
1596+ * 11111111________________
1597+ * ____________________33__
1598+ * ___________44___________
1599+ * __________33333_________
1600+ * ______________22________
1601+ * ___________________2222_
1602+ * _________111111111______
1603+ * _____________________11_
1604+ * _________________4______
1605+ *
1606+ * Sanitized equivalent (no overlap):
1607+ * 1_______________________
1608+ * _44_____________________
1609+ * ___1____________________
1610+ * ____22__________________
1611+ * ______11________________
1612+ * _________1______________
1613+ * __________3_____________
1614+ * ___________44___________
1615+ * _____________33_________
1616+ * _______________2________
1617+ * ________________1_______
1618+ * _________________4______
1619+ * ___________________2____
1620+ * ____________________33__
1621+ * ______________________4_
1622+ */
1623+
1624+int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
1625+ int *pnr_map)
1626+{
1627+ struct change_member {
1628+ struct e820entry *pbios; /* pointer to original bios entry */
1629+ unsigned long long addr; /* address for this change point */
1630+ };
1631+ static struct change_member change_point_list[2*E820_X_MAX] __initdata;
1632+ static struct change_member *change_point[2*E820_X_MAX] __initdata;
1633+ static struct e820entry *overlap_list[E820_X_MAX] __initdata;
1634+ static struct e820entry new_bios[E820_X_MAX] __initdata;
1635+ struct change_member *change_tmp;
1636+ unsigned long current_type, last_type;
1637+ unsigned long long last_addr;
1638+ int chgidx, still_changing;
1639+ int overlap_entries;
1640+ int new_bios_entry;
1641+ int old_nr, new_nr, chg_nr;
1642+ int i;
1643+
1644+ /* if there's only one memory region, don't bother */
1645+#ifdef CONFIG_XEN
1646+ if (*pnr_map == 1)
1647+ return 0;
1648+#endif
1649+ if (*pnr_map < 2)
1650+ return -1;
1651+
1652+ old_nr = *pnr_map;
1653+ BUG_ON(old_nr > max_nr_map);
1654+
1655+ /* bail out if we find any unreasonable addresses in bios map */
1656+ for (i = 0; i < old_nr; i++)
1657+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1658+ return -1;
1659+
1660+ /* create pointers for initial change-point information (for sorting) */
1661+ for (i = 0; i < 2 * old_nr; i++)
1662+ change_point[i] = &change_point_list[i];
1663+
1664+ /* record all known change-points (starting and ending addresses),
1665+ omitting those that are for empty memory regions */
1666+ chgidx = 0;
1667+ for (i = 0; i < old_nr; i++) {
1668+ if (biosmap[i].size != 0) {
1669+ change_point[chgidx]->addr = biosmap[i].addr;
1670+ change_point[chgidx++]->pbios = &biosmap[i];
1671+ change_point[chgidx]->addr = biosmap[i].addr +
1672+ biosmap[i].size;
1673+ change_point[chgidx++]->pbios = &biosmap[i];
1674+ }
1675+ }
1676+ chg_nr = chgidx;
1677+
1678+ /* sort change-point list by memory addresses (low -> high) */
1679+ still_changing = 1;
1680+ while (still_changing) {
1681+ still_changing = 0;
1682+ for (i = 1; i < chg_nr; i++) {
1683+ unsigned long long curaddr, lastaddr;
1684+ unsigned long long curpbaddr, lastpbaddr;
1685+
1686+ curaddr = change_point[i]->addr;
1687+ lastaddr = change_point[i - 1]->addr;
1688+ curpbaddr = change_point[i]->pbios->addr;
1689+ lastpbaddr = change_point[i - 1]->pbios->addr;
1690+
1691+ /*
1692+ * swap entries, when:
1693+ *
1694+ * curaddr > lastaddr or
1695+ * curaddr == lastaddr and curaddr == curpbaddr and
1696+ * lastaddr != lastpbaddr
1697+ */
1698+ if (curaddr < lastaddr ||
1699+ (curaddr == lastaddr && curaddr == curpbaddr &&
1700+ lastaddr != lastpbaddr)) {
1701+ change_tmp = change_point[i];
1702+ change_point[i] = change_point[i-1];
1703+ change_point[i-1] = change_tmp;
1704+ still_changing = 1;
1705+ }
1706+ }
1707+ }
1708+
1709+ /* create a new bios memory map, removing overlaps */
1710+ overlap_entries = 0; /* number of entries in the overlap table */
1711+ new_bios_entry = 0; /* index for creating new bios map entries */
1712+ last_type = 0; /* start with undefined memory type */
1713+ last_addr = 0; /* start with 0 as last starting address */
1714+
1715+ /* loop through change-points, determining affect on the new bios map */
1716+ for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1717+ /* keep track of all overlapping bios entries */
1718+ if (change_point[chgidx]->addr ==
1719+ change_point[chgidx]->pbios->addr) {
1720+ /*
1721+ * add map entry to overlap list (> 1 entry
1722+ * implies an overlap)
1723+ */
1724+ overlap_list[overlap_entries++] =
1725+ change_point[chgidx]->pbios;
1726+ } else {
1727+ /*
1728+ * remove entry from list (order independent,
1729+ * so swap with last)
1730+ */
1731+ for (i = 0; i < overlap_entries; i++) {
1732+ if (overlap_list[i] ==
1733+ change_point[chgidx]->pbios)
1734+ overlap_list[i] =
1735+ overlap_list[overlap_entries-1];
1736+ }
1737+ overlap_entries--;
1738+ }
1739+ /*
1740+ * if there are overlapping entries, decide which
1741+ * "type" to use (larger value takes precedence --
1742+ * 1=usable, 2,3,4,4+=unusable)
1743+ */
1744+ current_type = 0;
1745+ for (i = 0; i < overlap_entries; i++)
1746+ if (overlap_list[i]->type > current_type)
1747+ current_type = overlap_list[i]->type;
1748+ /*
1749+ * continue building up new bios map based on this
1750+ * information
1751+ */
1752+ if (current_type != last_type) {
1753+ if (last_type != 0) {
1754+ new_bios[new_bios_entry].size =
1755+ change_point[chgidx]->addr - last_addr;
1756+ /*
1757+ * move forward only if the new size
1758+ * was non-zero
1759+ */
1760+ if (new_bios[new_bios_entry].size != 0)
1761+ /*
1762+ * no more space left for new
1763+ * bios entries ?
1764+ */
1765+ if (++new_bios_entry >= max_nr_map)
1766+ break;
1767+ }
1768+ if (current_type != 0) {
1769+ new_bios[new_bios_entry].addr =
1770+ change_point[chgidx]->addr;
1771+ new_bios[new_bios_entry].type = current_type;
1772+ last_addr = change_point[chgidx]->addr;
1773+ }
1774+ last_type = current_type;
1775+ }
1776+ }
1777+ /* retain count for new bios entries */
1778+ new_nr = new_bios_entry;
1779+
1780+ /* copy new bios mapping into original location */
1781+ memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
1782+ *pnr_map = new_nr;
1783+
1784+ return 0;
1785+}
1786+
1787+static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
1788+{
1789+ while (nr_map) {
1790+ u64 start = biosmap->addr;
1791+ u64 size = biosmap->size;
1792+ u64 end = start + size;
1793+ u32 type = biosmap->type;
1794+
1795+ /* Overflow in 64 bits? Ignore the memory map. */
1796+ if (start > end)
1797+ return -1;
1798+
1799+ e820_add_region(start, size, type);
1800+
1801+ biosmap++;
1802+ nr_map--;
1803+ }
1804+ return 0;
1805+}
1806+
1807+/*
1808+ * Copy the BIOS e820 map into a safe place.
1809+ *
1810+ * Sanity-check it while we're at it..
1811+ *
1812+ * If we're lucky and live on a modern system, the setup code
1813+ * will have given us a memory map that we can use to properly
1814+ * set up memory. If we aren't, we'll fake a memory map.
1815+ */
1816+static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
1817+{
1818+#ifndef CONFIG_XEN
1819+ /* Only one memory region (or negative)? Ignore it */
1820+ if (nr_map < 2)
1821+ return -1;
1822+#else
1823+ BUG_ON(nr_map < 1);
1824+#endif
1825+
1826+ return __append_e820_map(biosmap, nr_map);
1827+}
1828+
1829+static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
1830+ u64 size, unsigned old_type,
1831+ unsigned new_type)
1832+{
00e5a55c 1833+ unsigned int i, x;
cc90b958
BS
1834+ u64 real_updated_size = 0;
1835+
1836+ BUG_ON(old_type == new_type);
1837+
1838+ if (size > (ULLONG_MAX - start))
1839+ size = ULLONG_MAX - start;
1840+
00e5a55c 1841+ for (i = 0; i < e820x->nr_map; i++) {
cc90b958
BS
1842+ struct e820entry *ei = &e820x->map[i];
1843+ u64 final_start, final_end;
1844+ if (ei->type != old_type)
1845+ continue;
1846+ /* totally covered? */
1847+ if (ei->addr >= start &&
1848+ (ei->addr + ei->size) <= (start + size)) {
1849+ ei->type = new_type;
1850+ real_updated_size += ei->size;
1851+ continue;
1852+ }
1853+ /* partially covered */
1854+ final_start = max(start, ei->addr);
1855+ final_end = min(start + size, ei->addr + ei->size);
1856+ if (final_start >= final_end)
1857+ continue;
00e5a55c
BS
1858+
1859+ x = e820x->nr_map;
1860+ if (x == ARRAY_SIZE(e820x->map)) {
1861+ printk(KERN_ERR "Too many memory map entries!\n");
1862+ break;
1863+ }
1864+ e820x->map[x].addr = final_start;
1865+ e820x->map[x].size = final_end - final_start;
1866+ e820x->map[x].type = new_type;
1867+ e820x->nr_map++;
1868+
cc90b958
BS
1869+ real_updated_size += final_end - final_start;
1870+
cc90b958
BS
1871+ if (ei->addr < final_start)
1872+ continue;
1873+ ei->addr = final_end;
00e5a55c 1874+ ei->size -= final_end - final_start;
cc90b958
BS
1875+ }
1876+ return real_updated_size;
1877+}
1878+
1879+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
1880+ unsigned new_type)
1881+{
1882+ return e820_update_range_map(&e820, start, size, old_type, new_type);
1883+}
1884+
1885+static u64 __init e820_update_range_saved(u64 start, u64 size,
1886+ unsigned old_type, unsigned new_type)
1887+{
00e5a55c
BS
1888+#ifdef CONFIG_XEN
1889+ if (is_initial_xendomain())
1890+ return e820_update_range_map(&machine_e820,
1891+ phys_to_machine(start), size,
1892+ old_type, new_type);
1893+#endif
cc90b958
BS
1894+ return e820_update_range_map(&e820_saved, start, size, old_type,
1895+ new_type);
1896+}
1897+
1898+/* make e820 not cover the range */
1899+u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
1900+ int checktype)
1901+{
1902+ int i;
1903+ u64 real_removed_size = 0;
1904+
1905+ if (size > (ULLONG_MAX - start))
1906+ size = ULLONG_MAX - start;
1907+
1908+ for (i = 0; i < e820.nr_map; i++) {
1909+ struct e820entry *ei = &e820.map[i];
1910+ u64 final_start, final_end;
1911+
1912+ if (checktype && ei->type != old_type)
1913+ continue;
1914+ /* totally covered? */
1915+ if (ei->addr >= start &&
1916+ (ei->addr + ei->size) <= (start + size)) {
1917+ real_removed_size += ei->size;
1918+ memset(ei, 0, sizeof(struct e820entry));
1919+ continue;
1920+ }
1921+ /* partially covered */
1922+ final_start = max(start, ei->addr);
1923+ final_end = min(start + size, ei->addr + ei->size);
1924+ if (final_start >= final_end)
1925+ continue;
1926+ real_removed_size += final_end - final_start;
1927+
1928+ ei->size -= final_end - final_start;
1929+ if (ei->addr < final_start)
1930+ continue;
1931+ ei->addr = final_end;
1932+ }
1933+ return real_removed_size;
1934+}
1935+
1936+void __init update_e820(void)
1937+{
1938+ int nr_map;
1939+
1940+ nr_map = e820.nr_map;
1941+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
1942+ return;
1943+ e820.nr_map = nr_map;
1944+ printk(KERN_INFO "modified physical RAM map:\n");
1945+ e820_print_map("modified");
1946+}
1947+static void __init update_e820_saved(void)
1948+{
1949+ int nr_map;
1950+
1951+ nr_map = e820_saved.nr_map;
1952+ if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
1953+ return;
1954+ e820_saved.nr_map = nr_map;
1955+}
1956+
1957+#ifdef CONFIG_XEN
1958+#define e820 machine_e820
1959+#endif
1960+
1961+#define MAX_GAP_END 0x100000000ull
1962+/*
1963+ * Search for a gap in the e820 memory space from start_addr to end_addr.
1964+ */
1965+__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
1966+ unsigned long start_addr, unsigned long long end_addr)
1967+{
1968+ unsigned long long last;
1969+ int i = e820.nr_map;
1970+ int found = 0;
1971+
1972+ last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
1973+#ifdef CONFIG_X86_64
1974+ if (start_addr >= MAX_GAP_END)
1975+ last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
1976+#endif
1977+
1978+ while (--i >= 0) {
1979+ unsigned long long start = e820.map[i].addr;
1980+ unsigned long long end = start + e820.map[i].size;
1981+
1982+ if (end < start_addr)
1983+ continue;
1984+
1985+ /*
1986+ * Since "last" is at most 4GB, we know we'll
1987+ * fit in 32 bits if this condition is true
1988+ */
1989+ if (last > end) {
1990+ unsigned long gap = last - end;
1991+
1992+ if (gap >= *gapsize) {
1993+ *gapsize = gap;
1994+ *gapstart = end;
1995+ found = 1;
1996+ }
1997+ }
1998+ if (start < last)
1999+ last = start;
2000+ }
2001+ return found;
2002+}
2003+
2004+/*
2005+ * Search for the biggest gap in the low 32 bits of the e820
2006+ * memory space. We pass this space to PCI to assign MMIO resources
2007+ * for hotplug or unconfigured devices in.
2008+ * Hopefully the BIOS let enough space left.
2009+ */
2010+__init void e820_setup_gap(void)
2011+{
2012+ unsigned long gapstart, gapsize, round;
2013+ int found;
2014+
2015+ gapstart = 0x10000000;
2016+ gapsize = 0x400000;
2017+ found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
2018+
2019+#ifdef CONFIG_X86_64
2020+ if (!found) {
2021+ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2022+ "address range\n"
2023+ KERN_ERR "PCI: Unassigned devices with 32bit resource "
2024+ "registers may break!\n");
00e5a55c 2025+ found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
cc90b958
BS
2026+ BUG_ON(!found);
2027+ }
2028+#endif
2029+
2030+ /*
2031+ * See how much we want to round up: start off with
2032+ * rounding to the next 1MB area.
2033+ */
2034+ round = 0x100000;
2035+ while ((gapsize >> 4) > round)
2036+ round += round;
2037+ /* Fun with two's complement */
2038+ pci_mem_start = (gapstart + round) & -round;
2039+
2040+ printk(KERN_INFO
2041+ "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2042+ pci_mem_start, gapstart, gapsize);
2043+}
2044+
2045+#undef e820
2046+
2047+#ifndef CONFIG_XEN
2048+/**
2049+ * Because of the size limitation of struct boot_params, only first
2050+ * 128 E820 memory entries are passed to kernel via
2051+ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
2052+ * linked list of struct setup_data, which is parsed here.
2053+ */
2054+void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
2055+{
2056+ u32 map_len;
2057+ int entries;
2058+ struct e820entry *extmap;
2059+
2060+ entries = sdata->len / sizeof(struct e820entry);
2061+ map_len = sdata->len + sizeof(struct setup_data);
2062+ if (map_len > PAGE_SIZE)
2063+ sdata = early_ioremap(pa_data, map_len);
2064+ extmap = (struct e820entry *)(sdata->data);
2065+ __append_e820_map(extmap, entries);
2066+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
2067+ if (map_len > PAGE_SIZE)
2068+ early_iounmap(sdata, map_len);
2069+ printk(KERN_INFO "extended physical RAM map:\n");
2070+ e820_print_map("extended");
2071+}
2072+
2073+#if defined(CONFIG_X86_64) || \
2074+ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
2075+/**
2076+ * Find the ranges of physical addresses that do not correspond to
2077+ * e820 RAM areas and mark the corresponding pages as nosave for
2078+ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
2079+ *
2080+ * This function requires the e820 map to be sorted and without any
2081+ * overlapping entries and assumes the first e820 area to be RAM.
2082+ */
2083+void __init e820_mark_nosave_regions(unsigned long limit_pfn)
2084+{
2085+ int i;
2086+ unsigned long pfn;
2087+
2088+ pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
2089+ for (i = 1; i < e820.nr_map; i++) {
2090+ struct e820entry *ei = &e820.map[i];
2091+
2092+ if (pfn < PFN_UP(ei->addr))
2093+ register_nosave_region(pfn, PFN_UP(ei->addr));
2094+
2095+ pfn = PFN_DOWN(ei->addr + ei->size);
2096+ if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
2097+ register_nosave_region(PFN_UP(ei->addr), pfn);
2098+
2099+ if (pfn >= limit_pfn)
2100+ break;
2101+ }
2102+}
2103+#endif
2104+#endif
2105+
2106+/*
2107+ * Early reserved memory areas.
2108+ */
2109+#define MAX_EARLY_RES 20
2110+
2111+struct early_res {
2112+ u64 start, end;
2113+ char name[16];
2114+ char overlap_ok;
2115+};
2116+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
2117+#ifndef CONFIG_XEN
2118+ { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
2119+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
2120+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
2121+#endif
2122+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
2123+ /*
2124+ * But first pinch a few for the stack/trampoline stuff
2125+ * FIXME: Don't need the extra page at 4K, but need to fix
2126+ * trampoline before removing it. (see the GDT stuff)
2127+ */
2128+ { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
2129+ /*
2130+ * Has to be in very low memory so we can execute
2131+ * real-mode AP code.
2132+ */
2133+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
2134+#endif
2135+#endif
2136+ {}
2137+};
2138+
2139+static int __init find_overlapped_early(u64 start, u64 end)
2140+{
2141+ int i;
2142+ struct early_res *r;
2143+
2144+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2145+ r = &early_res[i];
2146+ if (end > r->start && start < r->end)
2147+ break;
2148+ }
2149+
2150+ return i;
2151+}
2152+
2153+/*
2154+ * Drop the i-th range from the early reservation map,
2155+ * by copying any higher ranges down one over it, and
2156+ * clearing what had been the last slot.
2157+ */
2158+static void __init drop_range(int i)
2159+{
2160+ int j;
2161+
2162+ for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
2163+ ;
2164+
2165+ memmove(&early_res[i], &early_res[i + 1],
2166+ (j - 1 - i) * sizeof(struct early_res));
2167+
2168+ early_res[j - 1].end = 0;
2169+}
2170+
2171+/*
2172+ * Split any existing ranges that:
2173+ * 1) are marked 'overlap_ok', and
2174+ * 2) overlap with the stated range [start, end)
2175+ * into whatever portion (if any) of the existing range is entirely
2176+ * below or entirely above the stated range. Drop the portion
2177+ * of the existing range that overlaps with the stated range,
2178+ * which will allow the caller of this routine to then add that
2179+ * stated range without conflicting with any existing range.
2180+ */
2181+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
2182+{
2183+ int i;
2184+ struct early_res *r;
2185+ u64 lower_start, lower_end;
2186+ u64 upper_start, upper_end;
2187+ char name[16];
2188+
2189+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2190+ r = &early_res[i];
2191+
2192+ /* Continue past non-overlapping ranges */
2193+ if (end <= r->start || start >= r->end)
2194+ continue;
2195+
2196+ /*
2197+ * Leave non-ok overlaps as is; let caller
2198+ * panic "Overlapping early reservations"
2199+ * when it hits this overlap.
2200+ */
2201+ if (!r->overlap_ok)
2202+ return;
2203+
2204+ /*
2205+ * We have an ok overlap. We will drop it from the early
2206+ * reservation map, and add back in any non-overlapping
2207+ * portions (lower or upper) as separate, overlap_ok,
2208+ * non-overlapping ranges.
2209+ */
2210+
2211+ /* 1. Note any non-overlapping (lower or upper) ranges. */
2212+ strncpy(name, r->name, sizeof(name) - 1);
2213+
2214+ lower_start = lower_end = 0;
2215+ upper_start = upper_end = 0;
2216+ if (r->start < start) {
2217+ lower_start = r->start;
2218+ lower_end = start;
2219+ }
2220+ if (r->end > end) {
2221+ upper_start = end;
2222+ upper_end = r->end;
2223+ }
2224+
2225+ /* 2. Drop the original ok overlapping range */
2226+ drop_range(i);
2227+
2228+ i--; /* resume for-loop on copied down entry */
2229+
2230+ /* 3. Add back in any non-overlapping ranges. */
2231+ if (lower_end)
2232+ reserve_early_overlap_ok(lower_start, lower_end, name);
2233+ if (upper_end)
2234+ reserve_early_overlap_ok(upper_start, upper_end, name);
2235+ }
2236+}
2237+
2238+static void __init __reserve_early(u64 start, u64 end, char *name,
2239+ int overlap_ok)
2240+{
2241+ int i;
2242+ struct early_res *r;
2243+
2244+ i = find_overlapped_early(start, end);
2245+ if (i >= MAX_EARLY_RES)
2246+ panic("Too many early reservations");
2247+ r = &early_res[i];
2248+ if (r->end)
2249+ panic("Overlapping early reservations "
2250+ "%llx-%llx %s to %llx-%llx %s\n",
2251+ start, end - 1, name?name:"", r->start,
2252+ r->end - 1, r->name);
2253+ r->start = start;
2254+ r->end = end;
2255+ r->overlap_ok = overlap_ok;
2256+ if (name)
2257+ strncpy(r->name, name, sizeof(r->name) - 1);
2258+}
2259+
2260+/*
2261+ * A few early reservtations come here.
2262+ *
2263+ * The 'overlap_ok' in the name of this routine does -not- mean it
2264+ * is ok for these reservations to overlap an earlier reservation.
2265+ * Rather it means that it is ok for subsequent reservations to
2266+ * overlap this one.
2267+ *
2268+ * Use this entry point to reserve early ranges when you are doing
2269+ * so out of "Paranoia", reserving perhaps more memory than you need,
2270+ * just in case, and don't mind a subsequent overlapping reservation
2271+ * that is known to be needed.
2272+ *
2273+ * The drop_overlaps_that_are_ok() call here isn't really needed.
2274+ * It would be needed if we had two colliding 'overlap_ok'
2275+ * reservations, so that the second such would not panic on the
2276+ * overlap with the first. We don't have any such as of this
2277+ * writing, but might as well tolerate such if it happens in
2278+ * the future.
2279+ */
2280+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
2281+{
2282+ drop_overlaps_that_are_ok(start, end);
2283+ __reserve_early(start, end, name, 1);
2284+}
2285+
2286+/*
2287+ * Most early reservations come here.
2288+ *
2289+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
2290+ * 'overlap_ok' ranges, so that we can then reserve this memory
2291+ * range without risk of panic'ing on an overlapping overlap_ok
2292+ * early reservation.
2293+ */
2294+void __init reserve_early(u64 start, u64 end, char *name)
2295+{
2296+ drop_overlaps_that_are_ok(start, end);
2297+ __reserve_early(start, end, name, 0);
2298+}
2299+
2300+void __init free_early(u64 start, u64 end)
2301+{
2302+ struct early_res *r;
2303+ int i;
2304+
2305+ i = find_overlapped_early(start, end);
2306+ r = &early_res[i];
2307+ if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
2308+ panic("free_early on not reserved area: %llx-%llx!",
2309+ start, end - 1);
2310+
2311+ drop_range(i);
2312+}
2313+
2314+void __init early_res_to_bootmem(u64 start, u64 end)
2315+{
2316+ int i, count;
2317+ u64 final_start, final_end;
2318+
2319+ count = 0;
2320+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
2321+ count++;
2322+
2323+ printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
2324+ count, start, end);
2325+ for (i = 0; i < count; i++) {
2326+ struct early_res *r = &early_res[i];
2327+ printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
2328+ r->start, r->end, r->name);
2329+ final_start = max(start, r->start);
2330+ final_end = min(end, r->end);
2331+ if (final_start >= final_end) {
2332+ printk(KERN_CONT "\n");
2333+ continue;
2334+ }
2335+ printk(KERN_CONT " ==> [%010llx - %010llx]\n",
2336+ final_start, final_end);
2337+ reserve_bootmem_generic(final_start, final_end - final_start,
2338+ BOOTMEM_DEFAULT);
2339+ }
2340+}
2341+
2342+/* Check for already reserved areas */
2343+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
2344+{
2345+ int i;
2346+ u64 addr = *addrp;
2347+ int changed = 0;
2348+ struct early_res *r;
2349+again:
2350+ i = find_overlapped_early(addr, addr + size);
2351+ r = &early_res[i];
2352+ if (i < MAX_EARLY_RES && r->end) {
2353+ *addrp = addr = round_up(r->end, align);
2354+ changed = 1;
2355+ goto again;
2356+ }
2357+ return changed;
2358+}
2359+
2360+/* Check for already reserved areas */
2361+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
2362+{
2363+ int i;
2364+ u64 addr = *addrp, last;
2365+ u64 size = *sizep;
2366+ int changed = 0;
2367+again:
2368+ last = addr + size;
2369+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2370+ struct early_res *r = &early_res[i];
2371+ if (last > r->start && addr < r->start) {
2372+ size = r->start - addr;
2373+ changed = 1;
2374+ goto again;
2375+ }
2376+ if (last > r->end && addr < r->end) {
2377+ addr = round_up(r->end, align);
2378+ size = last - addr;
2379+ changed = 1;
2380+ goto again;
2381+ }
2382+ if (last <= r->end && addr >= r->start) {
2383+ (*sizep)++;
2384+ return 0;
2385+ }
2386+ }
2387+ if (changed) {
2388+ *addrp = addr;
2389+ *sizep = size;
2390+ }
2391+ return changed;
2392+}
2393+
2394+/*
2395+ * Find a free area with specified alignment in a specific range.
2396+ */
2397+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
2398+{
2399+ int i;
2400+
2401+ for (i = 0; i < e820.nr_map; i++) {
2402+ struct e820entry *ei = &e820.map[i];
2403+ u64 addr, last;
2404+ u64 ei_last;
2405+
2406+ if (ei->type != E820_RAM)
2407+ continue;
2408+ addr = round_up(ei->addr, align);
2409+ ei_last = ei->addr + ei->size;
2410+ if (addr < start)
2411+ addr = round_up(start, align);
2412+ if (addr >= ei_last)
2413+ continue;
2414+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
2415+ ;
2416+ last = addr + size;
2417+ if (last > ei_last)
2418+ continue;
2419+ if (last > end)
2420+ continue;
2421+ return addr;
2422+ }
2423+ return -1ULL;
2424+}
2425+
2426+/*
2427+ * Find next free range after *start
2428+ */
2429+u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
2430+{
2431+ int i;
2432+
2433+ for (i = 0; i < e820.nr_map; i++) {
2434+ struct e820entry *ei = &e820.map[i];
2435+ u64 addr, last;
2436+ u64 ei_last;
2437+
2438+ if (ei->type != E820_RAM)
2439+ continue;
2440+ addr = round_up(ei->addr, align);
2441+ ei_last = ei->addr + ei->size;
2442+ if (addr < start)
2443+ addr = round_up(start, align);
2444+ if (addr >= ei_last)
2445+ continue;
2446+ *sizep = ei_last - addr;
2447+ while (bad_addr_size(&addr, sizep, align) &&
2448+ addr + *sizep <= ei_last)
2449+ ;
2450+ last = addr + *sizep;
2451+ if (last > ei_last)
2452+ continue;
2453+ return addr;
2454+ }
cc90b958 2455+
00e5a55c 2456+ return -1ULL;
cc90b958
BS
2457+}
2458+
2459+/*
2460+ * pre allocated 4k and reserved it in e820
2461+ */
2462+u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
2463+{
2464+ u64 size = 0;
2465+ u64 addr;
2466+ u64 start;
00e5a55c
BS
2467+#ifdef CONFIG_XEN
2468+ unsigned int order = get_order(sizet);
cc90b958 2469+
00e5a55c
BS
2470+ if (is_initial_xendomain()) {
2471+ sizet = PAGE_SIZE << order;
2472+ if (align < PAGE_SIZE)
2473+ align = PAGE_SIZE;
2474+ }
2475+#endif
2476+ for (start = startt; ; start += size) {
cc90b958 2477+ start = find_e820_area_size(start, &size, align);
00e5a55c
BS
2478+ if (!(start + 1))
2479+ return 0;
2480+ if (size >= sizet)
2481+ break;
2482+ }
cc90b958 2483+
00e5a55c
BS
2484+#ifdef CONFIG_X86_32
2485+ if (start >= MAXMEM)
2486+ return 0;
2487+ if (start + size > MAXMEM)
2488+ size = MAXMEM - start;
2489+#endif
2490+#ifdef CONFIG_XEN
2491+ if ((start >> PAGE_SHIFT) >= xen_start_info->nr_pages)
cc90b958 2492+ return 0;
00e5a55c
BS
2493+ if (PFN_UP(start + size) > xen_start_info->nr_pages)
2494+ size = ((u64)xen_start_info->nr_pages << PAGE_SHIFT) - start;
2495+#endif
cc90b958
BS
2496+
2497+ addr = round_down(start + size - sizet, align);
00e5a55c
BS
2498+ if (addr < start)
2499+ return 0;
2500+#ifdef CONFIG_XEN
2501+ if (is_initial_xendomain()) {
2502+ int rc;
2503+ unsigned long max_initmap_pfn;
2504+
2505+ max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
2506+ + xen_start_info->nr_pt_frames
2507+ + 1 + (1 << (19 - PAGE_SHIFT)),
2508+ 1UL << (22 - PAGE_SHIFT));
2509+#ifdef CONFIG_X86_32
2510+ if ((addr >> PAGE_SHIFT)
2511+ < max(max_initmap_pfn, max_pfn_mapped))
2512+ rc = xen_create_contiguous_region((unsigned long)
2513+ __va(addr),
2514+ order, 32);
2515+#else
2516+ if ((addr >> PAGE_SHIFT) < max_pfn_mapped)
2517+ rc = xen_create_contiguous_region((unsigned long)
2518+ __va(addr),
2519+ order, 32);
2520+ else if ((addr >> PAGE_SHIFT) < max_initmap_pfn)
2521+ rc = xen_create_contiguous_region(__START_KERNEL_map
2522+ + addr,
2523+ order, 32);
2524+#endif
2525+ else
2526+ rc = early_create_contiguous_region(addr >> PAGE_SHIFT,
2527+ order, 32);
2528+ if (rc)
2529+ return 0;
2530+ }
2531+#endif
cc90b958
BS
2532+ e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
2533+ e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
2534+ printk(KERN_INFO "update e820 for early_reserve_e820\n");
2535+ update_e820();
2536+ update_e820_saved();
2537+
2538+ return addr;
2539+}
2540+
2541+#ifdef CONFIG_X86_32
2542+# ifdef CONFIG_X86_PAE
2543+# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT))
2544+# else
2545+# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
2546+# endif
2547+#else /* CONFIG_X86_32 */
2548+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
2549+#endif
2550+
2551+/*
2552+ * Find the highest page frame number we have available
2553+ */
2554+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
2555+{
2556+ int i;
2557+ unsigned long last_pfn = 0;
2558+ unsigned long max_arch_pfn = MAX_ARCH_PFN;
2559+
2560+ for (i = 0; i < e820.nr_map; i++) {
2561+ struct e820entry *ei = &e820.map[i];
2562+ unsigned long start_pfn;
2563+ unsigned long end_pfn;
2564+
2565+ if (ei->type != type)
2566+ continue;
2567+
2568+ start_pfn = ei->addr >> PAGE_SHIFT;
2569+ end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
2570+
2571+ if (start_pfn >= limit_pfn)
2572+ continue;
2573+ if (end_pfn > limit_pfn) {
2574+ last_pfn = limit_pfn;
2575+ break;
2576+ }
2577+ if (end_pfn > last_pfn)
2578+ last_pfn = end_pfn;
2579+ }
2580+
2581+ if (last_pfn > max_arch_pfn)
2582+ last_pfn = max_arch_pfn;
2583+
2584+ printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
2585+ last_pfn, max_arch_pfn);
2586+ return last_pfn;
2587+}
2588+unsigned long __init e820_end_of_ram_pfn(void)
2589+{
2590+ return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
2591+}
2592+
2593+unsigned long __init e820_end_of_low_ram_pfn(void)
2594+{
2595+ return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
2596+}
2597+/*
2598+ * Finds an active region in the address range from start_pfn to last_pfn and
2599+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
2600+ */
2601+int __init e820_find_active_region(const struct e820entry *ei,
2602+ unsigned long start_pfn,
2603+ unsigned long last_pfn,
2604+ unsigned long *ei_startpfn,
2605+ unsigned long *ei_endpfn)
2606+{
2607+ u64 align = PAGE_SIZE;
2608+
2609+ *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
2610+ *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
2611+
2612+ /* Skip map entries smaller than a page */
2613+ if (*ei_startpfn >= *ei_endpfn)
2614+ return 0;
2615+
2616+ /* Skip if map is outside the node */
2617+ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
2618+ *ei_startpfn >= last_pfn)
2619+ return 0;
2620+
2621+ /* Check for overlaps */
2622+ if (*ei_startpfn < start_pfn)
2623+ *ei_startpfn = start_pfn;
2624+ if (*ei_endpfn > last_pfn)
2625+ *ei_endpfn = last_pfn;
2626+
2627+ return 1;
2628+}
2629+
2630+/* Walk the e820 map and register active regions within a node */
2631+void __init e820_register_active_regions(int nid, unsigned long start_pfn,
2632+ unsigned long last_pfn)
2633+{
2634+ unsigned long ei_startpfn;
2635+ unsigned long ei_endpfn;
2636+ int i;
2637+
2638+ for (i = 0; i < e820.nr_map; i++)
2639+ if (e820_find_active_region(&e820.map[i],
2640+ start_pfn, last_pfn,
2641+ &ei_startpfn, &ei_endpfn))
2642+ add_active_range(nid, ei_startpfn, ei_endpfn);
2643+}
2644+
2645+/*
2646+ * Find the hole size (in bytes) in the memory range.
2647+ * @start: starting address of the memory range to scan
2648+ * @end: ending address of the memory range to scan
2649+ */
2650+u64 __init e820_hole_size(u64 start, u64 end)
2651+{
2652+ unsigned long start_pfn = start >> PAGE_SHIFT;
2653+ unsigned long last_pfn = end >> PAGE_SHIFT;
2654+ unsigned long ei_startpfn, ei_endpfn, ram = 0;
2655+ int i;
2656+
2657+ for (i = 0; i < e820.nr_map; i++) {
2658+ if (e820_find_active_region(&e820.map[i],
2659+ start_pfn, last_pfn,
2660+ &ei_startpfn, &ei_endpfn))
2661+ ram += ei_endpfn - ei_startpfn;
2662+ }
2663+ return end - start - ((u64)ram << PAGE_SHIFT);
2664+}
2665+
2666+static void early_panic(char *msg)
2667+{
2668+ early_printk(msg);
2669+ panic(msg);
2670+}
2671+
2672+static int userdef __initdata;
2673+
2674+/* "mem=nopentium" disables the 4MB page tables. */
2675+static int __init parse_memopt(char *p)
2676+{
2677+ u64 mem_size, current_end;
2678+ unsigned int i;
2679+
2680+ if (!p)
2681+ return -EINVAL;
2682+
2683+#ifdef CONFIG_X86_32
2684+ if (!strcmp(p, "nopentium")) {
2685+ setup_clear_cpu_cap(X86_FEATURE_PSE);
2686+ return 0;
2687+ }
2688+#endif
2689+
2690+ userdef = 1;
2691+ mem_size = memparse(p, &p);
2692+ e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2693+
2694+ i = e820.nr_map - 1;
2695+ current_end = e820.map[i].addr + e820.map[i].size;
2696+ if (current_end < mem_size) {
2697+ /*
2698+ * The e820 map ends before our requested size so
2699+ * extend the final entry to the requested address.
2700+ */
2701+ if (e820.map[i].type == E820_RAM)
2702+ e820.map[i].size = mem_size - e820.map[i].addr;
2703+ else
2704+ e820_add_region(current_end, mem_size - current_end, E820_RAM);
2705+ }
2706+
2707+ return 0;
2708+}
2709+early_param("mem", parse_memopt);
2710+
2711+#ifndef CONFIG_XEN
2712+static int __init parse_memmap_opt(char *p)
2713+{
2714+ char *oldp;
2715+ u64 start_at, mem_size;
2716+
2717+ if (!p)
2718+ return -EINVAL;
2719+
2720+ if (!strncmp(p, "exactmap", 8)) {
2721+#ifdef CONFIG_CRASH_DUMP
2722+ /*
2723+ * If we are doing a crash dump, we still need to know
2724+ * the real mem size before original memory map is
2725+ * reset.
2726+ */
2727+ saved_max_pfn = e820_end_of_ram_pfn();
2728+#endif
2729+ e820.nr_map = 0;
2730+ userdef = 1;
2731+ return 0;
2732+ }
2733+
2734+ oldp = p;
2735+ mem_size = memparse(p, &p);
2736+ if (p == oldp)
2737+ return -EINVAL;
2738+
2739+ userdef = 1;
2740+ if (*p == '@') {
2741+ start_at = memparse(p+1, &p);
2742+ e820_add_region(start_at, mem_size, E820_RAM);
2743+ } else if (*p == '#') {
2744+ start_at = memparse(p+1, &p);
2745+ e820_add_region(start_at, mem_size, E820_ACPI);
2746+ } else if (*p == '$') {
2747+ start_at = memparse(p+1, &p);
2748+ e820_add_region(start_at, mem_size, E820_RESERVED);
2749+ } else
2750+ e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2751+
2752+ return *p == '\0' ? 0 : -EINVAL;
2753+}
2754+early_param("memmap", parse_memmap_opt);
2755+
2756+void __init finish_e820_parsing(void)
2757+{
2758+ if (userdef) {
2759+ int nr = e820.nr_map;
2760+
2761+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
2762+ early_panic("Invalid user supplied memory map");
2763+ e820.nr_map = nr;
2764+
2765+ printk(KERN_INFO "user-defined physical RAM map:\n");
2766+ e820_print_map("user");
2767+ }
2768+}
2769+#endif
2770+
2771+static inline const char *e820_type_to_string(int e820_type)
2772+{
2773+ switch (e820_type) {
2774+ case E820_RESERVED_KERN:
2775+ case E820_RAM: return "System RAM";
2776+ case E820_ACPI: return "ACPI Tables";
2777+ case E820_NVS: return "ACPI Non-volatile Storage";
2778+ default: return "reserved";
2779+ }
2780+}
2781+
2782+#ifdef CONFIG_XEN
2783+#define e820 machine_e820
2784+#endif
2785+
2786+/*
2787+ * Mark e820 reserved areas as busy for the resource manager.
2788+ */
2789+void __init e820_reserve_resources(void)
2790+{
2791+ int i;
2792+ struct resource *res;
2793+ u64 end;
2794+
2795+ res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
2796+ for (i = 0; i < e820.nr_map; i++) {
2797+ end = e820.map[i].addr + e820.map[i].size - 1;
2798+#ifndef CONFIG_RESOURCES_64BIT
2799+ if (end > 0x100000000ULL) {
2800+ res++;
2801+ continue;
2802+ }
2803+#endif
2804+ res->name = e820_type_to_string(e820.map[i].type);
2805+ res->start = e820.map[i].addr;
2806+ res->end = end;
2807+
2808+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2809+ insert_resource(&iomem_resource, res);
2810+ res++;
2811+ }
2812+
2813+ for (i = 0; i < e820_saved.nr_map; i++) {
2814+ struct e820entry *entry = &e820_saved.map[i];
2815+ firmware_map_add_early(entry->addr,
2816+ entry->addr + entry->size - 1,
2817+ e820_type_to_string(entry->type));
2818+ }
2819+}
2820+
2821+#undef e820
2822+
2823+#ifndef CONFIG_XEN
2824+char *__init default_machine_specific_memory_setup(void)
2825+{
2826+ char *who = "BIOS-e820";
2827+ int new_nr;
2828+ /*
2829+ * Try to copy the BIOS-supplied E820-map.
2830+ *
2831+ * Otherwise fake a memory map; one section from 0k->640k,
2832+ * the next section from 1mb->appropriate_mem_k
2833+ */
2834+ new_nr = boot_params.e820_entries;
2835+ sanitize_e820_map(boot_params.e820_map,
2836+ ARRAY_SIZE(boot_params.e820_map),
2837+ &new_nr);
2838+ boot_params.e820_entries = new_nr;
2839+ if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
2840+ < 0) {
2841+ u64 mem_size;
2842+
2843+ /* compare results from other methods and take the greater */
2844+ if (boot_params.alt_mem_k
2845+ < boot_params.screen_info.ext_mem_k) {
2846+ mem_size = boot_params.screen_info.ext_mem_k;
2847+ who = "BIOS-88";
2848+ } else {
2849+ mem_size = boot_params.alt_mem_k;
2850+ who = "BIOS-e801";
2851+ }
2852+
2853+ e820.nr_map = 0;
2854+ e820_add_region(0, LOWMEMSIZE(), E820_RAM);
2855+ e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
2856+ }
2857+
2858+ /* In case someone cares... */
2859+ return who;
2860+}
2861+
2862+char *__init __attribute__((weak)) machine_specific_memory_setup(void)
2863+{
2864+ if (x86_quirks->arch_memory_setup) {
2865+ char *who = x86_quirks->arch_memory_setup();
2866+
2867+ if (who)
2868+ return who;
2869+ }
2870+ return default_machine_specific_memory_setup();
2871+}
2872+#endif
2873+
2874+char * __init memory_setup(void)
2875+{
2876+ int rc, nr_map;
2877+ struct xen_memory_map memmap;
2878+ /*
2879+ * This is rather large for a stack variable but this early in
2880+ * the boot process we know we have plenty slack space.
2881+ */
2882+ struct e820entry map[E820MAX];
2883+
2884+ memmap.nr_entries = E820MAX;
2885+ set_xen_guest_handle(memmap.buffer, map);
2886+
2887+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
2888+ if (rc == -ENOSYS) {
2889+ memmap.nr_entries = 1;
2890+ map[0].addr = 0ULL;
2891+ map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
2892+ /* 8MB slack (to balance backend allocations). */
2893+ map[0].size += 8ULL << 20;
2894+ map[0].type = E820_RAM;
2895+ rc = 0;
2896+ }
2897+ BUG_ON(rc);
2898+
2899+ nr_map = memmap.nr_entries;
2900+ sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
2901+
2902+ if (append_e820_map(map, nr_map) < 0)
2903+ BUG();
2904+
2905+#ifdef CONFIG_XEN
2906+ if (is_initial_xendomain()) {
2907+ memmap.nr_entries = E820MAX;
2908+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
2909+
2910+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
2911+ BUG();
2912+ machine_e820.nr_map = memmap.nr_entries;
00e5a55c 2913+ }
cc90b958
BS
2914+#endif
2915+
2916+ return "Xen";
2917+}
2918+
2919+void __init setup_memory_map(void)
2920+{
2921+ char *who;
2922+
2923+ who = memory_setup();
00e5a55c
BS
2924+#ifdef CONFIG_XEN
2925+ if (!is_initial_xendomain())
2926+#endif
2927+ memcpy(&e820_saved, &e820, sizeof(struct e820map));
cc90b958
BS
2928+ printk(KERN_INFO "Xen-provided physical RAM map:\n");
2929+ e820_print_map(who);
2930+}
00e5a55c 2931--- sle11-2009-06-04.orig/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:38:05.000000000 +0100
cc90b958
BS
2932+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
2933@@ -1,873 +0,0 @@
2934-#include <linux/kernel.h>
2935-#include <linux/types.h>
2936-#include <linux/init.h>
2937-#include <linux/bootmem.h>
2938-#include <linux/ioport.h>
2939-#include <linux/string.h>
2940-#include <linux/kexec.h>
2941-#include <linux/module.h>
2942-#include <linux/mm.h>
2943-#include <linux/pfn.h>
2944-#include <linux/uaccess.h>
2945-#include <linux/suspend.h>
2946-
2947-#include <asm/pgtable.h>
2948-#include <asm/page.h>
2949-#include <asm/e820.h>
2950-#include <asm/setup.h>
2951-#include <xen/interface/memory.h>
2952-
2953-struct e820map e820;
2954-struct change_member {
2955- struct e820entry *pbios; /* pointer to original bios entry */
2956- unsigned long long addr; /* address for this change point */
2957-};
2958-static struct change_member change_point_list[2*E820MAX] __initdata;
2959-static struct change_member *change_point[2*E820MAX] __initdata;
2960-static struct e820entry *overlap_list[E820MAX] __initdata;
2961-static struct e820entry new_bios[E820MAX] __initdata;
2962-/* For PCI or other memory-mapped resources */
2963-unsigned long pci_mem_start = 0x10000000;
2964-#ifdef CONFIG_PCI
2965-EXPORT_SYMBOL(pci_mem_start);
2966-#endif
2967-extern int user_defined_memmap;
2968-
2969-static struct resource system_rom_resource = {
2970- .name = "System ROM",
2971- .start = 0xf0000,
2972- .end = 0xfffff,
2973- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2974-};
2975-
2976-static struct resource extension_rom_resource = {
2977- .name = "Extension ROM",
2978- .start = 0xe0000,
2979- .end = 0xeffff,
2980- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2981-};
2982-
2983-static struct resource adapter_rom_resources[] = { {
2984- .name = "Adapter ROM",
2985- .start = 0xc8000,
2986- .end = 0,
2987- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2988-}, {
2989- .name = "Adapter ROM",
2990- .start = 0,
2991- .end = 0,
2992- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2993-}, {
2994- .name = "Adapter ROM",
2995- .start = 0,
2996- .end = 0,
2997- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2998-}, {
2999- .name = "Adapter ROM",
3000- .start = 0,
3001- .end = 0,
3002- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3003-}, {
3004- .name = "Adapter ROM",
3005- .start = 0,
3006- .end = 0,
3007- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3008-}, {
3009- .name = "Adapter ROM",
3010- .start = 0,
3011- .end = 0,
3012- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3013-} };
3014-
3015-static struct resource video_rom_resource = {
3016- .name = "Video ROM",
3017- .start = 0xc0000,
3018- .end = 0xc7fff,
3019- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3020-};
3021-
3022-#define ROMSIGNATURE 0xaa55
3023-
3024-static int __init romsignature(const unsigned char *rom)
3025-{
3026- const unsigned short * const ptr = (const unsigned short *)rom;
3027- unsigned short sig;
3028-
3029- return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
3030-}
3031-
3032-static int __init romchecksum(const unsigned char *rom, unsigned long length)
3033-{
3034- unsigned char sum, c;
3035-
3036- for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
3037- sum += c;
3038- return !length && !sum;
3039-}
3040-
3041-static void __init probe_roms(void)
3042-{
3043- const unsigned char *rom;
3044- unsigned long start, length, upper;
3045- unsigned char c;
3046- int i;
3047-
3048-#ifdef CONFIG_XEN
3049- /* Nothing to do if not running in dom0. */
3050- if (!is_initial_xendomain())
3051- return;
3052-#endif
3053-
3054- /* video rom */
3055- upper = adapter_rom_resources[0].start;
3056- for (start = video_rom_resource.start; start < upper; start += 2048) {
3057- rom = isa_bus_to_virt(start);
3058- if (!romsignature(rom))
3059- continue;
3060-
3061- video_rom_resource.start = start;
3062-
3063- if (probe_kernel_address(rom + 2, c) != 0)
3064- continue;
3065-
3066- /* 0 < length <= 0x7f * 512, historically */
3067- length = c * 512;
3068-
3069- /* if checksum okay, trust length byte */
3070- if (length && romchecksum(rom, length))
3071- video_rom_resource.end = start + length - 1;
3072-
3073- request_resource(&iomem_resource, &video_rom_resource);
3074- break;
3075- }
3076-
3077- start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3078- if (start < upper)
3079- start = upper;
3080-
3081- /* system rom */
3082- request_resource(&iomem_resource, &system_rom_resource);
3083- upper = system_rom_resource.start;
3084-
3085- /* check for extension rom (ignore length byte!) */
3086- rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
3087- if (romsignature(rom)) {
3088- length = extension_rom_resource.end - extension_rom_resource.start + 1;
3089- if (romchecksum(rom, length)) {
3090- request_resource(&iomem_resource, &extension_rom_resource);
3091- upper = extension_rom_resource.start;
3092- }
3093- }
3094-
3095- /* check for adapter roms on 2k boundaries */
3096- for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3097- rom = isa_bus_to_virt(start);
3098- if (!romsignature(rom))
3099- continue;
3100-
3101- if (probe_kernel_address(rom + 2, c) != 0)
3102- continue;
3103-
3104- /* 0 < length <= 0x7f * 512, historically */
3105- length = c * 512;
3106-
3107- /* but accept any length that fits if checksum okay */
3108- if (!length || start + length > upper || !romchecksum(rom, length))
3109- continue;
3110-
3111- adapter_rom_resources[i].start = start;
3112- adapter_rom_resources[i].end = start + length - 1;
3113- request_resource(&iomem_resource, &adapter_rom_resources[i]);
3114-
3115- start = adapter_rom_resources[i++].end & ~2047UL;
3116- }
3117-}
3118-
3119-#ifdef CONFIG_XEN
3120-static struct e820map machine_e820;
3121-#define e820 machine_e820
3122-#endif
3123-
3124-/*
3125- * Request address space for all standard RAM and ROM resources
3126- * and also for regions reported as reserved by the e820.
3127- */
3128-void __init init_iomem_resources(struct resource *code_resource,
3129- struct resource *data_resource,
3130- struct resource *bss_resource)
3131-{
3132- int i;
3133-
3134- probe_roms();
3135- for (i = 0; i < e820.nr_map; i++) {
3136- struct resource *res;
3137-#ifndef CONFIG_RESOURCES_64BIT
3138- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
3139- continue;
3140-#endif
3141- res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
3142- switch (e820.map[i].type) {
3143- case E820_RAM: res->name = "System RAM"; break;
3144- case E820_ACPI: res->name = "ACPI Tables"; break;
3145- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
3146- default: res->name = "reserved";
3147- }
3148- res->start = e820.map[i].addr;
3149- res->end = res->start + e820.map[i].size - 1;
3150- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3151- if (request_resource(&iomem_resource, res)) {
3152- kfree(res);
3153- continue;
3154- }
3155- if (e820.map[i].type == E820_RAM) {
3156- /*
3157- * We don't know which RAM region contains kernel data,
3158- * so we try it repeatedly and let the resource manager
3159- * test it.
3160- */
3161-#ifndef CONFIG_XEN
3162- request_resource(res, code_resource);
3163- request_resource(res, data_resource);
3164- request_resource(res, bss_resource);
3165-#endif
3166-#ifdef CONFIG_KEXEC
3167- if (crashk_res.start != crashk_res.end)
3168- request_resource(res, &crashk_res);
3169-#ifdef CONFIG_XEN
3170- xen_machine_kexec_register_resources(res);
3171-#endif
3172-#endif
3173- }
3174- }
3175-}
3176-
3177-#undef e820
3178-
3179-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
3180-/**
3181- * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
3182- * correspond to e820 RAM areas and mark the corresponding pages as nosave for
3183- * hibernation.
3184- *
3185- * This function requires the e820 map to be sorted and without any
3186- * overlapping entries and assumes the first e820 area to be RAM.
3187- */
3188-void __init e820_mark_nosave_regions(void)
3189-{
3190- int i;
3191- unsigned long pfn;
3192-
3193- pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
3194- for (i = 1; i < e820.nr_map; i++) {
3195- struct e820entry *ei = &e820.map[i];
3196-
3197- if (pfn < PFN_UP(ei->addr))
3198- register_nosave_region(pfn, PFN_UP(ei->addr));
3199-
3200- pfn = PFN_DOWN(ei->addr + ei->size);
3201- if (ei->type != E820_RAM)
3202- register_nosave_region(PFN_UP(ei->addr), pfn);
3203-
3204- if (pfn >= max_low_pfn)
3205- break;
3206- }
3207-}
3208-#endif
3209-
3210-void __init add_memory_region(unsigned long long start,
3211- unsigned long long size, int type)
3212-{
3213- int x;
3214-
3215- x = e820.nr_map;
3216-
3217- if (x == E820MAX) {
3218- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3219- return;
3220- }
3221-
3222- e820.map[x].addr = start;
3223- e820.map[x].size = size;
3224- e820.map[x].type = type;
3225- e820.nr_map++;
3226-} /* add_memory_region */
3227-
3228-/*
3229- * Sanitize the BIOS e820 map.
3230- *
3231- * Some e820 responses include overlapping entries. The following
3232- * replaces the original e820 map with a new one, removing overlaps.
3233- *
3234- */
3235-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3236-{
3237- struct change_member *change_tmp;
3238- unsigned long current_type, last_type;
3239- unsigned long long last_addr;
3240- int chgidx, still_changing;
3241- int overlap_entries;
3242- int new_bios_entry;
3243- int old_nr, new_nr, chg_nr;
3244- int i;
3245-
3246- /*
3247- Visually we're performing the following (1,2,3,4 = memory types)...
3248-
3249- Sample memory map (w/overlaps):
3250- ____22__________________
3251- ______________________4_
3252- ____1111________________
3253- _44_____________________
3254- 11111111________________
3255- ____________________33__
3256- ___________44___________
3257- __________33333_________
3258- ______________22________
3259- ___________________2222_
3260- _________111111111______
3261- _____________________11_
3262- _________________4______
3263-
3264- Sanitized equivalent (no overlap):
3265- 1_______________________
3266- _44_____________________
3267- ___1____________________
3268- ____22__________________
3269- ______11________________
3270- _________1______________
3271- __________3_____________
3272- ___________44___________
3273- _____________33_________
3274- _______________2________
3275- ________________1_______
3276- _________________4______
3277- ___________________2____
3278- ____________________33__
3279- ______________________4_
3280- */
3281- /* if there's only one memory region, don't bother */
3282- if (*pnr_map < 2) {
3283- return -1;
3284- }
3285-
3286- old_nr = *pnr_map;
3287-
3288- /* bail out if we find any unreasonable addresses in bios map */
3289- for (i=0; i<old_nr; i++)
3290- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
3291- return -1;
3292- }
3293-
3294- /* create pointers for initial change-point information (for sorting) */
3295- for (i=0; i < 2*old_nr; i++)
3296- change_point[i] = &change_point_list[i];
3297-
3298- /* record all known change-points (starting and ending addresses),
3299- omitting those that are for empty memory regions */
3300- chgidx = 0;
3301- for (i=0; i < old_nr; i++) {
3302- if (biosmap[i].size != 0) {
3303- change_point[chgidx]->addr = biosmap[i].addr;
3304- change_point[chgidx++]->pbios = &biosmap[i];
3305- change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3306- change_point[chgidx++]->pbios = &biosmap[i];
3307- }
3308- }
3309- chg_nr = chgidx; /* true number of change-points */
3310-
3311- /* sort change-point list by memory addresses (low -> high) */
3312- still_changing = 1;
3313- while (still_changing) {
3314- still_changing = 0;
3315- for (i=1; i < chg_nr; i++) {
3316- /* if <current_addr> > <last_addr>, swap */
3317- /* or, if current=<start_addr> & last=<end_addr>, swap */
3318- if ((change_point[i]->addr < change_point[i-1]->addr) ||
3319- ((change_point[i]->addr == change_point[i-1]->addr) &&
3320- (change_point[i]->addr == change_point[i]->pbios->addr) &&
3321- (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3322- )
3323- {
3324- change_tmp = change_point[i];
3325- change_point[i] = change_point[i-1];
3326- change_point[i-1] = change_tmp;
3327- still_changing=1;
3328- }
3329- }
3330- }
3331-
3332- /* create a new bios memory map, removing overlaps */
3333- overlap_entries=0; /* number of entries in the overlap table */
3334- new_bios_entry=0; /* index for creating new bios map entries */
3335- last_type = 0; /* start with undefined memory type */
3336- last_addr = 0; /* start with 0 as last starting address */
3337- /* loop through change-points, determining affect on the new bios map */
3338- for (chgidx=0; chgidx < chg_nr; chgidx++)
3339- {
3340- /* keep track of all overlapping bios entries */
3341- if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3342- {
3343- /* add map entry to overlap list (> 1 entry implies an overlap) */
3344- overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3345- }
3346- else
3347- {
3348- /* remove entry from list (order independent, so swap with last) */
3349- for (i=0; i<overlap_entries; i++)
3350- {
3351- if (overlap_list[i] == change_point[chgidx]->pbios)
3352- overlap_list[i] = overlap_list[overlap_entries-1];
3353- }
3354- overlap_entries--;
3355- }
3356- /* if there are overlapping entries, decide which "type" to use */
3357- /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3358- current_type = 0;
3359- for (i=0; i<overlap_entries; i++)
3360- if (overlap_list[i]->type > current_type)
3361- current_type = overlap_list[i]->type;
3362- /* continue building up new bios map based on this information */
3363- if (current_type != last_type) {
3364- if (last_type != 0) {
3365- new_bios[new_bios_entry].size =
3366- change_point[chgidx]->addr - last_addr;
3367- /* move forward only if the new size was non-zero */
3368- if (new_bios[new_bios_entry].size != 0)
3369- if (++new_bios_entry >= E820MAX)
3370- break; /* no more space left for new bios entries */
3371- }
3372- if (current_type != 0) {
3373- new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3374- new_bios[new_bios_entry].type = current_type;
3375- last_addr=change_point[chgidx]->addr;
3376- }
3377- last_type = current_type;
3378- }
3379- }
3380- new_nr = new_bios_entry; /* retain count for new bios entries */
3381-
3382- /* copy new bios mapping into original location */
3383- memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3384- *pnr_map = new_nr;
3385-
3386- return 0;
3387-}
3388-
3389-/*
3390- * Copy the BIOS e820 map into a safe place.
3391- *
3392- * Sanity-check it while we're at it..
3393- *
3394- * If we're lucky and live on a modern system, the setup code
3395- * will have given us a memory map that we can use to properly
3396- * set up memory. If we aren't, we'll fake a memory map.
3397- *
3398- * We check to see that the memory map contains at least 2 elements
3399- * before we'll use it, because the detection code in setup.S may
3400- * not be perfect and most every PC known to man has two memory
3401- * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3402- * thinkpad 560x, for example, does not cooperate with the memory
3403- * detection code.)
3404- */
3405-int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
3406-{
3407-#ifndef CONFIG_XEN
3408- /* Only one memory region (or negative)? Ignore it */
3409- if (nr_map < 2)
3410- return -1;
3411-#else
3412- BUG_ON(nr_map < 1);
3413-#endif
3414-
3415- do {
3416- u64 start = biosmap->addr;
3417- u64 size = biosmap->size;
3418- u64 end = start + size;
3419- u32 type = biosmap->type;
3420-
3421- /* Overflow in 64 bits? Ignore the memory map. */
3422- if (start > end)
3423- return -1;
3424-
3425- add_memory_region(start, size, type);
3426- } while (biosmap++, --nr_map);
3427-
3428-#ifdef CONFIG_XEN
3429- if (is_initial_xendomain()) {
3430- struct xen_memory_map memmap;
3431-
3432- memmap.nr_entries = E820MAX;
3433- set_xen_guest_handle(memmap.buffer, machine_e820.map);
3434-
3435- if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3436- BUG();
3437- machine_e820.nr_map = memmap.nr_entries;
3438- } else
3439- machine_e820 = e820;
3440-#endif
3441-
3442- return 0;
3443-}
3444-
3445-/*
3446- * Find the highest page frame number we have available
3447- */
3448-void __init propagate_e820_map(void)
3449-{
3450- int i;
3451-
3452- max_pfn = 0;
3453-
3454- for (i = 0; i < e820.nr_map; i++) {
3455- unsigned long start, end;
3456- /* RAM? */
3457- if (e820.map[i].type != E820_RAM)
3458- continue;
3459- start = PFN_UP(e820.map[i].addr);
3460- end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3461- if (start >= end)
3462- continue;
3463- if (end > max_pfn)
3464- max_pfn = end;
3465- memory_present(0, start, end);
3466- }
3467-}
3468-
3469-/*
3470- * Register fully available low RAM pages with the bootmem allocator.
3471- */
3472-void __init register_bootmem_low_pages(unsigned long max_low_pfn)
3473-{
3474- int i;
3475-
3476- for (i = 0; i < e820.nr_map; i++) {
3477- unsigned long curr_pfn, last_pfn, size;
3478- /*
3479- * Reserve usable low memory
3480- */
3481- if (e820.map[i].type != E820_RAM)
3482- continue;
3483- /*
3484- * We are rounding up the start address of usable memory:
3485- */
3486- curr_pfn = PFN_UP(e820.map[i].addr);
3487- if (curr_pfn >= max_low_pfn)
3488- continue;
3489- /*
3490- * ... and at the end of the usable range downwards:
3491- */
3492- last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3493-
3494-#ifdef CONFIG_XEN
3495- /*
3496- * Truncate to the number of actual pages currently
3497- * present.
3498- */
3499- if (last_pfn > xen_start_info->nr_pages)
3500- last_pfn = xen_start_info->nr_pages;
3501-#endif
3502-
3503- if (last_pfn > max_low_pfn)
3504- last_pfn = max_low_pfn;
3505-
3506- /*
3507- * .. finally, did all the rounding and playing
3508- * around just make the area go away?
3509- */
3510- if (last_pfn <= curr_pfn)
3511- continue;
3512-
3513- size = last_pfn - curr_pfn;
3514- free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
3515- }
3516-}
3517-
3518-void __init e820_register_memory(void)
3519-{
3520- unsigned long gapstart, gapsize, round;
3521- unsigned long long last;
3522- int i;
3523-
3524-#ifdef CONFIG_XEN
3525- if (is_initial_xendomain()) {
3526- struct xen_memory_map memmap;
3527-
3528- memmap.nr_entries = E820MAX;
3529- set_xen_guest_handle(memmap.buffer, machine_e820.map);
3530-
3531- if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3532- BUG();
3533- machine_e820.nr_map = memmap.nr_entries;
3534- }
3535- else
3536- machine_e820 = e820;
3537-#define e820 machine_e820
3538-#endif
3539-
3540- /*
3541- * Search for the biggest gap in the low 32 bits of the e820
3542- * memory space.
3543- */
3544- last = 0x100000000ull;
3545- gapstart = 0x10000000;
3546- gapsize = 0x400000;
3547- i = e820.nr_map;
3548- while (--i >= 0) {
3549- unsigned long long start = e820.map[i].addr;
3550- unsigned long long end = start + e820.map[i].size;
3551-
3552- /*
3553- * Since "last" is at most 4GB, we know we'll
3554- * fit in 32 bits if this condition is true
3555- */
3556- if (last > end) {
3557- unsigned long gap = last - end;
3558-
3559- if (gap > gapsize) {
3560- gapsize = gap;
3561- gapstart = end;
3562- }
3563- }
3564- if (start < last)
3565- last = start;
3566- }
3567-#undef e820
3568-
3569- /*
3570- * See how much we want to round up: start off with
3571- * rounding to the next 1MB area.
3572- */
3573- round = 0x100000;
3574- while ((gapsize >> 4) > round)
3575- round += round;
3576- /* Fun with two's complement */
3577- pci_mem_start = (gapstart + round) & -round;
3578-
3579- printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
3580- pci_mem_start, gapstart, gapsize);
3581-}
3582-
3583-void __init print_memory_map(char *who)
3584-{
3585- int i;
3586-
3587- for (i = 0; i < e820.nr_map; i++) {
3588- printk(" %s: %016Lx - %016Lx ", who,
3589- e820.map[i].addr,
3590- e820.map[i].addr + e820.map[i].size);
3591- switch (e820.map[i].type) {
3592- case E820_RAM: printk("(usable)\n");
3593- break;
3594- case E820_RESERVED:
3595- printk("(reserved)\n");
3596- break;
3597- case E820_ACPI:
3598- printk("(ACPI data)\n");
3599- break;
3600- case E820_NVS:
3601- printk("(ACPI NVS)\n");
3602- break;
3603- default: printk("type %u\n", e820.map[i].type);
3604- break;
3605- }
3606- }
3607-}
3608-
3609-void __init limit_regions(unsigned long long size)
3610-{
3611- unsigned long long current_addr = 0;
3612- int i;
3613-
3614- print_memory_map("limit_regions start");
3615- for (i = 0; i < e820.nr_map; i++) {
3616- current_addr = e820.map[i].addr + e820.map[i].size;
3617- if (current_addr < size)
3618- continue;
3619-
3620- if (e820.map[i].type != E820_RAM)
3621- continue;
3622-
3623- if (e820.map[i].addr >= size) {
3624- /*
3625- * This region starts past the end of the
3626- * requested size, skip it completely.
3627- */
3628- e820.nr_map = i;
3629- } else {
3630- e820.nr_map = i + 1;
3631- e820.map[i].size -= current_addr - size;
3632- }
3633- print_memory_map("limit_regions endfor");
3634- return;
3635- }
3636-#ifdef CONFIG_XEN
3637- if (current_addr < size) {
3638- /*
3639- * The e820 map finished before our requested size so
3640- * extend the final entry to the requested address.
3641- */
3642- --i;
3643- if (e820.map[i].type == E820_RAM)
3644- e820.map[i].size -= current_addr - size;
3645- else
3646- add_memory_region(current_addr, size - current_addr, E820_RAM);
3647- }
3648-#endif
3649- print_memory_map("limit_regions endfunc");
3650-}
3651-
3652-/*
3653- * This function checks if any part of the range <start,end> is mapped
3654- * with type.
3655- */
3656-int
3657-e820_any_mapped(u64 start, u64 end, unsigned type)
3658-{
3659- int i;
3660-
3661-#ifndef CONFIG_XEN
3662- for (i = 0; i < e820.nr_map; i++) {
3663- const struct e820entry *ei = &e820.map[i];
3664-#else
3665- if (!is_initial_xendomain())
3666- return 0;
3667- for (i = 0; i < machine_e820.nr_map; ++i) {
3668- const struct e820entry *ei = &machine_e820.map[i];
3669-#endif
3670-
3671- if (type && ei->type != type)
3672- continue;
3673- if (ei->addr >= end || ei->addr + ei->size <= start)
3674- continue;
3675- return 1;
3676- }
3677- return 0;
3678-}
3679-EXPORT_SYMBOL_GPL(e820_any_mapped);
3680-
3681- /*
3682- * This function checks if the entire range <start,end> is mapped with type.
3683- *
3684- * Note: this function only works correct if the e820 table is sorted and
3685- * not-overlapping, which is the case
3686- */
3687-int __init
3688-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
3689-{
3690- u64 start = s;
3691- u64 end = e;
3692- int i;
3693-
3694-#ifndef CONFIG_XEN
3695- for (i = 0; i < e820.nr_map; i++) {
3696- struct e820entry *ei = &e820.map[i];
3697-#else
3698- if (!is_initial_xendomain())
3699- return 0;
3700- for (i = 0; i < machine_e820.nr_map; ++i) {
3701- const struct e820entry *ei = &machine_e820.map[i];
3702-#endif
3703-
3704- if (type && ei->type != type)
3705- continue;
3706- /* is the region (part) in overlap with the current region ?*/
3707- if (ei->addr >= end || ei->addr + ei->size <= start)
3708- continue;
3709- /* if the region is at the beginning of <start,end> we move
3710- * start to the end of the region since it's ok until there
3711- */
3712- if (ei->addr <= start)
3713- start = ei->addr + ei->size;
3714- /* if start is now at or beyond end, we're done, full
3715- * coverage */
3716- if (start >= end)
3717- return 1; /* we're done */
3718- }
3719- return 0;
3720-}
3721-
3722-static int __init parse_memmap(char *arg)
3723-{
3724- if (!arg)
3725- return -EINVAL;
3726-
3727- if (strcmp(arg, "exactmap") == 0) {
3728-#ifdef CONFIG_CRASH_DUMP
3729- /* If we are doing a crash dump, we
3730- * still need to know the real mem
3731- * size before original memory map is
3732- * reset.
3733- */
3734- propagate_e820_map();
3735- saved_max_pfn = max_pfn;
3736-#endif
3737- e820.nr_map = 0;
3738- user_defined_memmap = 1;
3739- } else {
3740- /* If the user specifies memory size, we
3741- * limit the BIOS-provided memory map to
3742- * that size. exactmap can be used to specify
3743- * the exact map. mem=number can be used to
3744- * trim the existing memory map.
3745- */
3746- unsigned long long start_at, mem_size;
3747-
3748- mem_size = memparse(arg, &arg);
3749- if (*arg == '@') {
3750- start_at = memparse(arg+1, &arg);
3751- add_memory_region(start_at, mem_size, E820_RAM);
3752- } else if (*arg == '#') {
3753- start_at = memparse(arg+1, &arg);
3754- add_memory_region(start_at, mem_size, E820_ACPI);
3755- } else if (*arg == '$') {
3756- start_at = memparse(arg+1, &arg);
3757- add_memory_region(start_at, mem_size, E820_RESERVED);
3758- } else {
3759- limit_regions(mem_size);
3760- user_defined_memmap = 1;
3761- }
3762- }
3763- return 0;
3764-}
3765-early_param("memmap", parse_memmap);
3766-
3767-#ifndef CONFIG_XEN
3768-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
3769- unsigned new_type)
3770-{
3771- int i;
3772-
3773- BUG_ON(old_type == new_type);
3774-
3775- for (i = 0; i < e820.nr_map; i++) {
3776- struct e820entry *ei = &e820.map[i];
3777- u64 final_start, final_end;
3778- if (ei->type != old_type)
3779- continue;
3780- /* totally covered? */
3781- if (ei->addr >= start && ei->size <= size) {
3782- ei->type = new_type;
3783- continue;
3784- }
3785- /* partially covered */
3786- final_start = max(start, ei->addr);
3787- final_end = min(start + size, ei->addr + ei->size);
3788- if (final_start >= final_end)
3789- continue;
3790- add_memory_region(final_start, final_end - final_start,
3791- new_type);
3792- }
3793-}
3794-
3795-void __init update_e820(void)
3796-{
3797- u8 nr_map;
3798-
3799- nr_map = e820.nr_map;
3800- if (sanitize_e820_map(e820.map, &nr_map))
3801- return;
3802- e820.nr_map = nr_map;
3803- printk(KERN_INFO "modified physical RAM map:\n");
3804- print_memory_map("modified");
3805-}
3806-#endif
00e5a55c 3807--- sle11-2009-06-04.orig/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:38:05.000000000 +0100
cc90b958
BS
3808+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3809@@ -1,1045 +0,0 @@
3810-/*
3811- * Handle the memory map.
3812- * The functions here do the job until bootmem takes over.
3813- *
3814- * Getting sanitize_e820_map() in sync with i386 version by applying change:
3815- * - Provisions for empty E820 memory regions (reported by certain BIOSes).
3816- * Alex Achenbach <xela@slit.de>, December 2002.
3817- * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
3818- *
3819- */
3820-#include <linux/kernel.h>
3821-#include <linux/types.h>
3822-#include <linux/init.h>
3823-#include <linux/bootmem.h>
3824-#include <linux/ioport.h>
3825-#include <linux/string.h>
3826-#include <linux/kexec.h>
3827-#include <linux/module.h>
3828-#include <linux/mm.h>
3829-#include <linux/suspend.h>
3830-#include <linux/pfn.h>
3831-
3832-#include <asm/pgtable.h>
3833-#include <asm/page.h>
3834-#include <asm/e820.h>
3835-#include <asm/proto.h>
3836-#include <asm/setup.h>
3837-#include <asm/sections.h>
3838-#include <asm/kdebug.h>
3839-#include <xen/interface/memory.h>
3840-
3841-struct e820map e820 __initdata;
3842-#ifdef CONFIG_XEN
3843-struct e820map machine_e820;
3844-#endif
3845-
3846-/*
3847- * PFN of last memory page.
3848- */
3849-unsigned long end_pfn;
3850-
3851-/*
3852- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
3853- * The direct mapping extends to max_pfn_mapped, so that we can directly access
3854- * apertures, ACPI and other tables without having to play with fixmaps.
3855- */
3856-unsigned long max_pfn_mapped;
3857-
3858-/*
3859- * Last pfn which the user wants to use.
3860- */
3861-static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
3862-
3863-/*
3864- * Early reserved memory areas.
3865- */
3866-#define MAX_EARLY_RES 20
3867-
3868-struct early_res {
3869- unsigned long start, end;
3870- char name[16];
3871-};
3872-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
3873-#ifndef CONFIG_XEN
3874- { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
3875-#ifdef CONFIG_X86_TRAMPOLINE
3876- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
3877-#endif
3878-#endif
3879- {}
3880-};
3881-
3882-void __init reserve_early(unsigned long start, unsigned long end, char *name)
3883-{
3884- int i;
3885- struct early_res *r;
3886- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3887- r = &early_res[i];
3888- if (end > r->start && start < r->end)
3889- panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
3890- start, end - 1, name?name:"", r->start, r->end - 1, r->name);
3891- }
3892- if (i >= MAX_EARLY_RES)
3893- panic("Too many early reservations");
3894- r = &early_res[i];
3895- r->start = start;
3896- r->end = end;
3897- if (name)
3898- strncpy(r->name, name, sizeof(r->name) - 1);
3899-}
3900-
3901-void __init free_early(unsigned long start, unsigned long end)
3902-{
3903- struct early_res *r;
3904- int i, j;
3905-
3906- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3907- r = &early_res[i];
3908- if (start == r->start && end == r->end)
3909- break;
3910- }
3911- if (i >= MAX_EARLY_RES || !early_res[i].end)
3912- panic("free_early on not reserved area: %lx-%lx!", start, end);
3913-
3914- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
3915- ;
3916-
3917- memmove(&early_res[i], &early_res[i + 1],
3918- (j - 1 - i) * sizeof(struct early_res));
3919-
3920- early_res[j - 1].end = 0;
3921-}
3922-
3923-void __init early_res_to_bootmem(unsigned long start, unsigned long end)
3924-{
3925- int i;
3926- unsigned long final_start, final_end;
3927- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3928- struct early_res *r = &early_res[i];
3929- final_start = max(start, r->start);
3930- final_end = min(end, r->end);
3931- if (final_start >= final_end)
3932- continue;
3933- printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
3934- final_start, final_end - 1, r->name);
3935- reserve_bootmem_generic(final_start, final_end - final_start);
3936- }
3937-}
3938-
3939-/* Check for already reserved areas */
3940-static inline int __init
3941-bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
3942-{
3943- int i;
3944- unsigned long addr = *addrp, last;
3945- int changed = 0;
3946-again:
3947- last = addr + size;
3948- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3949- struct early_res *r = &early_res[i];
3950- if (last >= r->start && addr < r->end) {
3951- *addrp = addr = round_up(r->end, align);
3952- changed = 1;
3953- goto again;
3954- }
3955- }
3956- return changed;
3957-}
3958-
3959-/* Check for already reserved areas */
3960-static inline int __init
3961-bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
3962-{
3963- int i;
3964- unsigned long addr = *addrp, last;
3965- unsigned long size = *sizep;
3966- int changed = 0;
3967-again:
3968- last = addr + size;
3969- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3970- struct early_res *r = &early_res[i];
3971- if (last > r->start && addr < r->start) {
3972- size = r->start - addr;
3973- changed = 1;
3974- goto again;
3975- }
3976- if (last > r->end && addr < r->end) {
3977- addr = round_up(r->end, align);
3978- size = last - addr;
3979- changed = 1;
3980- goto again;
3981- }
3982- if (last <= r->end && addr >= r->start) {
3983- (*sizep)++;
3984- return 0;
3985- }
3986- }
3987- if (changed) {
3988- *addrp = addr;
3989- *sizep = size;
3990- }
3991- return changed;
3992-}
3993-/*
3994- * This function checks if any part of the range <start,end> is mapped
3995- * with type.
3996- */
3997-int
3998-e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
3999-{
4000- int i;
4001-
4002-#ifndef CONFIG_XEN
4003- for (i = 0; i < e820.nr_map; i++) {
4004- struct e820entry *ei = &e820.map[i];
4005-#else
4006- if (!is_initial_xendomain())
4007- return 0;
4008- for (i = 0; i < machine_e820.nr_map; i++) {
4009- const struct e820entry *ei = &machine_e820.map[i];
4010-#endif
4011-
4012- if (type && ei->type != type)
4013- continue;
4014- if (ei->addr >= end || ei->addr + ei->size <= start)
4015- continue;
4016- return 1;
4017- }
4018- return 0;
4019-}
4020-EXPORT_SYMBOL_GPL(e820_any_mapped);
4021-
4022-/*
4023- * This function checks if the entire range <start,end> is mapped with type.
4024- *
4025- * Note: this function only works correct if the e820 table is sorted and
4026- * not-overlapping, which is the case
4027- */
4028-int __init e820_all_mapped(unsigned long start, unsigned long end,
4029- unsigned type)
4030-{
4031- int i;
4032-
4033-#ifndef CONFIG_XEN
4034- for (i = 0; i < e820.nr_map; i++) {
4035- struct e820entry *ei = &e820.map[i];
4036-#else
4037- if (!is_initial_xendomain())
4038- return 0;
4039- for (i = 0; i < machine_e820.nr_map; i++) {
4040- const struct e820entry *ei = &machine_e820.map[i];
4041-#endif
4042-
4043- if (type && ei->type != type)
4044- continue;
4045- /* is the region (part) in overlap with the current region ?*/
4046- if (ei->addr >= end || ei->addr + ei->size <= start)
4047- continue;
4048-
4049- /* if the region is at the beginning of <start,end> we move
4050- * start to the end of the region since it's ok until there
4051- */
4052- if (ei->addr <= start)
4053- start = ei->addr + ei->size;
4054- /*
4055- * if start is now at or beyond end, we're done, full
4056- * coverage
4057- */
4058- if (start >= end)
4059- return 1;
4060- }
4061- return 0;
4062-}
4063-
4064-/*
4065- * Find a free area with specified alignment in a specific range.
4066- */
4067-unsigned long __init find_e820_area(unsigned long start, unsigned long end,
4068- unsigned long size, unsigned long align)
4069-{
4070- int i;
4071-
4072- for (i = 0; i < e820.nr_map; i++) {
4073- struct e820entry *ei = &e820.map[i];
4074- unsigned long addr, last;
4075- unsigned long ei_last;
4076-
4077- if (ei->type != E820_RAM)
4078- continue;
4079- addr = round_up(ei->addr, align);
4080- ei_last = ei->addr + ei->size;
4081- if (addr < start)
4082- addr = round_up(start, align);
4083- if (addr >= ei_last)
4084- continue;
4085- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
4086- ;
4087- last = addr + size;
4088- if (last > ei_last)
4089- continue;
4090- if (last > end)
4091- continue;
4092- return addr;
4093- }
4094- return -1UL;
4095-}
4096-
4097-/*
4098- * Find next free range after *start
4099- */
4100-unsigned long __init find_e820_area_size(unsigned long start,
4101- unsigned long *sizep,
4102- unsigned long align)
4103-{
4104- int i;
4105-
4106- for (i = 0; i < e820.nr_map; i++) {
4107- struct e820entry *ei = &e820.map[i];
4108- unsigned long addr, last;
4109- unsigned long ei_last;
4110-
4111- if (ei->type != E820_RAM)
4112- continue;
4113- addr = round_up(ei->addr, align);
4114- ei_last = ei->addr + ei->size;
4115- if (addr < start)
4116- addr = round_up(start, align);
4117- if (addr >= ei_last)
4118- continue;
4119- *sizep = ei_last - addr;
4120- while (bad_addr_size(&addr, sizep, align) &&
4121- addr + *sizep <= ei_last)
4122- ;
4123- last = addr + *sizep;
4124- if (last > ei_last)
4125- continue;
4126- return addr;
4127- }
4128- return -1UL;
4129-
4130-}
4131-/*
4132- * Find the highest page frame number we have available
4133- */
4134-unsigned long __init e820_end_of_ram(void)
4135-{
4136- unsigned long end_pfn;
4137-
4138- end_pfn = find_max_pfn_with_active_regions();
4139-
4140- if (end_pfn > max_pfn_mapped)
4141- max_pfn_mapped = end_pfn;
4142- if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
4143- max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
4144- if (end_pfn > end_user_pfn)
4145- end_pfn = end_user_pfn;
4146- if (end_pfn > max_pfn_mapped)
4147- end_pfn = max_pfn_mapped;
4148-
4149- printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
4150- return end_pfn;
4151-}
4152-
4153-/*
4154- * Mark e820 reserved areas as busy for the resource manager.
4155- */
4156-void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
4157-{
4158- int i;
4159- struct resource *res;
4160-
4161- res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
4162- for (i = 0; i < nr_map; i++) {
4163- switch (e820[i].type) {
4164- case E820_RAM: res->name = "System RAM"; break;
4165- case E820_ACPI: res->name = "ACPI Tables"; break;
4166- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
4167- default: res->name = "reserved";
4168- }
4169- res->start = e820[i].addr;
4170- res->end = res->start + e820[i].size - 1;
4171- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4172- insert_resource(&iomem_resource, res);
4173- res++;
4174- }
4175-}
4176-
4177-#ifndef CONFIG_XEN
4178-/*
4179- * Find the ranges of physical addresses that do not correspond to
4180- * e820 RAM areas and mark the corresponding pages as nosave for software
4181- * suspend and suspend to RAM.
4182- *
4183- * This function requires the e820 map to be sorted and without any
4184- * overlapping entries and assumes the first e820 area to be RAM.
4185- */
4186-void __init e820_mark_nosave_regions(void)
4187-{
4188- int i;
4189- unsigned long paddr;
4190-
4191- paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
4192- for (i = 1; i < e820.nr_map; i++) {
4193- struct e820entry *ei = &e820.map[i];
4194-
4195- if (paddr < ei->addr)
4196- register_nosave_region(PFN_DOWN(paddr),
4197- PFN_UP(ei->addr));
4198-
4199- paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
4200- if (ei->type != E820_RAM)
4201- register_nosave_region(PFN_UP(ei->addr),
4202- PFN_DOWN(paddr));
4203-
4204- if (paddr >= (end_pfn << PAGE_SHIFT))
4205- break;
4206- }
4207-}
4208-#endif
4209-
4210-/*
4211- * Finds an active region in the address range from start_pfn to end_pfn and
4212- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
4213- */
4214-static int __init e820_find_active_region(const struct e820entry *ei,
4215- unsigned long start_pfn,
4216- unsigned long end_pfn,
4217- unsigned long *ei_startpfn,
4218- unsigned long *ei_endpfn)
4219-{
4220- *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
4221- *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
4222-
4223- /* Skip map entries smaller than a page */
4224- if (*ei_startpfn >= *ei_endpfn)
4225- return 0;
4226-
4227- /* Check if max_pfn_mapped should be updated */
4228- if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
4229- max_pfn_mapped = *ei_endpfn;
4230-
4231- /* Skip if map is outside the node */
4232- if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
4233- *ei_startpfn >= end_pfn)
4234- return 0;
4235-
4236- /* Check for overlaps */
4237- if (*ei_startpfn < start_pfn)
4238- *ei_startpfn = start_pfn;
4239- if (*ei_endpfn > end_pfn)
4240- *ei_endpfn = end_pfn;
4241-
4242- /* Obey end_user_pfn to save on memmap */
4243- if (*ei_startpfn >= end_user_pfn)
4244- return 0;
4245- if (*ei_endpfn > end_user_pfn)
4246- *ei_endpfn = end_user_pfn;
4247-
4248- return 1;
4249-}
4250-
4251-/* Walk the e820 map and register active regions within a node */
4252-void __init
4253-e820_register_active_regions(int nid, unsigned long start_pfn,
4254- unsigned long end_pfn)
4255-{
4256- unsigned long ei_startpfn;
4257- unsigned long ei_endpfn;
4258- int i;
4259-
4260- for (i = 0; i < e820.nr_map; i++)
4261- if (e820_find_active_region(&e820.map[i],
4262- start_pfn, end_pfn,
4263- &ei_startpfn, &ei_endpfn))
4264- add_active_range(nid, ei_startpfn, ei_endpfn);
4265-}
4266-
4267-/*
4268- * Add a memory region to the kernel e820 map.
4269- */
4270-void __init add_memory_region(unsigned long start, unsigned long size, int type)
4271-{
4272- int x = e820.nr_map;
4273-
4274- if (x == E820MAX) {
4275- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
4276- return;
4277- }
4278-
4279- e820.map[x].addr = start;
4280- e820.map[x].size = size;
4281- e820.map[x].type = type;
4282- e820.nr_map++;
4283-}
4284-
4285-/*
4286- * Find the hole size (in bytes) in the memory range.
4287- * @start: starting address of the memory range to scan
4288- * @end: ending address of the memory range to scan
4289- */
4290-unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
4291-{
4292- unsigned long start_pfn = start >> PAGE_SHIFT;
4293- unsigned long end_pfn = end >> PAGE_SHIFT;
4294- unsigned long ei_startpfn, ei_endpfn, ram = 0;
4295- int i;
4296-
4297- for (i = 0; i < e820.nr_map; i++) {
4298- if (e820_find_active_region(&e820.map[i],
4299- start_pfn, end_pfn,
4300- &ei_startpfn, &ei_endpfn))
4301- ram += ei_endpfn - ei_startpfn;
4302- }
4303- return end - start - (ram << PAGE_SHIFT);
4304-}
4305-
4306-static void __init e820_print_map(char *who)
4307-{
4308- int i;
4309-
4310- for (i = 0; i < e820.nr_map; i++) {
4311- printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
4312- (unsigned long long) e820.map[i].addr,
4313- (unsigned long long)
4314- (e820.map[i].addr + e820.map[i].size));
4315- switch (e820.map[i].type) {
4316- case E820_RAM:
4317- printk(KERN_CONT "(usable)\n");
4318- break;
4319- case E820_RESERVED:
4320- printk(KERN_CONT "(reserved)\n");
4321- break;
4322- case E820_ACPI:
4323- printk(KERN_CONT "(ACPI data)\n");
4324- break;
4325- case E820_NVS:
4326- printk(KERN_CONT "(ACPI NVS)\n");
4327- break;
4328- default:
4329- printk(KERN_CONT "type %u\n", e820.map[i].type);
4330- break;
4331- }
4332- }
4333-}
4334-
4335-/*
4336- * Sanitize the BIOS e820 map.
4337- *
4338- * Some e820 responses include overlapping entries. The following
4339- * replaces the original e820 map with a new one, removing overlaps.
4340- *
4341- */
4342-static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
4343-{
4344- struct change_member {
4345- struct e820entry *pbios; /* pointer to original bios entry */
4346- unsigned long long addr; /* address for this change point */
4347- };
4348- static struct change_member change_point_list[2*E820MAX] __initdata;
4349- static struct change_member *change_point[2*E820MAX] __initdata;
4350- static struct e820entry *overlap_list[E820MAX] __initdata;
4351- static struct e820entry new_bios[E820MAX] __initdata;
4352- struct change_member *change_tmp;
4353- unsigned long current_type, last_type;
4354- unsigned long long last_addr;
4355- int chgidx, still_changing;
4356- int overlap_entries;
4357- int new_bios_entry;
4358- int old_nr, new_nr, chg_nr;
4359- int i;
4360-
4361- /*
4362- Visually we're performing the following
4363- (1,2,3,4 = memory types)...
4364-
4365- Sample memory map (w/overlaps):
4366- ____22__________________
4367- ______________________4_
4368- ____1111________________
4369- _44_____________________
4370- 11111111________________
4371- ____________________33__
4372- ___________44___________
4373- __________33333_________
4374- ______________22________
4375- ___________________2222_
4376- _________111111111______
4377- _____________________11_
4378- _________________4______
4379-
4380- Sanitized equivalent (no overlap):
4381- 1_______________________
4382- _44_____________________
4383- ___1____________________
4384- ____22__________________
4385- ______11________________
4386- _________1______________
4387- __________3_____________
4388- ___________44___________
4389- _____________33_________
4390- _______________2________
4391- ________________1_______
4392- _________________4______
4393- ___________________2____
4394- ____________________33__
4395- ______________________4_
4396- */
4397-
4398- /* if there's only one memory region, don't bother */
4399- if (*pnr_map < 2)
4400- return -1;
4401-
4402- old_nr = *pnr_map;
4403-
4404- /* bail out if we find any unreasonable addresses in bios map */
4405- for (i = 0; i < old_nr; i++)
4406- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
4407- return -1;
4408-
4409- /* create pointers for initial change-point information (for sorting) */
4410- for (i = 0; i < 2 * old_nr; i++)
4411- change_point[i] = &change_point_list[i];
4412-
4413- /* record all known change-points (starting and ending addresses),
4414- omitting those that are for empty memory regions */
4415- chgidx = 0;
4416- for (i = 0; i < old_nr; i++) {
4417- if (biosmap[i].size != 0) {
4418- change_point[chgidx]->addr = biosmap[i].addr;
4419- change_point[chgidx++]->pbios = &biosmap[i];
4420- change_point[chgidx]->addr = biosmap[i].addr +
4421- biosmap[i].size;
4422- change_point[chgidx++]->pbios = &biosmap[i];
4423- }
4424- }
4425- chg_nr = chgidx;
4426-
4427- /* sort change-point list by memory addresses (low -> high) */
4428- still_changing = 1;
4429- while (still_changing) {
4430- still_changing = 0;
4431- for (i = 1; i < chg_nr; i++) {
4432- unsigned long long curaddr, lastaddr;
4433- unsigned long long curpbaddr, lastpbaddr;
4434-
4435- curaddr = change_point[i]->addr;
4436- lastaddr = change_point[i - 1]->addr;
4437- curpbaddr = change_point[i]->pbios->addr;
4438- lastpbaddr = change_point[i - 1]->pbios->addr;
4439-
4440- /*
4441- * swap entries, when:
4442- *
4443- * curaddr > lastaddr or
4444- * curaddr == lastaddr and curaddr == curpbaddr and
4445- * lastaddr != lastpbaddr
4446- */
4447- if (curaddr < lastaddr ||
4448- (curaddr == lastaddr && curaddr == curpbaddr &&
4449- lastaddr != lastpbaddr)) {
4450- change_tmp = change_point[i];
4451- change_point[i] = change_point[i-1];
4452- change_point[i-1] = change_tmp;
4453- still_changing = 1;
4454- }
4455- }
4456- }
4457-
4458- /* create a new bios memory map, removing overlaps */
4459- overlap_entries = 0; /* number of entries in the overlap table */
4460- new_bios_entry = 0; /* index for creating new bios map entries */
4461- last_type = 0; /* start with undefined memory type */
4462- last_addr = 0; /* start with 0 as last starting address */
4463-
4464- /* loop through change-points, determining affect on the new bios map */
4465- for (chgidx = 0; chgidx < chg_nr; chgidx++) {
4466- /* keep track of all overlapping bios entries */
4467- if (change_point[chgidx]->addr ==
4468- change_point[chgidx]->pbios->addr) {
4469- /*
4470- * add map entry to overlap list (> 1 entry
4471- * implies an overlap)
4472- */
4473- overlap_list[overlap_entries++] =
4474- change_point[chgidx]->pbios;
4475- } else {
4476- /*
4477- * remove entry from list (order independent,
4478- * so swap with last)
4479- */
4480- for (i = 0; i < overlap_entries; i++) {
4481- if (overlap_list[i] ==
4482- change_point[chgidx]->pbios)
4483- overlap_list[i] =
4484- overlap_list[overlap_entries-1];
4485- }
4486- overlap_entries--;
4487- }
4488- /*
4489- * if there are overlapping entries, decide which
4490- * "type" to use (larger value takes precedence --
4491- * 1=usable, 2,3,4,4+=unusable)
4492- */
4493- current_type = 0;
4494- for (i = 0; i < overlap_entries; i++)
4495- if (overlap_list[i]->type > current_type)
4496- current_type = overlap_list[i]->type;
4497- /*
4498- * continue building up new bios map based on this
4499- * information
4500- */
4501- if (current_type != last_type) {
4502- if (last_type != 0) {
4503- new_bios[new_bios_entry].size =
4504- change_point[chgidx]->addr - last_addr;
4505- /*
4506- * move forward only if the new size
4507- * was non-zero
4508- */
4509- if (new_bios[new_bios_entry].size != 0)
4510- /*
4511- * no more space left for new
4512- * bios entries ?
4513- */
4514- if (++new_bios_entry >= E820MAX)
4515- break;
4516- }
4517- if (current_type != 0) {
4518- new_bios[new_bios_entry].addr =
4519- change_point[chgidx]->addr;
4520- new_bios[new_bios_entry].type = current_type;
4521- last_addr = change_point[chgidx]->addr;
4522- }
4523- last_type = current_type;
4524- }
4525- }
4526- /* retain count for new bios entries */
4527- new_nr = new_bios_entry;
4528-
4529- /* copy new bios mapping into original location */
4530- memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
4531- *pnr_map = new_nr;
4532-
4533- return 0;
4534-}
4535-
4536-/*
4537- * Copy the BIOS e820 map into a safe place.
4538- *
4539- * Sanity-check it while we're at it..
4540- *
4541- * If we're lucky and live on a modern system, the setup code
4542- * will have given us a memory map that we can use to properly
4543- * set up memory. If we aren't, we'll fake a memory map.
4544- */
4545-static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
4546-{
4547-#ifndef CONFIG_XEN
4548- /* Only one memory region (or negative)? Ignore it */
4549- if (nr_map < 2)
4550- return -1;
4551-#else
4552- BUG_ON(nr_map < 1);
4553-#endif
4554-
4555- do {
4556- u64 start = biosmap->addr;
4557- u64 size = biosmap->size;
4558- u64 end = start + size;
4559- u32 type = biosmap->type;
4560-
4561- /* Overflow in 64 bits? Ignore the memory map. */
4562- if (start > end)
4563- return -1;
4564-
4565- add_memory_region(start, size, type);
4566- } while (biosmap++, --nr_map);
4567-
4568-#ifdef CONFIG_XEN
4569- if (is_initial_xendomain()) {
4570- struct xen_memory_map memmap;
4571-
4572- memmap.nr_entries = E820MAX;
4573- set_xen_guest_handle(memmap.buffer, machine_e820.map);
4574-
4575- if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
4576- BUG();
4577- machine_e820.nr_map = memmap.nr_entries;
4578- } else
4579- machine_e820 = e820;
4580-#endif
4581-
4582- return 0;
4583-}
4584-
4585-static void early_panic(char *msg)
4586-{
4587- early_printk(msg);
4588- panic(msg);
4589-}
4590-
4591-/* We're not void only for x86 32-bit compat */
4592-char * __init machine_specific_memory_setup(void)
4593-{
4594-#ifndef CONFIG_XEN
4595- char *who = "BIOS-e820";
4596- /*
4597- * Try to copy the BIOS-supplied E820-map.
4598- *
4599- * Otherwise fake a memory map; one section from 0k->640k,
4600- * the next section from 1mb->appropriate_mem_k
4601- */
4602- sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
4603- if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
4604- early_panic("Cannot find a valid memory map");
4605-#else /* CONFIG_XEN */
4606- char *who = "Xen";
4607- int rc;
4608- struct xen_memory_map memmap;
4609- /*
4610- * This is rather large for a stack variable but this early in
4611- * the boot process we know we have plenty slack space.
4612- */
4613- struct e820entry map[E820MAX];
4614-
4615- memmap.nr_entries = E820MAX;
4616- set_xen_guest_handle(memmap.buffer, map);
4617-
4618- rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
4619- if ( rc == -ENOSYS ) {
4620- memmap.nr_entries = 1;
4621- map[0].addr = 0ULL;
4622- map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
4623- /* 8MB slack (to balance backend allocations). */
4624- map[0].size += 8 << 20;
4625- map[0].type = E820_RAM;
4626- rc = 0;
4627- }
4628- BUG_ON(rc);
4629-
4630- sanitize_e820_map(map, (char *)&memmap.nr_entries);
4631-
4632- if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
4633- early_panic("Cannot find a valid memory map");
4634-#endif
4635- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4636- e820_print_map(who);
4637-
4638- /* In case someone cares... */
4639- return who;
4640-}
4641-
4642-static int __init parse_memopt(char *p)
4643-{
4644- int i;
4645- unsigned long current_end;
4646- unsigned long end;
4647-
4648- if (!p)
4649- return -EINVAL;
4650- end_user_pfn = memparse(p, &p);
4651- end_user_pfn >>= PAGE_SHIFT;
4652-
4653- end = end_user_pfn<<PAGE_SHIFT;
4654- i = e820.nr_map-1;
4655- current_end = e820.map[i].addr + e820.map[i].size;
4656-
4657- if (current_end < end) {
4658- /*
4659- * The e820 map ends before our requested size so
4660- * extend the final entry to the requested address.
4661- */
4662- if (e820.map[i].type == E820_RAM)
4663- e820.map[i].size = end - e820.map[i].addr;
4664- else
4665- add_memory_region(current_end, end - current_end, E820_RAM);
4666- }
4667-
4668- return 0;
4669-}
4670-early_param("mem", parse_memopt);
4671-
4672-static int userdef __initdata;
4673-
4674-static int __init parse_memmap_opt(char *p)
4675-{
4676- char *oldp;
4677- unsigned long long start_at, mem_size;
4678-
4679- if (!strcmp(p, "exactmap")) {
4680-#ifdef CONFIG_CRASH_DUMP
4681- /*
4682- * If we are doing a crash dump, we still need to know
4683- * the real mem size before original memory map is
4684- * reset.
4685- */
4686- e820_register_active_regions(0, 0, -1UL);
4687- saved_max_pfn = e820_end_of_ram();
4688- remove_all_active_ranges();
4689-#endif
4690- max_pfn_mapped = 0;
4691- e820.nr_map = 0;
4692- userdef = 1;
4693- return 0;
4694- }
4695-
4696- oldp = p;
4697- mem_size = memparse(p, &p);
4698- if (p == oldp)
4699- return -EINVAL;
4700-
4701- userdef = 1;
4702- if (*p == '@') {
4703- start_at = memparse(p+1, &p);
4704- add_memory_region(start_at, mem_size, E820_RAM);
4705- } else if (*p == '#') {
4706- start_at = memparse(p+1, &p);
4707- add_memory_region(start_at, mem_size, E820_ACPI);
4708- } else if (*p == '$') {
4709- start_at = memparse(p+1, &p);
4710- add_memory_region(start_at, mem_size, E820_RESERVED);
4711- } else {
4712- end_user_pfn = (mem_size >> PAGE_SHIFT);
4713- }
4714- return *p == '\0' ? 0 : -EINVAL;
4715-}
4716-early_param("memmap", parse_memmap_opt);
4717-
4718-void __init finish_e820_parsing(void)
4719-{
4720- if (userdef) {
4721- char nr = e820.nr_map;
4722-
4723- if (sanitize_e820_map(e820.map, &nr) < 0)
4724- early_panic("Invalid user supplied memory map");
4725- e820.nr_map = nr;
4726-
4727- printk(KERN_INFO "user-defined physical RAM map:\n");
4728- e820_print_map("user");
4729- }
4730-}
4731-
4732-#ifndef CONFIG_XEN
4733-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
4734- unsigned new_type)
4735-{
4736- int i;
4737-
4738- BUG_ON(old_type == new_type);
4739-
4740- for (i = 0; i < e820.nr_map; i++) {
4741- struct e820entry *ei = &e820.map[i];
4742- u64 final_start, final_end;
4743- if (ei->type != old_type)
4744- continue;
4745- /* totally covered? */
4746- if (ei->addr >= start && ei->size <= size) {
4747- ei->type = new_type;
4748- continue;
4749- }
4750- /* partially covered */
4751- final_start = max(start, ei->addr);
4752- final_end = min(start + size, ei->addr + ei->size);
4753- if (final_start >= final_end)
4754- continue;
4755- add_memory_region(final_start, final_end - final_start,
4756- new_type);
4757- }
4758-}
4759-
4760-void __init update_e820(void)
4761-{
4762- u8 nr_map;
4763-
4764- nr_map = e820.nr_map;
4765- if (sanitize_e820_map(e820.map, &nr_map))
4766- return;
4767- e820.nr_map = nr_map;
4768- printk(KERN_INFO "modified physical RAM map:\n");
4769- e820_print_map("modified");
4770-}
4771-#endif
4772-
4773-unsigned long pci_mem_start = 0xaeedbabe;
4774-EXPORT_SYMBOL(pci_mem_start);
4775-
4776-/*
4777- * Search for the biggest gap in the low 32 bits of the e820
4778- * memory space. We pass this space to PCI to assign MMIO resources
4779- * for hotplug or unconfigured devices in.
4780- * Hopefully the BIOS let enough space left.
4781- */
4782-__init void e820_setup_gap(struct e820entry *e820, int nr_map)
4783-{
4784- unsigned long gapstart, gapsize, round;
4785- unsigned long last;
4786- int i;
4787- int found = 0;
4788-
4789- last = 0x100000000ull;
4790- gapstart = 0x10000000;
4791- gapsize = 0x400000;
4792- i = nr_map;
4793- while (--i >= 0) {
4794- unsigned long long start = e820[i].addr;
4795- unsigned long long end = start + e820[i].size;
4796-
4797- /*
4798- * Since "last" is at most 4GB, we know we'll
4799- * fit in 32 bits if this condition is true
4800- */
4801- if (last > end) {
4802- unsigned long gap = last - end;
4803-
4804- if (gap > gapsize) {
4805- gapsize = gap;
4806- gapstart = end;
4807- found = 1;
4808- }
4809- }
4810- if (start < last)
4811- last = start;
4812- }
4813-
4814- if (!found) {
4815- gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
4816- printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
4817- "address range\n"
4818- KERN_ERR "PCI: Unassigned devices with 32bit resource "
4819- "registers may break!\n");
4820- }
4821-
4822- /*
4823- * See how much we want to round up: start off with
4824- * rounding to the next 1MB area.
4825- */
4826- round = 0x100000;
4827- while ((gapsize >> 4) > round)
4828- round += round;
4829- /* Fun with two's complement */
4830- pci_mem_start = (gapstart + round) & -round;
4831-
4832- printk(KERN_INFO
4833- "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
4834- pci_mem_start, gapstart, gapsize);
4835-}
4836-
4837-int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
4838-{
4839- int i;
4840-
4841- if (slot < 0 || slot >= e820.nr_map)
4842- return -1;
4843- for (i = slot; i < e820.nr_map; i++) {
4844- if (e820.map[i].type != E820_RAM)
4845- continue;
4846- break;
4847- }
4848- if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
4849- return -1;
4850- *addr = e820.map[i].addr;
4851- *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
4852- max_pfn << PAGE_SHIFT) - *addr;
4853- return i + 1;
4854-}
00e5a55c
BS
4855--- sle11-2009-06-04.orig/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:38:05.000000000 +0100
4856+++ sle11-2009-06-04/arch/x86/kernel/early_printk-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
4857@@ -225,7 +225,7 @@ static struct console simnow_console = {
4858 static struct console *early_console = &early_vga_console;
4859 static int early_console_initialized;
4860
4861-void early_printk(const char *fmt, ...)
4862+asmlinkage void early_printk(const char *fmt, ...)
4863 {
4864 char buf[512];
4865 int n;
00e5a55c
BS
4866--- sle11-2009-06-04.orig/arch/x86/kernel/entry_32-xen.S 2009-03-16 16:38:05.000000000 +0100
4867+++ sle11-2009-06-04/arch/x86/kernel/entry_32-xen.S 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
4868@@ -51,15 +51,26 @@
4869 #include <asm/percpu.h>
4870 #include <asm/dwarf2.h>
4871 #include <asm/processor-flags.h>
4872-#include "irq_vectors.h"
4873+#include <asm/ftrace.h>
4874+#include <asm/irq_vectors.h>
4875 #include <xen/interface/xen.h>
4876
4877+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
4878+#include <linux/elf-em.h>
4879+#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
4880+#define __AUDIT_ARCH_LE 0x40000000
4881+
4882+#ifndef CONFIG_AUDITSYSCALL
4883+#define sysenter_audit syscall_trace_entry
4884+#define sysexit_audit syscall_exit_work
4885+#endif
4886+
4887 /*
4888 * We use macros for low-level operations which need to be overridden
4889 * for paravirtualization. The following will never clobber any registers:
4890 * INTERRUPT_RETURN (aka. "iret")
4891 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
4892- * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
4893+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
4894 *
4895 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
4896 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
4897@@ -277,11 +288,6 @@ END(resume_kernel)
4898 #endif
4899 CFI_ENDPROC
4900
4901- .macro test_tif ti_reg # system call tracing in operation / emulation
4902- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
4903- testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
4904- .endm
4905-
4906 /* SYSENTER_RETURN points to after the "sysenter" instruction in
4907 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
4908
4909@@ -338,8 +344,9 @@ sysenter_past_esp:
4910 .previous
4911
4912 GET_THREAD_INFO(%ebp)
4913- test_tif %ebp
4914- jnz syscall_trace_entry
4915+ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4916+ jnz sysenter_audit
4917+sysenter_do_call:
4918 cmpl $(nr_syscalls), %eax
4919 jae syscall_badsys
4920 call *sys_call_table(,%eax,4)
4921@@ -349,14 +356,54 @@ sysenter_past_esp:
4922 TRACE_IRQS_OFF
4923 movl TI_flags(%ebp), %ecx
4924 testw $_TIF_ALLWORK_MASK, %cx
4925- jne syscall_exit_work
4926+ jne sysexit_audit
4927+sysenter_exit:
4928 /* if something modifies registers it must also disable sysexit */
4929 movl PT_EIP(%esp), %edx
4930 movl PT_OLDESP(%esp), %ecx
4931 xorl %ebp,%ebp
4932 TRACE_IRQS_ON
4933 1: mov PT_FS(%esp), %fs
4934- ENABLE_INTERRUPTS_SYSCALL_RET
4935+ ENABLE_INTERRUPTS_SYSEXIT
4936+
4937+#ifdef CONFIG_AUDITSYSCALL
4938+sysenter_audit:
4939+ testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
4940+ jnz syscall_trace_entry
4941+ addl $4,%esp
4942+ CFI_ADJUST_CFA_OFFSET -4
4943+ /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
4944+ /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
4945+ /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
4946+ movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
4947+ movl %eax,%edx /* 2nd arg: syscall number */
4948+ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
4949+ call audit_syscall_entry
4950+ pushl %ebx
4951+ CFI_ADJUST_CFA_OFFSET 4
4952+ movl PT_EAX(%esp),%eax /* reload syscall number */
4953+ jmp sysenter_do_call
4954+
4955+sysexit_audit:
4956+ testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4957+ jne syscall_exit_work
4958+ TRACE_IRQS_ON
4959+ ENABLE_INTERRUPTS(CLBR_ANY)
4960+ movl %eax,%edx /* second arg, syscall return value */
4961+ cmpl $0,%eax /* is it < 0? */
4962+ setl %al /* 1 if so, 0 if not */
4963+ movzbl %al,%eax /* zero-extend that */
4964+ inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
4965+ call audit_syscall_exit
4966+ DISABLE_INTERRUPTS(CLBR_ANY)
4967+ TRACE_IRQS_OFF
4968+ movl TI_flags(%ebp), %ecx
4969+ testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4970+ jne syscall_exit_work
4971+ movl PT_EAX(%esp),%eax /* reload syscall return value */
4972+ jmp sysenter_exit
4973+#endif
4974+
4975 CFI_ENDPROC
4976 .pushsection .fixup,"ax"
4977 2: movl $0,PT_FS(%esp)
4978@@ -400,7 +447,7 @@ ENTRY(system_call)
4979 CFI_ADJUST_CFA_OFFSET 4
4980 SAVE_ALL
4981 GET_THREAD_INFO(%ebp)
4982- test_tif %ebp
4983+ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4984 jnz syscall_trace_entry
4985 cmpl $(nr_syscalls), %eax
4986 jae syscall_badsys
4987@@ -413,10 +460,6 @@ syscall_exit:
4988 # setting need_resched or sigpending
4989 # between sampling and the iret
4990 TRACE_IRQS_OFF
4991- testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
4992- jz no_singlestep
4993- orl $_TIF_SINGLESTEP,TI_flags(%ebp)
4994-no_singlestep:
4995 movl TI_flags(%ebp), %ecx
4996 testw $_TIF_ALLWORK_MASK, %cx # current->work
4997 jne syscall_exit_work
4998@@ -588,12 +631,8 @@ END(work_pending)
4999 syscall_trace_entry:
5000 movl $-ENOSYS,PT_EAX(%esp)
5001 movl %esp, %eax
5002- xorl %edx,%edx
5003- call do_syscall_trace
5004- cmpl $0, %eax
5005- jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5006- # so must skip actual syscall
5007- movl PT_ORIG_EAX(%esp), %eax
5008+ call syscall_trace_enter
5009+ /* What it returned is what we'll actually use. */
5010 cmpl $(nr_syscalls), %eax
5011 jnae syscall_call
5012 jmp syscall_exit
5013@@ -602,14 +641,13 @@ END(syscall_trace_entry)
5014 # perform syscall exit tracing
5015 ALIGN
5016 syscall_exit_work:
5017- testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
5018+ testb $_TIF_WORK_SYSCALL_EXIT, %cl
5019 jz work_pending
5020 TRACE_IRQS_ON
5021- ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
5022+ ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
5023 # schedule() instead
5024 movl %esp, %eax
5025- movl $1, %edx
5026- call do_syscall_trace
5027+ call syscall_trace_leave
5028 jmp resume_userspace
5029 END(syscall_exit_work)
5030 CFI_ENDPROC
00e5a55c 5031@@ -1113,10 +1151,10 @@ ENTRY(native_iret)
cc90b958
BS
5032 .previous
5033 END(native_iret)
5034
5035-ENTRY(native_irq_enable_syscall_ret)
5036+ENTRY(native_irq_enable_sysexit)
5037 sti
5038 sysexit
5039-END(native_irq_enable_syscall_ret)
5040+END(native_irq_enable_sysexit)
5041 #endif
5042
5043 KPROBE_ENTRY(int3)
00e5a55c 5044@@ -1265,6 +1303,77 @@ ENTRY(kernel_thread_helper)
cc90b958
BS
5045 CFI_ENDPROC
5046 ENDPROC(kernel_thread_helper)
5047
5048+#ifdef CONFIG_FTRACE
5049+#ifdef CONFIG_DYNAMIC_FTRACE
5050+
5051+ENTRY(mcount)
5052+ pushl %eax
5053+ pushl %ecx
5054+ pushl %edx
5055+ movl 0xc(%esp), %eax
5056+ subl $MCOUNT_INSN_SIZE, %eax
5057+
5058+.globl mcount_call
5059+mcount_call:
5060+ call ftrace_stub
5061+
5062+ popl %edx
5063+ popl %ecx
5064+ popl %eax
5065+
5066+ ret
5067+END(mcount)
5068+
5069+ENTRY(ftrace_caller)
5070+ pushl %eax
5071+ pushl %ecx
5072+ pushl %edx
5073+ movl 0xc(%esp), %eax
5074+ movl 0x4(%ebp), %edx
5075+ subl $MCOUNT_INSN_SIZE, %eax
5076+
5077+.globl ftrace_call
5078+ftrace_call:
5079+ call ftrace_stub
5080+
5081+ popl %edx
5082+ popl %ecx
5083+ popl %eax
5084+
5085+.globl ftrace_stub
5086+ftrace_stub:
5087+ ret
5088+END(ftrace_caller)
5089+
5090+#else /* ! CONFIG_DYNAMIC_FTRACE */
5091+
5092+ENTRY(mcount)
5093+ cmpl $ftrace_stub, ftrace_trace_function
5094+ jnz trace
5095+.globl ftrace_stub
5096+ftrace_stub:
5097+ ret
5098+
5099+ /* taken from glibc */
5100+trace:
5101+ pushl %eax
5102+ pushl %ecx
5103+ pushl %edx
5104+ movl 0xc(%esp), %eax
5105+ movl 0x4(%ebp), %edx
5106+ subl $MCOUNT_INSN_SIZE, %eax
5107+
5108+ call *ftrace_trace_function
5109+
5110+ popl %edx
5111+ popl %ecx
5112+ popl %eax
5113+
5114+ jmp ftrace_stub
5115+END(mcount)
5116+#endif /* CONFIG_DYNAMIC_FTRACE */
5117+#endif /* CONFIG_FTRACE */
5118+
5119 #include <asm/alternative-asm.h>
5120
5121 # pv syscall call handler stub
00e5a55c 5122@@ -1290,7 +1399,7 @@ ENTRY(ia32pv_cstar_target)
cc90b958
BS
5123 .previous
5124 SAVE_ALL
5125 GET_THREAD_INFO(%ebp)
5126- test_tif %ebp
5127+ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
5128 jnz cstar_trace_entry
5129 cmpl $nr_syscalls,%eax
5130 jae cstar_badsys
00e5a55c 5131@@ -1324,29 +1433,21 @@ cstar_trace_entry:
cc90b958
BS
5132 btl %eax,cstar_special
5133 jc .Lcstar_trace_special
5134 1: movl %esp,%eax
5135- xorl %edx,%edx
5136 LOCK_PREFIX
5137 orl $_TIF_CSTAR,TI_flags(%ebp)
5138- call do_syscall_trace
5139+ call syscall_trace_enter
5140 LOCK_PREFIX
5141 andl $~_TIF_CSTAR,TI_flags(%ebp)
5142- testl %eax,%eax
5143- jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
5144- # so must skip actual syscall
5145- movl PT_ORIG_EAX(%esp),%eax
5146+ /* What it returned is what we'll actually use. */
5147 cmpl $nr_syscalls,%eax
5148 jb .Lcstar_call
5149 jmp .Lcstar_exit
5150 .Lcstar_trace_special:
5151 movl PT_ECX(%esp),%ecx
5152 movl %esp,%eax
5153- xorl %edx,%edx
5154 movl %ecx,PT_EBP(%esp) # put user EBP back in place
5155- call do_syscall_trace
5156- testl %eax,%eax
5157- jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5158- # so must skip actual syscall
5159- movl PT_ORIG_EAX(%esp),%eax
5160+ call syscall_trace_enter
5161+ /* What it returned is what we'll actually use. */
5162 cmpl $nr_syscalls,%eax
5163 jb syscall_call
5164 jmp syscall_exit
00e5a55c
BS
5165--- sle11-2009-06-04.orig/arch/x86/kernel/entry_64.S 2009-06-04 00:00:00.000000000 +0200
5166+++ sle11-2009-06-04/arch/x86/kernel/entry_64.S 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
5167@@ -1409,7 +1409,7 @@ ENTRY(arch_unwind_init_running)
5168 ENDPROC(arch_unwind_init_running)
5169 #endif
5170
5171-#ifdef CONFIG_XEN
5172+#ifdef CONFIG_PARAVIRT_XEN
5173 ENTRY(xen_hypervisor_callback)
5174 zeroentry xen_do_hypervisor_callback
5175 END(xen_hypervisor_callback)
00e5a55c 5176@@ -1507,7 +1507,7 @@ ENTRY(xen_failsafe_callback)
cc90b958
BS
5177 CFI_ENDPROC
5178 END(xen_failsafe_callback)
5179
5180-#endif /* CONFIG_XEN */
5181+#endif /* CONFIG_PARAVIRT_XEN */
00e5a55c
BS
5182
5183 #ifdef CONFIG_KDB
5184
5185--- sle11-2009-06-04.orig/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:38:05.000000000 +0100
5186+++ sle11-2009-06-04/arch/x86/kernel/entry_64-xen.S 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
5187@@ -53,19 +53,130 @@
5188 #include <asm/hw_irq.h>
5189 #include <asm/page.h>
5190 #include <asm/irqflags.h>
5191+#include <asm/ftrace.h>
5192 #include <asm/errno.h>
5193 #include <xen/interface/xen.h>
5194 #include <xen/interface/features.h>
5195
5196+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
5197+#include <linux/elf-em.h>
5198+#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
5199+#define __AUDIT_ARCH_64BIT 0x80000000
5200+#define __AUDIT_ARCH_LE 0x40000000
5201+
5202 .code64
5203
5204+#ifdef CONFIG_FTRACE
5205+#ifdef CONFIG_DYNAMIC_FTRACE
5206+ENTRY(mcount)
5207+
5208+ subq $0x38, %rsp
5209+ movq %rax, (%rsp)
5210+ movq %rcx, 8(%rsp)
5211+ movq %rdx, 16(%rsp)
5212+ movq %rsi, 24(%rsp)
5213+ movq %rdi, 32(%rsp)
5214+ movq %r8, 40(%rsp)
5215+ movq %r9, 48(%rsp)
5216+
5217+ movq 0x38(%rsp), %rdi
5218+ subq $MCOUNT_INSN_SIZE, %rdi
5219+
5220+.globl mcount_call
5221+mcount_call:
5222+ call ftrace_stub
5223+
5224+ movq 48(%rsp), %r9
5225+ movq 40(%rsp), %r8
5226+ movq 32(%rsp), %rdi
5227+ movq 24(%rsp), %rsi
5228+ movq 16(%rsp), %rdx
5229+ movq 8(%rsp), %rcx
5230+ movq (%rsp), %rax
5231+ addq $0x38, %rsp
5232+
5233+ retq
5234+END(mcount)
5235+
5236+ENTRY(ftrace_caller)
5237+
5238+ /* taken from glibc */
5239+ subq $0x38, %rsp
5240+ movq %rax, (%rsp)
5241+ movq %rcx, 8(%rsp)
5242+ movq %rdx, 16(%rsp)
5243+ movq %rsi, 24(%rsp)
5244+ movq %rdi, 32(%rsp)
5245+ movq %r8, 40(%rsp)
5246+ movq %r9, 48(%rsp)
5247+
5248+ movq 0x38(%rsp), %rdi
5249+ movq 8(%rbp), %rsi
5250+ subq $MCOUNT_INSN_SIZE, %rdi
5251+
5252+.globl ftrace_call
5253+ftrace_call:
5254+ call ftrace_stub
5255+
5256+ movq 48(%rsp), %r9
5257+ movq 40(%rsp), %r8
5258+ movq 32(%rsp), %rdi
5259+ movq 24(%rsp), %rsi
5260+ movq 16(%rsp), %rdx
5261+ movq 8(%rsp), %rcx
5262+ movq (%rsp), %rax
5263+ addq $0x38, %rsp
5264+
5265+.globl ftrace_stub
5266+ftrace_stub:
5267+ retq
5268+END(ftrace_caller)
5269+
5270+#else /* ! CONFIG_DYNAMIC_FTRACE */
5271+ENTRY(mcount)
5272+ cmpq $ftrace_stub, ftrace_trace_function
5273+ jnz trace
5274+.globl ftrace_stub
5275+ftrace_stub:
5276+ retq
5277+
5278+trace:
5279+ /* taken from glibc */
5280+ subq $0x38, %rsp
5281+ movq %rax, (%rsp)
5282+ movq %rcx, 8(%rsp)
5283+ movq %rdx, 16(%rsp)
5284+ movq %rsi, 24(%rsp)
5285+ movq %rdi, 32(%rsp)
5286+ movq %r8, 40(%rsp)
5287+ movq %r9, 48(%rsp)
5288+
5289+ movq 0x38(%rsp), %rdi
5290+ movq 8(%rbp), %rsi
5291+ subq $MCOUNT_INSN_SIZE, %rdi
5292+
5293+ call *ftrace_trace_function
5294+
5295+ movq 48(%rsp), %r9
5296+ movq 40(%rsp), %r8
5297+ movq 32(%rsp), %rdi
5298+ movq 24(%rsp), %rsi
5299+ movq 16(%rsp), %rdx
5300+ movq 8(%rsp), %rcx
5301+ movq (%rsp), %rax
5302+ addq $0x38, %rsp
5303+
5304+ jmp ftrace_stub
5305+END(mcount)
5306+#endif /* CONFIG_DYNAMIC_FTRACE */
5307+#endif /* CONFIG_FTRACE */
5308+
5309 #ifndef CONFIG_PREEMPT
5310 #define retint_kernel retint_restore_args
5311 #endif
5312
5313 #ifdef CONFIG_PARAVIRT
5314-ENTRY(native_irq_enable_syscall_ret)
5315- movq %gs:pda_oldrsp,%rsp
5316+ENTRY(native_usergs_sysret64)
5317 swapgs
5318 sysretq
5319 #endif /* CONFIG_PARAVIRT */
5320@@ -102,7 +213,7 @@ NMI_MASK = 0x80000000
5321 .macro FAKE_STACK_FRAME child_rip
5322 /* push in order ss, rsp, eflags, cs, rip */
5323 xorl %eax, %eax
5324- pushq %rax /* ss */
5325+ pushq $__KERNEL_DS /* ss */
5326 CFI_ADJUST_CFA_OFFSET 8
5327 /*CFI_REL_OFFSET ss,0*/
5328 pushq %rax /* rsp */
5329@@ -197,13 +308,13 @@ ENTRY(ret_from_fork)
5330 CFI_ADJUST_CFA_OFFSET -4
5331 call schedule_tail
5332 GET_THREAD_INFO(%rcx)
5333- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
5334+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5335 jnz rff_trace
5336 rff_action:
5337 RESTORE_REST
5338 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
5339 je int_ret_from_sys_call
5340- testl $_TIF_IA32,threadinfo_flags(%rcx)
5341+ testl $_TIF_IA32,TI_flags(%rcx)
5342 jnz int_ret_from_sys_call
5343 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
5344 jmp ret_from_sys_call
5345@@ -265,8 +376,9 @@ ENTRY(system_call)
5346 SAVE_ARGS -8,0
5347 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
5348 GET_THREAD_INFO(%rcx)
5349- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
5350+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
5351 jnz tracesys
5352+system_call_fastpath:
5353 cmpq $__NR_syscall_max,%rax
5354 ja badsys
5355 movq %r10,%rcx
5356@@ -284,7 +396,7 @@ sysret_check:
5357 GET_THREAD_INFO(%rcx)
5358 DISABLE_INTERRUPTS(CLBR_NONE)
5359 TRACE_IRQS_OFF
5360- movl threadinfo_flags(%rcx),%edx
5361+ movl TI_flags(%rcx),%edx
5362 andl %edi,%edx
5363 jnz sysret_careful
5364 CFI_REMEMBER_STATE
5365@@ -315,16 +427,16 @@ sysret_careful:
5366 sysret_signal:
5367 TRACE_IRQS_ON
5368 ENABLE_INTERRUPTS(CLBR_NONE)
5369- testl $_TIF_DO_NOTIFY_MASK,%edx
5370- jz 1f
5371-
5372- /* Really a signal */
5373+#ifdef CONFIG_AUDITSYSCALL
5374+ bt $TIF_SYSCALL_AUDIT,%edx
5375+ jc sysret_audit
5376+#endif
5377 /* edx: work flags (arg3) */
5378 leaq do_notify_resume(%rip),%rax
5379 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
5380 xorl %esi,%esi # oldset -> arg2
5381 call ptregscall_common
5382-1: movl $_TIF_NEED_RESCHED,%edi
5383+ movl $_TIF_WORK_MASK,%edi
5384 /* Use IRET because user could have changed frame. This
5385 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
5386 DISABLE_INTERRUPTS(CLBR_NONE)
5387@@ -335,14 +447,56 @@ badsys:
5388 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
5389 jmp ret_from_sys_call
5390
5391+#ifdef CONFIG_AUDITSYSCALL
5392+ /*
5393+ * Fast path for syscall audit without full syscall trace.
5394+ * We just call audit_syscall_entry() directly, and then
5395+ * jump back to the normal fast path.
5396+ */
5397+auditsys:
5398+ movq %r10,%r9 /* 6th arg: 4th syscall arg */
5399+ movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
5400+ movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
5401+ movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
5402+ movq %rax,%rsi /* 2nd arg: syscall number */
5403+ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
5404+ call audit_syscall_entry
5405+ LOAD_ARGS 0 /* reload call-clobbered registers */
5406+ jmp system_call_fastpath
5407+
5408+ /*
5409+ * Return fast path for syscall audit. Call audit_syscall_exit()
5410+ * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
5411+ * masked off.
5412+ */
5413+sysret_audit:
5414+ movq %rax,%rsi /* second arg, syscall return value */
5415+ cmpq $0,%rax /* is it < 0? */
5416+ setl %al /* 1 if so, 0 if not */
5417+ movzbl %al,%edi /* zero-extend that into %edi */
5418+ inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
5419+ call audit_syscall_exit
5420+ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
5421+ jmp sysret_check
5422+#endif /* CONFIG_AUDITSYSCALL */
5423+
5424 /* Do syscall tracing */
5425 tracesys:
5426+#ifdef CONFIG_AUDITSYSCALL
5427+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5428+ jz auditsys
5429+#endif
5430 SAVE_REST
5431 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
5432 FIXUP_TOP_OF_STACK %rdi
5433 movq %rsp,%rdi
5434 call syscall_trace_enter
5435- LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
5436+ /*
5437+ * Reload arg registers from stack in case ptrace changed them.
5438+ * We don't reload %rax because syscall_trace_enter() returned
5439+ * the value it wants us to use in the table lookup.
5440+ */
5441+ LOAD_ARGS ARGOFFSET, 1
5442 RESTORE_REST
5443 cmpq $__NR_syscall_max,%rax
5444 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
5445@@ -356,6 +510,7 @@ tracesys:
5446 * Has correct top of stack, but partial stack frame.
5447 */
5448 .globl int_ret_from_sys_call
5449+ .globl int_with_check
5450 int_ret_from_sys_call:
5451 DISABLE_INTERRUPTS(CLBR_NONE)
5452 TRACE_IRQS_OFF
5453@@ -370,10 +525,10 @@ int_ret_from_sys_call:
5454 int_with_check:
5455 LOCKDEP_SYS_EXIT_IRQ
5456 GET_THREAD_INFO(%rcx)
5457- movl threadinfo_flags(%rcx),%edx
5458+ movl TI_flags(%rcx),%edx
5459 andl %edi,%edx
5460 jnz int_careful
5461- andl $~TS_COMPAT,threadinfo_status(%rcx)
5462+ andl $~TS_COMPAT,TI_status(%rcx)
5463 jmp retint_restore_args
5464
5465 /* Either reschedule or signal or syscall exit tracking needed. */
5466@@ -399,7 +554,7 @@ int_very_careful:
5467 ENABLE_INTERRUPTS(CLBR_NONE)
5468 SAVE_REST
5469 /* Check for syscall exit trace */
5470- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
5471+ testl $_TIF_WORK_SYSCALL_EXIT,%edx
5472 jz int_signal
5473 pushq %rdi
5474 CFI_ADJUST_CFA_OFFSET 8
5475@@ -407,7 +562,7 @@ int_very_careful:
5476 call syscall_trace_leave
5477 popq %rdi
5478 CFI_ADJUST_CFA_OFFSET -8
5479- andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
5480+ andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
5481 jmp int_restore_rest
5482
5483 int_signal:
5484@@ -416,7 +571,7 @@ int_signal:
5485 movq %rsp,%rdi # &ptregs -> arg1
5486 xorl %esi,%esi # oldset -> arg2
5487 call do_notify_resume
5488-1: movl $_TIF_NEED_RESCHED,%edi
5489+1: movl $_TIF_WORK_MASK,%edi
5490 int_restore_rest:
5491 RESTORE_REST
5492 DISABLE_INTERRUPTS(CLBR_NONE)
5493@@ -443,7 +598,6 @@ END(\label)
5494 PTREGSCALL stub_clone, sys_clone, %r8
5495 PTREGSCALL stub_fork, sys_fork, %rdi
5496 PTREGSCALL stub_vfork, sys_vfork, %rdi
5497- PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
5498 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
5499 PTREGSCALL stub_iopl, sys_iopl, %rsi
5500
5501@@ -517,10 +671,12 @@ END(stub_rt_sigreturn)
5502 *
5503 */
5504
5505-retint_check:
5506+retint_with_reschedule:
5507 CFI_DEFAULT_STACK adj=1
5508+ movl $_TIF_WORK_MASK,%edi
5509+retint_check:
5510 LOCKDEP_SYS_EXIT_IRQ
5511- movl threadinfo_flags(%rcx),%edx
5512+ movl TI_flags(%rcx),%edx
5513 andl %edi,%edx
5514 CFI_REMEMBER_STATE
5515 jnz retint_careful
5516@@ -565,17 +721,16 @@ retint_signal:
5517 RESTORE_REST
5518 DISABLE_INTERRUPTS(CLBR_NONE)
5519 TRACE_IRQS_OFF
5520- movl $_TIF_NEED_RESCHED,%edi
5521 GET_THREAD_INFO(%rcx)
5522- jmp retint_check
5523+ jmp retint_with_reschedule
5524
5525 #ifdef CONFIG_PREEMPT
5526 /* Returning to kernel space. Check if we need preemption */
5527 /* rcx: threadinfo. interrupts off. */
5528 ENTRY(retint_kernel)
5529- cmpl $0,threadinfo_preempt_count(%rcx)
5530+ cmpl $0,TI_preempt_count(%rcx)
5531 jnz retint_restore_args
5532- bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
5533+ bt $TIF_NEED_RESCHED,TI_flags(%rcx)
5534 jnc retint_restore_args
5535 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
5536 jnc retint_restore_args
5537@@ -630,6 +785,9 @@ END(invalidate_interrupt\num)
5538 ENTRY(call_function_interrupt)
5539 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
5540 END(call_function_interrupt)
5541+ENTRY(call_function_single_interrupt)
5542+ apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
5543+END(call_function_single_interrupt)
5544 ENTRY(irq_move_cleanup_interrupt)
5545 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
5546 END(irq_move_cleanup_interrupt)
5547@@ -639,6 +797,10 @@ ENTRY(apic_timer_interrupt)
5548 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
5549 END(apic_timer_interrupt)
5550
5551+ENTRY(uv_bau_message_intr1)
5552+ apicinterrupt 220,uv_bau_message_interrupt
5553+END(uv_bau_message_intr1)
5554+
5555 ENTRY(error_interrupt)
5556 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
5557 END(error_interrupt)
5558@@ -752,7 +914,7 @@ paranoid_restore\trace:
5559 jmp irq_return
5560 paranoid_userspace\trace:
5561 GET_THREAD_INFO(%rcx)
5562- movl threadinfo_flags(%rcx),%ebx
5563+ movl TI_flags(%rcx),%ebx
5564 andl $_TIF_WORK_MASK,%ebx
5565 jz paranoid_swapgs\trace
5566 movq %rsp,%rdi /* &pt_regs */
5567@@ -849,7 +1011,7 @@ error_exit:
5568 testb $3,CS-ARGOFFSET(%rsp)
5569 jz retint_kernel
5570 LOCKDEP_SYS_EXIT_IRQ
5571- movl threadinfo_flags(%rcx),%edx
5572+ movl TI_flags(%rcx),%edx
5573 movl $_TIF_WORK_MASK,%edi
5574 andl %edi,%edx
5575 jnz retint_careful
5576@@ -871,11 +1033,11 @@ error_kernelspace:
5577 iret run with kernel gs again, so don't set the user space flag.
5578 B stepping K8s sometimes report an truncated RIP for IRET
5579 exceptions returning to compat mode. Check for these here too. */
5580- leaq irq_return(%rip),%rbp
5581- cmpq %rbp,RIP(%rsp)
5582+ leaq irq_return(%rip),%rcx
5583+ cmpq %rcx,RIP(%rsp)
5584 je error_swapgs
5585- movl %ebp,%ebp /* zero extend */
5586- cmpq %rbp,RIP(%rsp)
5587+ movl %ecx,%ecx /* zero extend */
5588+ cmpq %rcx,RIP(%rsp)
5589 je error_swapgs
5590 cmpq $gs_change,RIP(%rsp)
5591 je error_swapgs
5592@@ -1121,6 +1283,7 @@ END(device_not_available)
5593 /* runs on exception stack */
5594 KPROBE_ENTRY(debug)
5595 /* INTR_FRAME
5596+ PARAVIRT_ADJUST_EXCEPTION_FRAME
5597 pushq $0
5598 CFI_ADJUST_CFA_OFFSET 8 */
5599 zeroentry do_debug
5600@@ -1148,6 +1311,7 @@ END(do_nmi_callback)
5601
5602 KPROBE_ENTRY(int3)
5603 /* INTR_FRAME
5604+ PARAVIRT_ADJUST_EXCEPTION_FRAME
5605 pushq $0
5606 CFI_ADJUST_CFA_OFFSET 8 */
5607 zeroentry do_int3
5608@@ -1171,14 +1335,11 @@ ENTRY(coprocessor_segment_overrun)
5609 zeroentry do_coprocessor_segment_overrun
5610 END(coprocessor_segment_overrun)
5611
5612-ENTRY(reserved)
5613- zeroentry do_reserved
5614-END(reserved)
5615-
5616 #if 0
5617 /* runs on exception stack */
5618 ENTRY(double_fault)
5619 XCPT_FRAME
5620+ PARAVIRT_ADJUST_EXCEPTION_FRAME
5621 paranoidentry do_double_fault
5622 jmp paranoid_exit1
5623 CFI_ENDPROC
5624@@ -1196,6 +1357,7 @@ END(segment_not_present)
5625 /* runs on exception stack */
5626 ENTRY(stack_segment)
5627 /* XCPT_FRAME
5628+ PARAVIRT_ADJUST_EXCEPTION_FRAME
5629 paranoidentry do_stack_segment */
5630 errorentry do_stack_segment
5631 /* jmp paranoid_exit1
5632@@ -1222,6 +1384,7 @@ END(spurious_interrupt_bug)
5633 /* runs on exception stack */
5634 ENTRY(machine_check)
5635 INTR_FRAME
5636+ PARAVIRT_ADJUST_EXCEPTION_FRAME
5637 pushq $0
5638 CFI_ADJUST_CFA_OFFSET 8
5639 paranoidentry do_machine_check
00e5a55c
BS
5640--- sle11-2009-06-04.orig/arch/x86/kernel/fixup.c 2009-03-16 16:33:40.000000000 +0100
5641+++ sle11-2009-06-04/arch/x86/kernel/fixup.c 2009-06-04 10:21:39.000000000 +0200
5642@@ -33,6 +33,7 @@
5643 #include <linux/kernel.h>
5644 #include <linux/delay.h>
5645 #include <linux/version.h>
5646+#include <asm/traps.h>
5647
5648 #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
5649
5650--- sle11-2009-06-04.orig/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
5651+++ sle11-2009-06-04/arch/x86/kernel/genapic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
5652@@ -58,7 +58,7 @@ void __init setup_apic_routing(void)
5653 else
5654 #endif
5655
5656- if (num_possible_cpus() <= 8)
5657+ if (max_physical_apicid < 8)
5658 genapic = &apic_flat;
5659 else
5660 genapic = &apic_physflat;
5661@@ -121,4 +121,5 @@ int is_uv_system(void)
5662 {
5663 return uv_system_type != UV_NONE;
5664 }
5665+EXPORT_SYMBOL_GPL(is_uv_system);
5666 #endif
00e5a55c
BS
5667--- sle11-2009-06-04.orig/arch/x86/kernel/genapic_xen_64.c 2009-03-16 16:38:05.000000000 +0100
5668+++ sle11-2009-06-04/arch/x86/kernel/genapic_xen_64.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
5669@@ -43,7 +43,7 @@ void xen_send_IPI_shortcut(unsigned int
5670 __send_IPI_one(smp_processor_id(), vector);
5671 break;
5672 case APIC_DEST_ALLBUT:
5673- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5674+ for_each_possible_cpu(cpu) {
5675 if (cpu == smp_processor_id())
5676 continue;
5677 if (cpu_isset(cpu, cpu_online_map)) {
5678@@ -52,7 +52,7 @@ void xen_send_IPI_shortcut(unsigned int
5679 }
5680 break;
5681 case APIC_DEST_ALLINC:
5682- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5683+ for_each_possible_cpu(cpu) {
5684 if (cpu_isset(cpu, cpu_online_map)) {
5685 __send_IPI_one(cpu, vector);
5686 }
5687@@ -81,8 +81,6 @@ static cpumask_t xen_vector_allocation_d
5688 */
5689 static void xen_init_apic_ldr(void)
5690 {
5691- Dprintk("%s\n", __FUNCTION__);
5692- return;
5693 }
5694
5695 static void xen_send_IPI_allbutself(int vector)
5696@@ -92,14 +90,12 @@ static void xen_send_IPI_allbutself(int
5697 * we get an APIC send error if we try to broadcast.
5698 * thus we have to avoid sending IPIs in this case.
5699 */
5700- Dprintk("%s\n", __FUNCTION__);
5701 if (num_online_cpus() > 1)
5702 xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
5703 }
5704
5705 static void xen_send_IPI_all(int vector)
5706 {
5707- Dprintk("%s\n", __FUNCTION__);
5708 xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
5709 }
5710
5711@@ -109,11 +105,10 @@ static void xen_send_IPI_mask(cpumask_t
5712 unsigned int cpu;
5713 unsigned long flags;
5714
5715- Dprintk("%s\n", __FUNCTION__);
5716 local_irq_save(flags);
5717 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
5718
5719- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5720+ for_each_possible_cpu(cpu) {
5721 if (cpu_isset(cpu, cpumask)) {
5722 __send_IPI_one(cpu, vector);
5723 }
5724@@ -125,14 +120,12 @@ static void xen_send_IPI_mask(cpumask_t
5725 static int xen_apic_id_registered(void)
5726 {
5727 /* better be set */
5728- Dprintk("%s\n", __FUNCTION__);
5729 return physid_isset(smp_processor_id(), phys_cpu_present_map);
5730 }
5731 #endif
5732
5733 static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
5734 {
5735- Dprintk("%s\n", __FUNCTION__);
5736 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
5737 }
5738
5739@@ -140,7 +133,6 @@ static unsigned int phys_pkg_id(int inde
5740 {
5741 u32 ebx;
5742
5743- Dprintk("%s\n", __FUNCTION__);
5744 ebx = cpuid_ebx(1);
5745 return ((ebx >> 24) & 0xFF) >> index_msb;
5746 }
cc90b958 5747--- /dev/null 1970-01-01 00:00:00.000000000 +0000
00e5a55c 5748+++ sle11-2009-06-04/arch/x86/kernel/head-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
5749@@ -0,0 +1,57 @@
5750+#include <linux/kernel.h>
5751+#include <linux/init.h>
5752+
5753+#include <asm/setup.h>
5754+#include <asm/bios_ebda.h>
5755+
5756+#define BIOS_LOWMEM_KILOBYTES 0x413
5757+
5758+/*
5759+ * The BIOS places the EBDA/XBDA at the top of conventional
5760+ * memory, and usually decreases the reported amount of
5761+ * conventional memory (int 0x12) too. This also contains a
5762+ * workaround for Dell systems that neglect to reserve EBDA.
5763+ * The same workaround also avoids a problem with the AMD768MPX
5764+ * chipset: reserve a page before VGA to prevent PCI prefetch
5765+ * into it (errata #56). Usually the page is reserved anyways,
5766+ * unless you have no PS/2 mouse plugged in.
5767+ */
5768+void __init reserve_ebda_region(void)
5769+{
5770+#ifndef CONFIG_XEN
5771+ unsigned int lowmem, ebda_addr;
5772+
5773+ /* To determine the position of the EBDA and the */
5774+ /* end of conventional memory, we need to look at */
5775+ /* the BIOS data area. In a paravirtual environment */
5776+ /* that area is absent. We'll just have to assume */
5777+ /* that the paravirt case can handle memory setup */
5778+ /* correctly, without our help. */
5779+ if (paravirt_enabled())
5780+ return;
5781+
5782+ /* end of low (conventional) memory */
5783+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5784+ lowmem <<= 10;
5785+
5786+ /* start of EBDA area */
5787+ ebda_addr = get_bios_ebda();
5788+
5789+ /* Fixup: bios puts an EBDA in the top 64K segment */
5790+ /* of conventional memory, but does not adjust lowmem. */
5791+ if ((lowmem - ebda_addr) <= 0x10000)
5792+ lowmem = ebda_addr;
5793+
5794+ /* Fixup: bios does not report an EBDA at all. */
5795+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5796+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5797+ lowmem = 0x9f000;
5798+
5799+ /* Paranoia: should never happen, but... */
5800+ if ((lowmem == 0) || (lowmem >= 0x100000))
5801+ lowmem = 0x9f000;
5802+
5803+ /* reserve all memory between lowmem and the 1MB mark */
5804+ reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
5805+#endif
5806+}
cc90b958 5807--- /dev/null 1970-01-01 00:00:00.000000000 +0000
00e5a55c 5808+++ sle11-2009-06-04/arch/x86/kernel/head32-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
5809@@ -0,0 +1,57 @@
5810+/*
5811+ * linux/arch/i386/kernel/head32.c -- prepare to run common code
5812+ *
5813+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5814+ * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
5815+ */
5816+
5817+#include <linux/init.h>
5818+#include <linux/start_kernel.h>
5819+
5820+#include <asm/setup.h>
5821+#include <asm/sections.h>
5822+#include <asm/e820.h>
5823+#include <asm/bios_ebda.h>
5824+
5825+void __init i386_start_kernel(void)
5826+{
5827+ reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5828+
5829+#ifndef CONFIG_XEN
5830+#ifdef CONFIG_BLK_DEV_INITRD
5831+ /* Reserve INITRD */
5832+ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
5833+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
5834+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
5835+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
5836+ reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
5837+ }
5838+#endif
5839+ reserve_early(init_pg_tables_start, init_pg_tables_end,
5840+ "INIT_PG_TABLE");
5841+#else
5842+ reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
5843+ __pa(xen_start_info->pt_base)
5844+ + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5845+ "Xen provided");
5846+
5847+ {
5848+ int max_cmdline;
5849+
5850+ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
5851+ max_cmdline = COMMAND_LINE_SIZE;
5852+ memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
5853+ boot_command_line[max_cmdline-1] = '\0';
5854+ }
5855+#endif
5856+
5857+ reserve_ebda_region();
5858+
5859+ /*
5860+ * At this point everything still needed from the boot loader
5861+ * or BIOS or kernel text should be early reserved or marked not
5862+ * RAM in e820. All other memory is free game.
5863+ */
5864+
5865+ start_kernel();
5866+}
00e5a55c
BS
5867--- sle11-2009-06-04.orig/arch/x86/kernel/head64-xen.c 2009-03-16 16:38:05.000000000 +0100
5868+++ sle11-2009-06-04/arch/x86/kernel/head64-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
5869@@ -32,7 +32,26 @@
5870 #include <asm/e820.h>
5871 #include <asm/bios_ebda.h>
5872
5873-unsigned long start_pfn;
5874+/* boot cpu pda */
5875+static struct x8664_pda _boot_cpu_pda __read_mostly;
5876+
5877+#ifdef CONFIG_SMP
5878+/*
5879+ * We install an empty cpu_pda pointer table to indicate to early users
5880+ * (numa_set_node) that the cpu_pda pointer table for cpus other than
5881+ * the boot cpu is not yet setup.
5882+ */
5883+static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
5884+#else
5885+static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
5886+#endif
5887+
5888+void __init x86_64_init_pda(void)
5889+{
5890+ _cpu_pda = __cpu_pda;
5891+ cpu_pda(0) = &_boot_cpu_pda;
5892+ pda_init(0);
5893+}
5894
5895 #ifndef CONFIG_XEN
5896 static void __init zap_identity_mappings(void)
5897@@ -77,83 +96,10 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
5898 unsigned int machine_to_phys_order;
5899 EXPORT_SYMBOL(machine_to_phys_order);
5900
5901-#define BIOS_LOWMEM_KILOBYTES 0x413
5902-
5903-/*
5904- * The BIOS places the EBDA/XBDA at the top of conventional
5905- * memory, and usually decreases the reported amount of
5906- * conventional memory (int 0x12) too. This also contains a
5907- * workaround for Dell systems that neglect to reserve EBDA.
5908- * The same workaround also avoids a problem with the AMD768MPX
5909- * chipset: reserve a page before VGA to prevent PCI prefetch
5910- * into it (errata #56). Usually the page is reserved anyways,
5911- * unless you have no PS/2 mouse plugged in.
5912- */
5913-static void __init reserve_ebda_region(void)
5914-{
5915-#ifndef CONFIG_XEN
5916- unsigned int lowmem, ebda_addr;
5917-
5918- /* To determine the position of the EBDA and the */
5919- /* end of conventional memory, we need to look at */
5920- /* the BIOS data area. In a paravirtual environment */
5921- /* that area is absent. We'll just have to assume */
5922- /* that the paravirt case can handle memory setup */
5923- /* correctly, without our help. */
5924- if (paravirt_enabled())
5925- return;
5926-
5927- /* end of low (conventional) memory */
5928- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5929- lowmem <<= 10;
5930-
5931- /* start of EBDA area */
5932- ebda_addr = get_bios_ebda();
5933-
5934- /* Fixup: bios puts an EBDA in the top 64K segment */
5935- /* of conventional memory, but does not adjust lowmem. */
5936- if ((lowmem - ebda_addr) <= 0x10000)
5937- lowmem = ebda_addr;
5938-
5939- /* Fixup: bios does not report an EBDA at all. */
5940- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5941- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5942- lowmem = 0x9f000;
5943-
5944- /* Paranoia: should never happen, but... */
5945- if ((lowmem == 0) || (lowmem >= 0x100000))
5946- lowmem = 0x9f000;
5947-
5948- /* reserve all memory between lowmem and the 1MB mark */
5949- reserve_early(lowmem, 0x100000, "BIOS reserved");
5950-#endif
5951-}
5952-
5953-static void __init reserve_setup_data(void)
5954-{
5955-#ifndef CONFIG_XEN
5956- struct setup_data *data;
5957- unsigned long pa_data;
5958- char buf[32];
5959-
5960- if (boot_params.hdr.version < 0x0209)
5961- return;
5962- pa_data = boot_params.hdr.setup_data;
5963- while (pa_data) {
5964- data = early_ioremap(pa_data, sizeof(*data));
5965- sprintf(buf, "setup data %x", data->type);
5966- reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
5967- pa_data = data->next;
5968- early_iounmap(data, sizeof(*data));
5969- }
5970-#endif
5971-}
5972-
5973 void __init x86_64_start_kernel(char * real_mode_data)
5974 {
5975 struct xen_machphys_mapping mapping;
5976 unsigned long machine_to_phys_nr_ents;
5977- int i;
5978
5979 /*
5980 * Build-time sanity checks on the kernel image and module
5981@@ -167,6 +113,7 @@ void __init x86_64_start_kernel(char * r
5982 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
5983 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
5984 (__START_KERNEL & PGDIR_MASK)));
5985+ BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
5986
5987 xen_setup_features();
5988
5989@@ -174,8 +121,6 @@ void __init x86_64_start_kernel(char * r
5990 if (!xen_feature(XENFEAT_auto_translated_physmap))
5991 phys_to_machine_mapping =
5992 (unsigned long *)xen_start_info->mfn_list;
5993- start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
5994- xen_start_info->nr_pt_frames;
5995
5996 machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
5997 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
5998@@ -208,19 +153,23 @@ void __init x86_64_start_kernel(char * r
5999
6000 early_printk("Kernel alive\n");
6001
6002- for (i = 0; i < NR_CPUS; i++)
6003- cpu_pda(i) = &boot_cpu_pda[i];
6004+ x86_64_init_pda();
6005
6006- pda_init(0);
6007+ early_printk("Kernel really alive\n");
6008+
6009+ x86_64_start_reservations(real_mode_data);
6010+}
6011+
6012+void __init x86_64_start_reservations(char *real_mode_data)
6013+{
6014 copy_bootdata(__va(real_mode_data));
6015
6016 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
6017
6018 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
6019- start_pfn << PAGE_SHIFT, "Xen provided");
6020-
6021- reserve_ebda_region();
6022- reserve_setup_data();
6023+ __pa(xen_start_info->pt_base)
6024+ + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
6025+ "Xen provided");
6026
6027 /*
6028 * At this point everything still needed from the boot loader
00e5a55c
BS
6029--- sle11-2009-06-04.orig/arch/x86/kernel/head_64-xen.S 2009-02-16 16:17:21.000000000 +0100
6030+++ sle11-2009-06-04/arch/x86/kernel/head_64-xen.S 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
6031@@ -95,53 +95,6 @@ NEXT_PAGE(hypercall_page)
6032
6033 #undef NEXT_PAGE
6034
6035- .data
6036-
6037- .align 16
6038- .globl cpu_gdt_descr
6039-cpu_gdt_descr:
6040- .word gdt_end-cpu_gdt_table-1
6041-gdt:
6042- .quad cpu_gdt_table
6043-#ifdef CONFIG_SMP
6044- .rept NR_CPUS-1
6045- .word 0
6046- .quad 0
6047- .endr
6048-#endif
6049-
6050-/* We need valid kernel segments for data and code in long mode too
6051- * IRET will check the segment types kkeil 2000/10/28
6052- * Also sysret mandates a special GDT layout
6053- */
6054-
6055- .section .data.page_aligned, "aw"
6056- .align PAGE_SIZE
6057-
6058-/* The TLS descriptors are currently at a different place compared to i386.
6059- Hopefully nobody expects them at a fixed place (Wine?) */
6060-
6061-ENTRY(cpu_gdt_table)
6062- .quad 0x0000000000000000 /* NULL descriptor */
6063- .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
6064- .quad 0x00af9b000000ffff /* __KERNEL_CS */
6065- .quad 0x00cf93000000ffff /* __KERNEL_DS */
6066- .quad 0x00cffb000000ffff /* __USER32_CS */
6067- .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
6068- .quad 0x00affb000000ffff /* __USER_CS */
6069- .quad 0x0 /* unused */
6070- .quad 0,0 /* TSS */
6071- .quad 0,0 /* LDT */
6072- .quad 0,0,0 /* three TLS descriptors */
6073- .quad 0x0000f40000000000 /* node/CPU stored in limit */
6074-gdt_end:
6075- /* asm/segment.h:GDT_ENTRIES must match this */
6076- /* This should be a multiple of the cache line size */
6077- /* GDTs of other CPUs are now dynamically allocated */
6078-
6079- /* zero the remaining page */
6080- .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
6081-
6082 .section .bss.page_aligned, "aw", @nobits
6083 .align PAGE_SIZE
6084 ENTRY(empty_zero_page)
00e5a55c
BS
6085--- sle11-2009-06-04.orig/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:38:05.000000000 +0100
6086+++ sle11-2009-06-04/arch/x86/kernel/io_apic_32-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
6087@@ -25,6 +25,7 @@
6088 #include <linux/init.h>
6089 #include <linux/delay.h>
6090 #include <linux/sched.h>
6091+#include <linux/bootmem.h>
6092 #include <linux/mc146818rtc.h>
6093 #include <linux/compiler.h>
6094 #include <linux/acpi.h>
6095@@ -75,7 +76,7 @@ static struct { int pin, apic; } ioapic_
6096 static DEFINE_SPINLOCK(ioapic_lock);
6097 static DEFINE_SPINLOCK(vector_lock);
6098
6099-int timer_over_8254 __initdata = 1;
6100+int timer_through_8259 __initdata;
6101
6102 /*
6103 * Is the SiS APIC rmw bug present ?
6104@@ -89,15 +90,21 @@ int sis_apic_bug = -1;
6105 int nr_ioapic_registers[MAX_IO_APICS];
6106
6107 /* I/O APIC entries */
6108-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6109+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
6110 int nr_ioapics;
6111
6112 /* MP IRQ source entries */
6113-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6114+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6115
6116 /* # of MP IRQ source entries */
6117 int mp_irq_entries;
6118
6119+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
6120+int mp_bus_id_to_type[MAX_MP_BUSSES];
6121+#endif
6122+
6123+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
6124+
6125 static int disable_timer_pin_1 __initdata;
6126
6127 /*
6128@@ -128,7 +135,7 @@ struct io_apic {
6129 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
6130 {
6131 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
6132- + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
6133+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
6134 }
6135 #endif
6136
6137@@ -142,7 +149,7 @@ static inline unsigned int io_apic_read(
6138 struct physdev_apic apic_op;
6139 int ret;
6140
6141- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6142+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6143 apic_op.reg = reg;
6144 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
6145 if (ret)
6146@@ -160,7 +167,7 @@ static inline void io_apic_write(unsigne
6147 #else
6148 struct physdev_apic apic_op;
6149
6150- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6151+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6152 apic_op.reg = reg;
6153 apic_op.value = value;
6154 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
6155@@ -288,7 +295,7 @@ static void __init replace_pin_at_irq(un
6156 }
6157 }
6158
6159-static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
6160+static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
6161 {
6162 struct irq_pin_list *entry = irq_2_pin + irq;
6163 unsigned int pin, reg;
6164@@ -308,30 +315,32 @@ static void __modify_IO_APIC_irq (unsign
6165 }
6166
6167 /* mask = 1 */
6168-static void __mask_IO_APIC_irq (unsigned int irq)
6169+static void __mask_IO_APIC_irq(unsigned int irq)
6170 {
6171- __modify_IO_APIC_irq(irq, 0x00010000, 0);
6172+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
6173 }
6174
6175 /* mask = 0 */
6176-static void __unmask_IO_APIC_irq (unsigned int irq)
6177+static void __unmask_IO_APIC_irq(unsigned int irq)
6178 {
6179- __modify_IO_APIC_irq(irq, 0, 0x00010000);
6180+ __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
6181 }
6182
6183 /* mask = 1, trigger = 0 */
6184-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
6185+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
6186 {
6187- __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
6188+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
6189+ IO_APIC_REDIR_LEVEL_TRIGGER);
6190 }
6191
6192 /* mask = 0, trigger = 1 */
6193-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
6194+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
6195 {
6196- __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
6197+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
6198+ IO_APIC_REDIR_MASKED);
6199 }
6200
6201-static void mask_IO_APIC_irq (unsigned int irq)
6202+static void mask_IO_APIC_irq(unsigned int irq)
6203 {
6204 unsigned long flags;
6205
6206@@ -340,7 +349,7 @@ static void mask_IO_APIC_irq (unsigned i
6207 spin_unlock_irqrestore(&ioapic_lock, flags);
6208 }
6209
6210-static void unmask_IO_APIC_irq (unsigned int irq)
6211+static void unmask_IO_APIC_irq(unsigned int irq)
6212 {
6213 unsigned long flags;
6214
6215@@ -352,7 +361,7 @@ static void unmask_IO_APIC_irq (unsigned
6216 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
6217 {
6218 struct IO_APIC_route_entry entry;
6219-
6220+
6221 /* Check delivery_mode to be sure we're not clearing an SMI pin */
6222 entry = ioapic_read_entry(apic, pin);
6223 if (entry.delivery_mode == dest_SMI)
6224@@ -364,7 +373,7 @@ static void clear_IO_APIC_pin(unsigned i
6225 ioapic_mask_entry(apic, pin);
6226 }
6227
6228-static void clear_IO_APIC (void)
6229+static void clear_IO_APIC(void)
6230 {
6231 int apic, pin;
6232
6233@@ -381,7 +390,7 @@ static void set_ioapic_affinity_irq(unsi
6234 struct irq_pin_list *entry = irq_2_pin + irq;
6235 unsigned int apicid_value;
6236 cpumask_t tmp;
6237-
6238+
6239 cpus_and(tmp, cpumask, cpu_online_map);
6240 if (cpus_empty(tmp))
6241 tmp = TARGET_CPUS;
6242@@ -410,7 +419,7 @@ static void set_ioapic_affinity_irq(unsi
6243 # include <linux/kernel_stat.h> /* kstat */
6244 # include <linux/slab.h> /* kmalloc() */
6245 # include <linux/timer.h>
6246-
6247+
6248 #define IRQBALANCE_CHECK_ARCH -999
6249 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
6250 #define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
6251@@ -422,14 +431,14 @@ static int physical_balance __read_mostl
6252 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
6253
6254 static struct irq_cpu_info {
6255- unsigned long * last_irq;
6256- unsigned long * irq_delta;
6257+ unsigned long *last_irq;
6258+ unsigned long *irq_delta;
6259 unsigned long irq;
6260 } irq_cpu_data[NR_CPUS];
6261
6262 #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
6263-#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
6264-#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
6265+#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
6266+#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
6267
6268 #define IDLE_ENOUGH(cpu,now) \
6269 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
6270@@ -468,8 +477,8 @@ inside:
6271 if (cpu == -1)
6272 cpu = NR_CPUS-1;
6273 }
6274- } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
6275- (search_idle && !IDLE_ENOUGH(cpu,now)));
6276+ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
6277+ (search_idle && !IDLE_ENOUGH(cpu, now)));
6278
6279 return cpu;
6280 }
6281@@ -479,15 +488,14 @@ static inline void balance_irq(int cpu,
6282 unsigned long now = jiffies;
6283 cpumask_t allowed_mask;
6284 unsigned int new_cpu;
6285-
6286+
6287 if (irqbalance_disabled)
6288- return;
6289+ return;
6290
6291 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
6292 new_cpu = move(cpu, allowed_mask, now, 1);
6293- if (cpu != new_cpu) {
6294+ if (cpu != new_cpu)
6295 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
6296- }
6297 }
6298
6299 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
6300@@ -499,14 +507,14 @@ static inline void rotate_irqs_among_cpu
6301 if (!irq_desc[j].action)
6302 continue;
6303 /* Is it a significant load ? */
6304- if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
6305+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
6306 useful_load_threshold)
6307 continue;
6308 balance_irq(i, j);
6309 }
6310 }
6311 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6312- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6313+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6314 return;
6315 }
6316
6317@@ -535,22 +543,22 @@ static void do_irq_balance(void)
6318 /* Is this an active IRQ or balancing disabled ? */
6319 if (!irq_desc[j].action || irq_balancing_disabled(j))
6320 continue;
6321- if ( package_index == i )
6322- IRQ_DELTA(package_index,j) = 0;
6323+ if (package_index == i)
6324+ IRQ_DELTA(package_index, j) = 0;
6325 /* Determine the total count per processor per IRQ */
6326 value_now = (unsigned long) kstat_cpu(i).irqs[j];
6327
6328 /* Determine the activity per processor per IRQ */
6329- delta = value_now - LAST_CPU_IRQ(i,j);
6330+ delta = value_now - LAST_CPU_IRQ(i, j);
6331
6332 /* Update last_cpu_irq[][] for the next time */
6333- LAST_CPU_IRQ(i,j) = value_now;
6334+ LAST_CPU_IRQ(i, j) = value_now;
6335
6336 /* Ignore IRQs whose rate is less than the clock */
6337 if (delta < useful_load_threshold)
6338 continue;
6339 /* update the load for the processor or package total */
6340- IRQ_DELTA(package_index,j) += delta;
6341+ IRQ_DELTA(package_index, j) += delta;
6342
6343 /* Keep track of the higher numbered sibling as well */
6344 if (i != package_index)
6345@@ -576,7 +584,8 @@ static void do_irq_balance(void)
6346 max_cpu_irq = ULONG_MAX;
6347
6348 tryanothercpu:
6349- /* Look for heaviest loaded processor.
6350+ /*
6351+ * Look for heaviest loaded processor.
6352 * We may come back to get the next heaviest loaded processor.
6353 * Skip processors with trivial loads.
6354 */
6355@@ -585,7 +594,7 @@ tryanothercpu:
6356 for_each_online_cpu(i) {
6357 if (i != CPU_TO_PACKAGEINDEX(i))
6358 continue;
6359- if (max_cpu_irq <= CPU_IRQ(i))
6360+ if (max_cpu_irq <= CPU_IRQ(i))
6361 continue;
6362 if (tmp_cpu_irq < CPU_IRQ(i)) {
6363 tmp_cpu_irq = CPU_IRQ(i);
6364@@ -594,8 +603,9 @@ tryanothercpu:
6365 }
6366
6367 if (tmp_loaded == -1) {
6368- /* In the case of small number of heavy interrupt sources,
6369- * loading some of the cpus too much. We use Ingo's original
6370+ /*
6371+ * In the case of small number of heavy interrupt sources,
6372+ * loading some of the cpus too much. We use Ingo's original
6373 * approach to rotate them around.
6374 */
6375 if (!first_attempt && imbalance >= useful_load_threshold) {
6376@@ -604,13 +614,14 @@ tryanothercpu:
6377 }
6378 goto not_worth_the_effort;
6379 }
6380-
6381+
6382 first_attempt = 0; /* heaviest search */
6383 max_cpu_irq = tmp_cpu_irq; /* load */
6384 max_loaded = tmp_loaded; /* processor */
6385 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
6386-
6387- /* if imbalance is less than approx 10% of max load, then
6388+
6389+ /*
6390+ * if imbalance is less than approx 10% of max load, then
6391 * observe diminishing returns action. - quit
6392 */
6393 if (imbalance < (max_cpu_irq >> 3))
6394@@ -626,26 +637,25 @@ tryanotherirq:
6395 /* Is this an active IRQ? */
6396 if (!irq_desc[j].action)
6397 continue;
6398- if (imbalance <= IRQ_DELTA(max_loaded,j))
6399+ if (imbalance <= IRQ_DELTA(max_loaded, j))
6400 continue;
6401 /* Try to find the IRQ that is closest to the imbalance
6402 * without going over.
6403 */
6404- if (move_this_load < IRQ_DELTA(max_loaded,j)) {
6405- move_this_load = IRQ_DELTA(max_loaded,j);
6406+ if (move_this_load < IRQ_DELTA(max_loaded, j)) {
6407+ move_this_load = IRQ_DELTA(max_loaded, j);
6408 selected_irq = j;
6409 }
6410 }
6411- if (selected_irq == -1) {
6412+ if (selected_irq == -1)
6413 goto tryanothercpu;
6414- }
6415
6416 imbalance = move_this_load;
6417-
6418+
6419 /* For physical_balance case, we accumulated both load
6420 * values in the one of the siblings cpu_irq[],
6421 * to use the same code for physical and logical processors
6422- * as much as possible.
6423+ * as much as possible.
6424 *
6425 * NOTE: the cpu_irq[] array holds the sum of the load for
6426 * sibling A and sibling B in the slot for the lowest numbered
6427@@ -674,11 +684,11 @@ tryanotherirq:
6428 /* mark for change destination */
6429 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
6430
6431- /* Since we made a change, come back sooner to
6432+ /* Since we made a change, come back sooner to
6433 * check for more variation.
6434 */
6435 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6436- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6437+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6438 return;
6439 }
6440 goto tryanotherirq;
6441@@ -689,7 +699,7 @@ not_worth_the_effort:
6442 * upward
6443 */
6444 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
6445- balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6446+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6447 return;
6448 }
6449
6450@@ -728,13 +738,13 @@ static int __init balanced_irq_init(void
6451 cpumask_t tmp;
6452
6453 cpus_shift_right(tmp, cpu_online_map, 2);
6454- c = &boot_cpu_data;
6455+ c = &boot_cpu_data;
6456 /* When not overwritten by the command line ask subarchitecture. */
6457 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
6458 irqbalance_disabled = NO_BALANCE_IRQ;
6459 if (irqbalance_disabled)
6460 return 0;
6461-
6462+
6463 /* disable irqbalance completely if there is only one processor online */
6464 if (num_online_cpus() < 2) {
6465 irqbalance_disabled = 1;
6466@@ -748,16 +758,14 @@ static int __init balanced_irq_init(void
6467 physical_balance = 1;
6468
6469 for_each_online_cpu(i) {
6470- irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6471- irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6472+ irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6473+ irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6474 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
6475 printk(KERN_ERR "balanced_irq_init: out of memory");
6476 goto failed;
6477 }
6478- memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
6479- memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
6480 }
6481-
6482+
6483 printk(KERN_INFO "Starting balanced_irq\n");
6484 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
6485 return 0;
6486@@ -799,7 +807,7 @@ void send_IPI_self(int vector)
6487 /*
6488 * Send the IPI. The write to APIC_ICR fires this off.
6489 */
6490- apic_write_around(APIC_ICR, cfg);
6491+ apic_write(APIC_ICR, cfg);
6492 #endif
6493 }
6494 #endif /* !CONFIG_SMP */
6495@@ -853,10 +861,10 @@ static int find_irq_entry(int apic, int
6496 int i;
6497
6498 for (i = 0; i < mp_irq_entries; i++)
6499- if (mp_irqs[i].mpc_irqtype == type &&
6500- (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
6501- mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
6502- mp_irqs[i].mpc_dstirq == pin)
6503+ if (mp_irqs[i].mp_irqtype == type &&
6504+ (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
6505+ mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
6506+ mp_irqs[i].mp_dstirq == pin)
6507 return i;
6508
6509 return -1;
6510@@ -871,13 +879,13 @@ static int __init find_isa_irq_pin(int i
6511 int i;
6512
6513 for (i = 0; i < mp_irq_entries; i++) {
6514- int lbus = mp_irqs[i].mpc_srcbus;
6515+ int lbus = mp_irqs[i].mp_srcbus;
6516
6517 if (test_bit(lbus, mp_bus_not_pci) &&
6518- (mp_irqs[i].mpc_irqtype == type) &&
6519- (mp_irqs[i].mpc_srcbusirq == irq))
6520+ (mp_irqs[i].mp_irqtype == type) &&
6521+ (mp_irqs[i].mp_srcbusirq == irq))
6522
6523- return mp_irqs[i].mpc_dstirq;
6524+ return mp_irqs[i].mp_dstirq;
6525 }
6526 return -1;
6527 }
6528@@ -887,17 +895,17 @@ static int __init find_isa_irq_apic(int
6529 int i;
6530
6531 for (i = 0; i < mp_irq_entries; i++) {
6532- int lbus = mp_irqs[i].mpc_srcbus;
6533+ int lbus = mp_irqs[i].mp_srcbus;
6534
6535 if (test_bit(lbus, mp_bus_not_pci) &&
6536- (mp_irqs[i].mpc_irqtype == type) &&
6537- (mp_irqs[i].mpc_srcbusirq == irq))
6538+ (mp_irqs[i].mp_irqtype == type) &&
6539+ (mp_irqs[i].mp_srcbusirq == irq))
6540 break;
6541 }
6542 if (i < mp_irq_entries) {
6543 int apic;
6544- for(apic = 0; apic < nr_ioapics; apic++) {
6545- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
6546+ for (apic = 0; apic < nr_ioapics; apic++) {
6547+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
6548 return apic;
6549 }
6550 }
6551@@ -918,28 +926,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6552
6553 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
6554 "slot:%d, pin:%d.\n", bus, slot, pin);
6555- if (mp_bus_id_to_pci_bus[bus] == -1) {
6556+ if (test_bit(bus, mp_bus_not_pci)) {
6557 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
6558 return -1;
6559 }
6560 for (i = 0; i < mp_irq_entries; i++) {
6561- int lbus = mp_irqs[i].mpc_srcbus;
6562+ int lbus = mp_irqs[i].mp_srcbus;
6563
6564 for (apic = 0; apic < nr_ioapics; apic++)
6565- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
6566- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
6567+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
6568+ mp_irqs[i].mp_dstapic == MP_APIC_ALL)
6569 break;
6570
6571 if (!test_bit(lbus, mp_bus_not_pci) &&
6572- !mp_irqs[i].mpc_irqtype &&
6573+ !mp_irqs[i].mp_irqtype &&
6574 (bus == lbus) &&
6575- (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
6576- int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
6577+ (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
6578+ int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
6579
6580 if (!(apic || IO_APIC_IRQ(irq)))
6581 continue;
6582
6583- if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
6584+ if (pin == (mp_irqs[i].mp_srcbusirq & 3))
6585 return irq;
6586 /*
6587 * Use the first all-but-pin matching entry as a
6588@@ -954,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6589 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
6590
6591 /*
6592- * This function currently is only a helper for the i386 smp boot process where
6593+ * This function currently is only a helper for the i386 smp boot process where
6594 * we need to reprogram the ioredtbls to cater for the cpus which have come online
6595 * so mask in all cases should simply be TARGET_CPUS
6596 */
6597@@ -1008,7 +1016,7 @@ static int EISA_ELCR(unsigned int irq)
6598 * EISA conforming in the MP table, that means its trigger type must
6599 * be read in from the ELCR */
6600
6601-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
6602+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
6603 #define default_EISA_polarity(idx) default_ISA_polarity(idx)
6604
6605 /* PCI interrupts are always polarity one level triggered,
6606@@ -1025,118 +1033,115 @@ static int EISA_ELCR(unsigned int irq)
6607
6608 static int MPBIOS_polarity(int idx)
6609 {
6610- int bus = mp_irqs[idx].mpc_srcbus;
6611+ int bus = mp_irqs[idx].mp_srcbus;
6612 int polarity;
6613
6614 /*
6615 * Determine IRQ line polarity (high active or low active):
6616 */
6617- switch (mp_irqs[idx].mpc_irqflag & 3)
6618+ switch (mp_irqs[idx].mp_irqflag & 3) {
6619+ case 0: /* conforms, ie. bus-type dependent polarity */
6620 {
6621- case 0: /* conforms, ie. bus-type dependent polarity */
6622- {
6623- polarity = test_bit(bus, mp_bus_not_pci)?
6624- default_ISA_polarity(idx):
6625- default_PCI_polarity(idx);
6626- break;
6627- }
6628- case 1: /* high active */
6629- {
6630- polarity = 0;
6631- break;
6632- }
6633- case 2: /* reserved */
6634- {
6635- printk(KERN_WARNING "broken BIOS!!\n");
6636- polarity = 1;
6637- break;
6638- }
6639- case 3: /* low active */
6640- {
6641- polarity = 1;
6642- break;
6643- }
6644- default: /* invalid */
6645- {
6646- printk(KERN_WARNING "broken BIOS!!\n");
6647- polarity = 1;
6648- break;
6649- }
6650+ polarity = test_bit(bus, mp_bus_not_pci)?
6651+ default_ISA_polarity(idx):
6652+ default_PCI_polarity(idx);
6653+ break;
6654+ }
6655+ case 1: /* high active */
6656+ {
6657+ polarity = 0;
6658+ break;
6659+ }
6660+ case 2: /* reserved */
6661+ {
6662+ printk(KERN_WARNING "broken BIOS!!\n");
6663+ polarity = 1;
6664+ break;
6665+ }
6666+ case 3: /* low active */
6667+ {
6668+ polarity = 1;
6669+ break;
6670+ }
6671+ default: /* invalid */
6672+ {
6673+ printk(KERN_WARNING "broken BIOS!!\n");
6674+ polarity = 1;
6675+ break;
6676+ }
6677 }
6678 return polarity;
6679 }
6680
6681 static int MPBIOS_trigger(int idx)
6682 {
6683- int bus = mp_irqs[idx].mpc_srcbus;
6684+ int bus = mp_irqs[idx].mp_srcbus;
6685 int trigger;
6686
6687 /*
6688 * Determine IRQ trigger mode (edge or level sensitive):
6689 */
6690- switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
6691+ switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
6692+ case 0: /* conforms, ie. bus-type dependent */
6693 {
6694- case 0: /* conforms, ie. bus-type dependent */
6695- {
6696- trigger = test_bit(bus, mp_bus_not_pci)?
6697- default_ISA_trigger(idx):
6698- default_PCI_trigger(idx);
6699+ trigger = test_bit(bus, mp_bus_not_pci)?
6700+ default_ISA_trigger(idx):
6701+ default_PCI_trigger(idx);
6702 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
6703- switch (mp_bus_id_to_type[bus])
6704- {
6705- case MP_BUS_ISA: /* ISA pin */
6706- {
6707- /* set before the switch */
6708- break;
6709- }
6710- case MP_BUS_EISA: /* EISA pin */
6711- {
6712- trigger = default_EISA_trigger(idx);
6713- break;
6714- }
6715- case MP_BUS_PCI: /* PCI pin */
6716- {
6717- /* set before the switch */
6718- break;
6719- }
6720- case MP_BUS_MCA: /* MCA pin */
6721- {
6722- trigger = default_MCA_trigger(idx);
6723- break;
6724- }
6725- default:
6726- {
6727- printk(KERN_WARNING "broken BIOS!!\n");
6728- trigger = 1;
6729- break;
6730- }
6731- }
6732-#endif
6733+ switch (mp_bus_id_to_type[bus]) {
6734+ case MP_BUS_ISA: /* ISA pin */
6735+ {
6736+ /* set before the switch */
6737 break;
6738 }
6739- case 1: /* edge */
6740+ case MP_BUS_EISA: /* EISA pin */
6741 {
6742- trigger = 0;
6743+ trigger = default_EISA_trigger(idx);
6744 break;
6745 }
6746- case 2: /* reserved */
6747+ case MP_BUS_PCI: /* PCI pin */
6748 {
6749- printk(KERN_WARNING "broken BIOS!!\n");
6750- trigger = 1;
6751+ /* set before the switch */
6752 break;
6753 }
6754- case 3: /* level */
6755+ case MP_BUS_MCA: /* MCA pin */
6756 {
6757- trigger = 1;
6758+ trigger = default_MCA_trigger(idx);
6759 break;
6760 }
6761- default: /* invalid */
6762+ default:
6763 {
6764 printk(KERN_WARNING "broken BIOS!!\n");
6765- trigger = 0;
6766+ trigger = 1;
6767 break;
6768 }
6769 }
6770+#endif
6771+ break;
6772+ }
6773+ case 1: /* edge */
6774+ {
6775+ trigger = 0;
6776+ break;
6777+ }
6778+ case 2: /* reserved */
6779+ {
6780+ printk(KERN_WARNING "broken BIOS!!\n");
6781+ trigger = 1;
6782+ break;
6783+ }
6784+ case 3: /* level */
6785+ {
6786+ trigger = 1;
6787+ break;
6788+ }
6789+ default: /* invalid */
6790+ {
6791+ printk(KERN_WARNING "broken BIOS!!\n");
6792+ trigger = 0;
6793+ break;
6794+ }
6795+ }
6796 return trigger;
6797 }
6798
6799@@ -1153,16 +1158,16 @@ static inline int irq_trigger(int idx)
6800 static int pin_2_irq(int idx, int apic, int pin)
6801 {
6802 int irq, i;
6803- int bus = mp_irqs[idx].mpc_srcbus;
6804+ int bus = mp_irqs[idx].mp_srcbus;
6805
6806 /*
6807 * Debugging check, we are in big trouble if this message pops up!
6808 */
6809- if (mp_irqs[idx].mpc_dstirq != pin)
6810+ if (mp_irqs[idx].mp_dstirq != pin)
6811 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
6812
6813 if (test_bit(bus, mp_bus_not_pci))
6814- irq = mp_irqs[idx].mpc_srcbusirq;
6815+ irq = mp_irqs[idx].mp_srcbusirq;
6816 else {
6817 /*
6818 * PCI IRQs are mapped in order
6819@@ -1204,8 +1209,8 @@ static inline int IO_APIC_irq_trigger(in
6820
6821 for (apic = 0; apic < nr_ioapics; apic++) {
6822 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6823- idx = find_irq_entry(apic,pin,mp_INT);
6824- if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
6825+ idx = find_irq_entry(apic, pin, mp_INT);
6826+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
6827 return irq_trigger(idx);
6828 }
6829 }
6830@@ -1291,25 +1296,25 @@ static void __init setup_IO_APIC_irqs(vo
6831 /*
6832 * add it to the IO-APIC irq-routing table:
6833 */
6834- memset(&entry,0,sizeof(entry));
6835+ memset(&entry, 0, sizeof(entry));
6836
6837 entry.delivery_mode = INT_DELIVERY_MODE;
6838 entry.dest_mode = INT_DEST_MODE;
6839 entry.mask = 0; /* enable IRQ */
6840- entry.dest.logical.logical_dest =
6841+ entry.dest.logical.logical_dest =
6842 cpu_mask_to_apicid(TARGET_CPUS);
6843
6844- idx = find_irq_entry(apic,pin,mp_INT);
6845+ idx = find_irq_entry(apic, pin, mp_INT);
6846 if (idx == -1) {
6847 if (first_notcon) {
6848 apic_printk(APIC_VERBOSE, KERN_DEBUG
6849 " IO-APIC (apicid-pin) %d-%d",
6850- mp_ioapics[apic].mpc_apicid,
6851+ mp_ioapics[apic].mp_apicid,
6852 pin);
6853 first_notcon = 0;
6854 } else
6855 apic_printk(APIC_VERBOSE, ", %d-%d",
6856- mp_ioapics[apic].mpc_apicid, pin);
6857+ mp_ioapics[apic].mp_apicid, pin);
6858 continue;
6859 }
6860
6861@@ -1343,7 +1348,7 @@ static void __init setup_IO_APIC_irqs(vo
6862 vector = assign_irq_vector(irq);
6863 entry.vector = vector;
6864 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
6865-
6866+
6867 if (!apic && (irq < 16))
6868 disable_8259A_irq(irq);
6869 }
6870@@ -1355,27 +1360,23 @@ static void __init setup_IO_APIC_irqs(vo
6871 apic_printk(APIC_VERBOSE, " not connected.\n");
6872 }
6873
6874+#ifndef CONFIG_XEN
6875 /*
6876- * Set up the 8259A-master output pin:
6877+ * Set up the timer pin, possibly with the 8259A-master behind.
6878 */
6879-#ifndef CONFIG_XEN
6880-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
6881+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
6882+ int vector)
6883 {
6884 struct IO_APIC_route_entry entry;
6885
6886- memset(&entry,0,sizeof(entry));
6887-
6888- disable_8259A_irq(0);
6889-
6890- /* mask LVT0 */
6891- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6892+ memset(&entry, 0, sizeof(entry));
6893
6894 /*
6895 * We use logical delivery to get the timer IRQ
6896 * to the first CPU.
6897 */
6898 entry.dest_mode = INT_DEST_MODE;
6899- entry.mask = 0; /* unmask IRQ now */
6900+ entry.mask = 1; /* mask IRQ now */
6901 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6902 entry.delivery_mode = INT_DELIVERY_MODE;
6903 entry.polarity = 0;
6904@@ -1384,17 +1385,14 @@ static void __init setup_ExtINT_IRQ0_pin
6905
6906 /*
6907 * The timer IRQ doesn't have to know that behind the
6908- * scene we have a 8259A-master in AEOI mode ...
6909+ * scene we may have a 8259A-master in AEOI mode ...
6910 */
6911- irq_desc[0].chip = &ioapic_chip;
6912- set_irq_handler(0, handle_edge_irq);
6913+ ioapic_register_intr(0, vector, IOAPIC_EDGE);
6914
6915 /*
6916 * Add it to the IO-APIC irq-routing table:
6917 */
6918 ioapic_write_entry(apic, pin, entry);
6919-
6920- enable_8259A_irq(0);
6921 }
6922
6923 void __init print_IO_APIC(void)
6924@@ -1409,10 +1407,10 @@ void __init print_IO_APIC(void)
6925 if (apic_verbosity == APIC_QUIET)
6926 return;
6927
6928- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6929+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6930 for (i = 0; i < nr_ioapics; i++)
6931 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
6932- mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
6933+ mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
6934
6935 /*
6936 * We are a bit conservative about what we expect. We have to
6937@@ -1431,7 +1429,7 @@ void __init print_IO_APIC(void)
6938 reg_03.raw = io_apic_read(apic, 3);
6939 spin_unlock_irqrestore(&ioapic_lock, flags);
6940
6941- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
6942+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
6943 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
6944 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
6945 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
6946@@ -1512,7 +1510,7 @@ void __init print_IO_APIC(void)
6947 return;
6948 }
6949
6950-static void print_APIC_bitfield (int base)
6951+static void print_APIC_bitfield(int base)
6952 {
6953 unsigned int v;
6954 int i, j;
6955@@ -1533,7 +1531,7 @@ static void print_APIC_bitfield (int bas
6956 }
6957 }
6958
6959-void /*__init*/ print_local_APIC(void * dummy)
6960+void /*__init*/ print_local_APIC(void *dummy)
6961 {
6962 unsigned int v, ver, maxlvt;
6963
6964@@ -1542,6 +1540,7 @@ void /*__init*/ print_local_APIC(void *
6965
6966 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
6967 smp_processor_id(), hard_smp_processor_id());
6968+ v = apic_read(APIC_ID);
6969 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
6970 GET_APIC_ID(read_apic_id()));
6971 v = apic_read(APIC_LVR);
6972@@ -1616,9 +1615,9 @@ void /*__init*/ print_local_APIC(void *
6973 printk("\n");
6974 }
6975
6976-void print_all_local_APICs (void)
6977+void print_all_local_APICs(void)
6978 {
6979- on_each_cpu(print_local_APIC, NULL, 1, 1);
6980+ on_each_cpu(print_local_APIC, NULL, 1);
6981 }
6982
6983 void /*__init*/ print_PIC(void)
6984@@ -1639,11 +1638,11 @@ void /*__init*/ print_PIC(void)
6985 v = inb(0xa0) << 8 | inb(0x20);
6986 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
6987
6988- outb(0x0b,0xa0);
6989- outb(0x0b,0x20);
6990+ outb(0x0b, 0xa0);
6991+ outb(0x0b, 0x20);
6992 v = inb(0xa0) << 8 | inb(0x20);
6993- outb(0x0a,0xa0);
6994- outb(0x0a,0x20);
6995+ outb(0x0a, 0xa0);
6996+ outb(0x0a, 0x20);
6997
6998 spin_unlock_irqrestore(&i8259A_lock, flags);
6999
7000@@ -1652,6 +1651,8 @@ void /*__init*/ print_PIC(void)
7001 v = inb(0x4d1) << 8 | inb(0x4d0);
7002 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
7003 }
7004+#else
7005+void __init print_IO_APIC(void) {}
7006 #endif /* !CONFIG_XEN */
7007
7008 static void __init enable_IO_APIC(void)
7009@@ -1681,7 +1682,7 @@ static void __init enable_IO_APIC(void)
7010 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
7011 }
7012 #ifndef CONFIG_XEN
7013- for(apic = 0; apic < nr_ioapics; apic++) {
7014+ for (apic = 0; apic < nr_ioapics; apic++) {
7015 int pin;
7016 /* See if any of the pins is in ExtINT mode */
7017 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
7018@@ -1774,7 +1775,7 @@ void disable_IO_APIC(void)
7019 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
7020 */
7021
7022-#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
7023+#ifndef CONFIG_XEN
7024 static void __init setup_ioapic_ids_from_mpc(void)
7025 {
7026 union IO_APIC_reg_00 reg_00;
7027@@ -1784,6 +1785,11 @@ static void __init setup_ioapic_ids_from
7028 unsigned char old_id;
7029 unsigned long flags;
7030
7031+#ifdef CONFIG_X86_NUMAQ
7032+ if (found_numaq)
7033+ return;
7034+#endif
7035+
7036 /*
7037 * Don't check I/O APIC IDs for xAPIC systems. They have
7038 * no meaning without the serial APIC bus.
7039@@ -1806,15 +1812,15 @@ static void __init setup_ioapic_ids_from
7040 spin_lock_irqsave(&ioapic_lock, flags);
7041 reg_00.raw = io_apic_read(apic, 0);
7042 spin_unlock_irqrestore(&ioapic_lock, flags);
7043-
7044- old_id = mp_ioapics[apic].mpc_apicid;
7045
7046- if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
7047+ old_id = mp_ioapics[apic].mp_apicid;
7048+
7049+ if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
7050 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
7051- apic, mp_ioapics[apic].mpc_apicid);
7052+ apic, mp_ioapics[apic].mp_apicid);
7053 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7054 reg_00.bits.ID);
7055- mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
7056+ mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
7057 }
7058
7059 /*
7060@@ -1823,9 +1829,9 @@ static void __init setup_ioapic_ids_from
7061 * 'stuck on smp_invalidate_needed IPI wait' messages.
7062 */
7063 if (check_apicid_used(phys_id_present_map,
7064- mp_ioapics[apic].mpc_apicid)) {
7065+ mp_ioapics[apic].mp_apicid)) {
7066 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
7067- apic, mp_ioapics[apic].mpc_apicid);
7068+ apic, mp_ioapics[apic].mp_apicid);
7069 for (i = 0; i < get_physical_broadcast(); i++)
7070 if (!physid_isset(i, phys_id_present_map))
7071 break;
7072@@ -1834,13 +1840,13 @@ static void __init setup_ioapic_ids_from
7073 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7074 i);
7075 physid_set(i, phys_id_present_map);
7076- mp_ioapics[apic].mpc_apicid = i;
7077+ mp_ioapics[apic].mp_apicid = i;
7078 } else {
7079 physid_mask_t tmp;
7080- tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
7081+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
7082 apic_printk(APIC_VERBOSE, "Setting %d in the "
7083 "phys_id_present_map\n",
7084- mp_ioapics[apic].mpc_apicid);
7085+ mp_ioapics[apic].mp_apicid);
7086 physids_or(phys_id_present_map, phys_id_present_map, tmp);
7087 }
7088
7089@@ -1849,21 +1855,21 @@ static void __init setup_ioapic_ids_from
7090 * We need to adjust the IRQ routing table
7091 * if the ID changed.
7092 */
7093- if (old_id != mp_ioapics[apic].mpc_apicid)
7094+ if (old_id != mp_ioapics[apic].mp_apicid)
7095 for (i = 0; i < mp_irq_entries; i++)
7096- if (mp_irqs[i].mpc_dstapic == old_id)
7097- mp_irqs[i].mpc_dstapic
7098- = mp_ioapics[apic].mpc_apicid;
7099+ if (mp_irqs[i].mp_dstapic == old_id)
7100+ mp_irqs[i].mp_dstapic
7101+ = mp_ioapics[apic].mp_apicid;
7102
7103 /*
7104 * Read the right value from the MPC table and
7105 * write it into the ID register.
7106- */
7107+ */
7108 apic_printk(APIC_VERBOSE, KERN_INFO
7109 "...changing IO-APIC physical APIC ID to %d ...",
7110- mp_ioapics[apic].mpc_apicid);
7111+ mp_ioapics[apic].mp_apicid);
7112
7113- reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
7114+ reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
7115 spin_lock_irqsave(&ioapic_lock, flags);
7116 io_apic_write(apic, 0, reg_00.raw);
7117 spin_unlock_irqrestore(&ioapic_lock, flags);
7118@@ -1874,17 +1880,13 @@ static void __init setup_ioapic_ids_from
7119 spin_lock_irqsave(&ioapic_lock, flags);
7120 reg_00.raw = io_apic_read(apic, 0);
7121 spin_unlock_irqrestore(&ioapic_lock, flags);
7122- if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
7123+ if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
7124 printk("could not set ID!\n");
7125 else
7126 apic_printk(APIC_VERBOSE, " ok.\n");
7127 }
7128 }
7129-#else
7130-static void __init setup_ioapic_ids_from_mpc(void) { }
7131-#endif
7132
7133-#ifndef CONFIG_XEN
7134 int no_timer_check __initdata;
7135
7136 static int __init notimercheck(char *s)
7137@@ -2077,45 +2079,53 @@ static inline void init_IO_APIC_traps(vo
7138 * The local APIC irq-chip implementation:
7139 */
7140
7141-static void ack_apic(unsigned int irq)
7142+static void ack_lapic_irq(unsigned int irq)
7143 {
7144 ack_APIC_irq();
7145 }
7146
7147-static void mask_lapic_irq (unsigned int irq)
7148+static void mask_lapic_irq(unsigned int irq)
7149 {
7150 unsigned long v;
7151
7152 v = apic_read(APIC_LVT0);
7153- apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
7154+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
7155 }
7156
7157-static void unmask_lapic_irq (unsigned int irq)
7158+static void unmask_lapic_irq(unsigned int irq)
7159 {
7160 unsigned long v;
7161
7162 v = apic_read(APIC_LVT0);
7163- apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
7164+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
7165 }
7166
7167 static struct irq_chip lapic_chip __read_mostly = {
7168- .name = "local-APIC-edge",
7169+ .name = "local-APIC",
7170 .mask = mask_lapic_irq,
7171 .unmask = unmask_lapic_irq,
7172- .eoi = ack_apic,
7173+ .ack = ack_lapic_irq,
7174 };
7175
7176+static void lapic_register_intr(int irq, int vector)
7177+{
7178+ irq_desc[irq].status &= ~IRQ_LEVEL;
7179+ set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
7180+ "edge");
7181+ set_intr_gate(vector, interrupt[irq]);
7182+}
7183+
7184 static void __init setup_nmi(void)
7185 {
7186 /*
7187- * Dirty trick to enable the NMI watchdog ...
7188+ * Dirty trick to enable the NMI watchdog ...
7189 * We put the 8259A master into AEOI mode and
7190 * unmask on all local APICs LVT0 as NMI.
7191 *
7192 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
7193 * is from Maciej W. Rozycki - so we do not have to EOI from
7194 * the NMI handler or the timer interrupt.
7195- */
7196+ */
7197 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
7198
7199 enable_NMI_through_LVT0();
7200@@ -2191,11 +2201,16 @@ static inline void __init unlock_ExtINT_
7201 static inline void __init check_timer(void)
7202 {
7203 int apic1, pin1, apic2, pin2;
7204+ int no_pin1 = 0;
7205 int vector;
7206+ unsigned int ver;
7207 unsigned long flags;
7208
7209 local_irq_save(flags);
7210
7211+ ver = apic_read(APIC_LVR);
7212+ ver = GET_APIC_VERSION(ver);
7213+
7214 /*
7215 * get/set the timer IRQ vector:
7216 */
7217@@ -2204,34 +2219,54 @@ static inline void __init check_timer(vo
7218 set_intr_gate(vector, interrupt[0]);
7219
7220 /*
7221- * Subtle, code in do_timer_interrupt() expects an AEOI
7222- * mode for the 8259A whenever interrupts are routed
7223- * through I/O APICs. Also IRQ0 has to be enabled in
7224- * the 8259A which implies the virtual wire has to be
7225- * disabled in the local APIC.
7226+ * As IRQ0 is to be enabled in the 8259A, the virtual
7227+ * wire has to be disabled in the local APIC. Also
7228+ * timer interrupts need to be acknowledged manually in
7229+ * the 8259A for the i82489DX when using the NMI
7230+ * watchdog as that APIC treats NMIs as level-triggered.
7231+ * The AEOI mode will finish them in the 8259A
7232+ * automatically.
7233 */
7234- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7235+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7236 init_8259A(1);
7237- timer_ack = 1;
7238- if (timer_over_8254 > 0)
7239- enable_8259A_irq(0);
7240+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
7241
7242 pin1 = find_isa_irq_pin(0, mp_INT);
7243 apic1 = find_isa_irq_apic(0, mp_INT);
7244 pin2 = ioapic_i8259.pin;
7245 apic2 = ioapic_i8259.apic;
7246
7247- printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
7248- vector, apic1, pin1, apic2, pin2);
7249+ apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
7250+ "apic1=%d pin1=%d apic2=%d pin2=%d\n",
7251+ vector, apic1, pin1, apic2, pin2);
7252+
7253+ /*
7254+ * Some BIOS writers are clueless and report the ExtINTA
7255+ * I/O APIC input from the cascaded 8259A as the timer
7256+ * interrupt input. So just in case, if only one pin
7257+ * was found above, try it both directly and through the
7258+ * 8259A.
7259+ */
7260+ if (pin1 == -1) {
7261+ pin1 = pin2;
7262+ apic1 = apic2;
7263+ no_pin1 = 1;
7264+ } else if (pin2 == -1) {
7265+ pin2 = pin1;
7266+ apic2 = apic1;
7267+ }
7268
7269 if (pin1 != -1) {
7270 /*
7271 * Ok, does IRQ0 through the IOAPIC work?
7272 */
7273+ if (no_pin1) {
7274+ add_pin_to_irq(0, apic1, pin1);
7275+ setup_timer_IRQ0_pin(apic1, pin1, vector);
7276+ }
7277 unmask_IO_APIC_irq(0);
7278 if (timer_irq_works()) {
7279 if (nmi_watchdog == NMI_IO_APIC) {
7280- disable_8259A_irq(0);
7281 setup_nmi();
7282 enable_8259A_irq(0);
7283 }
7284@@ -2240,71 +2275,77 @@ static inline void __init check_timer(vo
7285 goto out;
7286 }
7287 clear_IO_APIC_pin(apic1, pin1);
7288- printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
7289- "IO-APIC\n");
7290- }
7291-
7292- printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
7293- if (pin2 != -1) {
7294- printk("\n..... (found pin %d) ...", pin2);
7295+ if (!no_pin1)
7296+ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
7297+ "8254 timer not connected to IO-APIC\n");
7298+
7299+ apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
7300+ "(IRQ0) through the 8259A ...\n");
7301+ apic_printk(APIC_QUIET, KERN_INFO
7302+ "..... (found apic %d pin %d) ...\n", apic2, pin2);
7303 /*
7304 * legacy devices should be connected to IO APIC #0
7305 */
7306- setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
7307+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7308+ setup_timer_IRQ0_pin(apic2, pin2, vector);
7309+ unmask_IO_APIC_irq(0);
7310+ enable_8259A_irq(0);
7311 if (timer_irq_works()) {
7312- printk("works.\n");
7313- if (pin1 != -1)
7314- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7315- else
7316- add_pin_to_irq(0, apic2, pin2);
7317+ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
7318+ timer_through_8259 = 1;
7319 if (nmi_watchdog == NMI_IO_APIC) {
7320+ disable_8259A_irq(0);
7321 setup_nmi();
7322+ enable_8259A_irq(0);
7323 }
7324 goto out;
7325 }
7326 /*
7327 * Cleanup, just in case ...
7328 */
7329+ disable_8259A_irq(0);
7330 clear_IO_APIC_pin(apic2, pin2);
7331+ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
7332 }
7333- printk(" failed.\n");
7334
7335 if (nmi_watchdog == NMI_IO_APIC) {
7336- printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
7337- nmi_watchdog = 0;
7338+ apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
7339+ "through the IO-APIC - disabling NMI Watchdog!\n");
7340+ nmi_watchdog = NMI_NONE;
7341 }
7342+ timer_ack = 0;
7343
7344- printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
7345+ apic_printk(APIC_QUIET, KERN_INFO
7346+ "...trying to set up timer as Virtual Wire IRQ...\n");
7347
7348- disable_8259A_irq(0);
7349- set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
7350- "fasteoi");
7351- apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7352+ lapic_register_intr(0, vector);
7353+ apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7354 enable_8259A_irq(0);
7355
7356 if (timer_irq_works()) {
7357- printk(" works.\n");
7358+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7359 goto out;
7360 }
7361- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7362- printk(" failed.\n");
7363+ disable_8259A_irq(0);
7364+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7365+ apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
7366
7367- printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
7368+ apic_printk(APIC_QUIET, KERN_INFO
7369+ "...trying to set up timer as ExtINT IRQ...\n");
7370
7371- timer_ack = 0;
7372 init_8259A(0);
7373 make_8259A_irq(0);
7374- apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
7375+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
7376
7377 unlock_ExtINT_logic();
7378
7379 if (timer_irq_works()) {
7380- printk(" works.\n");
7381+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7382 goto out;
7383 }
7384- printk(" failed :(.\n");
7385+ apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
7386 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
7387- "report. Then try booting with the 'noapic' option");
7388+ "report. Then try booting with the 'noapic' option.\n");
7389 out:
7390 local_irq_restore(flags);
7391 }
7392@@ -2314,11 +2355,21 @@ int timer_uses_ioapic_pin_0 = 0;
7393 #endif
7394
7395 /*
7396- *
7397- * IRQ's that are handled by the PIC in the MPS IOAPIC case.
7398- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
7399- * Linux doesn't really care, as it's not actually used
7400- * for any interrupt handling anyway.
7401+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
7402+ * to devices. However there may be an I/O APIC pin available for
7403+ * this interrupt regardless. The pin may be left unconnected, but
7404+ * typically it will be reused as an ExtINT cascade interrupt for
7405+ * the master 8259A. In the MPS case such a pin will normally be
7406+ * reported as an ExtINT interrupt in the MP table. With ACPI
7407+ * there is no provision for ExtINT interrupts, and in the absence
7408+ * of an override it would be treated as an ordinary ISA I/O APIC
7409+ * interrupt, that is edge-triggered and unmasked by default. We
7410+ * used to do this, but it caused problems on some systems because
7411+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
7412+ * the same ExtINT cascade interrupt to drive the local APIC of the
7413+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
7414+ * the I/O APIC in all cases now. No actual device should request
7415+ * it anyway. --macro
7416 */
7417 #define PIC_IRQS (1 << PIC_CASCADE_IR)
7418
7419@@ -2328,25 +2379,22 @@ void __init setup_IO_APIC(void)
7420 int i;
7421
7422 /* Reserve all the system vectors. */
7423- for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
7424+ for (i = first_system_vector; i < NR_VECTORS; i++)
7425 set_bit(i, used_vectors);
7426 #endif
7427
7428 enable_IO_APIC();
7429
7430- if (acpi_ioapic)
7431- io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
7432- else
7433- io_apic_irqs = ~PIC_IRQS;
7434+ io_apic_irqs = ~PIC_IRQS;
7435
7436 printk("ENABLING IO-APIC IRQs\n");
7437
7438+#ifndef CONFIG_XEN
7439 /*
7440 * Set up IO-APIC IRQ routing.
7441 */
7442 if (!acpi_ioapic)
7443 setup_ioapic_ids_from_mpc();
7444-#ifndef CONFIG_XEN
7445 sync_Arb_IDs();
7446 #endif
7447 setup_IO_APIC_irqs();
7448@@ -2356,28 +2404,14 @@ void __init setup_IO_APIC(void)
7449 print_IO_APIC();
7450 }
7451
7452-static int __init setup_disable_8254_timer(char *s)
7453-{
7454- timer_over_8254 = -1;
7455- return 1;
7456-}
7457-static int __init setup_enable_8254_timer(char *s)
7458-{
7459- timer_over_8254 = 2;
7460- return 1;
7461-}
7462-
7463-__setup("disable_8254_timer", setup_disable_8254_timer);
7464-__setup("enable_8254_timer", setup_enable_8254_timer);
7465-
7466 /*
7467 * Called after all the initialization is done. If we didnt find any
7468 * APIC bugs then we can allow the modify fast path
7469 */
7470-
7471+
7472 static int __init io_apic_bug_finalize(void)
7473 {
7474- if(sis_apic_bug == -1)
7475+ if (sis_apic_bug == -1)
7476 sis_apic_bug = 0;
7477 if (is_initial_xendomain()) {
7478 struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
7479@@ -2396,17 +2430,17 @@ struct sysfs_ioapic_data {
7480 struct sys_device dev;
7481 struct IO_APIC_route_entry entry[0];
7482 };
7483-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
7484+static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
7485
7486 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
7487 {
7488 struct IO_APIC_route_entry *entry;
7489 struct sysfs_ioapic_data *data;
7490 int i;
7491-
7492+
7493 data = container_of(dev, struct sysfs_ioapic_data, dev);
7494 entry = data->entry;
7495- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7496+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7497 entry[i] = ioapic_read_entry(dev->id, i);
7498
7499 return 0;
7500@@ -2419,18 +2453,18 @@ static int ioapic_resume(struct sys_devi
7501 unsigned long flags;
7502 union IO_APIC_reg_00 reg_00;
7503 int i;
7504-
7505+
7506 data = container_of(dev, struct sysfs_ioapic_data, dev);
7507 entry = data->entry;
7508
7509 spin_lock_irqsave(&ioapic_lock, flags);
7510 reg_00.raw = io_apic_read(dev->id, 0);
7511- if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
7512- reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
7513+ if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
7514+ reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
7515 io_apic_write(dev->id, 0, reg_00.raw);
7516 }
7517 spin_unlock_irqrestore(&ioapic_lock, flags);
7518- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7519+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7520 ioapic_write_entry(dev->id, i, entry[i]);
7521
7522 return 0;
7523@@ -2444,24 +2478,23 @@ static struct sysdev_class ioapic_sysdev
7524
7525 static int __init ioapic_init_sysfs(void)
7526 {
7527- struct sys_device * dev;
7528+ struct sys_device *dev;
7529 int i, size, error = 0;
7530
7531 error = sysdev_class_register(&ioapic_sysdev_class);
7532 if (error)
7533 return error;
7534
7535- for (i = 0; i < nr_ioapics; i++ ) {
7536- size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7537+ for (i = 0; i < nr_ioapics; i++) {
7538+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7539 * sizeof(struct IO_APIC_route_entry);
7540- mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
7541+ mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
7542 if (!mp_ioapic_data[i]) {
7543 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
7544 continue;
7545 }
7546- memset(mp_ioapic_data[i], 0, size);
7547 dev = &mp_ioapic_data[i]->dev;
7548- dev->id = i;
7549+ dev->id = i;
7550 dev->cls = &ioapic_sysdev_class;
7551 error = sysdev_register(dev);
7552 if (error) {
7553@@ -2538,7 +2571,7 @@ static int msi_compose_msg(struct pci_de
7554 msg->address_lo =
7555 MSI_ADDR_BASE_LO |
7556 ((INT_DEST_MODE == 0) ?
7557- MSI_ADDR_DEST_MODE_PHYSICAL:
7558+MSI_ADDR_DEST_MODE_PHYSICAL:
7559 MSI_ADDR_DEST_MODE_LOGICAL) |
7560 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7561 MSI_ADDR_REDIRECTION_CPU:
7562@@ -2549,7 +2582,7 @@ static int msi_compose_msg(struct pci_de
7563 MSI_DATA_TRIGGER_EDGE |
7564 MSI_DATA_LEVEL_ASSERT |
7565 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7566- MSI_DATA_DELIVERY_FIXED:
7567+MSI_DATA_DELIVERY_FIXED:
7568 MSI_DATA_DELIVERY_LOWPRI) |
7569 MSI_DATA_VECTOR(vector);
7570 }
7571@@ -2720,12 +2753,12 @@ int arch_setup_ht_irq(unsigned int irq,
7572 #endif /* CONFIG_HT_IRQ */
7573
7574 /* --------------------------------------------------------------------------
7575- ACPI-based IOAPIC Configuration
7576+ ACPI-based IOAPIC Configuration
7577 -------------------------------------------------------------------------- */
7578
7579 #ifdef CONFIG_ACPI
7580
7581-int __init io_apic_get_unique_id (int ioapic, int apic_id)
7582+int __init io_apic_get_unique_id(int ioapic, int apic_id)
7583 {
7584 #ifndef CONFIG_XEN
7585 union IO_APIC_reg_00 reg_00;
7586@@ -2735,10 +2768,10 @@ int __init io_apic_get_unique_id (int io
7587 int i = 0;
7588
7589 /*
7590- * The P4 platform supports up to 256 APIC IDs on two separate APIC
7591- * buses (one for LAPICs, one for IOAPICs), where predecessors only
7592+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
7593+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
7594 * supports up to 16 on one shared APIC bus.
7595- *
7596+ *
7597 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
7598 * advantage of new APIC bus architecture.
7599 */
7600@@ -2757,7 +2790,7 @@ int __init io_apic_get_unique_id (int io
7601 }
7602
7603 /*
7604- * Every APIC in a system must have a unique ID or we get lots of nice
7605+ * Every APIC in a system must have a unique ID or we get lots of nice
7606 * 'stuck on smp_invalidate_needed IPI wait' messages.
7607 */
7608 if (check_apicid_used(apic_id_map, apic_id)) {
7609@@ -2774,7 +2807,7 @@ int __init io_apic_get_unique_id (int io
7610 "trying %d\n", ioapic, apic_id, i);
7611
7612 apic_id = i;
7613- }
7614+ }
7615
7616 tmp = apicid_to_cpu_present(apic_id);
7617 physids_or(apic_id_map, apic_id_map, tmp);
7618@@ -2802,7 +2835,7 @@ int __init io_apic_get_unique_id (int io
7619 }
7620
7621
7622-int __init io_apic_get_version (int ioapic)
7623+int __init io_apic_get_version(int ioapic)
7624 {
7625 union IO_APIC_reg_01 reg_01;
7626 unsigned long flags;
7627@@ -2815,7 +2848,7 @@ int __init io_apic_get_version (int ioap
7628 }
7629
7630
7631-int __init io_apic_get_redir_entries (int ioapic)
7632+int __init io_apic_get_redir_entries(int ioapic)
7633 {
7634 union IO_APIC_reg_01 reg_01;
7635 unsigned long flags;
7636@@ -2828,7 +2861,7 @@ int __init io_apic_get_redir_entries (in
7637 }
7638
7639
7640-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
7641+int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
7642 {
7643 struct IO_APIC_route_entry entry;
7644
7645@@ -2844,7 +2877,7 @@ int io_apic_set_pci_routing (int ioapic,
7646 * corresponding device driver registers for this IRQ.
7647 */
7648
7649- memset(&entry,0,sizeof(entry));
7650+ memset(&entry, 0, sizeof(entry));
7651
7652 entry.delivery_mode = INT_DELIVERY_MODE;
7653 entry.dest_mode = INT_DEST_MODE;
7654@@ -2863,7 +2896,7 @@ int io_apic_set_pci_routing (int ioapic,
7655
7656 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
7657 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
7658- mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
7659+ mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
7660 edge_level, active_high_low);
7661
7662 ioapic_register_intr(irq, entry.vector, edge_level);
7663@@ -2884,8 +2917,8 @@ int acpi_get_override_irq(int bus_irq, i
7664 return -1;
7665
7666 for (i = 0; i < mp_irq_entries; i++)
7667- if (mp_irqs[i].mpc_irqtype == mp_INT &&
7668- mp_irqs[i].mpc_srcbusirq == bus_irq)
7669+ if (mp_irqs[i].mp_irqtype == mp_INT &&
7670+ mp_irqs[i].mp_srcbusirq == bus_irq)
7671 break;
7672 if (i >= mp_irq_entries)
7673 return -1;
7674@@ -2918,3 +2951,35 @@ static int __init parse_noapic(char *arg
7675 return 0;
7676 }
7677 early_param("noapic", parse_noapic);
7678+
7679+#ifndef CONFIG_XEN
7680+void __init ioapic_init_mappings(void)
7681+{
7682+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
7683+ int i;
7684+
7685+ for (i = 0; i < nr_ioapics; i++) {
7686+ if (smp_found_config) {
7687+ ioapic_phys = mp_ioapics[i].mp_apicaddr;
7688+ if (!ioapic_phys) {
7689+ printk(KERN_ERR
7690+ "WARNING: bogus zero IO-APIC "
7691+ "address found in MPTABLE, "
7692+ "disabling IO/APIC support!\n");
7693+ smp_found_config = 0;
7694+ skip_ioapic_setup = 1;
7695+ goto fake_ioapic_page;
7696+ }
7697+ } else {
7698+fake_ioapic_page:
7699+ ioapic_phys = (unsigned long)
7700+ alloc_bootmem_pages(PAGE_SIZE);
7701+ ioapic_phys = __pa(ioapic_phys);
7702+ }
7703+ set_fixmap_nocache(idx, ioapic_phys);
7704+ printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
7705+ __fix_to_virt(idx), ioapic_phys);
7706+ idx++;
7707+ }
7708+}
7709+#endif
00e5a55c
BS
7710--- sle11-2009-06-04.orig/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
7711+++ sle11-2009-06-04/arch/x86/kernel/io_apic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
7712@@ -45,6 +45,7 @@
7713 #include <asm/proto.h>
7714 #include <asm/acpi.h>
7715 #include <asm/dma.h>
7716+#include <asm/i8259.h>
7717 #include <asm/nmi.h>
7718 #include <asm/msidef.h>
7719 #include <asm/hypertransport.h>
7720@@ -63,10 +64,16 @@ struct irq_cfg {
7721 };
7722
7723 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
7724-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7725+static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7726
7727 static int assign_irq_vector(int irq, cpumask_t mask);
7728
7729+#ifndef CONFIG_XEN
7730+int first_system_vector = 0xfe;
7731+
7732+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
7733+#endif
7734+
7735 #define __apicdebuginit __init
7736
7737 int sis_apic_bug; /* not actually supported, dummy for compile */
7738@@ -89,14 +96,14 @@ unsigned long io_apic_irqs;
7739
7740 #define clear_IO_APIC() ((void)0)
7741 #else
7742-int timer_over_8254 __initdata = 1;
7743+int timer_through_8259 __initdata;
7744
7745 /* Where if anywhere is the i8259 connect in external int mode */
7746 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
7747 #endif
7748
7749 static DEFINE_SPINLOCK(ioapic_lock);
7750-DEFINE_SPINLOCK(vector_lock);
7751+static DEFINE_SPINLOCK(vector_lock);
7752
7753 /*
7754 * # of IRQ routing registers
7755@@ -104,15 +111,17 @@ DEFINE_SPINLOCK(vector_lock);
7756 int nr_ioapic_registers[MAX_IO_APICS];
7757
7758 /* I/O APIC entries */
7759-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7760+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
7761 int nr_ioapics;
7762
7763 /* MP IRQ source entries */
7764-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7765+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7766
7767 /* # of MP IRQ source entries */
7768 int mp_irq_entries;
7769
7770+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
7771+
7772 /*
7773 * Rough estimation of how many shared IRQs there are, can
7774 * be changed anytime.
7775@@ -141,7 +150,7 @@ struct io_apic {
7776 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
7777 {
7778 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
7779- + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
7780+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
7781 }
7782 #endif
7783
7784@@ -155,7 +164,7 @@ static inline unsigned int io_apic_read(
7785 struct physdev_apic apic_op;
7786 int ret;
7787
7788- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7789+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7790 apic_op.reg = reg;
7791 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
7792 if (ret)
7793@@ -173,7 +182,7 @@ static inline void io_apic_write(unsigne
7794 #else
7795 struct physdev_apic apic_op;
7796
7797- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7798+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7799 apic_op.reg = reg;
7800 apic_op.value = value;
7801 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
7802@@ -209,7 +218,7 @@ static bool io_apic_level_ack_pending(un
7803 break;
7804 reg = io_apic_read(entry->apic, 0x10 + pin*2);
7805 /* Is the remote IRR bit set? */
7806- if ((reg >> 14) & 1) {
7807+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
7808 spin_unlock_irqrestore(&ioapic_lock, flags);
7809 return true;
7810 }
7811@@ -308,7 +317,7 @@ static void __target_IO_APIC_irq(unsigne
7812 break;
7813 io_apic_write(apic, 0x11 + pin*2, dest);
7814 reg = io_apic_read(apic, 0x10 + pin*2);
7815- reg &= ~0x000000ff;
7816+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
7817 reg |= vector;
7818 io_apic_modify(apic, reg);
7819 if (!entry->next)
7820@@ -372,6 +381,26 @@ static void add_pin_to_irq(unsigned int
7821 }
7822
7823 #ifndef CONFIG_XEN
7824+/*
7825+ * Reroute an IRQ to a different pin.
7826+ */
7827+static void __init replace_pin_at_irq(unsigned int irq,
7828+ int oldapic, int oldpin,
7829+ int newapic, int newpin)
7830+{
7831+ struct irq_pin_list *entry = irq_2_pin + irq;
7832+
7833+ while (1) {
7834+ if (entry->apic == oldapic && entry->pin == oldpin) {
7835+ entry->apic = newapic;
7836+ entry->pin = newpin;
7837+ }
7838+ if (!entry->next)
7839+ break;
7840+ entry = irq_2_pin + entry->next;
7841+ }
7842+}
7843+
7844 #define __DO_ACTION(R, ACTION, FINAL) \
7845 \
7846 { \
7847@@ -399,10 +428,11 @@ static void add_pin_to_irq(unsigned int
7848 static void name##_IO_APIC_irq (unsigned int irq) \
7849 __DO_ACTION(R, ACTION, FINAL)
7850
7851-DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
7852- /* mask = 1 */
7853-DO_ACTION( __unmask, 0, &= 0xfffeffff, )
7854- /* mask = 0 */
7855+/* mask = 1 */
7856+DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
7857+
7858+/* mask = 0 */
7859+DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
7860
7861 static void mask_IO_APIC_irq (unsigned int irq)
7862 {
7863@@ -465,22 +495,6 @@ static int __init disable_timer_pin_setu
7864 }
7865 __setup("disable_timer_pin_1", disable_timer_pin_setup);
7866
7867-#ifndef CONFIG_XEN
7868-static int __init setup_disable_8254_timer(char *s)
7869-{
7870- timer_over_8254 = -1;
7871- return 1;
7872-}
7873-static int __init setup_enable_8254_timer(char *s)
7874-{
7875- timer_over_8254 = 2;
7876- return 1;
7877-}
7878-
7879-__setup("disable_8254_timer", setup_disable_8254_timer);
7880-__setup("enable_8254_timer", setup_enable_8254_timer);
7881-#endif /* !CONFIG_XEN */
7882-
7883
7884 /*
7885 * Find the IRQ entry number of a certain pin.
7886@@ -490,10 +504,10 @@ static int find_irq_entry(int apic, int
7887 int i;
7888
7889 for (i = 0; i < mp_irq_entries; i++)
7890- if (mp_irqs[i].mpc_irqtype == type &&
7891- (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
7892- mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
7893- mp_irqs[i].mpc_dstirq == pin)
7894+ if (mp_irqs[i].mp_irqtype == type &&
7895+ (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
7896+ mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
7897+ mp_irqs[i].mp_dstirq == pin)
7898 return i;
7899
7900 return -1;
7901@@ -508,13 +522,13 @@ static int __init find_isa_irq_pin(int i
7902 int i;
7903
7904 for (i = 0; i < mp_irq_entries; i++) {
7905- int lbus = mp_irqs[i].mpc_srcbus;
7906+ int lbus = mp_irqs[i].mp_srcbus;
7907
7908 if (test_bit(lbus, mp_bus_not_pci) &&
7909- (mp_irqs[i].mpc_irqtype == type) &&
7910- (mp_irqs[i].mpc_srcbusirq == irq))
7911+ (mp_irqs[i].mp_irqtype == type) &&
7912+ (mp_irqs[i].mp_srcbusirq == irq))
7913
7914- return mp_irqs[i].mpc_dstirq;
7915+ return mp_irqs[i].mp_dstirq;
7916 }
7917 return -1;
7918 }
7919@@ -524,17 +538,17 @@ static int __init find_isa_irq_apic(int
7920 int i;
7921
7922 for (i = 0; i < mp_irq_entries; i++) {
7923- int lbus = mp_irqs[i].mpc_srcbus;
7924+ int lbus = mp_irqs[i].mp_srcbus;
7925
7926 if (test_bit(lbus, mp_bus_not_pci) &&
7927- (mp_irqs[i].mpc_irqtype == type) &&
7928- (mp_irqs[i].mpc_srcbusirq == irq))
7929+ (mp_irqs[i].mp_irqtype == type) &&
7930+ (mp_irqs[i].mp_srcbusirq == irq))
7931 break;
7932 }
7933 if (i < mp_irq_entries) {
7934 int apic;
7935 for(apic = 0; apic < nr_ioapics; apic++) {
7936- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
7937+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
7938 return apic;
7939 }
7940 }
7941@@ -555,28 +569,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7942
7943 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
7944 bus, slot, pin);
7945- if (mp_bus_id_to_pci_bus[bus] == -1) {
7946+ if (test_bit(bus, mp_bus_not_pci)) {
7947 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
7948 return -1;
7949 }
7950 for (i = 0; i < mp_irq_entries; i++) {
7951- int lbus = mp_irqs[i].mpc_srcbus;
7952+ int lbus = mp_irqs[i].mp_srcbus;
7953
7954 for (apic = 0; apic < nr_ioapics; apic++)
7955- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
7956- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
7957+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
7958+ mp_irqs[i].mp_dstapic == MP_APIC_ALL)
7959 break;
7960
7961 if (!test_bit(lbus, mp_bus_not_pci) &&
7962- !mp_irqs[i].mpc_irqtype &&
7963+ !mp_irqs[i].mp_irqtype &&
7964 (bus == lbus) &&
7965- (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
7966- int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
7967+ (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
7968+ int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
7969
7970 if (!(apic || IO_APIC_IRQ(irq)))
7971 continue;
7972
7973- if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
7974+ if (pin == (mp_irqs[i].mp_srcbusirq & 3))
7975 return irq;
7976 /*
7977 * Use the first all-but-pin matching entry as a
7978@@ -604,13 +618,13 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7979
7980 static int MPBIOS_polarity(int idx)
7981 {
7982- int bus = mp_irqs[idx].mpc_srcbus;
7983+ int bus = mp_irqs[idx].mp_srcbus;
7984 int polarity;
7985
7986 /*
7987 * Determine IRQ line polarity (high active or low active):
7988 */
7989- switch (mp_irqs[idx].mpc_irqflag & 3)
7990+ switch (mp_irqs[idx].mp_irqflag & 3)
7991 {
7992 case 0: /* conforms, ie. bus-type dependent polarity */
7993 if (test_bit(bus, mp_bus_not_pci))
7994@@ -646,13 +660,13 @@ static int MPBIOS_polarity(int idx)
7995
7996 static int MPBIOS_trigger(int idx)
7997 {
7998- int bus = mp_irqs[idx].mpc_srcbus;
7999+ int bus = mp_irqs[idx].mp_srcbus;
8000 int trigger;
8001
8002 /*
8003 * Determine IRQ trigger mode (edge or level sensitive):
8004 */
8005- switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
8006+ switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
8007 {
8008 case 0: /* conforms, ie. bus-type dependent */
8009 if (test_bit(bus, mp_bus_not_pci))
8010@@ -699,16 +713,16 @@ static inline int irq_trigger(int idx)
8011 static int pin_2_irq(int idx, int apic, int pin)
8012 {
8013 int irq, i;
8014- int bus = mp_irqs[idx].mpc_srcbus;
8015+ int bus = mp_irqs[idx].mp_srcbus;
8016
8017 /*
8018 * Debugging check, we are in big trouble if this message pops up!
8019 */
8020- if (mp_irqs[idx].mpc_dstirq != pin)
8021+ if (mp_irqs[idx].mp_dstirq != pin)
8022 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
8023
8024 if (test_bit(bus, mp_bus_not_pci)) {
8025- irq = mp_irqs[idx].mpc_srcbusirq;
8026+ irq = mp_irqs[idx].mp_srcbusirq;
8027 } else {
8028 /*
8029 * PCI IRQs are mapped in order
8030@@ -722,6 +736,19 @@ static int pin_2_irq(int idx, int apic,
8031 return irq;
8032 }
8033
8034+void lock_vector_lock(void)
8035+{
8036+ /* Used to the online set of cpus does not change
8037+ * during assign_irq_vector.
8038+ */
8039+ spin_lock(&vector_lock);
8040+}
8041+
8042+void unlock_vector_lock(void)
8043+{
8044+ spin_unlock(&vector_lock);
8045+}
8046+
8047 static int __assign_irq_vector(int irq, cpumask_t mask)
8048 {
8049 struct physdev_irq irq_op;
8050@@ -773,7 +800,7 @@ static void __clear_irq_vector(int irq)
8051
8052 vector = cfg->vector;
8053 cpus_and(mask, cfg->domain, cpu_online_map);
8054- for_each_cpu_mask(cpu, mask)
8055+ for_each_cpu_mask_nr(cpu, mask)
8056 per_cpu(vector_irq, cpu)[vector] = -1;
8057
8058 cfg->vector = 0;
8059@@ -842,7 +869,7 @@ static void setup_IO_APIC_irq(int apic,
8060 apic_printk(APIC_VERBOSE,KERN_DEBUG
8061 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
8062 "IRQ %d Mode:%i Active:%i)\n",
8063- apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
8064+ apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
8065 irq, trigger, polarity);
8066
8067 /*
8068@@ -883,10 +910,10 @@ static void __init setup_IO_APIC_irqs(vo
8069 idx = find_irq_entry(apic,pin,mp_INT);
8070 if (idx == -1) {
8071 if (first_notcon) {
8072- apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8073+ apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
8074 first_notcon = 0;
8075 } else
8076- apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8077+ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
8078 continue;
8079 }
8080 if (!first_notcon) {
8081@@ -908,26 +935,21 @@ static void __init setup_IO_APIC_irqs(vo
8082
8083 #ifndef CONFIG_XEN
8084 /*
8085- * Set up the 8259A-master output pin as broadcast to all
8086- * CPUs.
8087+ * Set up the timer pin, possibly with the 8259A-master behind.
8088 */
8089-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
8090+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
8091+ int vector)
8092 {
8093 struct IO_APIC_route_entry entry;
8094
8095 memset(&entry, 0, sizeof(entry));
8096
8097- disable_8259A_irq(0);
8098-
8099- /* mask LVT0 */
8100- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8101-
8102 /*
8103 * We use logical delivery to get the timer IRQ
8104 * to the first CPU.
8105 */
8106 entry.dest_mode = INT_DEST_MODE;
8107- entry.mask = 0; /* unmask IRQ now */
8108+ entry.mask = 1; /* mask IRQ now */
8109 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
8110 entry.delivery_mode = INT_DELIVERY_MODE;
8111 entry.polarity = 0;
8112@@ -936,7 +958,7 @@ static void __init setup_ExtINT_IRQ0_pin
8113
8114 /*
8115 * The timer IRQ doesn't have to know that behind the
8116- * scene we have a 8259A-master in AEOI mode ...
8117+ * scene we may have a 8259A-master in AEOI mode ...
8118 */
8119 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
8120
8121@@ -944,8 +966,6 @@ static void __init setup_ExtINT_IRQ0_pin
8122 * Add it to the IO-APIC irq-routing table:
8123 */
8124 ioapic_write_entry(apic, pin, entry);
8125-
8126- enable_8259A_irq(0);
8127 }
8128
8129 void __apicdebuginit print_IO_APIC(void)
8130@@ -962,7 +982,7 @@ void __apicdebuginit print_IO_APIC(void)
8131 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
8132 for (i = 0; i < nr_ioapics; i++)
8133 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
8134- mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
8135+ mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
8136
8137 /*
8138 * We are a bit conservative about what we expect. We have to
8139@@ -980,7 +1000,7 @@ void __apicdebuginit print_IO_APIC(void)
8140 spin_unlock_irqrestore(&ioapic_lock, flags);
8141
8142 printk("\n");
8143- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
8144+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
8145 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
8146 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
8147
8148@@ -1072,6 +1092,7 @@ void __apicdebuginit print_local_APIC(vo
8149
8150 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
8151 smp_processor_id(), hard_smp_processor_id());
8152+ v = apic_read(APIC_ID);
8153 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
8154 v = apic_read(APIC_LVR);
8155 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
8156@@ -1141,7 +1162,7 @@ void __apicdebuginit print_local_APIC(vo
8157
8158 void print_all_local_APICs (void)
8159 {
8160- on_each_cpu(print_local_APIC, NULL, 1, 1);
8161+ on_each_cpu(print_local_APIC, NULL, 1);
8162 }
8163
8164 void __apicdebuginit print_PIC(void)
8165@@ -1175,6 +1196,8 @@ void __apicdebuginit print_PIC(void)
8166 v = inb(0x4d1) << 8 | inb(0x4d0);
8167 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
8168 }
8169+#else
8170+void __apicdebuginit print_IO_APIC(void) {}
8171 #endif /* !CONFIG_XEN */
8172
8173 void __init enable_IO_APIC(void)
8174@@ -1359,12 +1382,10 @@ static unsigned int startup_ioapic_irq(u
8175 static int ioapic_retrigger_irq(unsigned int irq)
8176 {
8177 struct irq_cfg *cfg = &irq_cfg[irq];
8178- cpumask_t mask;
8179 unsigned long flags;
8180
8181 spin_lock_irqsave(&vector_lock, flags);
8182- mask = cpumask_of_cpu(first_cpu(cfg->domain));
8183- send_IPI_mask(mask, cfg->vector);
8184+ send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
8185 spin_unlock_irqrestore(&vector_lock, flags);
8186
8187 return 1;
8188@@ -1545,7 +1566,7 @@ static inline void init_IO_APIC_traps(vo
8189 }
8190
8191 #ifndef CONFIG_XEN
8192-static void enable_lapic_irq (unsigned int irq)
8193+static void unmask_lapic_irq(unsigned int irq)
8194 {
8195 unsigned long v;
8196
8197@@ -1553,7 +1574,7 @@ static void enable_lapic_irq (unsigned i
8198 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
8199 }
8200
8201-static void disable_lapic_irq (unsigned int irq)
8202+static void mask_lapic_irq(unsigned int irq)
8203 {
8204 unsigned long v;
8205
8206@@ -1566,19 +1587,20 @@ static void ack_lapic_irq (unsigned int
8207 ack_APIC_irq();
8208 }
8209
8210-static void end_lapic_irq (unsigned int i) { /* nothing */ }
8211-
8212-static struct hw_interrupt_type lapic_irq_type __read_mostly = {
8213- .name = "local-APIC",
8214- .typename = "local-APIC-edge",
8215- .startup = NULL, /* startup_irq() not used for IRQ0 */
8216- .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
8217- .enable = enable_lapic_irq,
8218- .disable = disable_lapic_irq,
8219- .ack = ack_lapic_irq,
8220- .end = end_lapic_irq,
8221+static struct irq_chip lapic_chip __read_mostly = {
8222+ .name = "local-APIC",
8223+ .mask = mask_lapic_irq,
8224+ .unmask = unmask_lapic_irq,
8225+ .ack = ack_lapic_irq,
8226 };
8227
8228+static void lapic_register_intr(int irq)
8229+{
8230+ irq_desc[irq].status &= ~IRQ_LEVEL;
8231+ set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
8232+ "edge");
8233+}
8234+
8235 static void __init setup_nmi(void)
8236 {
8237 /*
8238@@ -1664,6 +1686,7 @@ static inline void __init check_timer(vo
8239 struct irq_cfg *cfg = irq_cfg + 0;
8240 int apic1, pin1, apic2, pin2;
8241 unsigned long flags;
8242+ int no_pin1 = 0;
8243
8244 local_irq_save(flags);
8245
8246@@ -1674,34 +1697,48 @@ static inline void __init check_timer(vo
8247 assign_irq_vector(0, TARGET_CPUS);
8248
8249 /*
8250- * Subtle, code in do_timer_interrupt() expects an AEOI
8251- * mode for the 8259A whenever interrupts are routed
8252- * through I/O APICs. Also IRQ0 has to be enabled in
8253- * the 8259A which implies the virtual wire has to be
8254- * disabled in the local APIC.
8255+ * As IRQ0 is to be enabled in the 8259A, the virtual
8256+ * wire has to be disabled in the local APIC.
8257 */
8258 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8259 init_8259A(1);
8260- if (timer_over_8254 > 0)
8261- enable_8259A_irq(0);
8262
8263 pin1 = find_isa_irq_pin(0, mp_INT);
8264 apic1 = find_isa_irq_apic(0, mp_INT);
8265 pin2 = ioapic_i8259.pin;
8266 apic2 = ioapic_i8259.apic;
8267
8268- apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
8269- cfg->vector, apic1, pin1, apic2, pin2);
8270+ apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
8271+ "apic1=%d pin1=%d apic2=%d pin2=%d\n",
8272+ cfg->vector, apic1, pin1, apic2, pin2);
8273+
8274+ /*
8275+ * Some BIOS writers are clueless and report the ExtINTA
8276+ * I/O APIC input from the cascaded 8259A as the timer
8277+ * interrupt input. So just in case, if only one pin
8278+ * was found above, try it both directly and through the
8279+ * 8259A.
8280+ */
8281+ if (pin1 == -1) {
8282+ pin1 = pin2;
8283+ apic1 = apic2;
8284+ no_pin1 = 1;
8285+ } else if (pin2 == -1) {
8286+ pin2 = pin1;
8287+ apic2 = apic1;
8288+ }
8289
8290 if (pin1 != -1) {
8291 /*
8292 * Ok, does IRQ0 through the IOAPIC work?
8293 */
8294+ if (no_pin1) {
8295+ add_pin_to_irq(0, apic1, pin1);
8296+ setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
8297+ }
8298 unmask_IO_APIC_irq(0);
8299 if (!no_timer_check && timer_irq_works()) {
8300- nmi_watchdog_default();
8301 if (nmi_watchdog == NMI_IO_APIC) {
8302- disable_8259A_irq(0);
8303 setup_nmi();
8304 enable_8259A_irq(0);
8305 }
8306@@ -1710,54 +1747,62 @@ static inline void __init check_timer(vo
8307 goto out;
8308 }
8309 clear_IO_APIC_pin(apic1, pin1);
8310- apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
8311- "connected to IO-APIC\n");
8312- }
8313-
8314- apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
8315- "through the 8259A ... ");
8316- if (pin2 != -1) {
8317- apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
8318- apic2, pin2);
8319+ if (!no_pin1)
8320+ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
8321+ "8254 timer not connected to IO-APIC\n");
8322+
8323+ apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
8324+ "(IRQ0) through the 8259A ...\n");
8325+ apic_printk(APIC_QUIET, KERN_INFO
8326+ "..... (found apic %d pin %d) ...\n", apic2, pin2);
8327 /*
8328 * legacy devices should be connected to IO APIC #0
8329 */
8330- setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
8331+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
8332+ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
8333+ unmask_IO_APIC_irq(0);
8334+ enable_8259A_irq(0);
8335 if (timer_irq_works()) {
8336- apic_printk(APIC_VERBOSE," works.\n");
8337- nmi_watchdog_default();
8338+ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
8339+ timer_through_8259 = 1;
8340 if (nmi_watchdog == NMI_IO_APIC) {
8341+ disable_8259A_irq(0);
8342 setup_nmi();
8343+ enable_8259A_irq(0);
8344 }
8345 goto out;
8346 }
8347 /*
8348 * Cleanup, just in case ...
8349 */
8350+ disable_8259A_irq(0);
8351 clear_IO_APIC_pin(apic2, pin2);
8352+ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
8353 }
8354- apic_printk(APIC_VERBOSE," failed.\n");
8355
8356 if (nmi_watchdog == NMI_IO_APIC) {
8357- printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
8358- nmi_watchdog = 0;
8359+ apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
8360+ "through the IO-APIC - disabling NMI Watchdog!\n");
8361+ nmi_watchdog = NMI_NONE;
8362 }
8363
8364- apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
8365+ apic_printk(APIC_QUIET, KERN_INFO
8366+ "...trying to set up timer as Virtual Wire IRQ...\n");
8367
8368- disable_8259A_irq(0);
8369- irq_desc[0].chip = &lapic_irq_type;
8370+ lapic_register_intr(0);
8371 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
8372 enable_8259A_irq(0);
8373
8374 if (timer_irq_works()) {
8375- apic_printk(APIC_VERBOSE," works.\n");
8376+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8377 goto out;
8378 }
8379+ disable_8259A_irq(0);
8380 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
8381- apic_printk(APIC_VERBOSE," failed.\n");
8382+ apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
8383
8384- apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
8385+ apic_printk(APIC_QUIET, KERN_INFO
8386+ "...trying to set up timer as ExtINT IRQ...\n");
8387
8388 init_8259A(0);
8389 make_8259A_irq(0);
8390@@ -1766,11 +1811,12 @@ static inline void __init check_timer(vo
8391 unlock_ExtINT_logic();
8392
8393 if (timer_irq_works()) {
8394- apic_printk(APIC_VERBOSE," works.\n");
8395+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8396 goto out;
8397 }
8398- apic_printk(APIC_VERBOSE," failed :(.\n");
8399- panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
8400+ apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
8401+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
8402+ "report. Then try booting with the 'noapic' option.\n");
8403 out:
8404 local_irq_restore(flags);
8405 }
8406@@ -1788,10 +1834,21 @@ __setup("no_timer_check", notimercheck);
8407
8408 /*
8409 *
8410- * IRQs that are handled by the PIC in the MPS IOAPIC case.
8411- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
8412- * Linux doesn't really care, as it's not actually used
8413- * for any interrupt handling anyway.
8414+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
8415+ * to devices. However there may be an I/O APIC pin available for
8416+ * this interrupt regardless. The pin may be left unconnected, but
8417+ * typically it will be reused as an ExtINT cascade interrupt for
8418+ * the master 8259A. In the MPS case such a pin will normally be
8419+ * reported as an ExtINT interrupt in the MP table. With ACPI
8420+ * there is no provision for ExtINT interrupts, and in the absence
8421+ * of an override it would be treated as an ordinary ISA I/O APIC
8422+ * interrupt, that is edge-triggered and unmasked by default. We
8423+ * used to do this, but it caused problems on some systems because
8424+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
8425+ * the same ExtINT cascade interrupt to drive the local APIC of the
8426+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
8427+ * the I/O APIC in all cases now. No actual device should request
8428+ * it anyway. --macro
8429 */
8430 #define PIC_IRQS (1<<2)
8431
8432@@ -1799,10 +1856,7 @@ void __init setup_IO_APIC(void)
8433 {
8434 enable_IO_APIC();
8435
8436- if (acpi_ioapic)
8437- io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
8438- else
8439- io_apic_irqs = ~PIC_IRQS;
8440+ io_apic_irqs = ~PIC_IRQS;
8441
8442 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
8443
8444@@ -1851,8 +1905,8 @@ static int ioapic_resume(struct sys_devi
8445
8446 spin_lock_irqsave(&ioapic_lock, flags);
8447 reg_00.raw = io_apic_read(dev->id, 0);
8448- if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
8449- reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
8450+ if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
8451+ reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
8452 io_apic_write(dev->id, 0, reg_00.raw);
8453 }
8454 spin_unlock_irqrestore(&ioapic_lock, flags);
8455@@ -2254,8 +2308,8 @@ int acpi_get_override_irq(int bus_irq, i
8456 return -1;
8457
8458 for (i = 0; i < mp_irq_entries; i++)
8459- if (mp_irqs[i].mpc_irqtype == mp_INT &&
8460- mp_irqs[i].mpc_srcbusirq == bus_irq)
8461+ if (mp_irqs[i].mp_irqtype == mp_INT &&
8462+ mp_irqs[i].mp_srcbusirq == bus_irq)
8463 break;
8464 if (i >= mp_irq_entries)
8465 return -1;
8466@@ -2349,7 +2403,7 @@ void __init ioapic_init_mappings(void)
8467 ioapic_res = ioapic_setup_resources();
8468 for (i = 0; i < nr_ioapics; i++) {
8469 if (smp_found_config) {
8470- ioapic_phys = mp_ioapics[i].mpc_apicaddr;
8471+ ioapic_phys = mp_ioapics[i].mp_apicaddr;
8472 } else {
8473 ioapic_phys = (unsigned long)
8474 alloc_bootmem_pages(PAGE_SIZE);
00e5a55c
BS
8475--- sle11-2009-06-04.orig/arch/x86/kernel/ipi-xen.c 2009-03-16 16:38:05.000000000 +0100
8476+++ sle11-2009-06-04/arch/x86/kernel/ipi-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
8477@@ -8,7 +8,6 @@
8478 #include <linux/kernel_stat.h>
8479 #include <linux/mc146818rtc.h>
8480 #include <linux/cache.h>
8481-#include <linux/interrupt.h>
8482 #include <linux/cpu.h>
8483 #include <linux/module.h>
8484
8485@@ -85,7 +84,7 @@ void __send_IPI_shortcut(unsigned int sh
8486 /*
8487 * Send the IPI. The write to APIC_ICR fires this off.
8488 */
8489- apic_write_around(APIC_ICR, cfg);
8490+ apic_write(APIC_ICR, cfg);
8491 #else
8492 int cpu;
8493
8494@@ -132,7 +131,7 @@ static inline void __send_IPI_dest_field
8495 * prepare target chip field
8496 */
8497 cfg = __prepare_ICR2(mask);
8498- apic_write_around(APIC_ICR2, cfg);
8499+ apic_write(APIC_ICR2, cfg);
8500
8501 /*
8502 * program the ICR
8503@@ -142,7 +141,7 @@ static inline void __send_IPI_dest_field
8504 /*
8505 * Send the IPI. The write to APIC_ICR fires this off.
8506 */
8507- apic_write_around(APIC_ICR, cfg);
8508+ apic_write(APIC_ICR, cfg);
8509 }
8510 #endif
8511
00e5a55c
BS
8512--- sle11-2009-06-04.orig/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:38:05.000000000 +0100
8513+++ sle11-2009-06-04/arch/x86/kernel/irq_32-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
8514@@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
8515 #endif
8516 }
8517
8518+#ifdef CONFIG_DEBUG_STACKOVERFLOW
8519+/* Debugging check for stack overflow: is there less than 1KB free? */
8520+static int check_stack_overflow(void)
8521+{
8522+ long sp;
8523+
8524+ __asm__ __volatile__("andl %%esp,%0" :
8525+ "=r" (sp) : "0" (THREAD_SIZE - 1));
8526+
8527+ return sp < (sizeof(struct thread_info) + STACK_WARN);
8528+}
8529+
8530+static void print_stack_overflow(void)
8531+{
8532+ printk(KERN_WARNING "low stack detected by irq handler\n");
8533+ dump_stack();
8534+}
8535+
8536+#else
8537+static inline int check_stack_overflow(void) { return 0; }
8538+static inline void print_stack_overflow(void) { }
8539+#endif
8540+
8541 #ifdef CONFIG_4KSTACKS
8542 /*
8543 * per-CPU IRQ handling contexts (thread information and stack)
8544@@ -59,48 +82,26 @@ union irq_ctx {
8545
8546 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
8547 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
8548-#endif
8549-
8550-/*
8551- * do_IRQ handles all normal device IRQ's (the special
8552- * SMP cross-CPU interrupts have their own specific
8553- * handlers).
8554- */
8555-unsigned int do_IRQ(struct pt_regs *regs)
8556-{
8557- struct pt_regs *old_regs;
8558- /* high bit used in ret_from_ code */
8559- int irq = ~regs->orig_ax;
8560- struct irq_desc *desc = irq_desc + irq;
8561-#ifdef CONFIG_4KSTACKS
8562- union irq_ctx *curctx, *irqctx;
8563- u32 *isp;
8564-#endif
8565
8566- if (unlikely((unsigned)irq >= NR_IRQS)) {
8567- printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8568- __func__, irq);
8569- BUG();
8570- }
8571+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8572+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8573
8574- old_regs = set_irq_regs(regs);
8575- /*irq_enter();*/
8576-#ifdef CONFIG_DEBUG_STACKOVERFLOW
8577- /* Debugging check for stack overflow: is there less than 1KB free? */
8578- {
8579- long sp;
8580-
8581- __asm__ __volatile__("andl %%esp,%0" :
8582- "=r" (sp) : "0" (THREAD_SIZE - 1));
8583- if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
8584- printk("do_IRQ: stack overflow: %ld\n",
8585- sp - sizeof(struct thread_info));
8586- dump_stack();
8587- }
8588- }
8589-#endif
8590+static void call_on_stack(void *func, void *stack)
8591+{
8592+ asm volatile("xchgl %%ebx,%%esp \n"
8593+ "call *%%edi \n"
8594+ "movl %%ebx,%%esp \n"
8595+ : "=b" (stack)
8596+ : "0" (stack),
8597+ "D"(func)
8598+ : "memory", "cc", "edx", "ecx", "eax");
8599+}
8600
8601-#ifdef CONFIG_4KSTACKS
8602+static inline int
8603+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
8604+{
8605+ union irq_ctx *curctx, *irqctx;
8606+ u32 *isp, arg1, arg2;
8607
8608 curctx = (union irq_ctx *) current_thread_info();
8609 irqctx = hardirq_ctx[smp_processor_id()];
8610@@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs
8611 * handler) we can't do that and just have to keep using the
8612 * current stack (which is the irq stack already after all)
8613 */
8614- if (curctx != irqctx) {
8615- int arg1, arg2, bx;
8616-
8617- /* build the stack frame on the IRQ stack */
8618- isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8619- irqctx->tinfo.task = curctx->tinfo.task;
8620- irqctx->tinfo.previous_esp = current_stack_pointer;
8621+ if (unlikely(curctx == irqctx))
8622+ return 0;
8623
8624- /*
8625- * Copy the softirq bits in preempt_count so that the
8626- * softirq checks work in the hardirq context.
8627- */
8628- irqctx->tinfo.preempt_count =
8629- (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8630- (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8631-
8632- asm volatile(
8633- " xchgl %%ebx,%%esp \n"
8634- " call *%%edi \n"
8635- " movl %%ebx,%%esp \n"
8636- : "=a" (arg1), "=d" (arg2), "=b" (bx)
8637- : "0" (irq), "1" (desc), "2" (isp),
8638- "D" (desc->handle_irq)
8639- : "memory", "cc", "ecx"
8640- );
8641- } else
8642-#endif
8643- desc->handle_irq(irq, desc);
8644+ /* build the stack frame on the IRQ stack */
8645+ isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
8646+ irqctx->tinfo.task = curctx->tinfo.task;
8647+ irqctx->tinfo.previous_esp = current_stack_pointer;
8648
8649- /*irq_exit();*/
8650- set_irq_regs(old_regs);
8651+ /*
8652+ * Copy the softirq bits in preempt_count so that the
8653+ * softirq checks work in the hardirq context.
8654+ */
8655+ irqctx->tinfo.preempt_count =
8656+ (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8657+ (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8658+
8659+ if (unlikely(overflow))
8660+ call_on_stack(print_stack_overflow, isp);
8661+
8662+ asm volatile("xchgl %%ebx,%%esp \n"
8663+ "call *%%edi \n"
8664+ "movl %%ebx,%%esp \n"
8665+ : "=a" (arg1), "=d" (arg2), "=b" (isp)
8666+ : "0" (irq), "1" (desc), "2" (isp),
8667+ "D" (desc->handle_irq)
8668+ : "memory", "cc", "ecx");
8669 return 1;
8670 }
8671
8672-#ifdef CONFIG_4KSTACKS
8673-
8674-static char softirq_stack[NR_CPUS * THREAD_SIZE]
8675- __attribute__((__section__(".bss.page_aligned")));
8676-
8677-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
8678- __attribute__((__section__(".bss.page_aligned")));
8679-
8680 /*
8681 * allocate per-cpu stacks for hardirq and for softirq processing
8682 */
8683-void irq_ctx_init(int cpu)
8684+void __cpuinit irq_ctx_init(int cpu)
8685 {
8686 union irq_ctx *irqctx;
8687
8688@@ -164,25 +152,25 @@ void irq_ctx_init(int cpu)
8689 return;
8690
8691 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
8692- irqctx->tinfo.task = NULL;
8693- irqctx->tinfo.exec_domain = NULL;
8694- irqctx->tinfo.cpu = cpu;
8695- irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8696- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8697+ irqctx->tinfo.task = NULL;
8698+ irqctx->tinfo.exec_domain = NULL;
8699+ irqctx->tinfo.cpu = cpu;
8700+ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8701+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8702
8703 hardirq_ctx[cpu] = irqctx;
8704
8705 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
8706- irqctx->tinfo.task = NULL;
8707- irqctx->tinfo.exec_domain = NULL;
8708- irqctx->tinfo.cpu = cpu;
8709- irqctx->tinfo.preempt_count = 0;
8710- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8711+ irqctx->tinfo.task = NULL;
8712+ irqctx->tinfo.exec_domain = NULL;
8713+ irqctx->tinfo.cpu = cpu;
8714+ irqctx->tinfo.preempt_count = 0;
8715+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8716
8717 softirq_ctx[cpu] = irqctx;
8718
8719- printk("CPU %u irqstacks, hard=%p soft=%p\n",
8720- cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8721+ printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
8722+ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8723 }
8724
8725 void irq_ctx_exit(int cpu)
8726@@ -211,25 +199,56 @@ asmlinkage void do_softirq(void)
8727 /* build the stack frame on the softirq stack */
8728 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8729
8730- asm volatile(
8731- " xchgl %%ebx,%%esp \n"
8732- " call __do_softirq \n"
8733- " movl %%ebx,%%esp \n"
8734- : "=b"(isp)
8735- : "0"(isp)
8736- : "memory", "cc", "edx", "ecx", "eax"
8737- );
8738+ call_on_stack(__do_softirq, isp);
8739 /*
8740 * Shouldnt happen, we returned above if in_interrupt():
8741- */
8742+ */
8743 WARN_ON_ONCE(softirq_count());
8744 }
8745
8746 local_irq_restore(flags);
8747 }
8748+
8749+#else
8750+static inline int
8751+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
8752 #endif
8753
8754 /*
8755+ * do_IRQ handles all normal device IRQ's (the special
8756+ * SMP cross-CPU interrupts have their own specific
8757+ * handlers).
8758+ */
8759+unsigned int do_IRQ(struct pt_regs *regs)
8760+{
8761+ struct pt_regs *old_regs;
8762+ /* high bit used in ret_from_ code */
8763+ int overflow, irq = ~regs->orig_ax;
8764+ struct irq_desc *desc = irq_desc + irq;
8765+
8766+ if (unlikely((unsigned)irq >= NR_IRQS)) {
8767+ printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8768+ __func__, irq);
8769+ BUG();
8770+ }
8771+
8772+ old_regs = set_irq_regs(regs);
8773+ /*irq_enter();*/
8774+
8775+ overflow = check_stack_overflow();
8776+
8777+ if (!execute_on_irq_stack(overflow, desc, irq)) {
8778+ if (unlikely(overflow))
8779+ print_stack_overflow();
8780+ desc->handle_irq(irq, desc);
8781+ }
8782+
8783+ /*irq_exit();*/
8784+ set_irq_regs(old_regs);
8785+ return 1;
8786+}
8787+
8788+/*
8789 * Interrupt statistics:
8790 */
8791
8792@@ -337,6 +356,42 @@ skip:
8793 return 0;
8794 }
8795
8796+/*
8797+ * /proc/stat helpers
8798+ */
8799+u64 arch_irq_stat_cpu(unsigned int cpu)
8800+{
8801+ u64 sum = nmi_count(cpu);
8802+
8803+#ifdef CONFIG_X86_LOCAL_APIC
8804+ sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
8805+#endif
8806+#ifdef CONFIG_SMP
8807+ sum += per_cpu(irq_stat, cpu).irq_resched_count;
8808+ sum += per_cpu(irq_stat, cpu).irq_call_count;
8809+#ifndef CONFIG_XEN
8810+ sum += per_cpu(irq_stat, cpu).irq_tlb_count;
8811+#endif
8812+#endif
8813+#ifdef CONFIG_X86_MCE
8814+ sum += per_cpu(irq_stat, cpu).irq_thermal_count;
8815+#endif
8816+#ifdef CONFIG_X86_LOCAL_APIC
8817+ sum += per_cpu(irq_stat, cpu).irq_spurious_count;
8818+#endif
8819+ return sum;
8820+}
8821+
8822+u64 arch_irq_stat(void)
8823+{
8824+ u64 sum = atomic_read(&irq_err_count);
8825+
8826+#ifdef CONFIG_X86_IO_APIC
8827+ sum += atomic_read(&irq_mis_count);
8828+#endif
8829+ return sum;
8830+}
8831+
8832 #ifdef CONFIG_HOTPLUG_CPU
8833
8834 void fixup_irqs(cpumask_t map)
00e5a55c
BS
8835--- sle11-2009-06-04.orig/arch/x86/kernel/irq_64-xen.c 2009-03-16 16:33:40.000000000 +0100
8836+++ sle11-2009-06-04/arch/x86/kernel/irq_64-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
8837@@ -163,6 +163,34 @@ skip:
8838 }
8839
8840 /*
8841+ * /proc/stat helpers
8842+ */
8843+u64 arch_irq_stat_cpu(unsigned int cpu)
8844+{
8845+ u64 sum = cpu_pda(cpu)->__nmi_count;
8846+
8847+ sum += cpu_pda(cpu)->apic_timer_irqs;
8848+#ifdef CONFIG_SMP
8849+ sum += cpu_pda(cpu)->irq_resched_count;
8850+ sum += cpu_pda(cpu)->irq_call_count;
8851+#ifndef CONFIG_XEN
8852+ sum += cpu_pda(cpu)->irq_tlb_count;
8853+#endif
8854+#endif
8855+#ifdef CONFIG_X86_MCE
8856+ sum += cpu_pda(cpu)->irq_thermal_count;
8857+ sum += cpu_pda(cpu)->irq_threshold_count;
8858+#endif
8859+ sum += cpu_pda(cpu)->irq_spurious_count;
8860+ return sum;
8861+}
8862+
8863+u64 arch_irq_stat(void)
8864+{
8865+ return atomic_read(&irq_err_count);
8866+}
8867+
8868+/*
8869 * do_IRQ handles all normal device IRQ's (the special
8870 * SMP cross-CPU interrupts have their own specific
8871 * handlers).
00e5a55c
BS
8872--- sle11-2009-06-04.orig/arch/x86/kernel/ldt-xen.c 2009-03-16 16:33:40.000000000 +0100
8873+++ sle11-2009-06-04/arch/x86/kernel/ldt-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
8874@@ -20,9 +20,9 @@
8875 #include <asm/mmu_context.h>
8876
8877 #ifdef CONFIG_SMP
8878-static void flush_ldt(void *null)
8879+static void flush_ldt(void *current_mm)
8880 {
8881- if (current->active_mm)
8882+ if (current->active_mm == current_mm)
8883 load_LDT(&current->active_mm->context);
8884 }
8885 #endif
8886@@ -62,8 +62,6 @@ static int alloc_ldt(mm_context_t *pc, i
8887
8888 if (reload) {
8889 #ifdef CONFIG_SMP
8890- cpumask_t mask;
8891-
8892 preempt_disable();
8893 #endif
8894 make_pages_readonly(newldt,
8895@@ -71,9 +69,9 @@ static int alloc_ldt(mm_context_t *pc, i
8896 XENFEAT_writable_descriptor_tables);
8897 load_LDT(pc);
8898 #ifdef CONFIG_SMP
8899- mask = cpumask_of_cpu(smp_processor_id());
8900- if (!cpus_equal(current->mm->cpu_vm_mask, mask))
8901- smp_call_function(flush_ldt, NULL, 1, 1);
8902+ if (!cpus_equal(current->mm->cpu_vm_mask,
8903+ cpumask_of_cpu(smp_processor_id())))
8904+ smp_call_function(flush_ldt, current->mm, 1);
8905 preempt_enable();
8906 #endif
8907 }
00e5a55c
BS
8908--- sle11-2009-06-04.orig/arch/x86/kernel/machine_kexec_32.c 2008-11-25 12:35:53.000000000 +0100
8909+++ sle11-2009-06-04/arch/x86/kernel/machine_kexec_32.c 2009-06-04 10:21:39.000000000 +0200
8910@@ -68,6 +68,8 @@ void machine_kexec_setup_load_arg(xen_ke
8911 xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
8912 xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
8913
8914+ if (image->type == KEXEC_TYPE_DEFAULT)
8915+ xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
8916 }
8917
8918 int __init machine_kexec_setup_resources(struct resource *hypervisor,
8919--- sle11-2009-06-04.orig/arch/x86/kernel/microcode-xen.c 2009-03-16 16:38:05.000000000 +0100
8920+++ sle11-2009-06-04/arch/x86/kernel/microcode-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
8921@@ -5,13 +5,14 @@
8922 * 2006 Shaohua Li <shaohua.li@intel.com>
8923 *
8924 * This driver allows to upgrade microcode on Intel processors
8925- * belonging to IA-32 family - PentiumPro, Pentium II,
8926+ * belonging to IA-32 family - PentiumPro, Pentium II,
8927 * Pentium III, Xeon, Pentium 4, etc.
8928 *
8929- * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
8930- * Order Number 245472 or free download from:
8931- *
8932- * http://developer.intel.com/design/pentium4/manuals/245472.htm
8933+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
8934+ * Software Developer's Manual
8935+ * Order Number 253668 or free download from:
8936+ *
8937+ * http://developer.intel.com/design/pentium4/manuals/253668.htm
8938 *
8939 * For more information, go to http://www.urbanmyth.org/microcode
8940 *
8941@@ -26,6 +27,7 @@
8942 #include <linux/kernel.h>
8943 #include <linux/init.h>
8944 #include <linux/sched.h>
8945+#include <linux/smp_lock.h>
8946 #include <linux/cpumask.h>
8947 #include <linux/module.h>
8948 #include <linux/slab.h>
8949@@ -86,6 +88,7 @@ static int do_microcode_update (const vo
8950
8951 static int microcode_open (struct inode *unused1, struct file *unused2)
8952 {
8953+ cycle_kernel_lock();
8954 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8955 }
8956
8957@@ -162,7 +165,7 @@ static int request_microcode(void)
8958 c->x86, c->x86_model, c->x86_mask);
8959 error = request_firmware(&firmware, name, &microcode_pdev->dev);
8960 if (error) {
8961- pr_debug("microcode: ucode data file %s load failed\n", name);
8962+ pr_debug("microcode: data file %s load failed\n", name);
8963 return error;
8964 }
8965
8966@@ -183,6 +186,9 @@ static int __init microcode_init (void)
8967 {
8968 int error;
8969
8970+ printk(KERN_INFO
8971+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8972+
8973 error = microcode_dev_init();
8974 if (error)
8975 return error;
8976@@ -195,8 +201,6 @@ static int __init microcode_init (void)
8977
8978 request_microcode();
8979
8980- printk(KERN_INFO
8981- "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8982 return 0;
8983 }
8984
00e5a55c
BS
8985--- sle11-2009-06-04.orig/arch/x86/kernel/mpparse-xen.c 2009-03-16 16:38:05.000000000 +0100
8986+++ sle11-2009-06-04/arch/x86/kernel/mpparse-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
8987@@ -25,6 +25,9 @@
8988 #include <asm/proto.h>
8989 #include <asm/acpi.h>
8990 #include <asm/bios_ebda.h>
8991+#include <asm/e820.h>
8992+#include <asm/trampoline.h>
8993+#include <asm/setup.h>
8994
8995 #include <mach_apic.h>
8996 #ifdef CONFIG_X86_32
00e5a55c 8997@@ -32,27 +35,10 @@
cc90b958
BS
8998 #include <mach_mpparse.h>
8999 #endif
9000
9001-/* Have we found an MP table */
9002-int smp_found_config;
9003-
9004-/*
9005- * Various Linux-internal data structures created from the
9006- * MP-table.
9007- */
9008-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9009-int mp_bus_id_to_type[MAX_MP_BUSSES];
9010-#endif
9011-
9012-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
9013-int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
9014-
9015-static int mp_current_pci_id;
9016-
9017-int pic_mode;
9018-
9019-/*
9020- * Intel MP BIOS table parsing routines:
9021- */
00e5a55c
BS
9022+static void *_bus_to_virt(unsigned long ma)
9023+{
9024+ return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma);
9025+}
9026
cc90b958
BS
9027 /*
9028 * Checksum an MP configuration block.
00e5a55c 9029@@ -68,19 +54,7 @@ static int __init mpf_checksum(unsigned
cc90b958
BS
9030 return sum & 0xFF;
9031 }
9032
9033-#ifdef CONFIG_X86_NUMAQ
9034-/*
9035- * Have to match translation table entries to main table entries by counter
9036- * hence the mpc_record variable .... can't see a less disgusting way of
9037- * doing this ....
9038- */
9039-
9040-static int mpc_record;
9041-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
9042- __cpuinitdata;
9043-#endif
9044-
cc90b958
BS
9045-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9046+static void __init MP_processor_info(struct mpc_config_processor *m)
9047 {
00e5a55c 9048 #ifndef CONFIG_XEN
cc90b958 9049 int apicid;
00e5a55c 9050@@ -90,11 +64,12 @@ static void __cpuinit MP_processor_info(
cc90b958
BS
9051 disabled_cpus++;
9052 return;
9053 }
9054-#ifdef CONFIG_X86_NUMAQ
9055- apicid = mpc_apic_id(m, translation_table[mpc_record]);
9056-#else
9057- apicid = m->mpc_apicid;
9058-#endif
9059+
9060+ if (x86_quirks->mpc_apic_id)
9061+ apicid = x86_quirks->mpc_apic_id(m);
9062+ else
9063+ apicid = m->mpc_apicid;
9064+
9065 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
9066 bootup_cpu = " (Bootup-CPU)";
9067 boot_cpu_physical_apicid = m->mpc_apicid;
00e5a55c
BS
9068@@ -107,18 +82,17 @@ static void __cpuinit MP_processor_info(
9069 #endif
cc90b958 9070 }
cc90b958
BS
9071
9072+#ifdef CONFIG_X86_IO_APIC
9073 static void __init MP_bus_info(struct mpc_config_bus *m)
9074 {
9075 char str[7];
9076-
9077 memcpy(str, m->mpc_bustype, 6);
9078 str[6] = 0;
9079
9080-#ifdef CONFIG_X86_NUMAQ
9081- mpc_oem_bus_info(m, str, translation_table[mpc_record]);
9082-#else
9083- Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
9084-#endif
9085+ if (x86_quirks->mpc_oem_bus_info)
9086+ x86_quirks->mpc_oem_bus_info(m, str);
9087+ else
9088+ apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
9089
9090 #if MAX_MP_BUSSES < 256
9091 if (m->mpc_busid >= MAX_MP_BUSSES) {
00e5a55c 9092@@ -135,12 +109,10 @@ static void __init MP_bus_info(struct mp
cc90b958
BS
9093 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
9094 #endif
9095 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
9096-#ifdef CONFIG_X86_NUMAQ
9097- mpc_oem_pci_bus(m, translation_table[mpc_record]);
9098-#endif
9099+ if (x86_quirks->mpc_oem_pci_bus)
9100+ x86_quirks->mpc_oem_pci_bus(m);
9101+
9102 clear_bit(m->mpc_busid, mp_bus_not_pci);
9103- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
9104- mp_current_pci_id++;
9105 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
9106 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
9107 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
00e5a55c 9108@@ -151,6 +123,7 @@ static void __init MP_bus_info(struct mp
cc90b958
BS
9109 } else
9110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
9111 }
9112+#endif
9113
9114 #ifdef CONFIG_X86_IO_APIC
9115
00e5a55c 9116@@ -180,117 +153,111 @@ static void __init MP_ioapic_info(struct
cc90b958
BS
9117 if (bad_ioapic(m->mpc_apicaddr))
9118 return;
9119
9120- mp_ioapics[nr_ioapics] = *m;
9121+ mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
9122+ mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
9123+ mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
9124+ mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
9125+ mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
9126 nr_ioapics++;
9127 }
9128
9129-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9130+static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
9131 {
9132- mp_irqs[mp_irq_entries] = *m;
9133- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
9134+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9135 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9136 m->mpc_irqtype, m->mpc_irqflag & 3,
9137 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
9138 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
9139- if (++mp_irq_entries == MAX_IRQ_SOURCES)
9140- panic("Max # of irq sources exceeded!!\n");
9141 }
9142
9143-#endif
9144-
9145-static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9146+static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
9147 {
9148- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
9149- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9150- m->mpc_irqtype, m->mpc_irqflag & 3,
9151- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9152- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9153+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9154+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9155+ mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
9156+ (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
9157+ mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
9158 }
9159
9160-#ifdef CONFIG_X86_NUMAQ
9161-static void __init MP_translation_info(struct mpc_config_translation *m)
9162+static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
9163+ struct mp_config_intsrc *mp_irq)
9164 {
9165- printk(KERN_INFO
9166- "Translation: record %d, type %d, quad %d, global %d, local %d\n",
9167- mpc_record, m->trans_type, m->trans_quad, m->trans_global,
9168- m->trans_local);
9169+ mp_irq->mp_dstapic = m->mpc_dstapic;
9170+ mp_irq->mp_type = m->mpc_type;
9171+ mp_irq->mp_irqtype = m->mpc_irqtype;
9172+ mp_irq->mp_irqflag = m->mpc_irqflag;
9173+ mp_irq->mp_srcbus = m->mpc_srcbus;
9174+ mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
9175+ mp_irq->mp_dstirq = m->mpc_dstirq;
9176+}
9177
9178- if (mpc_record >= MAX_MPC_ENTRY)
9179- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
9180- else
9181- translation_table[mpc_record] = m; /* stash this for later */
9182- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
9183- node_set_online(m->trans_quad);
9184+static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
9185+ struct mpc_config_intsrc *m)
9186+{
9187+ m->mpc_dstapic = mp_irq->mp_dstapic;
9188+ m->mpc_type = mp_irq->mp_type;
9189+ m->mpc_irqtype = mp_irq->mp_irqtype;
9190+ m->mpc_irqflag = mp_irq->mp_irqflag;
9191+ m->mpc_srcbus = mp_irq->mp_srcbus;
9192+ m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
9193+ m->mpc_dstirq = mp_irq->mp_dstirq;
9194 }
9195
9196-/*
9197- * Read/parse the MPC oem tables
9198- */
9199+static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
9200+ struct mpc_config_intsrc *m)
9201+{
9202+ if (mp_irq->mp_dstapic != m->mpc_dstapic)
9203+ return 1;
9204+ if (mp_irq->mp_type != m->mpc_type)
9205+ return 2;
9206+ if (mp_irq->mp_irqtype != m->mpc_irqtype)
9207+ return 3;
9208+ if (mp_irq->mp_irqflag != m->mpc_irqflag)
9209+ return 4;
9210+ if (mp_irq->mp_srcbus != m->mpc_srcbus)
9211+ return 5;
9212+ if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
9213+ return 6;
9214+ if (mp_irq->mp_dstirq != m->mpc_dstirq)
9215+ return 7;
cc90b958
BS
9216
9217-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
9218- unsigned short oemsize)
00e5a55c
BS
9219+ return 0;
9220+}
9221+
cc90b958
BS
9222+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9223 {
9224- int count = sizeof(*oemtable); /* the header size */
9225- unsigned char *oemptr = ((unsigned char *)oemtable) + count;
9226+ int i;
9227
9228- mpc_record = 0;
9229- printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
9230- oemtable);
9231- if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
9232- printk(KERN_WARNING
9233- "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
9234- oemtable->oem_signature[0], oemtable->oem_signature[1],
9235- oemtable->oem_signature[2], oemtable->oem_signature[3]);
9236- return;
9237- }
9238- if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
9239- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
9240- return;
9241- }
9242- while (count < oemtable->oem_length) {
9243- switch (*oemptr) {
9244- case MP_TRANSLATION:
9245- {
9246- struct mpc_config_translation *m =
9247- (struct mpc_config_translation *)oemptr;
9248- MP_translation_info(m);
9249- oemptr += sizeof(*m);
9250- count += sizeof(*m);
9251- ++mpc_record;
9252- break;
9253- }
9254- default:
9255- {
9256- printk(KERN_WARNING
9257- "Unrecognised OEM table entry type! - %d\n",
9258- (int)*oemptr);
9259- return;
9260- }
9261- }
9262+ print_MP_intsrc_info(m);
9263+
9264+ for (i = 0; i < mp_irq_entries; i++) {
9265+ if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
9266+ return;
9267 }
9268+
9269+ assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
9270+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
9271+ panic("Max # of irq sources exceeded!!\n");
9272 }
9273
9274-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
9275- char *productid)
9276+#endif
9277+
9278+static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9279 {
9280- if (strncmp(oem, "IBM NUMA", 8))
9281- printk("Warning! May not be a NUMA-Q system!\n");
9282- if (mpc->mpc_oemptr)
9283- smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
9284- mpc->mpc_oemsize);
9285+ apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
9286+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9287+ m->mpc_irqtype, m->mpc_irqflag & 3,
9288+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9289+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9290 }
9291-#endif /* CONFIG_X86_NUMAQ */
9292
9293 /*
9294 * Read/parse the MPC
9295 */
9296
9297-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9298+static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
9299+ char *str)
9300 {
9301- char str[16];
9302- char oem[10];
9303- int count = sizeof(*mpc);
9304- unsigned char *mpt = ((unsigned char *)mpc) + count;
9305
9306 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
9307 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
00e5a55c 9308@@ -313,19 +280,41 @@ static int __init smp_read_mpc(struct mp
cc90b958
BS
9309 }
9310 memcpy(oem, mpc->mpc_oem, 8);
9311 oem[8] = 0;
9312- printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
9313+ printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
9314
9315 memcpy(str, mpc->mpc_productid, 12);
9316 str[12] = 0;
9317- printk("Product ID: %s ", str);
9318
9319-#ifdef CONFIG_X86_32
9320- mps_oem_check(mpc, oem, str);
9321-#endif
9322- printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
9323+ printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
9324
9325 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
9326
9327+ return 1;
9328+}
9329+
9330+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9331+{
9332+ char str[16];
9333+ char oem[10];
9334+
9335+ int count = sizeof(*mpc);
9336+ unsigned char *mpt = ((unsigned char *)mpc) + count;
9337+
9338+ if (!smp_check_mpc(mpc, oem, str))
9339+ return 0;
9340+
9341+#ifdef CONFIG_X86_32
9342+ /*
9343+ * need to make sure summit and es7000's mps_oem_check is safe to be
9344+ * called early via genericarch 's mps_oem_check
9345+ */
9346+ if (early) {
9347+#ifdef CONFIG_X86_NUMAQ
9348+ numaq_mps_oem_check(mpc, oem, str);
9349+#endif
9350+ } else
9351+ mps_oem_check(mpc, oem, str);
9352+#endif
9353 /* save the local APIC address, it might be non-default */
9354 if (!acpi_lapic)
9355 mp_lapic_addr = mpc->mpc_lapic;
00e5a55c 9356@@ -333,12 +322,17 @@ static int __init smp_read_mpc(struct mp
cc90b958
BS
9357 if (early)
9358 return 1;
9359
9360+ if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
9361+ struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
9362+ x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
9363+ }
9364+
9365 /*
9366 * Now process the configuration blocks.
9367 */
9368-#ifdef CONFIG_X86_NUMAQ
9369- mpc_record = 0;
9370-#endif
9371+ if (x86_quirks->mpc_record)
9372+ *x86_quirks->mpc_record = 0;
9373+
9374 while (count < mpc->mpc_length) {
9375 switch (*mpt) {
9376 case MP_PROCESSOR:
00e5a55c 9377@@ -356,7 +350,9 @@ static int __init smp_read_mpc(struct mp
cc90b958
BS
9378 {
9379 struct mpc_config_bus *m =
9380 (struct mpc_config_bus *)mpt;
9381+#ifdef CONFIG_X86_IO_APIC
9382 MP_bus_info(m);
9383+#endif
9384 mpt += sizeof(*m);
9385 count += sizeof(*m);
9386 break;
00e5a55c 9387@@ -402,10 +398,14 @@ static int __init smp_read_mpc(struct mp
cc90b958
BS
9388 count = mpc->mpc_length;
9389 break;
9390 }
9391-#ifdef CONFIG_X86_NUMAQ
9392- ++mpc_record;
9393-#endif
9394+ if (x86_quirks->mpc_record)
9395+ (*x86_quirks->mpc_record)++;
9396 }
9397+
9398+#ifdef CONFIG_X86_GENERICARCH
9399+ generic_bigsmp_probe();
9400+#endif
9401+
9402 setup_apic_routing();
9403 if (!num_processors)
9404 printk(KERN_ERR "MPTABLE: no processors registered!\n");
00e5a55c 9405@@ -431,7 +431,7 @@ static void __init construct_default_ioi
cc90b958
BS
9406 intsrc.mpc_type = MP_INTSRC;
9407 intsrc.mpc_irqflag = 0; /* conforming */
9408 intsrc.mpc_srcbus = 0;
9409- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
9410+ intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
9411
9412 intsrc.mpc_irqtype = mp_INT;
9413
00e5a55c 9414@@ -492,40 +492,11 @@ static void __init construct_default_ioi
cc90b958
BS
9415 MP_intsrc_info(&intsrc);
9416 }
9417
9418-#endif
9419
9420-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9421+static void __init construct_ioapic_table(int mpc_default_type)
9422 {
9423- struct mpc_config_processor processor;
9424- struct mpc_config_bus bus;
9425-#ifdef CONFIG_X86_IO_APIC
9426 struct mpc_config_ioapic ioapic;
9427-#endif
9428- struct mpc_config_lintsrc lintsrc;
9429- int linttypes[2] = { mp_ExtINT, mp_NMI };
9430- int i;
9431-
9432- /*
9433- * local APIC has default address
9434- */
9435- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9436-
9437- /*
9438- * 2 CPUs, numbered 0 & 1.
9439- */
9440- processor.mpc_type = MP_PROCESSOR;
9441- /* Either an integrated APIC or a discrete 82489DX. */
9442- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9443- processor.mpc_cpuflag = CPU_ENABLED;
9444- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9445- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9446- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9447- processor.mpc_reserved[0] = 0;
9448- processor.mpc_reserved[1] = 0;
9449- for (i = 0; i < 2; i++) {
9450- processor.mpc_apicid = i;
9451- MP_processor_info(&processor);
9452- }
9453+ struct mpc_config_bus bus;
9454
9455 bus.mpc_type = MP_BUS;
9456 bus.mpc_busid = 0;
00e5a55c 9457@@ -554,7 +525,6 @@ static inline void __init construct_defa
cc90b958
BS
9458 MP_bus_info(&bus);
9459 }
9460
9461-#ifdef CONFIG_X86_IO_APIC
9462 ioapic.mpc_type = MP_IOAPIC;
9463 ioapic.mpc_apicid = 2;
9464 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
00e5a55c 9465@@ -566,7 +536,42 @@ static inline void __init construct_defa
cc90b958
BS
9466 * We set up most of the low 16 IO-APIC pins according to MPS rules.
9467 */
9468 construct_default_ioirq_mptable(mpc_default_type);
9469+}
9470+#else
9471+static inline void __init construct_ioapic_table(int mpc_default_type) { }
9472 #endif
9473+
9474+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9475+{
9476+ struct mpc_config_processor processor;
9477+ struct mpc_config_lintsrc lintsrc;
9478+ int linttypes[2] = { mp_ExtINT, mp_NMI };
9479+ int i;
9480+
9481+ /*
9482+ * local APIC has default address
9483+ */
9484+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9485+
9486+ /*
9487+ * 2 CPUs, numbered 0 & 1.
9488+ */
9489+ processor.mpc_type = MP_PROCESSOR;
9490+ /* Either an integrated APIC or a discrete 82489DX. */
9491+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9492+ processor.mpc_cpuflag = CPU_ENABLED;
9493+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9494+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9495+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9496+ processor.mpc_reserved[0] = 0;
9497+ processor.mpc_reserved[1] = 0;
9498+ for (i = 0; i < 2; i++) {
9499+ processor.mpc_apicid = i;
9500+ MP_processor_info(&processor);
9501+ }
9502+
9503+ construct_ioapic_table(mpc_default_type);
9504+
9505 lintsrc.mpc_type = MP_LINTSRC;
9506 lintsrc.mpc_irqflag = 0; /* conforming */
9507 lintsrc.mpc_srcbusid = 0;
00e5a55c 9508@@ -584,10 +589,14 @@ static struct intel_mp_floating *mpf_fou
cc90b958
BS
9509 /*
9510 * Scan the memory blocks for an SMP configuration block.
9511 */
9512-static void __init __get_smp_config(unsigned early)
9513+static void __init __get_smp_config(unsigned int early)
9514 {
9515 struct intel_mp_floating *mpf = mpf_found;
9516
9517+ if (x86_quirks->mach_get_smp_config) {
9518+ if (x86_quirks->mach_get_smp_config(early))
9519+ return;
9520+ }
9521 if (acpi_lapic && early)
9522 return;
9523 /*
00e5a55c 9524@@ -604,7 +613,7 @@ static void __init __get_smp_config(unsi
cc90b958
BS
9525
9526 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
9527 mpf->mpf_specification);
9528-#ifdef CONFIG_X86_32
9529+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9530 if (mpf->mpf_feature2 & (1 << 7)) {
9531 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
9532 pic_mode = 1;
00e5a55c
BS
9533@@ -635,8 +644,10 @@ static void __init __get_smp_config(unsi
9534 * Read the physical hardware table. Anything here will
cc90b958
BS
9535 * override the defaults.
9536 */
00e5a55c
BS
9537- if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
9538+ if (!smp_read_mpc(_bus_to_virt(mpf->mpf_physptr), early)) {
cc90b958
BS
9539+#ifdef CONFIG_X86_LOCAL_APIC
9540 smp_found_config = 0;
9541+#endif
9542 printk(KERN_ERR
9543 "BIOS bug, MP table errors detected!...\n");
9544 printk(KERN_ERR "... disabling SMP support. "
00e5a55c
BS
9545@@ -690,10 +701,11 @@ void __init get_smp_config(void)
9546 static int __init smp_scan_config(unsigned long base, unsigned long length,
9547 unsigned reserve)
9548 {
9549- unsigned int *bp = isa_bus_to_virt(base);
9550+ unsigned int *bp = _bus_to_virt(base);
cc90b958
BS
9551 struct intel_mp_floating *mpf;
9552
9553- Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
9554+ apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
9555+ bp, length);
9556 BUILD_BUG_ON(sizeof(*mpf) != 16);
9557
9558 while (length > 0) {
00e5a55c 9559@@ -703,16 +715,22 @@ static int __init smp_scan_config(unsign
cc90b958
BS
9560 !mpf_checksum((unsigned char *)bp, 16) &&
9561 ((mpf->mpf_specification == 1)
9562 || (mpf->mpf_specification == 4))) {
9563-
9564+#ifdef CONFIG_X86_LOCAL_APIC
9565 smp_found_config = 1;
9566+#endif
9567 mpf_found = mpf;
9568-#ifdef CONFIG_X86_32
9569+
9570 #ifndef CONFIG_XEN
9571 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9572 mpf, virt_to_phys(mpf));
9573- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
9574+
9575+ if (!reserve)
9576+ return 1;
9577+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
9578 BOOTMEM_DEFAULT);
9579 if (mpf->mpf_physptr) {
9580+ unsigned long size = PAGE_SIZE;
9581+#ifdef CONFIG_X86_32
9582 /*
9583 * We cannot access to MPC table to compute
9584 * table size yet, as only few megabytes from
00e5a55c 9585@@ -722,27 +740,18 @@ static int __init smp_scan_config(unsign
cc90b958
BS
9586 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
9587 * in reserve_bootmem.
9588 */
9589- unsigned long size = PAGE_SIZE;
9590 unsigned long end = max_low_pfn * PAGE_SIZE;
9591 if (mpf->mpf_physptr + size > end)
9592 size = end - mpf->mpf_physptr;
9593- reserve_bootmem(mpf->mpf_physptr, size,
9594+#endif
9595+ reserve_bootmem_generic(mpf->mpf_physptr, size,
9596 BOOTMEM_DEFAULT);
9597 }
9598 #else
9599 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
00e5a55c
BS
9600- mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
9601-#endif
cc90b958
BS
9602-#elif !defined(CONFIG_XEN)
9603- if (!reserve)
9604- return 1;
9605-
9606- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
9607- if (mpf->mpf_physptr)
9608- reserve_bootmem_generic(mpf->mpf_physptr,
9609- PAGE_SIZE);
00e5a55c
BS
9610+ mpf, ((void *)bp - _bus_to_virt(base)) + base);
9611 #endif
cc90b958
BS
9612- return 1;
9613+ return 1;
9614 }
9615 bp += 4;
9616 length -= 16;
00e5a55c 9617@@ -750,12 +759,16 @@ static int __init smp_scan_config(unsign
cc90b958
BS
9618 return 0;
9619 }
9620
9621-static void __init __find_smp_config(unsigned reserve)
9622+static void __init __find_smp_config(unsigned int reserve)
9623 {
9624 #ifndef CONFIG_XEN
9625 unsigned int address;
00e5a55c
BS
9626 #endif
9627
cc90b958
BS
9628+ if (x86_quirks->mach_find_smp_config) {
9629+ if (x86_quirks->mach_find_smp_config(reserve))
9630+ return;
9631+ }
cc90b958 9632 /*
00e5a55c
BS
9633 * FIXME: Linux assumes you have 640K of base ram..
9634 * this continues the error...
9635@@ -802,300 +815,297 @@ void __init find_smp_config(void)
cc90b958
BS
9636 __find_smp_config(1);
9637 }
9638
9639-/* --------------------------------------------------------------------------
9640- ACPI-based MP Configuration
9641- -------------------------------------------------------------------------- */
9642-
9643-/*
9644- * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
9645- */
9646-int es7000_plat;
9647-
9648-#ifdef CONFIG_ACPI
9649+#ifdef CONFIG_X86_IO_APIC
9650+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
9651
9652-#ifdef CONFIG_X86_IO_APIC
9653+static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
9654+{
9655+ int i;
9656
9657-#define MP_ISA_BUS 0
9658+ if (m->mpc_irqtype != mp_INT)
9659+ return 0;
9660
9661-extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
9662+ if (m->mpc_irqflag != 0x0f)
9663+ return 0;
9664
9665-static int mp_find_ioapic(int gsi)
9666-{
9667- int i = 0;
9668+ /* not legacy */
9669
9670- /* Find the IOAPIC that manages this GSI. */
9671- for (i = 0; i < nr_ioapics; i++) {
9672- if ((gsi >= mp_ioapic_routing[i].gsi_base)
9673- && (gsi <= mp_ioapic_routing[i].gsi_end))
9674- return i;
9675+ for (i = 0; i < mp_irq_entries; i++) {
9676+ if (mp_irqs[i].mp_irqtype != mp_INT)
9677+ continue;
9678+
9679+ if (mp_irqs[i].mp_irqflag != 0x0f)
9680+ continue;
9681+
9682+ if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
9683+ continue;
9684+ if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
9685+ continue;
9686+ if (irq_used[i]) {
9687+ /* already claimed */
9688+ return -2;
9689+ }
9690+ irq_used[i] = 1;
9691+ return i;
9692 }
9693
9694- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9695+ /* not found */
9696 return -1;
9697 }
9698
9699-static u8 __init uniq_ioapic_id(u8 id)
9700-{
9701-#ifdef CONFIG_X86_32
9702- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
9703- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
9704- return io_apic_get_unique_id(nr_ioapics, id);
9705- else
9706- return id;
9707-#else
9708- int i;
9709- DECLARE_BITMAP(used, 256);
9710- bitmap_zero(used, 256);
9711- for (i = 0; i < nr_ioapics; i++) {
9712- struct mpc_config_ioapic *ia = &mp_ioapics[i];
9713- __set_bit(ia->mpc_apicid, used);
9714- }
9715- if (!test_bit(id, used))
9716- return id;
9717- return find_first_zero_bit(used, 256);
9718+#define SPARE_SLOT_NUM 20
9719+
9720+static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
9721 #endif
9722-}
9723
9724-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
9725+static int __init replace_intsrc_all(struct mp_config_table *mpc,
9726+ unsigned long mpc_new_phys,
9727+ unsigned long mpc_new_length)
9728 {
9729- int idx = 0;
9730-
9731- if (bad_ioapic(address))
9732- return;
9733+#ifdef CONFIG_X86_IO_APIC
9734+ int i;
9735+ int nr_m_spare = 0;
9736+#endif
9737
9738- idx = nr_ioapics;
9739+ int count = sizeof(*mpc);
9740+ unsigned char *mpt = ((unsigned char *)mpc) + count;
9741
9742- mp_ioapics[idx].mpc_type = MP_IOAPIC;
9743- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9744- mp_ioapics[idx].mpc_apicaddr = address;
9745+ printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
9746+ while (count < mpc->mpc_length) {
9747+ switch (*mpt) {
9748+ case MP_PROCESSOR:
9749+ {
9750+ struct mpc_config_processor *m =
9751+ (struct mpc_config_processor *)mpt;
9752+ mpt += sizeof(*m);
9753+ count += sizeof(*m);
9754+ break;
9755+ }
9756+ case MP_BUS:
9757+ {
9758+ struct mpc_config_bus *m =
9759+ (struct mpc_config_bus *)mpt;
9760+ mpt += sizeof(*m);
9761+ count += sizeof(*m);
9762+ break;
9763+ }
9764+ case MP_IOAPIC:
9765+ {
9766+ mpt += sizeof(struct mpc_config_ioapic);
9767+ count += sizeof(struct mpc_config_ioapic);
9768+ break;
9769+ }
9770+ case MP_INTSRC:
9771+ {
9772+#ifdef CONFIG_X86_IO_APIC
9773+ struct mpc_config_intsrc *m =
9774+ (struct mpc_config_intsrc *)mpt;
9775
9776-#ifndef CONFIG_XEN
9777- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
00e5a55c 9778+ apic_printk(APIC_VERBOSE, "OLD ");
cc90b958
BS
9779+ print_MP_intsrc_info(m);
9780+ i = get_MP_intsrc_index(m);
9781+ if (i > 0) {
9782+ assign_to_mpc_intsrc(&mp_irqs[i], m);
00e5a55c 9783+ apic_printk(APIC_VERBOSE, "NEW ");
cc90b958
BS
9784+ print_mp_irq_info(&mp_irqs[i]);
9785+ } else if (!i) {
9786+ /* legacy, do nothing */
9787+ } else if (nr_m_spare < SPARE_SLOT_NUM) {
9788+ /*
9789+ * not found (-1), or duplicated (-2)
9790+ * are invalid entries,
9791+ * we need to use the slot later
9792+ */
9793+ m_spare[nr_m_spare] = m;
9794+ nr_m_spare++;
9795+ }
9796 #endif
9797- mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
9798-#ifdef CONFIG_X86_32
9799- mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9800-#else
9801- mp_ioapics[idx].mpc_apicver = 0;
9802+ mpt += sizeof(struct mpc_config_intsrc);
9803+ count += sizeof(struct mpc_config_intsrc);
9804+ break;
9805+ }
9806+ case MP_LINTSRC:
9807+ {
9808+ struct mpc_config_lintsrc *m =
9809+ (struct mpc_config_lintsrc *)mpt;
9810+ mpt += sizeof(*m);
9811+ count += sizeof(*m);
9812+ break;
9813+ }
9814+ default:
9815+ /* wrong mptable */
9816+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
9817+ printk(KERN_ERR "type %x\n", *mpt);
9818+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
9819+ 1, mpc, mpc->mpc_length, 1);
9820+ goto out;
9821+ }
9822+ }
9823+
9824+#ifdef CONFIG_X86_IO_APIC
9825+ for (i = 0; i < mp_irq_entries; i++) {
9826+ if (irq_used[i])
9827+ continue;
9828+
9829+ if (mp_irqs[i].mp_irqtype != mp_INT)
9830+ continue;
9831+
9832+ if (mp_irqs[i].mp_irqflag != 0x0f)
9833+ continue;
9834+
9835+ if (nr_m_spare > 0) {
00e5a55c 9836+ apic_printk(APIC_VERBOSE, "*NEW* found\n");
cc90b958
BS
9837+ nr_m_spare--;
9838+ assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
9839+ m_spare[nr_m_spare] = NULL;
9840+ } else {
9841+ struct mpc_config_intsrc *m =
9842+ (struct mpc_config_intsrc *)mpt;
9843+ count += sizeof(struct mpc_config_intsrc);
9844+ if (!mpc_new_phys) {
9845+ printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
9846+ } else {
9847+ if (count <= mpc_new_length)
9848+ printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
9849+ else {
9850+ printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
9851+ goto out;
9852+ }
9853+ }
9854+ assign_to_mpc_intsrc(&mp_irqs[i], m);
9855+ mpc->mpc_length = count;
9856+ mpt += sizeof(struct mpc_config_intsrc);
9857+ }
9858+ print_mp_irq_info(&mp_irqs[i]);
9859+ }
9860 #endif
9861- /*
9862- * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9863- * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9864- */
9865- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9866- mp_ioapic_routing[idx].gsi_base = gsi_base;
9867- mp_ioapic_routing[idx].gsi_end = gsi_base +
9868- io_apic_get_redir_entries(idx);
9869-
9870- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
9871- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
9872- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9873- mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
9874+out:
9875+ /* update checksum */
9876+ mpc->mpc_checksum = 0;
9877+ mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
9878+ mpc->mpc_length);
9879
9880- nr_ioapics++;
9881+ return 0;
9882 }
9883
9884-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
9885-{
9886- struct mpc_config_intsrc intsrc;
9887- int ioapic = -1;
9888- int pin = -1;
9889-
9890- /*
9891- * Convert 'gsi' to 'ioapic.pin'.
9892- */
9893- ioapic = mp_find_ioapic(gsi);
9894- if (ioapic < 0)
9895- return;
9896- pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9897+static int __initdata enable_update_mptable;
9898
9899- /*
9900- * TBD: This check is for faulty timer entries, where the override
9901- * erroneously sets the trigger to level, resulting in a HUGE
9902- * increase of timer interrupts!
9903- */
9904- if ((bus_irq == 0) && (trigger == 3))
9905- trigger = 1;
9906+static int __init update_mptable_setup(char *str)
9907+{
9908+ enable_update_mptable = 1;
9909+ return 0;
9910+}
9911+early_param("update_mptable", update_mptable_setup);
9912
9913- intsrc.mpc_type = MP_INTSRC;
9914- intsrc.mpc_irqtype = mp_INT;
9915- intsrc.mpc_irqflag = (trigger << 2) | polarity;
9916- intsrc.mpc_srcbus = MP_ISA_BUS;
9917- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
9918- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
9919- intsrc.mpc_dstirq = pin; /* INTIN# */
9920+static unsigned long __initdata mpc_new_phys;
9921+static unsigned long mpc_new_length __initdata = 4096;
9922
9923- MP_intsrc_info(&intsrc);
9924+/* alloc_mptable or alloc_mptable=4k */
9925+static int __initdata alloc_mptable;
9926+static int __init parse_alloc_mptable_opt(char *p)
9927+{
9928+ enable_update_mptable = 1;
9929+ alloc_mptable = 1;
9930+ if (!p)
9931+ return 0;
00e5a55c 9932+ mpc_new_length = memparse(p, &p);
cc90b958
BS
9933+ return 0;
9934 }
9935+early_param("alloc_mptable", parse_alloc_mptable_opt);
9936
9937-void __init mp_config_acpi_legacy_irqs(void)
9938+void __init early_reserve_e820_mpc_new(void)
9939 {
9940- struct mpc_config_intsrc intsrc;
9941- int i = 0;
9942- int ioapic = -1;
9943-
9944-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9945- /*
9946- * Fabricate the legacy ISA bus (bus #31).
9947- */
9948- mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9949+ if (enable_update_mptable && alloc_mptable) {
00e5a55c 9950+ u64 startt = 0;
cc90b958
BS
9951+#ifdef CONFIG_X86_TRAMPOLINE
9952+ startt = TRAMPOLINE_BASE;
9953 #endif
9954- set_bit(MP_ISA_BUS, mp_bus_not_pci);
9955- Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9956-
9957- /*
9958- * Older generations of ES7000 have no legacy identity mappings
9959- */
9960- if (es7000_plat == 1)
9961- return;
9962-
9963- /*
9964- * Locate the IOAPIC that manages the ISA IRQs (0-15).
9965- */
9966- ioapic = mp_find_ioapic(0);
9967- if (ioapic < 0)
9968- return;
9969-
9970- intsrc.mpc_type = MP_INTSRC;
9971- intsrc.mpc_irqflag = 0; /* Conforming */
9972- intsrc.mpc_srcbus = MP_ISA_BUS;
9973-#ifdef CONFIG_X86_IO_APIC
9974- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9975-#endif
9976- /*
9977- * Use the default configuration for the IRQs 0-15. Unless
9978- * overridden by (MADT) interrupt source override entries.
9979- */
9980- for (i = 0; i < 16; i++) {
9981- int idx;
9982-
9983- for (idx = 0; idx < mp_irq_entries; idx++) {
9984- struct mpc_config_intsrc *irq = mp_irqs + idx;
9985-
9986- /* Do we already have a mapping for this ISA IRQ? */
9987- if (irq->mpc_srcbus == MP_ISA_BUS
9988- && irq->mpc_srcbusirq == i)
9989- break;
9990-
9991- /* Do we already have a mapping for this IOAPIC pin */
9992- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9993- (irq->mpc_dstirq == i))
9994- break;
9995- }
9996-
9997- if (idx != mp_irq_entries) {
9998- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9999- continue; /* IRQ already used */
10000- }
10001-
10002- intsrc.mpc_irqtype = mp_INT;
10003- intsrc.mpc_srcbusirq = i; /* Identity mapped */
10004- intsrc.mpc_dstirq = i;
10005-
10006- MP_intsrc_info(&intsrc);
00e5a55c 10007+ mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
cc90b958
BS
10008 }
10009 }
10010
10011-int mp_register_gsi(u32 gsi, int triggering, int polarity)
10012+static int __init update_mp_table(void)
10013 {
10014- int ioapic;
10015- int ioapic_pin;
10016-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10017-#define MAX_GSI_NUM 4096
10018-#define IRQ_COMPRESSION_START 64
10019+ char str[16];
10020+ char oem[10];
10021+ struct intel_mp_floating *mpf;
10022+ struct mp_config_table *mpc;
10023+ struct mp_config_table *mpc_new;
10024+
10025+ if (!enable_update_mptable)
10026+ return 0;
10027+
10028+ mpf = mpf_found;
10029+ if (!mpf)
10030+ return 0;
10031
10032- static int pci_irq = IRQ_COMPRESSION_START;
10033 /*
10034- * Mapping between Global System Interrupts, which
10035- * represent all possible interrupts, and IRQs
10036- * assigned to actual devices.
10037+ * Now see if we need to go further.
10038 */
10039- static int gsi_to_irq[MAX_GSI_NUM];
10040-#else
10041+ if (mpf->mpf_feature1 != 0)
10042+ return 0;
10043
10044- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
10045- return gsi;
10046-#endif
10047+ if (!mpf->mpf_physptr)
10048+ return 0;
10049
10050- /* Don't set up the ACPI SCI because it's already set up */
10051- if (acpi_gbl_FADT.sci_interrupt == gsi)
10052- return gsi;
00e5a55c 10053+ mpc = _bus_to_virt(mpf->mpf_physptr);
cc90b958
BS
10054
10055- ioapic = mp_find_ioapic(gsi);
10056- if (ioapic < 0) {
10057- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
10058- return gsi;
10059- }
10060+ if (!smp_check_mpc(mpc, oem, str))
10061+ return 0;
10062
10063- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
00e5a55c 10064+ printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
cc90b958
BS
10065+ printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
10066
10067-#ifndef CONFIG_X86_32
10068- if (ioapic_renumber_irq)
10069- gsi = ioapic_renumber_irq(ioapic, gsi);
10070-#endif
10071+ if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
10072+ mpc_new_phys = 0;
10073+ printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
10074+ mpc_new_length);
10075+ }
10076+
10077+ if (!mpc_new_phys) {
10078+ unsigned char old, new;
10079+ /* check if we can change the postion */
10080+ mpc->mpc_checksum = 0;
10081+ old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10082+ mpc->mpc_checksum = 0xff;
10083+ new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10084+ if (old == new) {
10085+ printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
10086+ return 0;
10087+ }
10088+ printk(KERN_INFO "use in-positon replacing\n");
10089+ } else {
10090+ maddr_t mpc_new_bus;
10091
10092- /*
10093- * Avoid pin reprogramming. PRTs typically include entries
10094- * with redundant pin->gsi mappings (but unique PCI devices);
10095- * we only program the IOAPIC on the first.
10096- */
10097- if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
10098- printk(KERN_ERR "Invalid reference to IOAPIC pin "
10099- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
10100- ioapic_pin);
10101- return gsi;
10102- }
10103- if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
10104- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
10105- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
10106-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10107- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
10108-#else
10109- return gsi;
10110-#endif
cc90b958
BS
10111+ mpc_new_bus = phys_to_machine(mpc_new_phys);
10112+ mpf->mpf_physptr = mpc_new_bus;
10113+ mpc_new = phys_to_virt(mpc_new_phys);
10114+ memcpy(mpc_new, mpc, mpc->mpc_length);
10115+ mpc = mpc_new;
10116+ /* check if we can modify that */
10117+ if (mpc_new_bus - mpf->mpf_physptr) {
10118+ struct intel_mp_floating *mpf_new;
10119+ /* steal 16 bytes from [0, 1k) */
10120+ printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
10121+ mpf_new = isa_bus_to_virt(0x400 - 16);
10122+ memcpy(mpf_new, mpf, 16);
10123+ mpf = mpf_new;
10124+ mpf->mpf_physptr = mpc_new_bus;
10125+ }
10126+ mpf->mpf_checksum = 0;
10127+ mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
10128+ printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
10129 }
10130
10131- set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
10132-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10133 /*
10134- * For GSI >= 64, use IRQ compression
10135+ * only replace the one with mp_INT and
10136+ * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
10137+ * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
10138+ * may need pci=routeirq for all coverage
10139 */
10140- if ((gsi >= IRQ_COMPRESSION_START)
10141- && (triggering == ACPI_LEVEL_SENSITIVE)) {
10142- /*
10143- * For PCI devices assign IRQs in order, avoiding gaps
10144- * due to unused I/O APIC pins.
10145- */
10146- int irq = gsi;
10147- if (gsi < MAX_GSI_NUM) {
10148- /*
10149- * Retain the VIA chipset work-around (gsi > 15), but
10150- * avoid a problem where the 8254 timer (IRQ0) is setup
10151- * via an override (so it's not on pin 0 of the ioapic),
10152- * and at the same time, the pin 0 interrupt is a PCI
10153- * type. The gsi > 15 test could cause these two pins
10154- * to be shared as IRQ0, and they are not shareable.
10155- * So test for this condition, and if necessary, avoid
10156- * the pin collision.
10157- */
10158- gsi = pci_irq++;
10159- /*
10160- * Don't assign IRQ used by ACPI SCI
10161- */
10162- if (gsi == acpi_gbl_FADT.sci_interrupt)
10163- gsi = pci_irq++;
10164- gsi_to_irq[irq] = gsi;
10165- } else {
10166- printk(KERN_ERR "GSI %u is too high\n", gsi);
10167- return gsi;
10168- }
10169- }
10170-#endif
10171- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
10172- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
10173- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
10174- return gsi;
10175+ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
10176+
10177+ return 0;
10178 }
10179
10180-#endif /* CONFIG_X86_IO_APIC */
10181-#endif /* CONFIG_ACPI */
10182+late_initcall(update_mp_table);
00e5a55c
BS
10183--- sle11-2009-06-04.orig/arch/x86/kernel/nmi.c 2009-06-04 11:08:07.000000000 +0200
10184+++ sle11-2009-06-04/arch/x86/kernel/nmi.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
10185@@ -27,7 +27,9 @@
10186 #include <linux/kdebug.h>
10187 #include <linux/smp.h>
10188
10189+#ifndef CONFIG_XEN
10190 #include <asm/i8259.h>
10191+#endif
10192 #include <asm/io_apic.h>
10193 #include <asm/smp.h>
10194 #include <asm/nmi.h>
10195@@ -179,8 +181,10 @@ int __init check_nmi_watchdog(void)
10196 kfree(prev_nmi_count);
10197 return 0;
10198 error:
10199+#ifndef CONFIG_XEN
10200 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
10201 disable_8259A_irq(0);
10202+#endif
10203 #ifdef CONFIG_X86_32
10204 timer_ack = 0;
10205 #endif
00e5a55c
BS
10206--- sle11-2009-06-04.orig/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:38:05.000000000 +0100
10207+++ sle11-2009-06-04/arch/x86/kernel/pci-dma-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
10208@@ -5,13 +5,13 @@
10209
10210 #include <asm/proto.h>
10211 #include <asm/dma.h>
10212-#include <asm/gart.h>
10213+#include <asm/iommu.h>
10214 #include <asm/calgary.h>
10215+#include <asm/amd_iommu.h>
10216
10217-int forbid_dac __read_mostly;
10218-EXPORT_SYMBOL(forbid_dac);
10219+static int forbid_dac __read_mostly;
10220
10221-const struct dma_mapping_ops *dma_ops;
10222+struct dma_mapping_ops *dma_ops;
10223 EXPORT_SYMBOL(dma_ops);
10224
10225 static int iommu_sac_force __read_mostly;
10226@@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_si
10227 void __init dma32_reserve_bootmem(void)
10228 {
10229 unsigned long size, align;
10230- if (end_pfn <= MAX_DMA32_PFN)
10231+ if (max_pfn <= MAX_DMA32_PFN)
10232 return;
10233
10234+ /*
10235+ * check aperture_64.c allocate_aperture() for reason about
10236+ * using 512M as goal
10237+ */
10238 align = 64ULL<<20;
10239 size = round_up(dma32_bootmem_size, align);
10240 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
10241- __pa(MAX_DMA_ADDRESS));
10242+ 512ULL<<20);
10243 if (dma32_bootmem_ptr)
10244 dma32_bootmem_size = size;
10245 else
10246@@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
10247 }
10248 static void __init dma32_free_bootmem(void)
10249 {
10250- int node;
10251
10252- if (end_pfn <= MAX_DMA32_PFN)
10253+ if (max_pfn <= MAX_DMA32_PFN)
10254 return;
10255
10256 if (!dma32_bootmem_ptr)
10257 return;
10258
10259- for_each_online_node(node)
10260- free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
10261- dma32_bootmem_size);
10262+ free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
10263
10264 dma32_bootmem_ptr = NULL;
10265 dma32_bootmem_size = 0;
10266@@ -107,7 +108,7 @@ static void __init dma32_free_bootmem(vo
10267 #define dma32_free_bootmem() ((void)0)
10268 #endif
10269
10270-static const struct dma_mapping_ops swiotlb_dma_ops = {
10271+static struct dma_mapping_ops swiotlb_dma_ops = {
10272 .mapping_error = swiotlb_dma_mapping_error,
10273 .map_single = swiotlb_map_single_phys,
10274 .unmap_single = swiotlb_unmap_single,
10275@@ -130,25 +131,31 @@ void __init pci_iommu_alloc(void)
10276 * The order of these functions is important for
10277 * fall-back/fail-over reasons
10278 */
10279-#ifdef CONFIG_GART_IOMMU
10280 gart_iommu_hole_init();
10281-#endif
10282
10283-#ifdef CONFIG_CALGARY_IOMMU
10284 detect_calgary();
10285-#endif
10286
10287 detect_intel_iommu();
10288
10289-#ifdef CONFIG_SWIOTLB
10290+ amd_iommu_detect();
10291+
10292 swiotlb_init();
10293 if (swiotlb) {
10294 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
10295 dma_ops = &swiotlb_dma_ops;
10296 }
10297-#endif
10298 }
10299
10300+#ifndef CONFIG_XEN
10301+unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
10302+{
10303+ unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
10304+
10305+ return size >> PAGE_SHIFT;
10306+}
10307+EXPORT_SYMBOL(iommu_num_pages);
10308+#endif
10309+
10310 /*
10311 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
10312 * documentation.
10313@@ -201,9 +208,7 @@ static __init int iommu_setup(char *p)
10314 swiotlb = 1;
10315 #endif
10316
10317-#ifdef CONFIG_GART_IOMMU
10318 gart_parse_options(p);
10319-#endif
10320
10321 #ifdef CONFIG_CALGARY_IOMMU
10322 if (!strncmp(p, "calgary", 7))
10323@@ -245,136 +250,19 @@ int range_straddles_page_boundary(paddr_
10324 !check_pages_physically_contiguous(pfn, offset, size));
10325 }
10326
10327-#ifdef CONFIG_X86_32
10328-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
10329- dma_addr_t device_addr, size_t size, int flags)
10330-{
10331- void __iomem *mem_base = NULL;
10332- int pages = size >> PAGE_SHIFT;
10333- int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
10334-
10335- if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
10336- goto out;
10337- if (!size)
10338- goto out;
10339- if (dev->dma_mem)
10340- goto out;
10341-
10342- /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
10343-
10344- mem_base = ioremap(bus_addr, size);
10345- if (!mem_base)
10346- goto out;
10347-
10348- dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
10349- if (!dev->dma_mem)
10350- goto out;
10351- dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
10352- if (!dev->dma_mem->bitmap)
10353- goto free1_out;
10354-
10355- dev->dma_mem->virt_base = mem_base;
10356- dev->dma_mem->device_base = device_addr;
10357- dev->dma_mem->size = pages;
10358- dev->dma_mem->flags = flags;
10359-
10360- if (flags & DMA_MEMORY_MAP)
10361- return DMA_MEMORY_MAP;
10362-
10363- return DMA_MEMORY_IO;
10364-
10365- free1_out:
10366- kfree(dev->dma_mem);
10367- out:
10368- if (mem_base)
10369- iounmap(mem_base);
10370- return 0;
10371-}
10372-EXPORT_SYMBOL(dma_declare_coherent_memory);
10373-
10374-void dma_release_declared_memory(struct device *dev)
10375-{
10376- struct dma_coherent_mem *mem = dev->dma_mem;
10377-
10378- if (!mem)
10379- return;
10380- dev->dma_mem = NULL;
10381- iounmap(mem->virt_base);
10382- kfree(mem->bitmap);
10383- kfree(mem);
10384-}
10385-EXPORT_SYMBOL(dma_release_declared_memory);
10386-
10387-void *dma_mark_declared_memory_occupied(struct device *dev,
10388- dma_addr_t device_addr, size_t size)
10389-{
10390- struct dma_coherent_mem *mem = dev->dma_mem;
10391- int pos, err;
10392- int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
10393-
10394- pages >>= PAGE_SHIFT;
10395-
10396- if (!mem)
10397- return ERR_PTR(-EINVAL);
10398-
10399- pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
10400- err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
10401- if (err != 0)
10402- return ERR_PTR(err);
10403- return mem->virt_base + (pos << PAGE_SHIFT);
10404-}
10405-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
10406-
10407-static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
10408- dma_addr_t *dma_handle, void **ret)
10409-{
10410- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10411- int order = get_order(size);
10412-
10413- if (mem) {
10414- int page = bitmap_find_free_region(mem->bitmap, mem->size,
10415- order);
10416- if (page >= 0) {
10417- *dma_handle = mem->device_base + (page << PAGE_SHIFT);
10418- *ret = mem->virt_base + (page << PAGE_SHIFT);
10419- memset(*ret, 0, size);
10420- }
10421- if (mem->flags & DMA_MEMORY_EXCLUSIVE)
10422- *ret = NULL;
10423- }
10424- return (mem != NULL);
10425-}
10426-
10427-static int dma_release_coherent(struct device *dev, int order, void *vaddr)
10428-{
10429- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10430-
10431- if (mem && vaddr >= mem->virt_base && vaddr <
10432- (mem->virt_base + (mem->size << PAGE_SHIFT))) {
10433- int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
10434-
10435- bitmap_release_region(mem->bitmap, page, order);
10436- return 1;
10437- }
10438- return 0;
10439-}
10440-#else
10441-#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
10442-#define dma_release_coherent(dev, order, vaddr) (0)
10443-#endif /* CONFIG_X86_32 */
10444-
10445 int dma_supported(struct device *dev, u64 mask)
10446 {
10447+ struct dma_mapping_ops *ops = get_dma_ops(dev);
10448+
10449 #ifdef CONFIG_PCI
10450 if (mask > 0xffffffff && forbid_dac > 0) {
10451- printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
10452- dev->bus_id);
10453+ dev_info(dev, "PCI: Disallowing DAC for device\n");
10454 return 0;
10455 }
10456 #endif
10457
10458- if (dma_ops->dma_supported)
10459- return dma_ops->dma_supported(dev, mask);
10460+ if (ops->dma_supported)
10461+ return ops->dma_supported(dev, mask);
10462
10463 /* Copied from i386. Doesn't make much sense, because it will
10464 only work for pci_alloc_coherent.
10465@@ -395,8 +283,7 @@ int dma_supported(struct device *dev, u6
10466 type. Normally this doesn't make any difference, but gives
10467 more gentle handling of IOMMU overflow. */
10468 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
10469- printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
10470- dev->bus_id, mask);
10471+ dev_info(dev, "Force SAC with mask %Lx\n", mask);
10472 return 0;
10473 }
10474
10475@@ -422,6 +309,9 @@ void *
10476 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
10477 gfp_t gfp)
10478 {
10479+#ifndef CONFIG_XEN
10480+ struct dma_mapping_ops *ops = get_dma_ops(dev);
10481+#endif
10482 void *memory = NULL;
10483 struct page *page;
10484 unsigned long dma_mask = 0;
10485@@ -431,7 +321,7 @@ dma_alloc_coherent(struct device *dev, s
10486 /* ignore region specifiers */
10487 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
10488
10489- if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
10490+ if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
10491 return memory;
10492
10493 if (!dev) {
10494@@ -491,8 +381,8 @@ dma_alloc_coherent(struct device *dev, s
10495 /* Let low level make its own zone decisions */
10496 gfp &= ~(GFP_DMA32|GFP_DMA);
10497
10498- if (dma_ops->alloc_coherent)
10499- return dma_ops->alloc_coherent(dev, size,
10500+ if (ops->alloc_coherent)
10501+ return ops->alloc_coherent(dev, size,
10502 dma_handle, gfp);
10503 return NULL;
10504 }
10505@@ -504,14 +394,14 @@ dma_alloc_coherent(struct device *dev, s
10506 }
10507 }
10508
10509- if (dma_ops->alloc_coherent) {
10510+ if (ops->alloc_coherent) {
10511 free_pages((unsigned long)memory, order);
10512 gfp &= ~(GFP_DMA|GFP_DMA32);
10513- return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
10514+ return ops->alloc_coherent(dev, size, dma_handle, gfp);
10515 }
10516
10517- if (dma_ops->map_simple) {
10518- *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
10519+ if (ops->map_simple) {
10520+ *dma_handle = ops->map_simple(dev, virt_to_bus(memory),
10521 size,
10522 PCI_DMA_BIDIRECTIONAL);
10523 if (*dma_handle != bad_dma_address)
10524@@ -542,13 +432,17 @@ EXPORT_SYMBOL(dma_alloc_coherent);
10525 void dma_free_coherent(struct device *dev, size_t size,
10526 void *vaddr, dma_addr_t bus)
10527 {
10528+#ifndef CONFIG_XEN
10529+ struct dma_mapping_ops *ops = get_dma_ops(dev);
10530+#endif
10531+
10532 int order = get_order(size);
10533 WARN_ON(irqs_disabled()); /* for portability */
10534- if (dma_release_coherent(dev, order, vaddr))
10535+ if (dma_release_from_coherent(dev, order, vaddr))
10536 return;
10537 #ifndef CONFIG_XEN
10538- if (dma_ops->unmap_single)
10539- dma_ops->unmap_single(dev, bus, size, 0);
10540+ if (ops->unmap_single)
10541+ ops->unmap_single(dev, bus, size, 0);
10542 #endif
10543 xen_destroy_contiguous_region((unsigned long)vaddr, order);
10544 free_pages((unsigned long)vaddr, order);
10545@@ -557,15 +451,13 @@ EXPORT_SYMBOL(dma_free_coherent);
10546
10547 static int __init pci_iommu_init(void)
10548 {
10549-#ifdef CONFIG_CALGARY_IOMMU
10550 calgary_iommu_init();
10551-#endif
10552
10553 intel_iommu_init();
10554
10555-#ifdef CONFIG_GART_IOMMU
10556+ amd_iommu_init();
10557+
10558 gart_iommu_init();
10559-#endif
10560
10561 no_iommu_init();
10562 return 0;
00e5a55c
BS
10563--- sle11-2009-06-04.orig/arch/x86/kernel/pci-nommu-xen.c 2009-03-16 16:38:05.000000000 +0100
10564+++ sle11-2009-06-04/arch/x86/kernel/pci-nommu-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
10565@@ -79,18 +79,12 @@ gnttab_unmap_single(struct device *dev,
10566 gnttab_dma_unmap_page(dma_addr);
10567 }
10568
10569-static int nommu_mapping_error(dma_addr_t dma_addr)
10570-{
10571- return (dma_addr == bad_dma_address);
10572-}
10573-
10574-static const struct dma_mapping_ops nommu_dma_ops = {
10575+static struct dma_mapping_ops nommu_dma_ops = {
10576 .map_single = gnttab_map_single,
10577 .unmap_single = gnttab_unmap_single,
10578 .map_sg = gnttab_map_sg,
10579 .unmap_sg = gnttab_unmap_sg,
10580 .dma_supported = swiotlb_dma_supported,
10581- .mapping_error = nommu_mapping_error
10582 };
10583
10584 void __init no_iommu_init(void)
00e5a55c
BS
10585--- sle11-2009-06-04.orig/arch/x86/kernel/probe_roms_32.c 2009-06-04 11:08:07.000000000 +0200
10586+++ sle11-2009-06-04/arch/x86/kernel/probe_roms_32.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
10587@@ -99,6 +99,11 @@ void __init probe_roms(void)
10588 unsigned char c;
10589 int i;
10590
10591+#ifdef CONFIG_XEN
10592+ if (!is_initial_xendomain())
10593+ return;
10594+#endif
10595+
10596 /* video rom */
10597 upper = adapter_rom_resources[0].start;
10598 for (start = video_rom_resource.start; start < upper; start += 2048) {
10599@@ -131,7 +136,7 @@ void __init probe_roms(void)
10600 upper = system_rom_resource.start;
10601
10602 /* check for extension rom (ignore length byte!) */
10603- rom = isa_bus_to_virt(extension_rom_resource.start);
10604+ rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
10605 if (romsignature(rom)) {
10606 length = extension_rom_resource.end - extension_rom_resource.start + 1;
10607 if (romchecksum(rom, length)) {
00e5a55c
BS
10608--- sle11-2009-06-04.orig/arch/x86/kernel/process-xen.c 2009-03-16 16:38:05.000000000 +0100
10609+++ sle11-2009-06-04/arch/x86/kernel/process-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
10610@@ -6,6 +6,13 @@
10611 #include <linux/sched.h>
10612 #include <linux/module.h>
10613 #include <linux/pm.h>
10614+#include <linux/clockchips.h>
10615+#include <asm/system.h>
10616+
10617+unsigned long idle_halt;
10618+EXPORT_SYMBOL(idle_halt);
10619+unsigned long idle_nomwait;
10620+EXPORT_SYMBOL(idle_nomwait);
10621
10622 struct kmem_cache *task_xstate_cachep;
10623
10624@@ -45,6 +52,70 @@ void arch_task_cache_init(void)
10625 SLAB_PANIC, NULL);
10626 }
10627
10628+/*
10629+ * Idle related variables and functions
10630+ */
10631+unsigned long boot_option_idle_override = 0;
10632+EXPORT_SYMBOL(boot_option_idle_override);
10633+
10634+/*
10635+ * Powermanagement idle function, if any..
10636+ */
10637+void (*pm_idle)(void);
10638+EXPORT_SYMBOL(pm_idle);
10639+
10640+#ifdef CONFIG_X86_32
10641+/*
10642+ * This halt magic was a workaround for ancient floppy DMA
10643+ * wreckage. It should be safe to remove.
10644+ */
10645+static int hlt_counter;
10646+void disable_hlt(void)
10647+{
10648+ hlt_counter++;
10649+}
10650+EXPORT_SYMBOL(disable_hlt);
10651+
10652+void enable_hlt(void)
10653+{
10654+ hlt_counter--;
10655+}
10656+EXPORT_SYMBOL(enable_hlt);
10657+
10658+static inline int hlt_use_halt(void)
10659+{
10660+ return (!hlt_counter && boot_cpu_data.hlt_works_ok);
10661+}
10662+#else
10663+static inline int hlt_use_halt(void)
10664+{
10665+ return 1;
10666+}
10667+#endif
10668+
10669+/*
10670+ * We use this if we don't have any better
10671+ * idle routine..
10672+ */
10673+void xen_idle(void)
10674+{
10675+ current_thread_info()->status &= ~TS_POLLING;
10676+ /*
10677+ * TS_POLLING-cleared state must be visible before we
10678+ * test NEED_RESCHED:
10679+ */
10680+ smp_mb();
10681+
10682+ if (!need_resched())
10683+ safe_halt(); /* enables interrupts racelessly */
10684+ else
10685+ local_irq_enable();
10686+ current_thread_info()->status |= TS_POLLING;
10687+}
10688+#ifdef CONFIG_APM_MODULE
10689+EXPORT_SYMBOL(default_idle);
10690+#endif
10691+
10692 static void do_nothing(void *unused)
10693 {
10694 }
10695@@ -61,7 +132,7 @@ void cpu_idle_wait(void)
10696 {
10697 smp_mb();
10698 /* kick all the CPUs so that they exit out of pm_idle */
10699- smp_call_function(do_nothing, NULL, 0, 1);
10700+ smp_call_function(do_nothing, NULL, 1);
10701 }
10702 EXPORT_SYMBOL_GPL(cpu_idle_wait);
10703
10704@@ -125,60 +196,175 @@ static void poll_idle(void)
10705 *
10706 * idle=mwait overrides this decision and forces the usage of mwait.
10707 */
10708+static int __cpuinitdata force_mwait;
10709+
10710+#define MWAIT_INFO 0x05
10711+#define MWAIT_ECX_EXTENDED_INFO 0x01
10712+#define MWAIT_EDX_C1 0xf0
10713+
10714 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
10715 {
10716+ u32 eax, ebx, ecx, edx;
10717+
10718 if (force_mwait)
10719 return 1;
10720
10721- if (c->x86_vendor == X86_VENDOR_AMD) {
10722- switch(c->x86) {
10723- case 0x10:
10724- case 0x11:
10725- return 0;
10726+ if (c->cpuid_level < MWAIT_INFO)
10727+ return 0;
10728+
10729+ cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
10730+ /* Check, whether EDX has extended info about MWAIT */
10731+ if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
10732+ return 1;
10733+
10734+ /*
10735+ * edx enumeratios MONITOR/MWAIT extensions. Check, whether
10736+ * C1 supports MWAIT
10737+ */
10738+ return (edx & MWAIT_EDX_C1);
10739+}
10740+
10741+/*
10742+ * Check for AMD CPUs, which have potentially C1E support
10743+ */
10744+static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
10745+{
10746+ if (c->x86_vendor != X86_VENDOR_AMD)
10747+ return 0;
10748+
10749+ if (c->x86 < 0x0F)
10750+ return 0;
10751+
10752+ /* Family 0x0f models < rev F do not have C1E */
10753+ if (c->x86 == 0x0f && c->x86_model < 0x40)
10754+ return 0;
10755+
10756+ return 1;
10757+}
10758+
10759+static cpumask_t c1e_mask = CPU_MASK_NONE;
10760+static int c1e_detected;
10761+
10762+void c1e_remove_cpu(int cpu)
10763+{
10764+ cpu_clear(cpu, c1e_mask);
10765+}
10766+
10767+/*
10768+ * C1E aware idle routine. We check for C1E active in the interrupt
10769+ * pending message MSR. If we detect C1E, then we handle it the same
10770+ * way as C3 power states (local apic timer and TSC stop)
10771+ */
10772+static void c1e_idle(void)
10773+{
10774+ if (need_resched())
10775+ return;
10776+
10777+ if (!c1e_detected) {
10778+ u32 lo, hi;
10779+
10780+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
10781+ if (lo & K8_INTP_C1E_ACTIVE_MASK) {
10782+ c1e_detected = 1;
10783+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
10784+ mark_tsc_unstable("TSC halt in AMD C1E");
10785+ printk(KERN_INFO "System has AMD C1E enabled\n");
10786+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
10787 }
10788 }
10789- return 1;
10790+
10791+ if (c1e_detected) {
10792+ int cpu = smp_processor_id();
10793+
10794+ if (!cpu_isset(cpu, c1e_mask)) {
10795+ cpu_set(cpu, c1e_mask);
10796+ /*
10797+ * Force broadcast so ACPI can not interfere. Needs
10798+ * to run with interrupts enabled as it uses
10799+ * smp_function_call.
10800+ */
10801+ local_irq_enable();
10802+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
10803+ &cpu);
10804+ printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
10805+ cpu);
10806+ local_irq_disable();
10807+ }
10808+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
10809+
10810+ default_idle();
10811+
10812+ /*
10813+ * The switch back from broadcast mode needs to be
10814+ * called with interrupts disabled.
10815+ */
10816+ local_irq_disable();
10817+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
10818+ local_irq_enable();
10819+ } else
10820+ default_idle();
10821 }
10822 #endif
10823
10824 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
10825 {
10826 #ifndef CONFIG_XEN
10827- static int selected;
10828-
10829- if (selected)
10830- return;
10831 #ifdef CONFIG_X86_SMP
10832 if (pm_idle == poll_idle && smp_num_siblings > 1) {
10833 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
10834 " performance may degrade.\n");
10835 }
10836 #endif
10837+ if (pm_idle)
10838+ return;
10839+
10840 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
10841 /*
10842- * Skip, if setup has overridden idle.
10843 * One CPU supports mwait => All CPUs supports mwait
10844 */
10845- if (!pm_idle) {
10846- printk(KERN_INFO "using mwait in idle threads.\n");
10847- pm_idle = mwait_idle;
10848- }
10849- }
10850- selected = 1;
10851+ printk(KERN_INFO "using mwait in idle threads.\n");
10852+ pm_idle = mwait_idle;
10853+ } else if (check_c1e_idle(c)) {
10854+ printk(KERN_INFO "using C1E aware idle routine\n");
10855+ pm_idle = c1e_idle;
10856+ } else
10857+ pm_idle = default_idle;
10858 #endif
10859 }
10860
10861 static int __init idle_setup(char *str)
10862 {
10863+ if (!str)
10864+ return -EINVAL;
10865+
10866 if (!strcmp(str, "poll")) {
10867 printk("using polling idle threads.\n");
10868 pm_idle = poll_idle;
10869- }
10870 #ifndef CONFIG_XEN
10871- else if (!strcmp(str, "mwait"))
10872+ } else if (!strcmp(str, "mwait"))
10873 force_mwait = 1;
10874+ else if (!strcmp(str, "halt")) {
10875+ /*
10876+ * When the boot option of idle=halt is added, halt is
10877+ * forced to be used for CPU idle. In such case CPU C2/C3
10878+ * won't be used again.
10879+ * To continue to load the CPU idle driver, don't touch
10880+ * the boot_option_idle_override.
10881+ */
10882+ pm_idle = default_idle;
10883+ idle_halt = 1;
10884+ return 0;
10885+ } else if (!strcmp(str, "nomwait")) {
10886+ /*
10887+ * If the boot option of "idle=nomwait" is added,
10888+ * it means that mwait will be disabled for CPU C2/C3
10889+ * states. In such case it won't touch the variable
10890+ * of boot_option_idle_override.
10891+ */
10892+ idle_nomwait = 1;
10893+ return 0;
10894 #endif
10895- else
10896+ } else
10897 return -1;
10898
10899 boot_option_idle_override = 1;
00e5a55c
BS
10900--- sle11-2009-06-04.orig/arch/x86/kernel/process_32-xen.c 2009-03-16 16:38:05.000000000 +0100
10901+++ sle11-2009-06-04/arch/x86/kernel/process_32-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
10902@@ -59,15 +59,11 @@
10903 #include <asm/tlbflush.h>
10904 #include <asm/cpu.h>
10905 #include <asm/kdebug.h>
10906+#include <asm/idle.h>
10907
10908 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
10909 asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
10910
10911-static int hlt_counter;
10912-
10913-unsigned long boot_option_idle_override = 0;
10914-EXPORT_SYMBOL(boot_option_idle_override);
10915-
10916 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
10917 EXPORT_PER_CPU_SYMBOL(current_task);
10918
10919@@ -82,46 +78,27 @@ unsigned long thread_saved_pc(struct tas
10920 return ((unsigned long *)tsk->thread.sp)[3];
10921 }
10922
10923-/*
10924- * Powermanagement idle function, if any..
10925- */
10926-void (*pm_idle)(void);
10927-EXPORT_SYMBOL(pm_idle);
10928+#ifdef CONFIG_HOTPLUG_CPU
10929+#ifndef CONFIG_XEN
10930+#include <asm/nmi.h>
10931
10932-void disable_hlt(void)
10933+static void cpu_exit_clear(void)
10934 {
10935- hlt_counter++;
10936-}
10937+ int cpu = raw_smp_processor_id();
10938
10939-EXPORT_SYMBOL(disable_hlt);
10940-
10941-void enable_hlt(void)
10942-{
10943- hlt_counter--;
10944-}
10945+ idle_task_exit();
10946
10947-EXPORT_SYMBOL(enable_hlt);
10948+ cpu_uninit();
10949+ irq_ctx_exit(cpu);
10950
10951-static void xen_idle(void)
10952-{
10953- current_thread_info()->status &= ~TS_POLLING;
10954- /*
10955- * TS_POLLING-cleared state must be visible before we
10956- * test NEED_RESCHED:
10957- */
10958- smp_mb();
10959+ cpu_clear(cpu, cpu_callout_map);
10960+ cpu_clear(cpu, cpu_callin_map);
10961
10962- if (!need_resched())
10963- safe_halt(); /* enables interrupts racelessly */
10964- else
10965- local_irq_enable();
10966- current_thread_info()->status |= TS_POLLING;
10967+ numa_remove_cpu(cpu);
10968+ c1e_remove_cpu(cpu);
10969 }
10970-#ifdef CONFIG_APM_MODULE
10971-EXPORT_SYMBOL(default_idle);
10972 #endif
10973
10974-#ifdef CONFIG_HOTPLUG_CPU
10975 static inline void play_dead(void)
10976 {
10977 idle_task_exit();
10978@@ -152,13 +129,11 @@ void cpu_idle(void)
10979
10980 /* endless idle loop with no priority at all */
10981 while (1) {
10982- tick_nohz_stop_sched_tick();
10983+ tick_nohz_stop_sched_tick(1);
10984 while (!need_resched()) {
10985- void (*idle)(void);
10986
10987 check_pgt_cache();
10988 rmb();
10989- idle = xen_idle; /* no alternatives */
10990
10991 if (rcu_pending(cpu))
10992 rcu_check_callbacks(cpu, 0);
10993@@ -168,7 +143,10 @@ void cpu_idle(void)
10994
10995 local_irq_disable();
10996 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
10997- idle();
10998+ /* Don't trace irqs off for idle */
10999+ stop_critical_timings();
11000+ xen_idle();
11001+ start_critical_timings();
11002 }
11003 tick_nohz_restart_sched_tick();
11004 preempt_enable_no_resched();
00e5a55c
BS
11005--- sle11-2009-06-04.orig/arch/x86/kernel/process_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11006+++ sle11-2009-06-04/arch/x86/kernel/process_64-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
11007@@ -65,15 +65,6 @@ asmlinkage extern void ret_from_fork(voi
11008
11009 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
11010
11011-unsigned long boot_option_idle_override = 0;
11012-EXPORT_SYMBOL(boot_option_idle_override);
11013-
11014-/*
11015- * Powermanagement idle function, if any..
11016- */
11017-void (*pm_idle)(void);
11018-EXPORT_SYMBOL(pm_idle);
11019-
11020 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
11021
11022 void idle_notifier_register(struct notifier_block *n)
11023@@ -103,25 +94,13 @@ void exit_idle(void)
11024 __exit_idle();
11025 }
11026
11027-static void xen_idle(void)
11028-{
11029- current_thread_info()->status &= ~TS_POLLING;
11030- /*
11031- * TS_POLLING-cleared state must be visible before we
11032- * test NEED_RESCHED:
11033- */
11034- smp_mb();
11035- if (!need_resched())
11036- safe_halt(); /* enables interrupts racelessly */
11037- else
11038- local_irq_enable();
11039- current_thread_info()->status |= TS_POLLING;
11040-}
11041-
11042 #ifdef CONFIG_HOTPLUG_CPU
11043 static inline void play_dead(void)
11044 {
11045 idle_task_exit();
11046+#ifndef CONFIG_XEN
11047+ c1e_remove_cpu(raw_smp_processor_id());
11048+#endif
11049 local_irq_disable();
11050 cpu_clear(smp_processor_id(), cpu_initialized);
11051 preempt_enable_no_resched();
11052@@ -146,12 +125,11 @@ void cpu_idle(void)
11053 current_thread_info()->status |= TS_POLLING;
11054 /* endless idle loop with no priority at all */
11055 while (1) {
11056- tick_nohz_stop_sched_tick();
11057+ tick_nohz_stop_sched_tick(1);
11058 while (!need_resched()) {
11059- void (*idle)(void);
11060
11061 rmb();
11062- idle = xen_idle; /* no alternatives */
11063+
11064 if (cpu_is_offline(smp_processor_id()))
11065 play_dead();
11066 /*
11067@@ -161,7 +139,10 @@ void cpu_idle(void)
11068 */
11069 local_irq_disable();
11070 enter_idle();
11071- idle();
11072+ /* Don't trace irqs off for idle */
11073+ stop_critical_timings();
11074+ xen_idle();
11075+ start_critical_timings();
11076 /* In many cases the interrupt that ended idle
11077 has already called exit_idle. But some idle
11078 loops can be woken up without interrupt. */
11079@@ -271,7 +252,7 @@ void exit_thread(void)
11080 }
11081 }
11082
11083-void load_gs_index(unsigned gs)
11084+void xen_load_gs_index(unsigned gs)
11085 {
11086 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
11087 }
11088@@ -373,10 +354,10 @@ int copy_thread(int nr, unsigned long cl
11089 p->thread.fs = me->thread.fs;
11090 p->thread.gs = me->thread.gs;
11091
11092- asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
11093- asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
11094- asm("mov %%es,%0" : "=m" (p->thread.es));
11095- asm("mov %%ds,%0" : "=m" (p->thread.ds));
11096+ savesegment(gs, p->thread.gsindex);
11097+ savesegment(fs, p->thread.fsindex);
11098+ savesegment(es, p->thread.es);
11099+ savesegment(ds, p->thread.ds);
11100
11101 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
11102 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
11103@@ -417,7 +398,9 @@ out:
11104 void
11105 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
11106 {
11107- asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
11108+ loadsegment(fs, 0);
11109+ loadsegment(es, 0);
11110+ loadsegment(ds, 0);
11111 load_gs_index(0);
11112 regs->ip = new_ip;
11113 regs->sp = new_sp;
11114@@ -557,8 +540,8 @@ static inline void __switch_to_xtra(stru
11115 struct task_struct *
11116 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
11117 {
11118- struct thread_struct *prev = &prev_p->thread,
11119- *next = &next_p->thread;
11120+ struct thread_struct *prev = &prev_p->thread;
11121+ struct thread_struct *next = &next_p->thread;
11122 int cpu = smp_processor_id();
11123 #ifndef CONFIG_X86_NO_TSS
11124 struct tss_struct *tss = &per_cpu(init_tss, cpu);
11125@@ -661,12 +644,25 @@ __switch_to(struct task_struct *prev_p,
11126 */
11127 if (unlikely(next->es))
11128 loadsegment(es, next->es);
11129-
11130+
11131 if (unlikely(next->ds))
11132 loadsegment(ds, next->ds);
11133
11134+ /*
11135+ * Leave lazy mode, flushing any hypercalls made here.
11136+ * This must be done before restoring TLS segments so
11137+ * the GDT and LDT are properly updated, and must be
11138+ * done before math_state_restore, so the TS bit is up
11139+ * to date.
11140+ */
11141+ arch_leave_lazy_cpu_mode();
11142+
11143 /*
11144 * Switch FS and GS.
11145+ *
11146+ * Segment register != 0 always requires a reload. Also
11147+ * reload when it has changed. When prev process used 64bit
11148+ * base always reload to avoid an information leak.
11149 */
11150 if (unlikely(next->fsindex))
11151 loadsegment(fs, next->fsindex);
11152@@ -687,7 +683,8 @@ __switch_to(struct task_struct *prev_p,
11153 write_pda(oldrsp, next->usersp);
11154 write_pda(pcurrent, next_p);
11155 write_pda(kernelstack,
11156- (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
11157+ (unsigned long)task_stack_page(next_p) +
11158+ THREAD_SIZE - PDA_STACKOFFSET);
11159 #ifdef CONFIG_CC_STACKPROTECTOR
11160 write_pda(stack_canary, next_p->stack_canary);
11161
11162@@ -848,7 +845,7 @@ long do_arch_prctl(struct task_struct *t
11163 set_32bit_tls(task, FS_TLS, addr);
11164 if (doit) {
11165 load_TLS(&task->thread, cpu);
11166- asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
11167+ loadsegment(fs, FS_TLS_SEL);
11168 }
11169 task->thread.fsindex = FS_TLS_SEL;
11170 task->thread.fs = 0;
11171@@ -858,7 +855,7 @@ long do_arch_prctl(struct task_struct *t
11172 if (doit) {
11173 /* set the selector to 0 to not confuse
11174 __switch_to */
11175- asm volatile("movl %0,%%fs" :: "r" (0));
11176+ loadsegment(fs, 0);
11177 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
11178 addr);
11179 }
11180@@ -882,7 +879,7 @@ long do_arch_prctl(struct task_struct *t
11181 if (task->thread.gsindex == GS_TLS_SEL)
11182 base = read_32bit_tls(task, GS_TLS);
11183 else if (doit) {
11184- asm("movl %%gs,%0" : "=r" (gsindex));
11185+ savesegment(gs, gsindex);
11186 if (gsindex)
11187 rdmsrl(MSR_KERNEL_GS_BASE, base);
11188 else
00e5a55c
BS
11189--- sle11-2009-06-04.orig/arch/x86/kernel/quirks-xen.c 2009-03-16 16:33:40.000000000 +0100
11190+++ sle11-2009-06-04/arch/x86/kernel/quirks-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
11191@@ -63,6 +63,7 @@ static enum {
11192 ICH_FORCE_HPET_RESUME,
11193 VT8237_FORCE_HPET_RESUME,
11194 NVIDIA_FORCE_HPET_RESUME,
11195+ ATI_FORCE_HPET_RESUME,
11196 } force_hpet_resume_type;
11197
11198 static void __iomem *rcba_base;
11199@@ -156,6 +157,8 @@ static void ich_force_enable_hpet(struct
11200
11201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
11202 ich_force_enable_hpet);
11203+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
11204+ ich_force_enable_hpet);
11205 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
11206 ich_force_enable_hpet);
11207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
11208@@ -172,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
11209
11210 static struct pci_dev *cached_dev;
11211
11212+static void hpet_print_force_info(void)
11213+{
11214+ printk(KERN_INFO "HPET not enabled in BIOS. "
11215+ "You might try hpet=force boot option\n");
11216+}
11217+
11218 static void old_ich_force_hpet_resume(void)
11219 {
11220 u32 val;
11221@@ -251,8 +260,12 @@ static void old_ich_force_enable_hpet_us
11222 {
11223 if (hpet_force_user)
11224 old_ich_force_enable_hpet(dev);
11225+ else
11226+ hpet_print_force_info();
11227 }
11228
11229+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
11230+ old_ich_force_enable_hpet_user);
11231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
11232 old_ich_force_enable_hpet_user);
11233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
11234@@ -288,9 +301,14 @@ static void vt8237_force_enable_hpet(str
11235 {
11236 u32 uninitialized_var(val);
11237
11238- if (!hpet_force_user || hpet_address || force_hpet_address)
11239+ if (hpet_address || force_hpet_address)
11240 return;
11241
11242+ if (!hpet_force_user) {
11243+ hpet_print_force_info();
11244+ return;
11245+ }
11246+
11247 pci_read_config_dword(dev, 0x68, &val);
11248 /*
11249 * Bit 7 is HPET enable bit.
11250@@ -328,6 +346,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_V
11251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
11252 vt8237_force_enable_hpet);
11253
11254+static void ati_force_hpet_resume(void)
11255+{
11256+ pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
11257+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
11258+}
11259+
11260+static void ati_force_enable_hpet(struct pci_dev *dev)
11261+{
11262+ u32 uninitialized_var(val);
11263+
11264+ if (hpet_address || force_hpet_address)
11265+ return;
11266+
11267+ if (!hpet_force_user) {
11268+ hpet_print_force_info();
11269+ return;
11270+ }
11271+
11272+ pci_write_config_dword(dev, 0x14, 0xfed00000);
11273+ pci_read_config_dword(dev, 0x14, &val);
11274+ force_hpet_address = val;
11275+ force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
11276+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
11277+ force_hpet_address);
11278+ cached_dev = dev;
11279+ return;
11280+}
11281+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
11282+ ati_force_enable_hpet);
11283+
11284 /*
11285 * Undocumented chipset feature taken from LinuxBIOS.
11286 */
11287@@ -341,8 +389,13 @@ static void nvidia_force_enable_hpet(str
11288 {
11289 u32 uninitialized_var(val);
11290
11291- if (!hpet_force_user || hpet_address || force_hpet_address)
11292+ if (hpet_address || force_hpet_address)
11293+ return;
11294+
11295+ if (!hpet_force_user) {
11296+ hpet_print_force_info();
11297 return;
11298+ }
11299
11300 pci_write_config_dword(dev, 0x44, 0xfed00001);
11301 pci_read_config_dword(dev, 0x44, &val);
11302@@ -395,6 +448,9 @@ void force_hpet_resume(void)
11303 case NVIDIA_FORCE_HPET_RESUME:
11304 nvidia_force_hpet_resume();
11305 return;
11306+ case ATI_FORCE_HPET_RESUME:
11307+ ati_force_hpet_resume();
11308+ return;
11309 default:
11310 break;
11311 }
00e5a55c
BS
11312--- sle11-2009-06-04.orig/arch/x86/kernel/setup-xen.c 2009-03-16 16:38:05.000000000 +0100
11313+++ sle11-2009-06-04/arch/x86/kernel/setup-xen.c 2009-06-04 10:21:39.000000000 +0200
11314@@ -1,141 +1,1131 @@
cc90b958
BS
11315-#include <linux/kernel.h>
11316+/*
11317+ * Copyright (C) 1995 Linus Torvalds
11318+ *
11319+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11320+ *
11321+ * Memory region support
11322+ * David Parsons <orc@pell.chi.il.us>, July-August 1999
11323+ *
11324+ * Added E820 sanitization routine (removes overlapping memory regions);
11325+ * Brian Moyle <bmoyle@mvista.com>, February 2001
11326+ *
11327+ * Moved CPU detection code to cpu/${cpu}.c
11328+ * Patrick Mochel <mochel@osdl.org>, March 2002
11329+ *
11330+ * Provisions for empty E820 memory regions (reported by certain BIOSes).
11331+ * Alex Achenbach <xela@slit.de>, December 2002.
11332+ *
11333+ */
11334+
11335+/*
11336+ * This file handles the architecture-dependent parts of initialization
11337+ */
11338+
11339+#include <linux/sched.h>
11340+#include <linux/mm.h>
11341+#include <linux/mmzone.h>
11342+#include <linux/screen_info.h>
11343+#include <linux/ioport.h>
11344+#include <linux/acpi.h>
11345+#include <linux/apm_bios.h>
11346+#include <linux/initrd.h>
11347+#include <linux/bootmem.h>
11348+#include <linux/seq_file.h>
11349+#include <linux/console.h>
11350+#include <linux/mca.h>
11351+#include <linux/root_dev.h>
11352+#include <linux/highmem.h>
11353 #include <linux/module.h>
11354+#include <linux/efi.h>
11355 #include <linux/init.h>
11356-#include <linux/bootmem.h>
11357+#include <linux/edd.h>
11358+#include <linux/iscsi_ibft.h>
11359+#include <linux/nodemask.h>
11360+#include <linux/kexec.h>
11361+#include <linux/dmi.h>
11362+#include <linux/pfn.h>
11363+#include <linux/pci.h>
11364+#include <asm/pci-direct.h>
11365+#include <linux/init_ohci1394_dma.h>
11366+#include <linux/kvm_para.h>
11367+
11368+#include <linux/errno.h>
11369+#include <linux/kernel.h>
11370+#include <linux/stddef.h>
11371+#include <linux/unistd.h>
11372+#include <linux/ptrace.h>
11373+#include <linux/slab.h>
11374+#include <linux/user.h>
11375+#include <linux/delay.h>
11376+
11377+#include <linux/kallsyms.h>
11378+#include <linux/cpufreq.h>
11379+#include <linux/dma-mapping.h>
11380+#include <linux/ctype.h>
11381+#include <linux/uaccess.h>
11382+
11383 #include <linux/percpu.h>
11384-#include <asm/smp.h>
11385-#include <asm/percpu.h>
11386+#include <linux/crash_dump.h>
11387+
11388+#include <video/edid.h>
11389+
11390+#include <asm/mtrr.h>
11391+#include <asm/apic.h>
11392+#include <asm/e820.h>
11393+#include <asm/mpspec.h>
11394+#include <asm/setup.h>
11395+#include <asm/arch_hooks.h>
11396+#include <asm/efi.h>
11397 #include <asm/sections.h>
11398+#include <asm/dmi.h>
11399+#include <asm/io_apic.h>
11400+#include <asm/ist.h>
11401+#include <asm/vmi.h>
11402+#include <setup_arch.h>
11403+#include <asm/bios_ebda.h>
11404+#include <asm/cacheflush.h>
11405 #include <asm/processor.h>
11406-#include <asm/setup.h>
11407+#include <asm/bugs.h>
11408+
11409+#include <asm/system.h>
11410+#include <asm/vsyscall.h>
11411+#include <asm/smp.h>
11412+#include <asm/desc.h>
11413+#include <asm/dma.h>
11414+#include <asm/iommu.h>
11415+#include <asm/mmu_context.h>
11416+#include <asm/proto.h>
11417+
11418+#include <mach_apic.h>
11419+#include <asm/paravirt.h>
11420+
11421+#include <asm/percpu.h>
11422 #include <asm/topology.h>
11423-#include <asm/mpspec.h>
11424 #include <asm/apicdef.h>
11425+#ifdef CONFIG_X86_64
11426+#include <asm/numa_64.h>
11427+#endif
11428+
11429+#ifdef CONFIG_XEN
11430+#include <asm/hypervisor.h>
11431+#include <xen/interface/kexec.h>
11432+#include <xen/interface/memory.h>
11433+#include <xen/interface/nmi.h>
11434+#include <xen/interface/physdev.h>
11435+#include <xen/features.h>
11436+#include <xen/firmware.h>
11437+#include <xen/xencons.h>
11438+
11439+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11440+EXPORT_SYMBOL(HYPERVISOR_shared_info);
11441
11442-#ifdef CONFIG_X86_LOCAL_APIC
11443-unsigned int num_processors;
11444-unsigned disabled_cpus __cpuinitdata;
11445-/* Processor that is doing the boot up */
11446-unsigned int boot_cpu_physical_apicid = -1U;
11447-EXPORT_SYMBOL(boot_cpu_physical_apicid);
11448+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
11449+static struct notifier_block xen_panic_block = {
11450+ xen_panic_event, NULL, 0 /* try to go last */
11451+};
11452
11453-DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
11454-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
11455+unsigned long *phys_to_machine_mapping;
11456+EXPORT_SYMBOL(phys_to_machine_mapping);
11457
11458-/* Bitmask of physically existing CPUs */
11459-physid_mask_t phys_cpu_present_map;
11460+unsigned long *pfn_to_mfn_frame_list_list,
11461+#ifdef CONFIG_X86_64
11462+ *pfn_to_mfn_frame_list[512];
11463+#else
11464+ *pfn_to_mfn_frame_list[128];
11465+#endif
11466+
11467+/* Raw start-of-day parameters from the hypervisor. */
11468+start_info_t *xen_start_info;
11469+EXPORT_SYMBOL(xen_start_info);
11470+#endif
11471+
11472+#ifndef ARCH_SETUP
11473+#define ARCH_SETUP
11474+#endif
11475+
11476+#ifndef CONFIG_XEN
11477+#ifndef CONFIG_DEBUG_BOOT_PARAMS
11478+struct boot_params __initdata boot_params;
11479+#else
11480+struct boot_params boot_params;
11481+#endif
11482 #endif
11483
11484-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
11485 /*
11486- * Copy data used in early init routines from the initial arrays to the
11487- * per cpu data areas. These arrays then become expendable and the
11488- * *_early_ptr's are zeroed indicating that the static arrays are gone.
11489+ * Machine setup..
11490 */
11491-static void __init setup_per_cpu_maps(void)
11492+static struct resource data_resource = {
11493+ .name = "Kernel data",
11494+ .start = 0,
11495+ .end = 0,
11496+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11497+};
11498+
11499+static struct resource code_resource = {
11500+ .name = "Kernel code",
11501+ .start = 0,
11502+ .end = 0,
11503+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11504+};
11505+
11506+static struct resource bss_resource = {
11507+ .name = "Kernel bss",
11508+ .start = 0,
11509+ .end = 0,
11510+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11511+};
11512+
11513+
11514+#ifdef CONFIG_X86_32
11515+#ifndef CONFIG_XEN
11516+/* This value is set up by the early boot code to point to the value
11517+ immediately after the boot time page tables. It contains a *physical*
11518+ address, and must not be in the .bss segment! */
11519+unsigned long init_pg_tables_start __initdata = ~0UL;
11520+unsigned long init_pg_tables_end __initdata = ~0UL;
11521+#endif
11522+
11523+static struct resource video_ram_resource = {
11524+ .name = "Video RAM area",
11525+ .start = 0xa0000,
11526+ .end = 0xbffff,
11527+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11528+};
11529+
11530+/* cpu data as detected by the assembly code in head.S */
11531+struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11532+/* common cpu data for all cpus */
11533+struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11534+EXPORT_SYMBOL(boot_cpu_data);
11535+#ifndef CONFIG_XEN
11536+static void set_mca_bus(int x)
00e5a55c 11537+{
cc90b958
BS
11538+#ifdef CONFIG_MCA
11539+ MCA_bus = x;
11540+#endif
11541+}
11542+
11543+unsigned int def_to_bigsmp;
11544+
11545+/* for MCA, but anyone else can use it if they want */
11546+unsigned int machine_id;
11547+unsigned int machine_submodel_id;
11548+unsigned int BIOS_revision;
11549+
11550+struct apm_info apm_info;
11551+EXPORT_SYMBOL(apm_info);
11552+#endif
11553+
11554+#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
11555+struct ist_info ist_info;
11556+EXPORT_SYMBOL(ist_info);
11557+#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
11558+struct ist_info ist_info;
11559+#endif
11560+
11561+#else
11562+struct cpuinfo_x86 boot_cpu_data __read_mostly;
11563+EXPORT_SYMBOL(boot_cpu_data);
11564+#endif
11565+
11566+
11567+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
11568+unsigned long mmu_cr4_features;
11569+#else
11570+unsigned long mmu_cr4_features = X86_CR4_PAE;
11571+#endif
11572+
11573+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
11574+int bootloader_type;
11575+
11576+/*
11577+ * Early DMI memory
11578+ */
11579+int dmi_alloc_index;
11580+char dmi_alloc_data[DMI_MAX_DATA];
11581+
11582+/*
11583+ * Setup options
11584+ */
11585+struct screen_info screen_info;
11586+EXPORT_SYMBOL(screen_info);
11587+struct edid_info edid_info;
11588+EXPORT_SYMBOL_GPL(edid_info);
11589+
11590+extern int root_mountflags;
11591+
11592+unsigned long saved_video_mode;
11593+
11594+#define RAMDISK_IMAGE_START_MASK 0x07FF
11595+#define RAMDISK_PROMPT_FLAG 0x8000
11596+#define RAMDISK_LOAD_FLAG 0x4000
11597+
11598+static char __initdata command_line[COMMAND_LINE_SIZE];
11599+
11600+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11601+struct edd edd;
11602+#ifdef CONFIG_EDD_MODULE
11603+EXPORT_SYMBOL(edd);
11604+#endif
00e5a55c 11605+#ifndef CONFIG_XEN
cc90b958
BS
11606+/**
11607+ * copy_edd() - Copy the BIOS EDD information
11608+ * from boot_params into a safe place.
11609+ *
11610+ */
11611+static inline void copy_edd(void)
11612+{
11613+ memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
11614+ sizeof(edd.mbr_signature));
11615+ memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
11616+ edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
11617+ edd.edd_info_nr = boot_params.eddbuf_entries;
11618+}
11619+#endif
11620+#else
11621+static inline void copy_edd(void)
11622+{
11623+}
11624+#endif
11625+
11626+#ifdef CONFIG_BLK_DEV_INITRD
11627+
11628+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11629+
11630+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
11631+static void __init relocate_initrd(void)
11632+{
11633+
11634+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11635+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11636+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11637+ u64 ramdisk_here;
11638+ unsigned long slop, clen, mapaddr;
11639+ char *p, *q;
11640+
11641+ /* We need to move the initrd down into lowmem */
11642+ ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
11643+ PAGE_SIZE);
11644+
11645+ if (ramdisk_here == -1ULL)
11646+ panic("Cannot find place for new RAMDISK of size %lld\n",
11647+ ramdisk_size);
11648+
11649+ /* Note: this includes all the lowmem currently occupied by
11650+ the initrd, we rely on that fact to keep the data intact. */
11651+ reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
11652+ "NEW RAMDISK");
11653+ initrd_start = ramdisk_here + PAGE_OFFSET;
11654+ initrd_end = initrd_start + ramdisk_size;
11655+ printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
11656+ ramdisk_here, ramdisk_here + ramdisk_size);
11657+
11658+ q = (char *)initrd_start;
11659+
11660+ /* Copy any lowmem portion of the initrd */
11661+ if (ramdisk_image < end_of_lowmem) {
11662+ clen = end_of_lowmem - ramdisk_image;
11663+ p = (char *)__va(ramdisk_image);
11664+ memcpy(q, p, clen);
11665+ q += clen;
11666+ ramdisk_image += clen;
11667+ ramdisk_size -= clen;
11668+ }
11669+
11670+ /* Copy the highmem portion of the initrd */
11671+ while (ramdisk_size) {
11672+ slop = ramdisk_image & ~PAGE_MASK;
11673+ clen = ramdisk_size;
11674+ if (clen > MAX_MAP_CHUNK-slop)
11675+ clen = MAX_MAP_CHUNK-slop;
11676+ mapaddr = ramdisk_image & PAGE_MASK;
11677+ p = early_ioremap(mapaddr, clen+slop);
11678+ memcpy(q, p+slop, clen);
11679+ early_iounmap(p, clen+slop);
11680+ q += clen;
11681+ ramdisk_image += clen;
11682+ ramdisk_size -= clen;
11683+ }
11684+ /* high pages is not converted by early_res_to_bootmem */
11685+ ramdisk_image = boot_params.hdr.ramdisk_image;
11686+ ramdisk_size = boot_params.hdr.ramdisk_size;
11687+ printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
11688+ " %08llx - %08llx\n",
11689+ ramdisk_image, ramdisk_image + ramdisk_size - 1,
11690+ ramdisk_here, ramdisk_here + ramdisk_size - 1);
11691+}
11692+#endif
11693+
11694+static void __init reserve_initrd(void)
00e5a55c
BS
11695 {
11696 #ifndef CONFIG_XEN
11697- int cpu;
cc90b958
BS
11698+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11699+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11700+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
11701+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
00e5a55c 11702+
cc90b958
BS
11703+ if (!boot_params.hdr.type_of_loader ||
11704+ !ramdisk_image || !ramdisk_size)
11705+ return; /* No initrd provided by bootloader */
11706+#else
11707+ unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
11708+ unsigned long ramdisk_size = xen_start_info->mod_len;
11709+ unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
11710+ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
00e5a55c
BS
11711
11712- for_each_possible_cpu(cpu) {
11713- per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
11714- per_cpu(x86_bios_cpu_apicid, cpu) =
11715- x86_bios_cpu_apicid_init[cpu];
11716-#ifdef CONFIG_NUMA
11717- per_cpu(x86_cpu_to_node_map, cpu) =
11718- x86_cpu_to_node_map_init[cpu];
cc90b958
BS
11719+ if (!xen_start_info->mod_start || !ramdisk_size)
11720+ return; /* No initrd provided by bootloader */
11721 #endif
11722+
11723+ initrd_start = 0;
11724+
11725+ if (ramdisk_size >= (end_of_lowmem>>1)) {
11726+ free_early(ramdisk_image, ramdisk_end);
11727+ printk(KERN_ERR "initrd too large to handle, "
11728+ "disabling initrd\n");
11729+ return;
11730 }
11731
11732- /* indicate the early static arrays will soon be gone */
11733- x86_cpu_to_apicid_early_ptr = NULL;
11734- x86_bios_cpu_apicid_early_ptr = NULL;
11735-#ifdef CONFIG_NUMA
11736- x86_cpu_to_node_map_early_ptr = NULL;
11737+ printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
11738+ ramdisk_end);
11739+
11740+
11741+ if (ramdisk_end <= end_of_lowmem) {
11742+ /* All in lowmem, easy case */
11743+ /*
11744+ * don't need to reserve again, already reserved early
11745+ * in i386_start_kernel
11746+ */
11747+ initrd_start = ramdisk_image + PAGE_OFFSET;
11748+ initrd_end = initrd_start + ramdisk_size;
11749+#ifdef CONFIG_X86_64_XEN
11750+ initrd_below_start_ok = 1;
11751 #endif
11752+ return;
11753+ }
11754+
11755+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11756+ relocate_initrd();
11757+#else
11758+ printk(KERN_ERR "initrd extends beyond end of memory "
11759+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11760+ ramdisk_end, end_of_lowmem);
11761+ initrd_start = 0;
11762 #endif
11763+ free_early(ramdisk_image, ramdisk_end);
11764 }
11765+#else
11766+static void __init reserve_initrd(void)
11767+{
11768+}
11769+#endif /* CONFIG_BLK_DEV_INITRD */
00e5a55c 11770+
cc90b958
BS
11771+static void __init parse_setup_data(void)
11772+{
11773+#ifndef CONFIG_XEN
11774+ struct setup_data *data;
11775+ u64 pa_data;
00e5a55c
BS
11776
11777-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
11778-cpumask_t *cpumask_of_cpu_map __read_mostly;
11779-EXPORT_SYMBOL(cpumask_of_cpu_map);
cc90b958
BS
11780+ if (boot_params.hdr.version < 0x0209)
11781+ return;
11782+ pa_data = boot_params.hdr.setup_data;
11783+ while (pa_data) {
11784+ data = early_ioremap(pa_data, PAGE_SIZE);
11785+ switch (data->type) {
11786+ case SETUP_E820_EXT:
11787+ parse_e820_ext(data, pa_data);
11788+ break;
11789+ default:
11790+ break;
11791+ }
11792+ pa_data = data->next;
11793+ early_iounmap(data, PAGE_SIZE);
11794+ }
11795+#endif
11796+}
11797
11798-/* requires nr_cpu_ids to be initialized */
11799-static void __init setup_cpumask_of_cpu(void)
11800+static void __init e820_reserve_setup_data(void)
11801 {
11802- int i;
11803+#ifndef CONFIG_XEN
11804+ struct setup_data *data;
11805+ u64 pa_data;
11806+ int found = 0;
00e5a55c 11807+
cc90b958
BS
11808+ if (boot_params.hdr.version < 0x0209)
11809+ return;
11810+ pa_data = boot_params.hdr.setup_data;
11811+ while (pa_data) {
11812+ data = early_ioremap(pa_data, sizeof(*data));
11813+ e820_update_range(pa_data, sizeof(*data)+data->len,
11814+ E820_RAM, E820_RESERVED_KERN);
11815+ found = 1;
11816+ pa_data = data->next;
11817+ early_iounmap(data, sizeof(*data));
11818+ }
11819+ if (!found)
11820+ return;
00e5a55c
BS
11821
11822- /* alloc_bootmem zeroes memory */
11823- cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
11824- for (i = 0; i < nr_cpu_ids; i++)
11825- cpu_set(i, cpumask_of_cpu_map[i]);
cc90b958
BS
11826+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
11827+ memcpy(&e820_saved, &e820, sizeof(struct e820map));
11828+ printk(KERN_INFO "extended physical RAM map:\n");
11829+ e820_print_map("reserve setup_data");
11830+#endif
11831 }
11832-#else
11833-static inline void setup_cpumask_of_cpu(void) { }
11834+
11835+static void __init reserve_early_setup_data(void)
11836+{
11837+#ifndef CONFIG_XEN
11838+ struct setup_data *data;
11839+ u64 pa_data;
11840+ char buf[32];
11841+
11842+ if (boot_params.hdr.version < 0x0209)
11843+ return;
11844+ pa_data = boot_params.hdr.setup_data;
11845+ while (pa_data) {
11846+ data = early_ioremap(pa_data, sizeof(*data));
11847+ sprintf(buf, "setup data %x", data->type);
11848+ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
11849+ pa_data = data->next;
11850+ early_iounmap(data, sizeof(*data));
11851+ }
11852 #endif
11853+}
11854
11855-#ifdef CONFIG_X86_32
11856 /*
11857- * Great future not-so-futuristic plan: make i386 and x86_64 do it
11858- * the same way
11859+ * --------- Crashkernel reservation ------------------------------
11860+ */
11861+
11862+#ifdef CONFIG_KEXEC
11863+
11864+#ifndef CONFIG_XEN
11865+/**
11866+ * Reserve @size bytes of crashkernel memory at any suitable offset.
11867+ *
11868+ * @size: Size of the crashkernel memory to reserve.
11869+ * Returns the base address on success, and -1ULL on failure.
11870+ */
11871+unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
11872+{
11873+ const unsigned long long alignment = 16<<20; /* 16M */
11874+ unsigned long long start = 0LL;
11875+
11876+ while (1) {
11877+ int ret;
11878+
11879+ start = find_e820_area(start, ULONG_MAX, size, alignment);
11880+ if (start == -1ULL)
11881+ return start;
11882+
11883+ /* try to reserve it */
11884+ ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
11885+ if (ret >= 0)
11886+ return start;
11887+
11888+ start += alignment;
11889+ }
11890+}
11891+
11892+static inline unsigned long long get_total_mem(void)
11893+{
11894+ unsigned long long total;
11895+
11896+ total = max_low_pfn - min_low_pfn;
11897+#ifdef CONFIG_HIGHMEM
11898+ total += highend_pfn - highstart_pfn;
11899+#endif
11900+
11901+ return total << PAGE_SHIFT;
11902+}
11903+
11904+static void __init reserve_crashkernel(void)
11905+{
11906+ unsigned long long total_mem;
11907+ unsigned long long crash_size, crash_base;
11908+ int ret;
11909+
11910+ total_mem = get_total_mem();
11911+
11912+ ret = parse_crashkernel(boot_command_line, total_mem,
11913+ &crash_size, &crash_base);
11914+ if (ret != 0 || crash_size <= 0)
11915+ return;
11916+
11917+ /* 0 means: find the address automatically */
11918+ if (crash_base <= 0) {
11919+ crash_base = find_and_reserve_crashkernel(crash_size);
11920+ if (crash_base == -1ULL) {
11921+ pr_info("crashkernel reservation failed. "
11922+ "No suitable area found.\n");
11923+ return;
11924+ }
11925+ } else {
11926+ ret = reserve_bootmem_generic(crash_base, crash_size,
11927+ BOOTMEM_EXCLUSIVE);
11928+ if (ret < 0) {
11929+ pr_info("crashkernel reservation failed - "
11930+ "memory is in use\n");
11931+ return;
11932+ }
11933+ }
11934+
11935+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
11936+ "for crashkernel (System RAM: %ldMB)\n",
11937+ (unsigned long)(crash_size >> 20),
11938+ (unsigned long)(crash_base >> 20),
11939+ (unsigned long)(total_mem >> 20));
11940+
11941+ crashk_res.start = crash_base;
11942+ crashk_res.end = crash_base + crash_size - 1;
11943+ insert_resource(&iomem_resource, &crashk_res);
11944+}
11945+#else
11946+#define reserve_crashkernel xen_machine_kexec_setup_resources
11947+#endif
11948+#else
11949+static void __init reserve_crashkernel(void)
11950+{
11951+}
11952+#endif
11953+
11954+static struct resource standard_io_resources[] = {
11955+ { .name = "dma1", .start = 0x00, .end = 0x1f,
11956+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11957+ { .name = "pic1", .start = 0x20, .end = 0x21,
11958+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11959+ { .name = "timer0", .start = 0x40, .end = 0x43,
11960+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11961+ { .name = "timer1", .start = 0x50, .end = 0x53,
11962+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11963+ { .name = "keyboard", .start = 0x60, .end = 0x60,
11964+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11965+ { .name = "keyboard", .start = 0x64, .end = 0x64,
11966+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11967+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
11968+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11969+ { .name = "pic2", .start = 0xa0, .end = 0xa1,
11970+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11971+ { .name = "dma2", .start = 0xc0, .end = 0xdf,
11972+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11973+ { .name = "fpu", .start = 0xf0, .end = 0xff,
11974+ .flags = IORESOURCE_BUSY | IORESOURCE_IO }
11975+};
11976+
11977+static void __init reserve_standard_io_resources(void)
11978+{
11979+ int i;
11980+
11981+ /* Nothing to do if not running in dom0. */
11982+ if (!is_initial_xendomain())
11983+ return;
11984+
11985+ /* request I/O space for devices used on all i[345]86 PCs */
11986+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
11987+ request_resource(&ioport_resource, &standard_io_resources[i]);
11988+
11989+}
11990+
11991+#ifdef CONFIG_PROC_VMCORE
11992+/* elfcorehdr= specifies the location of elf core header
11993+ * stored by the crashed kernel. This option will be passed
11994+ * by kexec loader to the capture kernel.
11995 */
11996-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
11997-EXPORT_SYMBOL(__per_cpu_offset);
11998+static int __init setup_elfcorehdr(char *arg)
11999+{
12000+ char *end;
12001+ if (!arg)
12002+ return -EINVAL;
12003+ elfcorehdr_addr = memparse(arg, &end);
12004+ return end > arg ? 0 : -EINVAL;
12005+}
12006+early_param("elfcorehdr", setup_elfcorehdr);
12007 #endif
12008
12009+static struct x86_quirks default_x86_quirks __initdata;
12010+
12011+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
12012+
00e5a55c 12013+/*
cc90b958
BS
12014+ * Determine if we were loaded by an EFI loader. If so, then we have also been
12015+ * passed the efi memmap, systab, etc., so we should use these data structures
12016+ * for initialization. Note, the efi init code path is determined by the
12017+ * global efi_enabled. This allows the same kernel image to be used on existing
12018+ * systems (with a traditional BIOS) as well as on EFI systems.
00e5a55c
BS
12019+ */
12020 /*
12021- * Great future plan:
12022- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
12023- * Always point %gs to its beginning
cc90b958
BS
12024+ * setup_arch - architecture-specific boot-time initializations
12025+ *
12026+ * Note: On x86_64, fixmaps are ready for use even before this is called.
00e5a55c
BS
12027 */
12028-void __init setup_per_cpu_areas(void)
cc90b958
BS
12029+
12030+void __init setup_arch(char **cmdline_p)
12031 {
12032- int i, highest_cpu = 0;
12033- unsigned long size;
12034+#ifdef CONFIG_XEN
12035+ unsigned int i;
12036+ unsigned long p2m_pages;
12037+ struct physdev_set_iopl set_iopl;
12038
12039-#ifdef CONFIG_HOTPLUG_CPU
12040- prefill_possible_map();
12041+#ifdef CONFIG_X86_32
12042+ /* Force a quick death if the kernel panics (not domain 0). */
12043+ extern int panic_timeout;
12044+ if (!panic_timeout && !is_initial_xendomain())
12045+ panic_timeout = 1;
12046 #endif
12047
12048- /* Copy section for each CPU (we discard the original) */
12049- size = PERCPU_ENOUGH_ROOM;
12050- printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
12051- size);
12052-
12053- for_each_possible_cpu(i) {
12054- char *ptr;
12055-#ifndef CONFIG_NEED_MULTIPLE_NODES
12056- ptr = alloc_bootmem_pages(size);
12057-#else
12058- int node = early_cpu_to_node(i);
12059- if (!node_online(node) || !NODE_DATA(node)) {
12060- ptr = alloc_bootmem_pages(size);
12061- printk(KERN_INFO
12062- "cpu %d has no node or node-local memory\n", i);
12063- }
12064- else
12065- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
12066+ /* Register a call for panic conditions. */
12067+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12068+
12069+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12070+ VMASST_TYPE_writable_pagetables));
12071+#ifdef CONFIG_X86_32
12072+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12073+ VMASST_TYPE_4gb_segments));
12074+#endif
12075+#endif /* CONFIG_XEN */
12076+
12077+#ifdef CONFIG_X86_32
12078+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12079+ visws_early_detect();
12080+ pre_setup_arch_hook();
12081+#else
12082+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
12083+#endif
12084+
12085+ early_cpu_init();
12086+ early_ioremap_init();
12087+
12088+#ifndef CONFIG_XEN
12089+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
12090+ screen_info = boot_params.screen_info;
12091+ edid_info = boot_params.edid_info;
12092+#ifdef CONFIG_X86_32
12093+ apm_info.bios = boot_params.apm_bios_info;
12094+ ist_info = boot_params.ist_info;
12095+ if (boot_params.sys_desc_table.length != 0) {
12096+ set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
12097+ machine_id = boot_params.sys_desc_table.table[0];
12098+ machine_submodel_id = boot_params.sys_desc_table.table[1];
12099+ BIOS_revision = boot_params.sys_desc_table.table[2];
12100+ }
12101+#endif
12102+ saved_video_mode = boot_params.hdr.vid_mode;
12103+ bootloader_type = boot_params.hdr.type_of_loader;
12104+
12105+#ifdef CONFIG_BLK_DEV_RAM
12106+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
12107+ rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
12108+ rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
12109+#endif
12110+#ifdef CONFIG_EFI
12111+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
12112+#ifdef CONFIG_X86_32
12113+ "EL32",
12114+#else
12115+ "EL64",
00e5a55c
BS
12116 #endif
12117- if (!ptr)
12118- panic("Cannot allocate cpu data for CPU %d\n", i);
cc90b958
BS
12119+ 4)) {
12120+ efi_enabled = 1;
12121+ efi_reserve_early();
12122+ }
12123+#endif
12124+#else /* CONFIG_XEN */
12125+#ifdef CONFIG_X86_32
12126+ /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12127+ properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12128+ */
12129+ ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12130+#else
12131+ ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
12132+#endif
12133+ if (is_initial_xendomain()) {
12134+ const struct dom0_vga_console_info *info =
12135+ (void *)((char *)xen_start_info +
12136+ xen_start_info->console.dom0.info_off);
12137+
12138+ dom0_init_screen_info(info,
12139+ xen_start_info->console.dom0.info_size);
12140+ xen_start_info->console.domU.mfn = 0;
12141+ xen_start_info->console.domU.evtchn = 0;
12142+ } else
12143+ screen_info.orig_video_isVGA = 0;
12144+ copy_edid();
12145+#endif /* CONFIG_XEN */
12146+
12147+ ARCH_SETUP
12148+
12149+ setup_memory_map();
12150+ parse_setup_data();
12151+ /* update the e820_saved too */
12152+ e820_reserve_setup_data();
12153+
12154+ copy_edd();
12155+
12156+#ifndef CONFIG_XEN
12157+ if (!boot_params.hdr.root_flags)
12158+ root_mountflags &= ~MS_RDONLY;
00e5a55c 12159+#endif
cc90b958
BS
12160+ init_mm.start_code = (unsigned long) _text;
12161+ init_mm.end_code = (unsigned long) _etext;
12162+ init_mm.end_data = (unsigned long) _edata;
12163+#ifdef CONFIG_X86_32
12164+#ifndef CONFIG_XEN
12165+ init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
12166+#else
12167+ init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12168+ xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12169+#endif
12170+#else
12171+ init_mm.brk = (unsigned long) &_end;
12172+#endif
12173+
12174+ code_resource.start = virt_to_phys(_text);
12175+ code_resource.end = virt_to_phys(_etext)-1;
12176+ data_resource.start = virt_to_phys(_etext);
12177+ data_resource.end = virt_to_phys(_edata)-1;
12178+ bss_resource.start = virt_to_phys(&__bss_start);
12179+ bss_resource.end = virt_to_phys(&__bss_stop)-1;
12180+
12181+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
12182+ *cmdline_p = command_line;
12183+
12184+ parse_early_param();
12185+
12186 #ifdef CONFIG_X86_64
12187- cpu_pda(i)->data_offset = ptr - __per_cpu_start;
12188+ check_efer();
12189+#endif
12190+
12191+#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
12192+ /*
12193+ * Must be before kernel pagetables are setup
12194+ * or fixmap area is touched.
12195+ */
12196+ vmi_init();
12197+#endif
12198+
12199+ /* after early param, so could get panic from serial */
12200+ reserve_early_setup_data();
12201+
12202+ if (acpi_mps_check()) {
12203+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
12204+ disable_apic = 1;
12205+#endif
12206+ setup_clear_cpu_cap(X86_FEATURE_APIC);
12207+ }
12208+
12209+#ifdef CONFIG_PCI
12210+ if (pci_early_dump_regs)
12211+ early_dump_pci_devices();
12212+#endif
12213+
12214+ finish_e820_parsing();
12215+
12216+#ifdef CONFIG_X86_32
12217+ probe_roms();
12218+#endif
12219+
12220+#ifndef CONFIG_XEN
12221+ /* after parse_early_param, so could debug it */
12222+ insert_resource(&iomem_resource, &code_resource);
12223+ insert_resource(&iomem_resource, &data_resource);
12224+ insert_resource(&iomem_resource, &bss_resource);
12225+
12226+ if (efi_enabled)
12227+ efi_init();
12228+
12229+#ifdef CONFIG_X86_32
12230+ if (ppro_with_ram_bug()) {
12231+ e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
12232+ E820_RESERVED);
12233+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
12234+ printk(KERN_INFO "fixed physical RAM map:\n");
12235+ e820_print_map("bad_ppro");
12236+ }
12237 #else
12238- __per_cpu_offset[i] = ptr - __per_cpu_start;
12239+ early_gart_iommu_check();
12240 #endif
12241- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
12242+#endif /* CONFIG_XEN */
12243
12244- highest_cpu = i;
12245+ /*
12246+ * partially used pages are not usable - thus
12247+ * we are rounding upwards:
12248+ */
12249+ max_pfn = e820_end_of_ram_pfn();
12250+
12251+ /* preallocate 4k for mptable mpc */
12252+ early_reserve_e820_mpc_new();
12253+ /* update e820 for memory not covered by WB MTRRs */
12254+ mtrr_bp_init();
12255+#ifndef CONFIG_XEN
12256+ if (mtrr_trim_uncached_memory(max_pfn))
12257+ max_pfn = e820_end_of_ram_pfn();
12258+#endif
12259+
12260+#ifdef CONFIG_X86_32
12261+ /* max_low_pfn get updated here */
12262+ find_low_pfn_range();
12263+#else
12264+ num_physpages = max_pfn;
12265+ max_mapnr = max_pfn;
12266+
12267+
12268+ /* How many end-of-memory variables you have, grandma! */
12269+ /* need this before calling reserve_initrd */
12270+ if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
12271+ max_low_pfn = e820_end_of_low_ram_pfn();
12272+ else
12273+ max_low_pfn = max_pfn;
12274+
12275+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
12276+#endif
12277+
12278+ /* max_pfn_mapped is updated here */
cc90b958
BS
12279+ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12280+ max_pfn_mapped = max_low_pfn_mapped;
12281+
12282+#ifdef CONFIG_X86_64
12283+ if (max_pfn > max_low_pfn) {
12284+ max_pfn_mapped = init_memory_mapping(1UL<<32,
12285+ max_pfn<<PAGE_SHIFT);
12286+ /* can we preseve max_low_pfn ?*/
12287+ max_low_pfn = max_pfn;
12288 }
12289+#endif
cc90b958
BS
12290
12291- nr_cpu_ids = highest_cpu + 1;
12292- printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
12293+ /*
12294+ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
12295+ */
12296
12297- /* Setup percpu data maps */
12298- setup_per_cpu_maps();
12299+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
12300+ if (init_ohci1394_dma_early)
12301+ init_ohci1394_dma_on_all_controllers();
12302+#endif
12303
12304- /* Setup cpumask_of_cpu map */
12305- setup_cpumask_of_cpu();
12306-}
12307+ reserve_initrd();
12308+
12309+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12310+ vsmp_init();
12311+#endif
12312+
12313+ if (is_initial_xendomain())
12314+ dmi_scan_machine();
12315+
12316+ io_delay_init();
12317+
12318+#ifdef CONFIG_ACPI
12319+ if (!is_initial_xendomain()) {
12320+ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12321+ disable_acpi();
12322+ }
12323+#endif
12324+
12325+ /*
12326+ * Parse the ACPI tables for possible boot-time SMP configuration.
12327+ */
12328+ acpi_boot_table_init();
12329+
12330+#ifdef CONFIG_ACPI_NUMA
12331+ /*
12332+ * Parse SRAT to discover nodes.
12333+ */
12334+ acpi_numa_init();
12335+#endif
12336+
12337+ initmem_init(0, max_pfn);
12338
12339+#ifdef CONFIG_ACPI_SLEEP
12340+ /*
12341+ * Reserve low memory region for sleep support.
12342+ */
12343+ acpi_reserve_bootmem();
12344 #endif
12345+#ifdef CONFIG_X86_FIND_SMP_CONFIG
12346+ /*
12347+ * Find and reserve possible boot-time SMP configuration:
12348+ */
12349+ find_smp_config();
12350+#endif
12351+ reserve_crashkernel();
12352+
12353+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12354+ /*
12355+ * dma32_reserve_bootmem() allocates bootmem which may conflict
12356+ * with the crashkernel command line, so do that after
12357+ * reserve_crashkernel()
12358+ */
12359+ dma32_reserve_bootmem();
12360+#endif
12361+
12362+ reserve_ibft_region();
12363+
12364+#ifdef CONFIG_KVM_CLOCK
12365+ kvmclock_init();
12366+#endif
12367+
12368+ xen_pagetable_setup_start(swapper_pg_dir);
12369+ paging_init();
12370+ xen_pagetable_setup_done(swapper_pg_dir);
12371+ paravirt_post_allocator_init();
12372+
12373+#ifdef CONFIG_X86_64
12374+ map_vsyscall();
12375+#endif
12376+
12377+#ifdef CONFIG_XEN
12378+ p2m_pages = max_pfn;
12379+ if (xen_start_info->nr_pages > max_pfn) {
12380+ /*
12381+ * the max_pfn was shrunk (probably by mem= or highmem=
12382+ * kernel parameter); shrink reservation with the HV
12383+ */
12384+ struct xen_memory_reservation reservation = {
12385+ .address_bits = 0,
12386+ .extent_order = 0,
12387+ .domid = DOMID_SELF
12388+ };
12389+ unsigned int difference;
12390+ int ret;
12391+
12392+ difference = xen_start_info->nr_pages - max_pfn;
12393+
12394+ set_xen_guest_handle(reservation.extent_start,
12395+ ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
12396+ reservation.nr_extents = difference;
12397+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
12398+ &reservation);
12399+ BUG_ON(ret != difference);
12400+ }
12401+ else if (max_pfn > xen_start_info->nr_pages)
12402+ p2m_pages = xen_start_info->nr_pages;
12403+
12404+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12405+ unsigned long i, j;
12406+ unsigned int k, fpp;
12407+
12408+ /* Make sure we have a large enough P->M table. */
12409+ phys_to_machine_mapping = alloc_bootmem_pages(
12410+ max_pfn * sizeof(unsigned long));
12411+ memset(phys_to_machine_mapping, ~0,
12412+ max_pfn * sizeof(unsigned long));
12413+ memcpy(phys_to_machine_mapping,
12414+ (unsigned long *)xen_start_info->mfn_list,
12415+ p2m_pages * sizeof(unsigned long));
12416+ free_bootmem(
12417+ __pa(xen_start_info->mfn_list),
12418+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12419+ sizeof(unsigned long))));
12420+
12421+ /*
12422+ * Initialise the list of the frames that specify the list of
12423+ * frames that make up the p2m table. Used by save/restore.
12424+ */
12425+ pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
12426+
12427+ fpp = PAGE_SIZE/sizeof(unsigned long);
12428+ for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
12429+ if (j == fpp)
12430+ j = 0;
12431+ if (j == 0) {
12432+ k++;
12433+ BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
12434+ pfn_to_mfn_frame_list[k] =
12435+ alloc_bootmem_pages(PAGE_SIZE);
12436+ pfn_to_mfn_frame_list_list[k] =
12437+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
12438+ }
12439+ pfn_to_mfn_frame_list[k][j] =
12440+ virt_to_mfn(&phys_to_machine_mapping[i]);
12441+ }
12442+ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12443+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12444+ virt_to_mfn(pfn_to_mfn_frame_list_list);
12445+ }
12446+
12447+ /* Mark all ISA DMA channels in-use - using them wouldn't work. */
12448+ for (i = 0; i < MAX_DMA_CHANNELS; ++i)
12449+ if (i != 4 && request_dma(i, "xen") != 0)
12450+ BUG();
12451+#endif /* CONFIG_XEN */
12452+
12453+#ifdef CONFIG_X86_GENERICARCH
12454+ generic_apic_probe();
12455+#endif
12456+
12457+#ifndef CONFIG_XEN
12458+ early_quirks();
12459+#endif
12460+
12461+ /*
12462+ * Read APIC and some other early information from ACPI tables.
12463+ */
12464+ acpi_boot_init();
12465+
12466+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
12467+ /*
12468+ * get boot-time SMP configuration:
12469+ */
12470+ if (smp_found_config)
12471+ get_smp_config();
12472+#endif
12473+
12474+ prefill_possible_map();
12475+#ifdef CONFIG_X86_64
12476+ init_cpu_to_node();
12477+#endif
12478+
12479+#ifndef CONFIG_XEN
12480+ init_apic_mappings();
12481+ ioapic_init_mappings();
12482+
12483+ kvm_guest_init();
12484+
12485+ e820_reserve_resources();
12486+ e820_mark_nosave_regions(max_low_pfn);
12487+#else
12488+ if (is_initial_xendomain())
12489+ e820_reserve_resources();
12490+#endif
12491+
12492+#ifdef CONFIG_X86_32
12493+ request_resource(&iomem_resource, &video_ram_resource);
12494+#endif
12495+ reserve_standard_io_resources();
12496+
12497+#ifndef CONFIG_XEN
12498+ e820_setup_gap();
12499+
12500+#ifdef CONFIG_VT
12501+#if defined(CONFIG_VGA_CONSOLE)
12502+ if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12503+ conswitchp = &vga_con;
12504+#elif defined(CONFIG_DUMMY_CONSOLE)
12505+ conswitchp = &dummy_con;
12506+#endif
12507+#endif
12508+#else /* CONFIG_XEN */
12509+ if (is_initial_xendomain())
12510+ e820_setup_gap();
12511+
12512+ set_iopl.iopl = 1;
12513+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
12514+
12515+#ifdef CONFIG_VT
12516+#ifdef CONFIG_DUMMY_CONSOLE
12517+ conswitchp = &dummy_con;
12518+#endif
12519+#ifdef CONFIG_VGA_CONSOLE
12520+ if (is_initial_xendomain())
12521+ conswitchp = &vga_con;
12522+#endif
12523+#endif
12524+#endif /* CONFIG_XEN */
12525+}
12526+
12527+#ifdef CONFIG_XEN
12528+static int
12529+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12530+{
12531+ HYPERVISOR_shutdown(SHUTDOWN_crash);
12532+ /* we're never actually going to get here... */
12533+ return NOTIFY_DONE;
12534+}
12535+#endif /* !CONFIG_XEN */
00e5a55c 12536--- sle11-2009-06-04.orig/arch/x86/kernel/setup64-xen.c 2009-03-16 16:38:05.000000000 +0100
cc90b958
BS
12537+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12538@@ -1,370 +0,0 @@
12539-/*
12540- * X86-64 specific CPU setup.
12541- * Copyright (C) 1995 Linus Torvalds
12542- * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
12543- * See setup.c for older changelog.
12544- *
12545- * Jun Nakajima <jun.nakajima@intel.com>
12546- * Modified for Xen
12547- *
12548- */
12549-#include <linux/init.h>
12550-#include <linux/kernel.h>
12551-#include <linux/sched.h>
12552-#include <linux/string.h>
12553-#include <linux/bootmem.h>
12554-#include <linux/bitops.h>
12555-#include <linux/module.h>
12556-#include <linux/kgdb.h>
12557-#include <asm/pda.h>
12558-#include <asm/pgtable.h>
12559-#include <asm/processor.h>
12560-#include <asm/desc.h>
12561-#include <asm/atomic.h>
12562-#include <asm/mmu_context.h>
12563-#include <asm/smp.h>
12564-#include <asm/i387.h>
12565-#include <asm/percpu.h>
12566-#include <asm/proto.h>
12567-#include <asm/sections.h>
12568-#include <asm/setup.h>
12569-#include <asm/genapic.h>
12570-#ifdef CONFIG_XEN
12571-#include <asm/hypervisor.h>
12572-#endif
12573-
12574-#ifndef CONFIG_DEBUG_BOOT_PARAMS
12575-struct boot_params __initdata boot_params;
12576-#else
12577-struct boot_params boot_params;
12578-#endif
12579-
12580-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
12581-
12582-struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
12583-EXPORT_SYMBOL(_cpu_pda);
12584-struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
12585-
12586-#ifndef CONFIG_X86_NO_IDT
12587-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
12588-#endif
12589-
12590-char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
12591-
12592-unsigned long __supported_pte_mask __read_mostly = ~0UL;
12593-EXPORT_SYMBOL(__supported_pte_mask);
12594-
12595-static int do_not_nx __cpuinitdata = 0;
12596-
12597-/* noexec=on|off
12598-Control non executable mappings for 64bit processes.
12599-
12600-on Enable(default)
12601-off Disable
12602-*/
12603-static int __init nonx_setup(char *str)
12604-{
12605- if (!str)
12606- return -EINVAL;
12607- if (!strncmp(str, "on", 2)) {
12608- __supported_pte_mask |= _PAGE_NX;
12609- do_not_nx = 0;
12610- } else if (!strncmp(str, "off", 3)) {
12611- do_not_nx = 1;
12612- __supported_pte_mask &= ~_PAGE_NX;
12613- }
12614- return 0;
12615-}
12616-early_param("noexec", nonx_setup);
12617-
12618-int force_personality32 = 0;
12619-
12620-/* noexec32=on|off
12621-Control non executable heap for 32bit processes.
12622-To control the stack too use noexec=off
12623-
12624-on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
12625-off PROT_READ implies PROT_EXEC
12626-*/
12627-static int __init nonx32_setup(char *str)
12628-{
12629- if (!strcmp(str, "on"))
12630- force_personality32 &= ~READ_IMPLIES_EXEC;
12631- else if (!strcmp(str, "off"))
12632- force_personality32 |= READ_IMPLIES_EXEC;
12633- return 1;
12634-}
12635-__setup("noexec32=", nonx32_setup);
12636-
12637-#ifdef CONFIG_XEN
12638-static void __init_refok switch_pt(int cpu)
12639-{
12640- if (cpu == 0)
12641- xen_init_pt();
12642- xen_pt_switch(__pa_symbol(init_level4_pgt));
12643- xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
12644-}
12645-#define switch_pt() switch_pt(cpu)
12646-
12647-static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12648-{
12649- unsigned long frames[16];
12650- unsigned long va;
12651- int f;
12652-
12653- for (va = gdt_descr->address, f = 0;
12654- va < gdt_descr->address + gdt_descr->size;
12655- va += PAGE_SIZE, f++) {
12656- frames[f] = virt_to_mfn(va);
12657- make_page_readonly(
12658- (void *)va, XENFEAT_writable_descriptor_tables);
12659- }
12660- if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
12661- sizeof (struct desc_struct)))
12662- BUG();
12663-}
12664-#else
12665-static void switch_pt(void)
12666-{
12667- asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
12668-}
12669-
12670-static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12671-{
12672- load_gdt(gdt_descr);
12673- load_idt(idt_descr);
12674-}
12675-#endif
12676-
12677-void pda_init(int cpu)
12678-{
12679- struct x8664_pda *pda = cpu_pda(cpu);
12680-
12681- /* Setup up data that may be needed in __get_free_pages early */
12682- asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
12683-#ifndef CONFIG_XEN
12684- /* Memory clobbers used to order PDA accessed */
12685- mb();
12686- wrmsrl(MSR_GS_BASE, pda);
12687- mb();
12688-#else
12689- if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
12690- (unsigned long)pda))
12691- BUG();
12692-#endif
12693- pda->cpunumber = cpu;
12694- pda->irqcount = -1;
12695- pda->kernelstack =
12696- (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
12697- pda->active_mm = &init_mm;
12698- pda->mmu_state = 0;
12699-
12700- if (cpu == 0) {
12701- /* others are initialized in smpboot.c */
12702- pda->pcurrent = &init_task;
12703- pda->irqstackptr = boot_cpu_stack;
12704- } else {
12705- pda->irqstackptr = (char *)
12706- __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
12707- if (!pda->irqstackptr)
12708- panic("cannot allocate irqstack for cpu %d", cpu);
12709- }
12710-
12711- switch_pt();
12712-
12713- pda->irqstackptr += IRQSTACKSIZE-64;
12714-}
12715-
12716-#ifndef CONFIG_X86_NO_TSS
12717-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
12718-__attribute__((section(".bss.page_aligned")));
12719-#endif
12720-
12721-extern asmlinkage void ignore_sysret(void);
12722-
12723-/* May not be marked __init: used by software suspend */
12724-void syscall_init(void)
12725-{
12726-#ifndef CONFIG_XEN
12727- /*
12728- * LSTAR and STAR live in a bit strange symbiosis.
12729- * They both write to the same internal register. STAR allows to set CS/DS
12730- * but only a 32bit target. LSTAR sets the 64bit rip.
12731- */
12732- wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
12733- wrmsrl(MSR_LSTAR, system_call);
12734- wrmsrl(MSR_CSTAR, ignore_sysret);
12735-
12736- /* Flags to clear on syscall */
12737- wrmsrl(MSR_SYSCALL_MASK,
12738- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
12739-#endif
12740-#ifdef CONFIG_IA32_EMULATION
12741- syscall32_cpu_init ();
12742-#else
12743- {
12744- static const struct callback_register cstar = {
12745- .type = CALLBACKTYPE_syscall32,
12746- .address = (unsigned long)ignore_sysret
12747- };
12748- if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
12749- printk(KERN_WARN "Unable to register CSTAR callback\n");
12750- }
12751-#endif
12752-}
12753-
12754-void __cpuinit check_efer(void)
12755-{
12756- unsigned long efer;
12757-
12758- rdmsrl(MSR_EFER, efer);
12759- if (!(efer & EFER_NX) || do_not_nx) {
12760- __supported_pte_mask &= ~_PAGE_NX;
12761- }
12762-}
12763-
12764-unsigned long kernel_eflags;
12765-
12766-#ifndef CONFIG_X86_NO_TSS
12767-/*
12768- * Copies of the original ist values from the tss are only accessed during
12769- * debugging, no special alignment required.
12770- */
12771-DEFINE_PER_CPU(struct orig_ist, orig_ist);
12772-#endif
12773-
12774-/*
12775- * cpu_init() initializes state that is per-CPU. Some data is already
12776- * initialized (naturally) in the bootstrap process, such as the GDT
12777- * and IDT. We reload them nevertheless, this function acts as a
12778- * 'CPU state barrier', nothing should get across.
12779- * A lot of state is already set up in PDA init.
12780- */
12781-void __cpuinit cpu_init (void)
12782-{
12783- int cpu = stack_smp_processor_id();
12784-#ifndef CONFIG_X86_NO_TSS
12785- struct tss_struct *t = &per_cpu(init_tss, cpu);
12786- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
12787- unsigned long v;
12788- char *estacks = NULL;
12789- unsigned i;
12790-#endif
12791- struct task_struct *me;
12792-
12793- /* CPU 0 is initialised in head64.c */
12794- if (cpu != 0) {
12795- pda_init(cpu);
12796- }
12797-#ifndef CONFIG_X86_NO_TSS
12798- else
12799- estacks = boot_exception_stacks;
12800-#endif
12801-
12802- me = current;
12803-
12804- if (cpu_test_and_set(cpu, cpu_initialized))
12805- panic("CPU#%d already initialized!\n", cpu);
12806-
12807- printk("Initializing CPU#%d\n", cpu);
12808-
12809- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
12810-
12811- /*
12812- * Initialize the per-CPU GDT with the boot GDT,
12813- * and set up the GDT descriptor:
12814- */
12815-#ifndef CONFIG_XEN
12816- if (cpu)
12817- memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
12818-#endif
12819-
12820- cpu_gdt_descr[cpu].size = GDT_SIZE;
12821- cpu_gdt_init(&cpu_gdt_descr[cpu]);
12822-
12823- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
12824- syscall_init();
12825-
12826- wrmsrl(MSR_FS_BASE, 0);
12827- wrmsrl(MSR_KERNEL_GS_BASE, 0);
12828- barrier();
12829-
12830- check_efer();
12831-
12832-#ifndef CONFIG_X86_NO_TSS
12833- /*
12834- * set up and load the per-CPU TSS
12835- */
12836- for (v = 0; v < N_EXCEPTION_STACKS; v++) {
12837- static const unsigned int order[N_EXCEPTION_STACKS] = {
12838- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
12839- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
12840- };
12841- if (cpu) {
12842- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
12843- if (!estacks)
12844- panic("Cannot allocate exception stack %ld %d\n",
12845- v, cpu);
12846- }
12847- estacks += PAGE_SIZE << order[v];
12848- orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
12849- }
12850-
12851- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
12852- /*
12853- * <= is required because the CPU will access up to
12854- * 8 bits beyond the end of the IO permission bitmap.
12855- */
12856- for (i = 0; i <= IO_BITMAP_LONGS; i++)
12857- t->io_bitmap[i] = ~0UL;
12858-#endif
12859-
12860- atomic_inc(&init_mm.mm_count);
12861- me->active_mm = &init_mm;
12862- if (me->mm)
12863- BUG();
12864- enter_lazy_tlb(&init_mm, me);
12865-
12866-#ifndef CONFIG_X86_NO_TSS
12867- set_tss_desc(cpu, t);
12868-#endif
12869-#ifndef CONFIG_XEN
12870- load_TR_desc();
12871-#endif
12872- load_LDT(&init_mm.context);
12873-
12874-#ifdef CONFIG_KGDB
12875- /*
12876- * If the kgdb is connected no debug regs should be altered. This
12877- * is only applicable when KGDB and a KGDB I/O module are built
12878- * into the kernel and you are using early debugging with
12879- * kgdbwait. KGDB will control the kernel HW breakpoint registers.
12880- */
12881- if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
12882- arch_kgdb_ops.correct_hw_break();
12883- else {
12884-#endif
12885- /*
12886- * Clear all 6 debug registers:
12887- */
12888-
12889- set_debugreg(0UL, 0);
12890- set_debugreg(0UL, 1);
12891- set_debugreg(0UL, 2);
12892- set_debugreg(0UL, 3);
12893- set_debugreg(0UL, 6);
12894- set_debugreg(0UL, 7);
12895-#ifdef CONFIG_KGDB
12896- /* If the kgdb is connected no debug regs should be altered. */
12897- }
12898-#endif
12899-
12900- fpu_init();
12901-
12902- asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
12903- if (raw_irqs_disabled())
12904- kernel_eflags &= ~X86_EFLAGS_IF;
12905-
12906- if (is_uv_system())
12907- uv_cpu_init();
12908-}
00e5a55c 12909--- sle11-2009-06-04.orig/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:38:05.000000000 +0100
cc90b958
BS
12910+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12911@@ -1,1151 +0,0 @@
12912-/*
12913- * Copyright (C) 1995 Linus Torvalds
12914- *
12915- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
12916- *
12917- * Memory region support
12918- * David Parsons <orc@pell.chi.il.us>, July-August 1999
12919- *
12920- * Added E820 sanitization routine (removes overlapping memory regions);
12921- * Brian Moyle <bmoyle@mvista.com>, February 2001
12922- *
12923- * Moved CPU detection code to cpu/${cpu}.c
12924- * Patrick Mochel <mochel@osdl.org>, March 2002
12925- *
12926- * Provisions for empty E820 memory regions (reported by certain BIOSes).
12927- * Alex Achenbach <xela@slit.de>, December 2002.
12928- *
12929- */
12930-
12931-/*
12932- * This file handles the architecture-dependent parts of initialization
12933- */
12934-
12935-#include <linux/sched.h>
12936-#include <linux/mm.h>
12937-#include <linux/mmzone.h>
12938-#include <linux/screen_info.h>
12939-#include <linux/ioport.h>
12940-#include <linux/acpi.h>
12941-#include <linux/apm_bios.h>
12942-#include <linux/initrd.h>
12943-#include <linux/bootmem.h>
12944-#include <linux/seq_file.h>
12945-#include <linux/console.h>
12946-#include <linux/mca.h>
12947-#include <linux/root_dev.h>
12948-#include <linux/highmem.h>
12949-#include <linux/module.h>
12950-#include <linux/efi.h>
12951-#include <linux/init.h>
12952-#include <linux/edd.h>
12953-#include <linux/iscsi_ibft.h>
12954-#include <linux/nodemask.h>
12955-#include <linux/kernel.h>
12956-#include <linux/percpu.h>
12957-#include <linux/notifier.h>
12958-#include <linux/kexec.h>
12959-#include <linux/crash_dump.h>
12960-#include <linux/dmi.h>
12961-#include <linux/pfn.h>
12962-#include <linux/pci.h>
12963-#include <linux/init_ohci1394_dma.h>
12964-#include <linux/kvm_para.h>
12965-
12966-#include <video/edid.h>
12967-
12968-#include <asm/mtrr.h>
12969-#include <asm/apic.h>
12970-#include <asm/e820.h>
12971-#include <asm/mpspec.h>
12972-#include <asm/mmzone.h>
12973-#include <asm/setup.h>
12974-#include <asm/arch_hooks.h>
12975-#include <asm/sections.h>
12976-#include <asm/io_apic.h>
12977-#include <asm/ist.h>
12978-#include <asm/io.h>
12979-#include <asm/hypervisor.h>
12980-#include <xen/interface/physdev.h>
12981-#include <xen/interface/memory.h>
12982-#include <xen/features.h>
12983-#include <xen/firmware.h>
12984-#include <xen/xencons.h>
12985-#include <setup_arch.h>
12986-#include <asm/bios_ebda.h>
12987-#include <asm/cacheflush.h>
12988-#include <asm/processor.h>
12989-
12990-#ifdef CONFIG_XEN
12991-#include <xen/interface/kexec.h>
12992-#endif
12993-
12994-static int xen_panic_event(struct notifier_block *, unsigned long, void *);
12995-static struct notifier_block xen_panic_block = {
12996- xen_panic_event, NULL, 0 /* try to go last */
12997-};
12998-
12999-/*
13000- * Machine setup..
13001- */
13002-static struct resource data_resource = {
13003- .name = "Kernel data",
13004- .start = 0,
13005- .end = 0,
13006- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13007-};
13008-
13009-static struct resource code_resource = {
13010- .name = "Kernel code",
13011- .start = 0,
13012- .end = 0,
13013- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13014-};
13015-
13016-static struct resource bss_resource = {
13017- .name = "Kernel bss",
13018- .start = 0,
13019- .end = 0,
13020- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13021-};
13022-
13023-static struct resource video_ram_resource = {
13024- .name = "Video RAM area",
13025- .start = 0xa0000,
13026- .end = 0xbffff,
13027- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13028-};
13029-
13030-static struct resource standard_io_resources[] = { {
13031- .name = "dma1",
13032- .start = 0x0000,
13033- .end = 0x001f,
13034- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13035-}, {
13036- .name = "pic1",
13037- .start = 0x0020,
13038- .end = 0x0021,
13039- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13040-}, {
13041- .name = "timer0",
13042- .start = 0x0040,
13043- .end = 0x0043,
13044- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13045-}, {
13046- .name = "timer1",
13047- .start = 0x0050,
13048- .end = 0x0053,
13049- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13050-}, {
13051- .name = "keyboard",
13052- .start = 0x0060,
13053- .end = 0x0060,
13054- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13055-}, {
13056- .name = "keyboard",
13057- .start = 0x0064,
13058- .end = 0x0064,
13059- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13060-}, {
13061- .name = "dma page reg",
13062- .start = 0x0080,
13063- .end = 0x008f,
13064- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13065-}, {
13066- .name = "pic2",
13067- .start = 0x00a0,
13068- .end = 0x00a1,
13069- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13070-}, {
13071- .name = "dma2",
13072- .start = 0x00c0,
13073- .end = 0x00df,
13074- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13075-}, {
13076- .name = "fpu",
13077- .start = 0x00f0,
13078- .end = 0x00ff,
13079- .flags = IORESOURCE_BUSY | IORESOURCE_IO
13080-} };
13081-
13082-/* cpu data as detected by the assembly code in head.S */
13083-struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13084-/* common cpu data for all cpus */
13085-struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13086-EXPORT_SYMBOL(boot_cpu_data);
13087-
13088-unsigned int def_to_bigsmp;
13089-
13090-#ifndef CONFIG_X86_PAE
13091-unsigned long mmu_cr4_features;
13092-#else
13093-unsigned long mmu_cr4_features = X86_CR4_PAE;
13094-#endif
13095-
13096-/* for MCA, but anyone else can use it if they want */
13097-unsigned int machine_id;
13098-unsigned int machine_submodel_id;
13099-unsigned int BIOS_revision;
13100-
13101-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
13102-int bootloader_type;
13103-
13104-/* user-defined highmem size */
13105-static unsigned int highmem_pages = -1;
13106-
13107-/*
13108- * Setup options
13109- */
13110-struct screen_info screen_info;
13111-EXPORT_SYMBOL(screen_info);
13112-struct apm_info apm_info;
13113-EXPORT_SYMBOL(apm_info);
13114-struct edid_info edid_info;
13115-EXPORT_SYMBOL_GPL(edid_info);
13116-#ifndef CONFIG_XEN
13117-#define copy_edid() (edid_info = boot_params.edid_info)
13118-#endif
13119-struct ist_info ist_info;
13120-#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
13121- defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
13122-EXPORT_SYMBOL(ist_info);
13123-#endif
13124-
13125-extern void early_cpu_init(void);
13126-extern int root_mountflags;
13127-
13128-unsigned long saved_video_mode;
13129-
13130-#define RAMDISK_IMAGE_START_MASK 0x07FF
13131-#define RAMDISK_PROMPT_FLAG 0x8000
13132-#define RAMDISK_LOAD_FLAG 0x4000
13133-
13134-static char __initdata command_line[COMMAND_LINE_SIZE];
13135-
13136-#ifndef CONFIG_DEBUG_BOOT_PARAMS
13137-struct boot_params __initdata boot_params;
13138-#else
13139-struct boot_params boot_params;
13140-#endif
13141-
13142-/*
13143- * Point at the empty zero page to start with. We map the real shared_info
13144- * page as soon as fixmap is up and running.
13145- */
13146-shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
13147-EXPORT_SYMBOL(HYPERVISOR_shared_info);
13148-
13149-unsigned long *phys_to_machine_mapping;
13150-unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
13151-EXPORT_SYMBOL(phys_to_machine_mapping);
13152-
13153-/* Raw start-of-day parameters from the hypervisor. */
13154-start_info_t *xen_start_info;
13155-EXPORT_SYMBOL(xen_start_info);
13156-
13157-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
13158-struct edd edd;
13159-#ifdef CONFIG_EDD_MODULE
13160-EXPORT_SYMBOL(edd);
13161-#endif
13162-#ifndef CONFIG_XEN
13163-/**
13164- * copy_edd() - Copy the BIOS EDD information
13165- * from boot_params into a safe place.
13166- *
13167- */
13168-static inline void copy_edd(void)
13169-{
13170- memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
13171- sizeof(edd.mbr_signature));
13172- memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
13173- edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
13174- edd.edd_info_nr = boot_params.eddbuf_entries;
13175-}
13176-#endif
13177-#else
13178-static inline void copy_edd(void)
13179-{
13180-}
13181-#endif
13182-
13183-int __initdata user_defined_memmap;
13184-
13185-/*
13186- * "mem=nopentium" disables the 4MB page tables.
13187- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
13188- * to <mem>, overriding the bios size.
13189- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
13190- * <start> to <start>+<mem>, overriding the bios size.
13191- *
13192- * HPA tells me bootloaders need to parse mem=, so no new
13193- * option should be mem= [also see Documentation/i386/boot.txt]
13194- */
13195-static int __init parse_mem(char *arg)
13196-{
13197- if (!arg)
13198- return -EINVAL;
13199-
13200- if (strcmp(arg, "nopentium") == 0) {
13201- setup_clear_cpu_cap(X86_FEATURE_PSE);
13202- } else {
13203- /* If the user specifies memory size, we
13204- * limit the BIOS-provided memory map to
13205- * that size. exactmap can be used to specify
13206- * the exact map. mem=number can be used to
13207- * trim the existing memory map.
13208- */
13209- unsigned long long mem_size;
13210-
13211- mem_size = memparse(arg, &arg);
13212- limit_regions(mem_size);
13213- user_defined_memmap = 1;
13214- }
13215- return 0;
13216-}
13217-early_param("mem", parse_mem);
13218-
13219-#ifdef CONFIG_PROC_VMCORE
13220-/* elfcorehdr= specifies the location of elf core header
13221- * stored by the crashed kernel.
13222- */
13223-static int __init parse_elfcorehdr(char *arg)
13224-{
13225- if (!arg)
13226- return -EINVAL;
13227-
13228- elfcorehdr_addr = memparse(arg, &arg);
13229- return 0;
13230-}
13231-early_param("elfcorehdr", parse_elfcorehdr);
13232-#endif /* CONFIG_PROC_VMCORE */
13233-
13234-/*
13235- * highmem=size forces highmem to be exactly 'size' bytes.
13236- * This works even on boxes that have no highmem otherwise.
13237- * This also works to reduce highmem size on bigger boxes.
13238- */
13239-static int __init parse_highmem(char *arg)
13240-{
13241- if (!arg)
13242- return -EINVAL;
13243-
13244- highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
13245- return 0;
13246-}
13247-early_param("highmem", parse_highmem);
13248-
13249-/*
13250- * vmalloc=size forces the vmalloc area to be exactly 'size'
13251- * bytes. This can be used to increase (or decrease) the
13252- * vmalloc area - the default is 128m.
13253- */
13254-static int __init parse_vmalloc(char *arg)
13255-{
13256- if (!arg)
13257- return -EINVAL;
13258-
13259- __VMALLOC_RESERVE = memparse(arg, &arg);
13260- return 0;
13261-}
13262-early_param("vmalloc", parse_vmalloc);
13263-
13264-#ifndef CONFIG_XEN
13265-/*
13266- * reservetop=size reserves a hole at the top of the kernel address space which
13267- * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
13268- * so relocating the fixmap can be done before paging initialization.
13269- */
13270-static int __init parse_reservetop(char *arg)
13271-{
13272- unsigned long address;
13273-
13274- if (!arg)
13275- return -EINVAL;
13276-
13277- address = memparse(arg, &arg);
13278- reserve_top_address(address);
13279- return 0;
13280-}
13281-early_param("reservetop", parse_reservetop);
13282-#endif
13283-
13284-/*
13285- * Determine low and high memory ranges:
13286- */
13287-unsigned long __init find_max_low_pfn(void)
13288-{
13289- unsigned long max_low_pfn;
13290-
13291- max_low_pfn = max_pfn;
13292- if (max_low_pfn > MAXMEM_PFN) {
13293- if (highmem_pages == -1)
13294- highmem_pages = max_pfn - MAXMEM_PFN;
13295- if (highmem_pages + MAXMEM_PFN < max_pfn)
13296- max_pfn = MAXMEM_PFN + highmem_pages;
13297- if (highmem_pages + MAXMEM_PFN > max_pfn) {
13298- printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
13299- highmem_pages = 0;
13300- }
13301- max_low_pfn = MAXMEM_PFN;
13302-#ifndef CONFIG_HIGHMEM
13303- /* Maximum memory usable is what is directly addressable */
13304- printk(KERN_WARNING "Warning only %ldMB will be used.\n",
13305- MAXMEM>>20);
13306- if (max_pfn > MAX_NONPAE_PFN)
13307- printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13308- else
13309- printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
13310- max_pfn = MAXMEM_PFN;
13311-#else /* !CONFIG_HIGHMEM */
13312-#ifndef CONFIG_HIGHMEM64G
13313- if (max_pfn > MAX_NONPAE_PFN) {
13314- max_pfn = MAX_NONPAE_PFN;
13315- printk(KERN_WARNING "Warning only 4GB will be used.\n");
13316- printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13317- }
13318-#endif /* !CONFIG_HIGHMEM64G */
13319-#endif /* !CONFIG_HIGHMEM */
13320- } else {
13321- if (highmem_pages == -1)
13322- highmem_pages = 0;
13323-#ifdef CONFIG_HIGHMEM
13324- if (highmem_pages >= max_pfn) {
13325- printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
13326- highmem_pages = 0;
13327- }
13328- if (highmem_pages) {
13329- if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
13330- printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
13331- highmem_pages = 0;
13332- }
13333- max_low_pfn -= highmem_pages;
13334- }
13335-#else
13336- if (highmem_pages)
13337- printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
13338-#endif
13339- }
13340- return max_low_pfn;
13341-}
13342-
13343-#ifndef CONFIG_XEN
13344-#define BIOS_LOWMEM_KILOBYTES 0x413
13345-
13346-/*
13347- * The BIOS places the EBDA/XBDA at the top of conventional
13348- * memory, and usually decreases the reported amount of
13349- * conventional memory (int 0x12) too. This also contains a
13350- * workaround for Dell systems that neglect to reserve EBDA.
13351- * The same workaround also avoids a problem with the AMD768MPX
13352- * chipset: reserve a page before VGA to prevent PCI prefetch
13353- * into it (errata #56). Usually the page is reserved anyways,
13354- * unless you have no PS/2 mouse plugged in.
13355- */
13356-static void __init reserve_ebda_region(void)
13357-{
13358- unsigned int lowmem, ebda_addr;
13359-
13360- /* To determine the position of the EBDA and the */
13361- /* end of conventional memory, we need to look at */
13362- /* the BIOS data area. In a paravirtual environment */
13363- /* that area is absent. We'll just have to assume */
13364- /* that the paravirt case can handle memory setup */
13365- /* correctly, without our help. */
13366- if (paravirt_enabled())
13367- return;
13368-
13369- /* end of low (conventional) memory */
13370- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
13371- lowmem <<= 10;
13372-
13373- /* start of EBDA area */
13374- ebda_addr = get_bios_ebda();
13375-
13376- /* Fixup: bios puts an EBDA in the top 64K segment */
13377- /* of conventional memory, but does not adjust lowmem. */
13378- if ((lowmem - ebda_addr) <= 0x10000)
13379- lowmem = ebda_addr;
13380-
13381- /* Fixup: bios does not report an EBDA at all. */
13382- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
13383- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
13384- lowmem = 0x9f000;
13385-
13386- /* Paranoia: should never happen, but... */
13387- if ((lowmem == 0) || (lowmem >= 0x100000))
13388- lowmem = 0x9f000;
13389-
13390- /* reserve all memory between lowmem and the 1MB mark */
13391- reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
13392-}
13393-#endif
13394-
13395-#ifndef CONFIG_NEED_MULTIPLE_NODES
13396-static void __init setup_bootmem_allocator(void);
13397-static unsigned long __init setup_memory(void)
13398-{
13399- /*
13400- * partially used pages are not usable - thus
13401- * we are rounding upwards:
13402- */
13403- min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
13404- xen_start_info->nr_pt_frames;
13405-
13406- max_low_pfn = find_max_low_pfn();
13407-
13408-#ifdef CONFIG_HIGHMEM
13409- highstart_pfn = highend_pfn = max_pfn;
13410- if (max_pfn > max_low_pfn) {
13411- highstart_pfn = max_low_pfn;
13412- }
13413- printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
13414- pages_to_mb(highend_pfn - highstart_pfn));
13415- num_physpages = highend_pfn;
13416- high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
13417-#else
13418- num_physpages = max_low_pfn;
13419- high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
13420-#endif
13421-#ifdef CONFIG_FLATMEM
13422- max_mapnr = num_physpages;
13423-#endif
13424- printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
13425- pages_to_mb(max_low_pfn));
13426-
13427- setup_bootmem_allocator();
13428-
13429- return max_low_pfn;
13430-}
13431-
13432-static void __init zone_sizes_init(void)
13433-{
13434- unsigned long max_zone_pfns[MAX_NR_ZONES];
13435- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
13436- max_zone_pfns[ZONE_DMA] =
13437- virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
13438- max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
13439-#ifdef CONFIG_HIGHMEM
13440- max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
13441- add_active_range(0, 0, highend_pfn);
13442-#else
13443- add_active_range(0, 0, max_low_pfn);
13444-#endif
13445-
13446- free_area_init_nodes(max_zone_pfns);
13447-}
13448-#else
13449-extern unsigned long __init setup_memory(void);
13450-extern void zone_sizes_init(void);
13451-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
13452-
13453-static inline unsigned long long get_total_mem(void)
13454-{
13455- unsigned long long total;
13456-
13457- total = max_low_pfn - min_low_pfn;
13458-#ifdef CONFIG_HIGHMEM
13459- total += highend_pfn - highstart_pfn;
13460-#endif
13461-
13462- return total << PAGE_SHIFT;
13463-}
13464-
13465-#ifdef CONFIG_KEXEC
13466-#ifndef CONFIG_XEN
13467-static void __init reserve_crashkernel(void)
13468-{
13469- unsigned long long total_mem;
13470- unsigned long long crash_size, crash_base;
13471- int ret;
13472-
13473- total_mem = get_total_mem();
13474-
13475- ret = parse_crashkernel(boot_command_line, total_mem,
13476- &crash_size, &crash_base);
13477- if (ret == 0 && crash_size > 0) {
13478- if (crash_base > 0) {
13479- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
13480- "for crashkernel (System RAM: %ldMB)\n",
13481- (unsigned long)(crash_size >> 20),
13482- (unsigned long)(crash_base >> 20),
13483- (unsigned long)(total_mem >> 20));
13484-
13485- if (reserve_bootmem(crash_base, crash_size,
13486- BOOTMEM_EXCLUSIVE) < 0) {
13487- printk(KERN_INFO "crashkernel reservation "
13488- "failed - memory is in use\n");
13489- return;
13490- }
13491-
13492- crashk_res.start = crash_base;
13493- crashk_res.end = crash_base + crash_size - 1;
13494- } else
13495- printk(KERN_INFO "crashkernel reservation failed - "
13496- "you have to specify a base address\n");
13497- }
13498-}
13499-#else
13500-#define reserve_crashkernel xen_machine_kexec_setup_resources
13501-#endif
13502-#else
13503-static inline void __init reserve_crashkernel(void)
13504-{}
13505-#endif
13506-
13507-#ifdef CONFIG_BLK_DEV_INITRD
13508-
13509-static bool do_relocate_initrd = false;
13510-
13511-static void __init reserve_initrd(void)
13512-{
13513- unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
13514- unsigned long ramdisk_size = xen_start_info->mod_len;
13515- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
13516- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13517- unsigned long ramdisk_here;
13518-
13519- initrd_start = 0;
13520-
13521- if (!xen_start_info->mod_start || !ramdisk_size)
13522- return; /* No initrd provided by bootloader */
13523-
13524- if (ramdisk_end < ramdisk_image) {
13525- printk(KERN_ERR "initrd wraps around end of memory, "
13526- "disabling initrd\n");
13527- return;
13528- }
13529- if (ramdisk_size >= end_of_lowmem/2) {
13530- printk(KERN_ERR "initrd too large to handle, "
13531- "disabling initrd\n");
13532- return;
13533- }
13534- if (ramdisk_end <= end_of_lowmem) {
13535- /* All in lowmem, easy case */
13536- reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
13537- initrd_start = ramdisk_image + PAGE_OFFSET;
13538- initrd_end = initrd_start+ramdisk_size;
13539- return;
13540- }
13541-
13542- /* We need to move the initrd down into lowmem */
13543- ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
13544-
13545- /* Note: this includes all the lowmem currently occupied by
13546- the initrd, we rely on that fact to keep the data intact. */
13547- reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
13548- initrd_start = ramdisk_here + PAGE_OFFSET;
13549- initrd_end = initrd_start + ramdisk_size;
13550-
13551- do_relocate_initrd = true;
13552-}
13553-
13554-#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
13555-
13556-static void __init relocate_initrd(void)
13557-{
13558- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
13559- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
13560- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13561- unsigned long ramdisk_here;
13562- unsigned long slop, clen, mapaddr;
13563- char *p, *q;
13564-
13565- if (!do_relocate_initrd)
13566- return;
13567-
13568- ramdisk_here = initrd_start - PAGE_OFFSET;
13569-
13570- q = (char *)initrd_start;
13571-
13572- /* Copy any lowmem portion of the initrd */
13573- if (ramdisk_image < end_of_lowmem) {
13574- clen = end_of_lowmem - ramdisk_image;
13575- p = (char *)__va(ramdisk_image);
13576- memcpy(q, p, clen);
13577- q += clen;
13578- ramdisk_image += clen;
13579- ramdisk_size -= clen;
13580- }
13581-
13582- /* Copy the highmem portion of the initrd */
13583- while (ramdisk_size) {
13584- slop = ramdisk_image & ~PAGE_MASK;
13585- clen = ramdisk_size;
13586- if (clen > MAX_MAP_CHUNK-slop)
13587- clen = MAX_MAP_CHUNK-slop;
13588- mapaddr = ramdisk_image & PAGE_MASK;
13589- p = early_ioremap(mapaddr, clen+slop);
13590- memcpy(q, p+slop, clen);
13591- early_iounmap(p, clen+slop);
13592- q += clen;
13593- ramdisk_image += clen;
13594- ramdisk_size -= clen;
13595- }
13596-}
13597-
13598-#endif /* CONFIG_BLK_DEV_INITRD */
13599-
13600-void __init setup_bootmem_allocator(void)
13601-{
13602- unsigned long bootmap_size;
13603- /*
13604- * Initialize the boot-time allocator (with low memory only):
13605- */
13606- bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
13607-
13608- register_bootmem_low_pages(max_low_pfn);
13609-
13610- /*
13611- * Reserve the bootmem bitmap itself as well. We do this in two
13612- * steps (first step was init_bootmem()) because this catches
13613- * the (very unlikely) case of us accidentally initializing the
13614- * bootmem allocator with an invalid RAM area.
13615- */
13616- reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
13617- bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
13618- BOOTMEM_DEFAULT);
13619-
13620-#ifndef CONFIG_XEN
13621- /*
13622- * reserve physical page 0 - it's a special BIOS page on many boxes,
13623- * enabling clean reboots, SMP operation, laptop functions.
13624- */
13625- reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
13626-
13627- /* reserve EBDA region */
13628- reserve_ebda_region();
13629-
13630-#ifdef CONFIG_SMP
13631- /*
13632- * But first pinch a few for the stack/trampoline stuff
13633- * FIXME: Don't need the extra page at 4K, but need to fix
13634- * trampoline before removing it. (see the GDT stuff)
13635- */
13636- reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
13637-#endif
13638-#ifdef CONFIG_ACPI_SLEEP
13639- /*
13640- * Reserve low memory region for sleep support.
13641- */
13642- acpi_reserve_bootmem();
13643-#endif
13644-#endif /* !CONFIG_XEN */
13645-
13646-#ifdef CONFIG_BLK_DEV_INITRD
13647- reserve_initrd();
13648-#endif
13649- numa_kva_reserve();
13650- reserve_crashkernel();
13651-
13652- reserve_ibft_region();
13653-}
13654-
13655-/*
13656- * The node 0 pgdat is initialized before all of these because
13657- * it's needed for bootmem. node>0 pgdats have their virtual
13658- * space allocated before the pagetables are in place to access
13659- * them, so they can't be cleared then.
13660- *
13661- * This should all compile down to nothing when NUMA is off.
13662- */
13663-static void __init remapped_pgdat_init(void)
13664-{
13665- int nid;
13666-
13667- for_each_online_node(nid) {
13668- if (nid != 0)
13669- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
13670- }
13671-}
13672-
13673-#ifdef CONFIG_MCA
13674-static void set_mca_bus(int x)
13675-{
13676- MCA_bus = x;
13677-}
13678-#else
13679-static void set_mca_bus(int x) { }
13680-#endif
13681-
13682-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
13683-char * __init __attribute__((weak)) memory_setup(void)
13684-{
13685- return machine_specific_memory_setup();
13686-}
13687-
13688-#ifdef CONFIG_NUMA
13689-/*
13690- * In the golden day, when everything among i386 and x86_64 will be
13691- * integrated, this will not live here
13692- */
13693-void *x86_cpu_to_node_map_early_ptr;
13694-int x86_cpu_to_node_map_init[NR_CPUS] = {
13695- [0 ... NR_CPUS-1] = NUMA_NO_NODE
13696-};
13697-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
13698-#endif
13699-
13700-/*
13701- * Determine if we were loaded by an EFI loader. If so, then we have also been
13702- * passed the efi memmap, systab, etc., so we should use these data structures
13703- * for initialization. Note, the efi init code path is determined by the
13704- * global efi_enabled. This allows the same kernel image to be used on existing
13705- * systems (with a traditional BIOS) as well as on EFI systems.
13706- */
13707-void __init setup_arch(char **cmdline_p)
13708-{
13709- int i, j, k, fpp;
13710- struct physdev_set_iopl set_iopl;
13711- unsigned long max_low_pfn;
13712- unsigned long p2m_pages;
13713-
13714- /* Force a quick death if the kernel panics (not domain 0). */
13715- extern int panic_timeout;
13716- if (!panic_timeout && !is_initial_xendomain())
13717- panic_timeout = 1;
13718-
13719- /* Register a call for panic conditions. */
13720- atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
13721-
13722- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13723- VMASST_TYPE_4gb_segments));
13724- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13725- VMASST_TYPE_writable_pagetables));
13726-
13727- memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
13728- pre_setup_arch_hook();
13729- early_cpu_init();
13730- early_ioremap_init();
13731-#ifdef CONFIG_SMP
13732- prefill_possible_map();
13733-#endif
13734-
13735-#ifdef CONFIG_EFI
13736- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
13737- "EL32", 4))
13738- efi_enabled = 1;
13739-#endif
13740-
13741- /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
13742- properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
13743- */
13744- ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
13745- screen_info = boot_params.screen_info;
13746- copy_edid();
13747- apm_info.bios = boot_params.apm_bios_info;
13748- ist_info = boot_params.ist_info;
13749- saved_video_mode = boot_params.hdr.vid_mode;
13750- if( boot_params.sys_desc_table.length != 0 ) {
13751- set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
13752- machine_id = boot_params.sys_desc_table.table[0];
13753- machine_submodel_id = boot_params.sys_desc_table.table[1];
13754- BIOS_revision = boot_params.sys_desc_table.table[2];
13755- }
13756- bootloader_type = boot_params.hdr.type_of_loader;
13757-
13758- if (is_initial_xendomain()) {
13759- const struct dom0_vga_console_info *info =
13760- (void *)((char *)xen_start_info +
13761- xen_start_info->console.dom0.info_off);
13762-
13763- dom0_init_screen_info(info,
13764- xen_start_info->console.dom0.info_size);
13765- xen_start_info->console.domU.mfn = 0;
13766- xen_start_info->console.domU.evtchn = 0;
13767- } else
13768- screen_info.orig_video_isVGA = 0;
13769-
13770-#ifdef CONFIG_BLK_DEV_RAM
13771- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
13772- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
13773- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
13774-#endif
13775-
13776- ARCH_SETUP
13777-
13778- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
13779- print_memory_map(memory_setup());
13780-
13781- copy_edd();
13782-
13783- if (!boot_params.hdr.root_flags)
13784- root_mountflags &= ~MS_RDONLY;
13785- init_mm.start_code = (unsigned long) _text;
13786- init_mm.end_code = (unsigned long) _etext;
13787- init_mm.end_data = (unsigned long) _edata;
13788- init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
13789- xen_start_info->nr_pt_frames) << PAGE_SHIFT;
13790-
13791- code_resource.start = virt_to_phys(_text);
13792- code_resource.end = virt_to_phys(_etext)-1;
13793- data_resource.start = virt_to_phys(_etext);
13794- data_resource.end = virt_to_phys(_edata)-1;
13795- bss_resource.start = virt_to_phys(&__bss_start);
13796- bss_resource.end = virt_to_phys(&__bss_stop)-1;
13797-
13798- if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
13799- i = COMMAND_LINE_SIZE;
13800- memcpy(boot_command_line, xen_start_info->cmd_line, i);
13801- boot_command_line[i - 1] = '\0';
13802- parse_early_param();
13803-
13804- if (user_defined_memmap) {
13805- printk(KERN_INFO "user-defined physical RAM map:\n");
13806- print_memory_map("user");
13807- }
13808-
13809- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
13810- *cmdline_p = command_line;
13811-
13812- if (efi_enabled)
13813- efi_init();
13814-
13815- /* update e820 for memory not covered by WB MTRRs */
13816- propagate_e820_map();
13817- mtrr_bp_init();
13818-#ifndef CONFIG_XEN
13819- if (mtrr_trim_uncached_memory(max_pfn))
13820- propagate_e820_map();
13821-#endif
13822-
13823- max_low_pfn = setup_memory();
13824-
13825-#ifdef CONFIG_KVM_CLOCK
13826- kvmclock_init();
13827-#endif
13828-
13829-#ifdef CONFIG_VMI
13830- /*
13831- * Must be after max_low_pfn is determined, and before kernel
13832- * pagetables are setup.
13833- */
13834- vmi_init();
13835-#endif
13836- kvm_guest_init();
13837-
13838- /*
13839- * NOTE: before this point _nobody_ is allowed to allocate
13840- * any memory using the bootmem allocator. Although the
13841- * allocator is now initialised only the first 8Mb of the kernel
13842- * virtual address space has been mapped. All allocations before
13843- * paging_init() has completed must use the alloc_bootmem_low_pages()
13844- * variant (which allocates DMA'able memory) and care must be taken
13845- * not to exceed the 8Mb limit.
13846- */
13847-
13848-#ifdef CONFIG_SMP
13849- smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
13850-#endif
13851- paging_init();
13852-
13853- /*
13854- * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
13855- */
13856-
13857-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
13858- if (init_ohci1394_dma_early)
13859- init_ohci1394_dma_on_all_controllers();
13860-#endif
13861-
13862- remapped_pgdat_init();
13863- sparse_init();
13864- zone_sizes_init();
13865-
13866-#ifdef CONFIG_X86_FIND_SMP_CONFIG
13867- /*
13868- * Find and reserve possible boot-time SMP configuration:
13869- */
13870- find_smp_config();
13871-#endif
13872-
13873- p2m_pages = max_pfn;
13874- if (xen_start_info->nr_pages > max_pfn) {
13875- /*
13876- * the max_pfn was shrunk (probably by mem= or highmem=
13877- * kernel parameter); shrink reservation with the HV
13878- */
13879- struct xen_memory_reservation reservation = {
13880- .address_bits = 0,
13881- .extent_order = 0,
13882- .domid = DOMID_SELF
13883- };
13884- unsigned int difference;
13885- int ret;
13886-
13887- difference = xen_start_info->nr_pages - max_pfn;
13888-
13889- set_xen_guest_handle(reservation.extent_start,
13890- ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
13891- reservation.nr_extents = difference;
13892- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
13893- &reservation);
13894- BUG_ON (ret != difference);
13895- }
13896- else if (max_pfn > xen_start_info->nr_pages)
13897- p2m_pages = xen_start_info->nr_pages;
13898-
13899- /* Make sure we have a correctly sized P->M table. */
13900- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
13901- phys_to_machine_mapping = alloc_bootmem_low_pages(
13902- max_pfn * sizeof(unsigned long));
13903- memset(phys_to_machine_mapping, ~0,
13904- max_pfn * sizeof(unsigned long));
13905- memcpy(phys_to_machine_mapping,
13906- (unsigned long *)xen_start_info->mfn_list,
13907- p2m_pages * sizeof(unsigned long));
13908- free_bootmem(
13909- __pa(xen_start_info->mfn_list),
13910- PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
13911- sizeof(unsigned long))));
13912-
13913- /*
13914- * Initialise the list of the frames that specify the list of
13915- * frames that make up the p2m table. Used by save/restore
13916- */
13917- pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
13918-
13919- fpp = PAGE_SIZE/sizeof(unsigned long);
13920- for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
13921- if ((j % fpp) == 0) {
13922- k++;
13923- BUG_ON(k>=16);
13924- pfn_to_mfn_frame_list[k] =
13925- alloc_bootmem_low_pages(PAGE_SIZE);
13926- pfn_to_mfn_frame_list_list[k] =
13927- virt_to_mfn(pfn_to_mfn_frame_list[k]);
13928- j=0;
13929- }
13930- pfn_to_mfn_frame_list[k][j] =
13931- virt_to_mfn(&phys_to_machine_mapping[i]);
13932- }
13933- HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
13934- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
13935- virt_to_mfn(pfn_to_mfn_frame_list_list);
13936- }
13937-
13938- /* Mark all ISA DMA channels in-use - using them wouldn't work. */
13939- for (i = 0; i < MAX_DMA_CHANNELS; ++i)
13940- if (i != 4 && request_dma(i, "xen") != 0)
13941- BUG();
13942-
13943- /*
13944- * NOTE: at this point the bootmem allocator is fully available.
13945- */
13946-
13947-#ifdef CONFIG_BLK_DEV_INITRD
13948- relocate_initrd();
13949-#endif
13950-
13951- paravirt_post_allocator_init();
13952-
13953- if (is_initial_xendomain())
13954- dmi_scan_machine();
13955-
13956- io_delay_init();
13957-
13958-#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
13959- /*
13960- * setup to use the early static init tables during kernel startup
13961- * X86_SMP will exclude sub-arches that don't deal well with it.
13962- */
13963- x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
13964- x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
13965-#ifdef CONFIG_NUMA
13966- x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
13967-#endif
13968-#endif
13969-
13970-#ifdef CONFIG_X86_GENERICARCH
13971- generic_apic_probe();
13972-#endif
13973-
13974- set_iopl.iopl = 1;
13975- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
13976-
13977-#ifdef CONFIG_ACPI
13978- if (!is_initial_xendomain()) {
13979- printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
13980- acpi_disabled = 1;
13981- acpi_ht = 0;
13982- }
13983-
13984- /*
13985- * Parse the ACPI tables for possible boot-time SMP configuration.
13986- */
13987- acpi_boot_table_init();
13988-#endif
13989-
13990-#ifndef CONFIG_XEN
13991- early_quirks();
13992-#endif
13993-
13994-#ifdef CONFIG_ACPI
13995- acpi_boot_init();
13996-
13997-#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
13998- if (def_to_bigsmp)
13999- printk(KERN_WARNING "More than 8 CPUs detected and "
14000- "CONFIG_X86_PC cannot handle it.\nUse "
14001- "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
14002-#endif
14003-#endif
14004-#ifdef CONFIG_X86_LOCAL_APIC
14005- if (smp_found_config)
14006- get_smp_config();
14007-#endif
14008-
14009- e820_register_memory();
14010- e820_mark_nosave_regions();
14011-
14012- if (is_initial_xendomain()) {
14013-#ifdef CONFIG_VT
14014-#if defined(CONFIG_VGA_CONSOLE)
14015- if (!efi_enabled ||
14016- (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14017- conswitchp = &vga_con;
14018-#elif defined(CONFIG_DUMMY_CONSOLE)
14019- conswitchp = &dummy_con;
14020-#endif
14021-#endif
14022- } else {
14023-#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14024- conswitchp = &dummy_con;
14025-#endif
14026- }
14027-}
14028-
14029-static int
14030-xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14031-{
14032- HYPERVISOR_shutdown(SHUTDOWN_crash);
14033- /* we're never actually going to get here... */
14034- return NOTIFY_DONE;
14035-}
14036-
14037-/*
14038- * Request address space for all standard resources
14039- *
14040- * This is called just before pcibios_init(), which is also a
14041- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
14042- */
14043-static int __init request_standard_resources(void)
14044-{
14045- int i;
14046-
14047- /* Nothing to do if not running in dom0. */
14048- if (!is_initial_xendomain())
14049- return 0;
14050-
14051- printk(KERN_INFO "Setting up standard PCI resources\n");
14052- init_iomem_resources(&code_resource, &data_resource, &bss_resource);
14053-
14054- request_resource(&iomem_resource, &video_ram_resource);
14055-
14056- /* request I/O space for devices used on all i[345]86 PCs */
14057- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14058- request_resource(&ioport_resource, &standard_io_resources[i]);
14059- return 0;
14060-}
14061-
14062-subsys_initcall(request_standard_resources);
00e5a55c 14063--- sle11-2009-06-04.orig/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:38:05.000000000 +0100
cc90b958
BS
14064+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
14065@@ -1,1433 +0,0 @@
14066-/*
14067- * Copyright (C) 1995 Linus Torvalds
14068- */
14069-
14070-/*
14071- * This file handles the architecture-dependent parts of initialization
14072- */
14073-
14074-#include <linux/errno.h>
14075-#include <linux/sched.h>
14076-#include <linux/kernel.h>
14077-#include <linux/mm.h>
14078-#include <linux/stddef.h>
14079-#include <linux/unistd.h>
14080-#include <linux/ptrace.h>
14081-#include <linux/slab.h>
14082-#include <linux/user.h>
14083-#include <linux/screen_info.h>
14084-#include <linux/ioport.h>
14085-#include <linux/delay.h>
14086-#include <linux/init.h>
14087-#include <linux/initrd.h>
14088-#include <linux/highmem.h>
14089-#include <linux/bootmem.h>
14090-#include <linux/module.h>
14091-#include <asm/processor.h>
14092-#include <linux/console.h>
14093-#include <linux/seq_file.h>
14094-#include <linux/crash_dump.h>
14095-#include <linux/root_dev.h>
14096-#include <linux/pci.h>
14097-#include <asm/pci-direct.h>
14098-#include <linux/efi.h>
14099-#include <linux/acpi.h>
14100-#include <linux/kallsyms.h>
14101-#include <linux/edd.h>
14102-#include <linux/iscsi_ibft.h>
14103-#include <linux/mmzone.h>
14104-#include <linux/kexec.h>
14105-#include <linux/cpufreq.h>
14106-#include <linux/dmi.h>
14107-#include <linux/dma-mapping.h>
14108-#include <linux/ctype.h>
14109-#include <linux/sort.h>
14110-#include <linux/uaccess.h>
14111-#include <linux/init_ohci1394_dma.h>
14112-#include <linux/kvm_para.h>
14113-
14114-#include <asm/mtrr.h>
14115-#include <asm/uaccess.h>
14116-#include <asm/system.h>
14117-#include <asm/vsyscall.h>
14118-#include <asm/io.h>
14119-#include <asm/smp.h>
14120-#include <asm/msr.h>
14121-#include <asm/desc.h>
14122-#include <video/edid.h>
14123-#include <asm/e820.h>
14124-#include <asm/dma.h>
14125-#include <asm/gart.h>
14126-#include <asm/mpspec.h>
14127-#include <asm/mmu_context.h>
14128-#include <asm/proto.h>
14129-#include <asm/setup.h>
14130-#include <asm/numa.h>
14131-#include <asm/sections.h>
14132-#include <asm/dmi.h>
14133-#include <asm/cacheflush.h>
14134-#include <asm/mce.h>
14135-#include <asm/ds.h>
14136-#include <asm/topology.h>
14137-#include <asm/pat.h>
14138-
14139-#include <mach_apic.h>
14140-#ifdef CONFIG_XEN
14141-#include <linux/percpu.h>
14142-#include <xen/interface/physdev.h>
14143-#include "setup_arch_pre.h"
14144-#include <asm/hypervisor.h>
14145-#include <xen/interface/nmi.h>
14146-#include <xen/features.h>
14147-#include <xen/firmware.h>
14148-#include <xen/xencons.h>
14149-#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
14150-#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
14151-#include <asm/mach-xen/setup_arch_post.h>
14152-#include <xen/interface/memory.h>
14153-
14154-#ifdef CONFIG_XEN
14155-#include <xen/interface/kexec.h>
14156-#endif
14157-
14158-extern unsigned long start_pfn;
14159-extern struct edid_info edid_info;
14160-
14161-shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
14162-EXPORT_SYMBOL(HYPERVISOR_shared_info);
14163-
14164-static int xen_panic_event(struct notifier_block *, unsigned long, void *);
14165-static struct notifier_block xen_panic_block = {
14166- xen_panic_event, NULL, 0 /* try to go last */
14167-};
14168-
14169-unsigned long *phys_to_machine_mapping;
14170-unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
14171-
14172-EXPORT_SYMBOL(phys_to_machine_mapping);
14173-
14174-DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
14175-DEFINE_PER_CPU(int, nr_multicall_ents);
14176-
14177-/* Raw start-of-day parameters from the hypervisor. */
14178-start_info_t *xen_start_info;
14179-EXPORT_SYMBOL(xen_start_info);
14180-#endif
14181-
14182-/*
14183- * Machine setup..
14184- */
14185-
14186-struct cpuinfo_x86 boot_cpu_data __read_mostly;
14187-EXPORT_SYMBOL(boot_cpu_data);
14188-
14189-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
14190-
14191-unsigned long mmu_cr4_features;
14192-
14193-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
14194-int bootloader_type;
14195-
14196-unsigned long saved_video_mode;
14197-
14198-int force_mwait __cpuinitdata;
14199-
14200-/*
14201- * Early DMI memory
14202- */
14203-int dmi_alloc_index;
14204-char dmi_alloc_data[DMI_MAX_DATA];
14205-
14206-/*
14207- * Setup options
14208- */
14209-struct screen_info screen_info;
14210-EXPORT_SYMBOL(screen_info);
14211-struct sys_desc_table_struct {
14212- unsigned short length;
14213- unsigned char table[0];
14214-};
14215-
14216-struct edid_info edid_info;
14217-EXPORT_SYMBOL_GPL(edid_info);
14218-
14219-extern int root_mountflags;
14220-
14221-char __initdata command_line[COMMAND_LINE_SIZE];
14222-
14223-static struct resource standard_io_resources[] = {
14224- { .name = "dma1", .start = 0x00, .end = 0x1f,
14225- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14226- { .name = "pic1", .start = 0x20, .end = 0x21,
14227- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14228- { .name = "timer0", .start = 0x40, .end = 0x43,
14229- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14230- { .name = "timer1", .start = 0x50, .end = 0x53,
14231- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14232- { .name = "keyboard", .start = 0x60, .end = 0x60,
14233- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14234- { .name = "keyboard", .start = 0x64, .end = 0x64,
14235- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14236- { .name = "dma page reg", .start = 0x80, .end = 0x8f,
14237- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14238- { .name = "pic2", .start = 0xa0, .end = 0xa1,
14239- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14240- { .name = "dma2", .start = 0xc0, .end = 0xdf,
14241- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14242- { .name = "fpu", .start = 0xf0, .end = 0xff,
14243- .flags = IORESOURCE_BUSY | IORESOURCE_IO }
14244-};
14245-
14246-#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
14247-
14248-static struct resource data_resource = {
14249- .name = "Kernel data",
14250- .start = 0,
14251- .end = 0,
14252- .flags = IORESOURCE_RAM,
14253-};
14254-static struct resource code_resource = {
14255- .name = "Kernel code",
14256- .start = 0,
14257- .end = 0,
14258- .flags = IORESOURCE_RAM,
14259-};
14260-static struct resource bss_resource = {
14261- .name = "Kernel bss",
14262- .start = 0,
14263- .end = 0,
14264- .flags = IORESOURCE_RAM,
14265-};
14266-
14267-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
14268-
14269-#ifdef CONFIG_PROC_VMCORE
14270-/* elfcorehdr= specifies the location of elf core header
14271- * stored by the crashed kernel. This option will be passed
14272- * by kexec loader to the capture kernel.
14273- */
14274-static int __init setup_elfcorehdr(char *arg)
14275-{
14276- char *end;
14277- if (!arg)
14278- return -EINVAL;
14279- elfcorehdr_addr = memparse(arg, &end);
14280- return end > arg ? 0 : -EINVAL;
14281-}
14282-early_param("elfcorehdr", setup_elfcorehdr);
14283-#endif
14284-
14285-#ifndef CONFIG_NUMA
14286-static void __init
14287-contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
14288-{
14289- unsigned long bootmap_size, bootmap;
14290-
14291- bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
14292- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
14293- PAGE_SIZE);
14294- if (bootmap == -1L)
14295- panic("Cannot find bootmem map of size %ld\n", bootmap_size);
14296- bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
14297- e820_register_active_regions(0, start_pfn, end_pfn);
14298-#ifdef CONFIG_XEN
14299- free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
14300- early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
14301-#else
14302- free_bootmem_with_active_regions(0, end_pfn);
14303- early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
14304-#endif
14305- reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
14306-}
14307-#endif
14308-
14309-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
14310-struct edd edd;
14311-#ifdef CONFIG_EDD_MODULE
14312-EXPORT_SYMBOL(edd);
14313-#endif
14314-#ifndef CONFIG_XEN
14315-/**
14316- * copy_edd() - Copy the BIOS EDD information
14317- * from boot_params into a safe place.
14318- *
14319- */
14320-static inline void copy_edd(void)
14321-{
14322- memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
14323- sizeof(edd.mbr_signature));
14324- memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
14325- edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
14326- edd.edd_info_nr = boot_params.eddbuf_entries;
14327-}
14328-#endif
14329-#else
14330-static inline void copy_edd(void)
14331-{
14332-}
14333-#endif
14334-
14335-#ifdef CONFIG_KEXEC
14336-#ifndef CONFIG_XEN
14337-static void __init reserve_crashkernel(void)
14338-{
14339- unsigned long long total_mem;
14340- unsigned long long crash_size, crash_base;
14341- int ret;
14342-
14343- total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
14344-
14345- ret = parse_crashkernel(boot_command_line, total_mem,
14346- &crash_size, &crash_base);
14347- if (ret == 0 && crash_size) {
14348- if (crash_base <= 0) {
14349- printk(KERN_INFO "crashkernel reservation failed - "
14350- "you have to specify a base address\n");
14351- return;
14352- }
14353-
14354- if (reserve_bootmem(crash_base, crash_size,
14355- BOOTMEM_EXCLUSIVE) < 0) {
14356- printk(KERN_INFO "crashkernel reservation failed - "
14357- "memory is in use\n");
14358- return;
14359- }
14360-
14361- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
14362- "for crashkernel (System RAM: %ldMB)\n",
14363- (unsigned long)(crash_size >> 20),
14364- (unsigned long)(crash_base >> 20),
14365- (unsigned long)(total_mem >> 20));
14366- crashk_res.start = crash_base;
14367- crashk_res.end = crash_base + crash_size - 1;
14368- insert_resource(&iomem_resource, &crashk_res);
14369- }
14370-}
14371-#else
14372-#define reserve_crashkernel xen_machine_kexec_setup_resources
14373-#endif
14374-#else
14375-static inline void __init reserve_crashkernel(void)
14376-{}
14377-#endif
14378-
14379-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
14380-void __attribute__((weak)) __init memory_setup(void)
14381-{
14382- machine_specific_memory_setup();
14383-}
14384-
14385-static void __init parse_setup_data(void)
14386-{
14387- struct setup_data *data;
14388- unsigned long pa_data;
14389-
14390- if (boot_params.hdr.version < 0x0209)
14391- return;
14392- pa_data = boot_params.hdr.setup_data;
14393- while (pa_data) {
14394- data = early_ioremap(pa_data, PAGE_SIZE);
14395- switch (data->type) {
14396- default:
14397- break;
14398- }
14399-#ifndef CONFIG_DEBUG_BOOT_PARAMS
14400- free_early(pa_data, pa_data+sizeof(*data)+data->len);
14401-#endif
14402- pa_data = data->next;
14403- early_iounmap(data, PAGE_SIZE);
14404- }
14405-}
14406-
14407-#ifdef CONFIG_PCI_MMCONFIG
14408-extern void __cpuinit fam10h_check_enable_mmcfg(void);
14409-extern void __init check_enable_amd_mmconf_dmi(void);
14410-#else
14411-void __cpuinit fam10h_check_enable_mmcfg(void)
14412-{
14413-}
14414-void __init check_enable_amd_mmconf_dmi(void)
14415-{
14416-}
14417-#endif
14418-
14419-/*
14420- * setup_arch - architecture-specific boot-time initializations
14421- *
14422- * Note: On x86_64, fixmaps are ready for use even before this is called.
14423- */
14424-void __init setup_arch(char **cmdline_p)
14425-{
14426- unsigned i;
14427-
14428-#ifdef CONFIG_XEN
14429- extern struct e820map machine_e820;
14430-
14431- printk(KERN_INFO "Command line: %s\n", boot_command_line);
14432-
14433- /* Register a call for panic conditions. */
14434- atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
14435-
14436- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
14437- VMASST_TYPE_writable_pagetables));
14438-
14439- early_ioremap_init();
14440-
14441- ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
14442- screen_info = boot_params.screen_info;
14443-
14444- if (is_initial_xendomain()) {
14445- const struct dom0_vga_console_info *info =
14446- (void *)((char *)xen_start_info +
14447- xen_start_info->console.dom0.info_off);
14448-
14449- dom0_init_screen_info(info,
14450- xen_start_info->console.dom0.info_size);
14451- xen_start_info->console.domU.mfn = 0;
14452- xen_start_info->console.domU.evtchn = 0;
14453- } else
14454- screen_info.orig_video_isVGA = 0;
14455-
14456- copy_edid();
14457-#else
14458- printk(KERN_INFO "Command line: %s\n", boot_command_line);
14459-
14460- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
14461- screen_info = boot_params.screen_info;
14462- edid_info = boot_params.edid_info;
14463-#endif /* !CONFIG_XEN */
14464- saved_video_mode = boot_params.hdr.vid_mode;
14465- bootloader_type = boot_params.hdr.type_of_loader;
14466-
14467-#ifdef CONFIG_BLK_DEV_RAM
14468- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
14469- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
14470- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
14471-#endif
14472-#ifdef CONFIG_EFI
14473- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
14474- "EL64", 4))
14475- efi_enabled = 1;
14476-#endif
14477-
14478- ARCH_SETUP
14479-
14480- memory_setup();
14481- copy_edd();
14482-
14483- if (!boot_params.hdr.root_flags)
14484- root_mountflags &= ~MS_RDONLY;
14485- init_mm.start_code = (unsigned long) &_text;
14486- init_mm.end_code = (unsigned long) &_etext;
14487- init_mm.end_data = (unsigned long) &_edata;
14488- init_mm.brk = (unsigned long) &_end;
14489-
14490- code_resource.start = virt_to_phys(&_text);
14491- code_resource.end = virt_to_phys(&_etext)-1;
14492- data_resource.start = virt_to_phys(&_etext);
14493- data_resource.end = virt_to_phys(&_edata)-1;
14494- bss_resource.start = virt_to_phys(&__bss_start);
14495- bss_resource.end = virt_to_phys(&__bss_stop)-1;
14496-
14497- early_identify_cpu(&boot_cpu_data);
14498-
14499- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
14500- *cmdline_p = command_line;
14501-
14502- parse_setup_data();
14503-
14504- parse_early_param();
14505-
14506-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
14507- if (init_ohci1394_dma_early)
14508- init_ohci1394_dma_on_all_controllers();
14509-#endif
14510-
14511- finish_e820_parsing();
14512-
14513-#ifndef CONFIG_XEN
14514- /* after parse_early_param, so could debug it */
14515- insert_resource(&iomem_resource, &code_resource);
14516- insert_resource(&iomem_resource, &data_resource);
14517- insert_resource(&iomem_resource, &bss_resource);
14518-#endif
14519-
14520- early_gart_iommu_check();
14521-
14522- e820_register_active_regions(0, 0, -1UL);
14523- /*
14524- * partially used pages are not usable - thus
14525- * we are rounding upwards:
14526- */
14527- end_pfn = e820_end_of_ram();
14528- /* update e820 for memory not covered by WB MTRRs */
14529- mtrr_bp_init();
14530-#ifndef CONFIG_XEN
14531- if (mtrr_trim_uncached_memory(end_pfn)) {
14532- e820_register_active_regions(0, 0, -1UL);
14533- end_pfn = e820_end_of_ram();
14534- }
14535-#endif
14536-
14537- num_physpages = end_pfn;
14538- max_mapnr = end_pfn;
14539-
14540- check_efer();
14541-
14542- max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
14543- if (efi_enabled)
14544- efi_init();
14545-
14546-#ifndef CONFIG_XEN
14547- vsmp_init();
14548-#endif
14549-
14550- if (is_initial_xendomain())
14551- dmi_scan_machine();
14552-
14553- io_delay_init();
14554-
14555-#ifdef CONFIG_KVM_CLOCK
14556- kvmclock_init();
14557-#endif
14558-
14559-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
14560- /* setup to use the early static init tables during kernel startup */
14561- x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
14562- x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
14563-#ifdef CONFIG_NUMA
14564- x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
14565-#endif
14566-#endif
14567-
14568- /* How many end-of-memory variables you have, grandma! */
14569- max_low_pfn = end_pfn;
14570- max_pfn = end_pfn;
14571- high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
14572-
14573- /* Remove active ranges so rediscovery with NUMA-awareness happens */
14574- remove_all_active_ranges();
14575-
14576-#ifdef CONFIG_ACPI_NUMA
14577- /*
14578- * Parse SRAT to discover nodes.
14579- */
14580- acpi_numa_init();
14581-#endif
14582-
14583-#ifdef CONFIG_NUMA
14584- numa_initmem_init(0, end_pfn);
14585-#else
14586- contig_initmem_init(0, end_pfn);
14587-#endif
14588-
14589-#ifndef CONFIG_XEN
14590- dma32_reserve_bootmem();
14591-
14592-#ifdef CONFIG_ACPI_SLEEP
14593- /*
14594- * Reserve low memory region for sleep support.
14595- */
14596- acpi_reserve_bootmem();
14597-#endif
14598-
14599- if (efi_enabled)
14600- efi_reserve_bootmem();
14601-#endif
14602-
14603-#ifdef CONFIG_BLK_DEV_INITRD
14604-#ifdef CONFIG_XEN
14605- if (xen_start_info->mod_start) {
14606- unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
14607- unsigned long ramdisk_size = xen_start_info->mod_len;
14608-#else
14609- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
14610- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
14611- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
14612-#endif
14613- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
14614- unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
14615-
14616- if (ramdisk_end <= end_of_mem) {
14617- /*
14618- * don't need to reserve again, already reserved early
14619- * in x86_64_start_kernel, and early_res_to_bootmem
14620- * convert that to reserved in bootmem
14621- */
14622- initrd_start = ramdisk_image + PAGE_OFFSET;
14623- initrd_end = initrd_start+ramdisk_size;
14624-#ifdef CONFIG_XEN
14625- initrd_below_start_ok = 1;
14626-#endif
14627- } else {
14628- free_bootmem(ramdisk_image, ramdisk_size);
14629- printk(KERN_ERR "initrd extends beyond end of memory "
14630- "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
14631- ramdisk_end, end_of_mem);
14632- initrd_start = 0;
14633- }
14634- }
14635-#endif
14636- reserve_crashkernel();
14637-
14638- reserve_ibft_region();
14639-
14640- paging_init();
14641- map_vsyscall();
14642-#ifdef CONFIG_X86_LOCAL_APIC
14643- /*
14644- * Find and reserve possible boot-time SMP configuration:
14645- */
14646- find_smp_config();
14647-#endif
14648-#ifdef CONFIG_XEN
14649- {
14650- int i, j, k, fpp;
14651- unsigned long p2m_pages;
14652-
14653- p2m_pages = end_pfn;
14654- if (xen_start_info->nr_pages > end_pfn) {
14655- /*
14656- * the end_pfn was shrunk (probably by mem= or highmem=
14657- * kernel parameter); shrink reservation with the HV
14658- */
14659- struct xen_memory_reservation reservation = {
14660- .address_bits = 0,
14661- .extent_order = 0,
14662- .domid = DOMID_SELF
14663- };
14664- unsigned int difference;
14665- int ret;
14666-
14667- difference = xen_start_info->nr_pages - end_pfn;
14668-
14669- set_xen_guest_handle(reservation.extent_start,
14670- ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
14671- reservation.nr_extents = difference;
14672- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
14673- &reservation);
14674- BUG_ON (ret != difference);
14675- }
14676- else if (end_pfn > xen_start_info->nr_pages)
14677- p2m_pages = xen_start_info->nr_pages;
14678-
14679- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
14680- /* Make sure we have a large enough P->M table. */
14681- phys_to_machine_mapping = alloc_bootmem_pages(
14682- end_pfn * sizeof(unsigned long));
14683- memset(phys_to_machine_mapping, ~0,
14684- end_pfn * sizeof(unsigned long));
14685- memcpy(phys_to_machine_mapping,
14686- (unsigned long *)xen_start_info->mfn_list,
14687- p2m_pages * sizeof(unsigned long));
14688- free_bootmem(
14689- __pa(xen_start_info->mfn_list),
14690- PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
14691- sizeof(unsigned long))));
14692-
14693- /*
14694- * Initialise the list of the frames that specify the
14695- * list of frames that make up the p2m table. Used by
14696- * save/restore.
14697- */
14698- pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
14699-
14700- fpp = PAGE_SIZE/sizeof(unsigned long);
14701- for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
14702- if ((j % fpp) == 0) {
14703- k++;
14704- BUG_ON(k>=fpp);
14705- pfn_to_mfn_frame_list[k] =
14706- alloc_bootmem_pages(PAGE_SIZE);
14707- pfn_to_mfn_frame_list_list[k] =
14708- virt_to_mfn(pfn_to_mfn_frame_list[k]);
14709- j=0;
14710- }
14711- pfn_to_mfn_frame_list[k][j] =
14712- virt_to_mfn(&phys_to_machine_mapping[i]);
14713- }
14714- HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
14715- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
14716- virt_to_mfn(pfn_to_mfn_frame_list_list);
14717- }
14718-
14719- /* Mark all ISA DMA channels in-use - using them wouldn't work. */
14720- for (i = 0; i < MAX_DMA_CHANNELS; ++i)
14721- if (i != 4 && request_dma(i, "xen") != 0)
14722- BUG();
14723- }
14724-
14725-#ifdef CONFIG_ACPI
14726- if (!is_initial_xendomain()) {
14727- acpi_disabled = 1;
14728- acpi_ht = 0;
14729- }
14730-#endif
14731-#endif
14732-
14733-#ifndef CONFIG_XEN
14734- early_quirks();
14735-#endif
14736-
14737-#ifdef CONFIG_ACPI
14738- /*
14739- * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
14740- * Call this early for SRAT node setup.
14741- */
14742- acpi_boot_table_init();
14743-
14744- /*
14745- * Read APIC and some other early information from ACPI tables.
14746- */
14747- acpi_boot_init();
14748-#endif
14749-
14750- init_cpu_to_node();
14751-
14752-#ifdef CONFIG_X86_LOCAL_APIC
14753- /*
14754- * get boot-time SMP configuration:
14755- */
14756- if (smp_found_config)
14757- get_smp_config();
14758-#ifndef CONFIG_XEN
14759- init_apic_mappings();
14760- ioapic_init_mappings();
14761-#endif
14762-#endif
14763-#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
14764- prefill_possible_map();
14765-#endif
14766-
14767- kvm_guest_init();
14768-
14769- /*
14770- * We trust e820 completely. No explicit ROM probing in memory.
14771- */
14772-#ifdef CONFIG_XEN
14773- if (is_initial_xendomain())
14774- e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
14775-#else
14776- e820_reserve_resources(e820.map, e820.nr_map);
14777- e820_mark_nosave_regions();
14778-#endif
14779-
14780- /* request I/O space for devices used on all i[345]86 PCs */
14781- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14782- request_resource(&ioport_resource, &standard_io_resources[i]);
14783-
14784-#ifdef CONFIG_XEN
14785- if (is_initial_xendomain())
14786- e820_setup_gap(machine_e820.map, machine_e820.nr_map);
14787-#else
14788- e820_setup_gap(e820.map, e820.nr_map);
14789-#endif
14790-
14791-#ifdef CONFIG_XEN
14792- {
14793- struct physdev_set_iopl set_iopl;
14794-
14795- set_iopl.iopl = 1;
14796- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
14797-
14798- if (is_initial_xendomain()) {
14799-#ifdef CONFIG_VT
14800-#if defined(CONFIG_VGA_CONSOLE)
14801- conswitchp = &vga_con;
14802-#elif defined(CONFIG_DUMMY_CONSOLE)
14803- conswitchp = &dummy_con;
14804-#endif
14805-#endif
14806- } else {
14807-#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14808- conswitchp = &dummy_con;
14809-#endif
14810- }
14811- }
14812-#else /* CONFIG_XEN */
14813-
14814-#ifdef CONFIG_VT
14815-#if defined(CONFIG_VGA_CONSOLE)
14816- if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14817- conswitchp = &vga_con;
14818-#elif defined(CONFIG_DUMMY_CONSOLE)
14819- conswitchp = &dummy_con;
14820-#endif
14821-#endif
14822-
14823-#endif /* !CONFIG_XEN */
14824-
14825- /* do this before identify_cpu for boot cpu */
14826- check_enable_amd_mmconf_dmi();
14827-}
14828-
14829-#ifdef CONFIG_XEN
14830-static int
14831-xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14832-{
14833- HYPERVISOR_shutdown(SHUTDOWN_crash);
14834- /* we're never actually going to get here... */
14835- return NOTIFY_DONE;
14836-}
14837-#endif /* !CONFIG_XEN */
14838-
14839-
14840-static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
14841-{
14842- unsigned int *v;
14843-
14844- if (c->extended_cpuid_level < 0x80000004)
14845- return 0;
14846-
14847- v = (unsigned int *) c->x86_model_id;
14848- cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
14849- cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
14850- cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
14851- c->x86_model_id[48] = 0;
14852- return 1;
14853-}
14854-
14855-
14856-static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
14857-{
14858- unsigned int n, dummy, eax, ebx, ecx, edx;
14859-
14860- n = c->extended_cpuid_level;
14861-
14862- if (n >= 0x80000005) {
14863- cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
14864- printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
14865- "D cache %dK (%d bytes/line)\n",
14866- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
14867- c->x86_cache_size = (ecx>>24) + (edx>>24);
14868- /* On K8 L1 TLB is inclusive, so don't count it */
14869- c->x86_tlbsize = 0;
14870- }
14871-
14872- if (n >= 0x80000006) {
14873- cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
14874- ecx = cpuid_ecx(0x80000006);
14875- c->x86_cache_size = ecx >> 16;
14876- c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
14877-
14878- printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
14879- c->x86_cache_size, ecx & 0xFF);
14880- }
14881- if (n >= 0x80000008) {
14882- cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
14883- c->x86_virt_bits = (eax >> 8) & 0xff;
14884- c->x86_phys_bits = eax & 0xff;
14885- }
14886-}
14887-
14888-#ifdef CONFIG_NUMA
14889-static int __cpuinit nearby_node(int apicid)
14890-{
14891- int i, node;
14892-
14893- for (i = apicid - 1; i >= 0; i--) {
14894- node = apicid_to_node[i];
14895- if (node != NUMA_NO_NODE && node_online(node))
14896- return node;
14897- }
14898- for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
14899- node = apicid_to_node[i];
14900- if (node != NUMA_NO_NODE && node_online(node))
14901- return node;
14902- }
14903- return first_node(node_online_map); /* Shouldn't happen */
14904-}
14905-#endif
14906-
14907-/*
14908- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
14909- * Assumes number of cores is a power of two.
14910- */
14911-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
14912-{
14913-#ifdef CONFIG_SMP
14914- unsigned bits;
14915-#ifdef CONFIG_NUMA
14916- int cpu = smp_processor_id();
14917- int node = 0;
14918- unsigned apicid = hard_smp_processor_id();
14919-#endif
14920- bits = c->x86_coreid_bits;
14921-
14922- /* Low order bits define the core id (index of core in socket) */
14923- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
14924- /* Convert the initial APIC ID into the socket ID */
14925- c->phys_proc_id = c->initial_apicid >> bits;
14926-
14927-#ifdef CONFIG_NUMA
14928- node = c->phys_proc_id;
14929- if (apicid_to_node[apicid] != NUMA_NO_NODE)
14930- node = apicid_to_node[apicid];
14931- if (!node_online(node)) {
14932- /* Two possibilities here:
14933- - The CPU is missing memory and no node was created.
14934- In that case try picking one from a nearby CPU
14935- - The APIC IDs differ from the HyperTransport node IDs
14936- which the K8 northbridge parsing fills in.
14937- Assume they are all increased by a constant offset,
14938- but in the same order as the HT nodeids.
14939- If that doesn't result in a usable node fall back to the
14940- path for the previous case. */
14941-
14942- int ht_nodeid = c->initial_apicid;
14943-
14944- if (ht_nodeid >= 0 &&
14945- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
14946- node = apicid_to_node[ht_nodeid];
14947- /* Pick a nearby node */
14948- if (!node_online(node))
14949- node = nearby_node(apicid);
14950- }
14951- numa_set_node(cpu, node);
14952-
14953- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
14954-#endif
14955-#endif
14956-}
14957-
14958-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
14959-{
14960-#ifdef CONFIG_SMP
14961- unsigned bits, ecx;
14962-
14963- /* Multi core CPU? */
14964- if (c->extended_cpuid_level < 0x80000008)
14965- return;
14966-
14967- ecx = cpuid_ecx(0x80000008);
14968-
14969- c->x86_max_cores = (ecx & 0xff) + 1;
14970-
14971- /* CPU telling us the core id bits shift? */
14972- bits = (ecx >> 12) & 0xF;
14973-
14974- /* Otherwise recompute */
14975- if (bits == 0) {
14976- while ((1 << bits) < c->x86_max_cores)
14977- bits++;
14978- }
14979-
14980- c->x86_coreid_bits = bits;
14981-
14982-#endif
14983-}
14984-
14985-#define ENABLE_C1E_MASK 0x18000000
14986-#define CPUID_PROCESSOR_SIGNATURE 1
14987-#define CPUID_XFAM 0x0ff00000
14988-#define CPUID_XFAM_K8 0x00000000
14989-#define CPUID_XFAM_10H 0x00100000
14990-#define CPUID_XFAM_11H 0x00200000
14991-#define CPUID_XMOD 0x000f0000
14992-#define CPUID_XMOD_REV_F 0x00040000
14993-
14994-#ifndef CONFIG_XEN
14995-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
14996-static __cpuinit int amd_apic_timer_broken(void)
14997-{
14998- u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
14999-
15000- switch (eax & CPUID_XFAM) {
15001- case CPUID_XFAM_K8:
15002- if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
15003- break;
15004- case CPUID_XFAM_10H:
15005- case CPUID_XFAM_11H:
15006- rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
15007- if (lo & ENABLE_C1E_MASK)
15008- return 1;
15009- break;
15010- default:
15011- /* err on the side of caution */
15012- return 1;
15013- }
15014- return 0;
15015-}
15016-#endif
15017-
15018-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
15019-{
15020- early_init_amd_mc(c);
15021-
15022- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
15023- if (c->x86_power & (1<<8))
15024- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15025-}
15026-
15027-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
15028-{
15029- unsigned level;
15030-
15031-#ifdef CONFIG_SMP
15032- unsigned long value;
15033-
15034- /*
15035- * Disable TLB flush filter by setting HWCR.FFDIS on K8
15036- * bit 6 of msr C001_0015
15037- *
15038- * Errata 63 for SH-B3 steppings
15039- * Errata 122 for all steppings (F+ have it disabled by default)
15040- */
15041- if (c->x86 == 15) {
15042- rdmsrl(MSR_K8_HWCR, value);
15043- value |= 1 << 6;
15044- wrmsrl(MSR_K8_HWCR, value);
15045- }
15046-#endif
15047-
15048- /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
15049- 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
15050- clear_cpu_cap(c, 0*32+31);
15051-
15052- /* On C+ stepping K8 rep microcode works well for copy/memset */
15053- level = cpuid_eax(1);
15054- if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
15055- level >= 0x0f58))
15056- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15057- if (c->x86 == 0x10 || c->x86 == 0x11)
15058- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15059-
15060- /* Enable workaround for FXSAVE leak */
15061- if (c->x86 >= 6)
15062- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
15063-
15064- level = get_model_name(c);
15065- if (!level) {
15066- switch (c->x86) {
15067- case 15:
15068- /* Should distinguish Models here, but this is only
15069- a fallback anyways. */
15070- strcpy(c->x86_model_id, "Hammer");
15071- break;
15072- }
15073- }
15074- display_cacheinfo(c);
15075-
15076- /* Multi core CPU? */
15077- if (c->extended_cpuid_level >= 0x80000008)
15078- amd_detect_cmp(c);
15079-
15080- if (c->extended_cpuid_level >= 0x80000006 &&
15081- (cpuid_edx(0x80000006) & 0xf000))
15082- num_cache_leaves = 4;
15083- else
15084- num_cache_leaves = 3;
15085-
15086- if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
15087- set_cpu_cap(c, X86_FEATURE_K8);
15088-
15089- /* MFENCE stops RDTSC speculation */
15090- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
15091-
15092- if (c->x86 == 0x10)
15093- fam10h_check_enable_mmcfg();
15094-
15095-#ifndef CONFIG_XEN
15096- if (amd_apic_timer_broken())
15097- disable_apic_timer = 1;
15098-
15099- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
15100- unsigned long long tseg;
15101-
15102- /*
15103- * Split up direct mapping around the TSEG SMM area.
15104- * Don't do it for gbpages because there seems very little
15105- * benefit in doing so.
15106- */
15107- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
15108- (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
15109- set_memory_4k((unsigned long)__va(tseg), 1);
15110- }
15111-#endif
15112-}
15113-
15114-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
15115-{
15116-#ifdef CONFIG_SMP
15117- u32 eax, ebx, ecx, edx;
15118- int index_msb, core_bits;
15119-
15120- cpuid(1, &eax, &ebx, &ecx, &edx);
15121-
15122-
15123- if (!cpu_has(c, X86_FEATURE_HT))
15124- return;
15125- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
15126- goto out;
15127-
15128- smp_num_siblings = (ebx & 0xff0000) >> 16;
15129-
15130- if (smp_num_siblings == 1) {
15131- printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
15132- } else if (smp_num_siblings > 1) {
15133-
15134- if (smp_num_siblings > NR_CPUS) {
15135- printk(KERN_WARNING "CPU: Unsupported number of "
15136- "siblings %d", smp_num_siblings);
15137- smp_num_siblings = 1;
15138- return;
15139- }
15140-
15141- index_msb = get_count_order(smp_num_siblings);
15142- c->phys_proc_id = phys_pkg_id(index_msb);
15143-
15144- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
15145-
15146- index_msb = get_count_order(smp_num_siblings);
15147-
15148- core_bits = get_count_order(c->x86_max_cores);
15149-
15150- c->cpu_core_id = phys_pkg_id(index_msb) &
15151- ((1 << core_bits) - 1);
15152- }
15153-out:
15154- if ((c->x86_max_cores * smp_num_siblings) > 1) {
15155- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
15156- c->phys_proc_id);
15157- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
15158- c->cpu_core_id);
15159- }
15160-
15161-#endif
15162-}
15163-
15164-/*
15165- * find out the number of processor cores on the die
15166- */
15167-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
15168-{
15169- unsigned int eax, t;
15170-
15171- if (c->cpuid_level < 4)
15172- return 1;
15173-
15174- cpuid_count(4, 0, &eax, &t, &t, &t);
15175-
15176- if (eax & 0x1f)
15177- return ((eax >> 26) + 1);
15178- else
15179- return 1;
15180-}
15181-
15182-static void __cpuinit srat_detect_node(void)
15183-{
15184-#ifdef CONFIG_NUMA
15185- unsigned node;
15186- int cpu = smp_processor_id();
15187- int apicid = hard_smp_processor_id();
15188-
15189- /* Don't do the funky fallback heuristics the AMD version employs
15190- for now. */
15191- node = apicid_to_node[apicid];
15192- if (node == NUMA_NO_NODE || !node_online(node))
15193- node = first_node(node_online_map);
15194- numa_set_node(cpu, node);
15195-
15196- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
15197-#endif
15198-}
15199-
15200-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
15201-{
15202- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
15203- (c->x86 == 0x6 && c->x86_model >= 0x0e))
15204- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15205-}
15206-
15207-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
15208-{
15209- /* Cache sizes */
15210- unsigned n;
15211-
15212- init_intel_cacheinfo(c);
15213- if (c->cpuid_level > 9) {
15214- unsigned eax = cpuid_eax(10);
15215- /* Check for version and the number of counters */
15216- if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
15217- set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
15218- }
15219-
15220- if (cpu_has_ds) {
15221- unsigned int l1, l2;
15222- rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
15223- if (!(l1 & (1<<11)))
15224- set_cpu_cap(c, X86_FEATURE_BTS);
15225- if (!(l1 & (1<<12)))
15226- set_cpu_cap(c, X86_FEATURE_PEBS);
15227- }
15228-
15229-
15230- if (cpu_has_bts)
15231- ds_init_intel(c);
15232-
15233- n = c->extended_cpuid_level;
15234- if (n >= 0x80000008) {
15235- unsigned eax = cpuid_eax(0x80000008);
15236- c->x86_virt_bits = (eax >> 8) & 0xff;
15237- c->x86_phys_bits = eax & 0xff;
15238- /* CPUID workaround for Intel 0F34 CPU */
15239- if (c->x86_vendor == X86_VENDOR_INTEL &&
15240- c->x86 == 0xF && c->x86_model == 0x3 &&
15241- c->x86_mask == 0x4)
15242- c->x86_phys_bits = 36;
15243- }
15244-
15245- if (c->x86 == 15)
15246- c->x86_cache_alignment = c->x86_clflush_size * 2;
15247- if (c->x86 == 6)
15248- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15249- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15250- c->x86_max_cores = intel_num_cpu_cores(c);
15251-
15252- srat_detect_node();
15253-}
15254-
15255-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
15256-{
15257- if (c->x86 == 0x6 && c->x86_model >= 0xf)
15258- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15259-}
15260-
15261-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
15262-{
15263- /* Cache sizes */
15264- unsigned n;
15265-
15266- n = c->extended_cpuid_level;
15267- if (n >= 0x80000008) {
15268- unsigned eax = cpuid_eax(0x80000008);
15269- c->x86_virt_bits = (eax >> 8) & 0xff;
15270- c->x86_phys_bits = eax & 0xff;
15271- }
15272-
15273- if (c->x86 == 0x6 && c->x86_model >= 0xf) {
15274- c->x86_cache_alignment = c->x86_clflush_size * 2;
15275- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15276- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15277- }
15278- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15279-}
15280-
15281-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
15282-{
15283- char *v = c->x86_vendor_id;
15284-
15285- if (!strcmp(v, "AuthenticAMD"))
15286- c->x86_vendor = X86_VENDOR_AMD;
15287- else if (!strcmp(v, "GenuineIntel"))
15288- c->x86_vendor = X86_VENDOR_INTEL;
15289- else if (!strcmp(v, "CentaurHauls"))
15290- c->x86_vendor = X86_VENDOR_CENTAUR;
15291- else
15292- c->x86_vendor = X86_VENDOR_UNKNOWN;
15293-}
15294-
15295-/* Do some early cpuid on the boot CPU to get some parameter that are
15296- needed before check_bugs. Everything advanced is in identify_cpu
15297- below. */
15298-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
15299-{
15300- u32 tfms, xlvl;
15301-
15302- c->loops_per_jiffy = loops_per_jiffy;
15303- c->x86_cache_size = -1;
15304- c->x86_vendor = X86_VENDOR_UNKNOWN;
15305- c->x86_model = c->x86_mask = 0; /* So far unknown... */
15306- c->x86_vendor_id[0] = '\0'; /* Unset */
15307- c->x86_model_id[0] = '\0'; /* Unset */
15308- c->x86_clflush_size = 64;
15309- c->x86_cache_alignment = c->x86_clflush_size;
15310- c->x86_max_cores = 1;
15311- c->x86_coreid_bits = 0;
15312- c->extended_cpuid_level = 0;
15313- memset(&c->x86_capability, 0, sizeof c->x86_capability);
15314-
15315- /* Get vendor name */
15316- cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
15317- (unsigned int *)&c->x86_vendor_id[0],
15318- (unsigned int *)&c->x86_vendor_id[8],
15319- (unsigned int *)&c->x86_vendor_id[4]);
15320-
15321- get_cpu_vendor(c);
15322-
15323- /* Initialize the standard set of capabilities */
15324- /* Note that the vendor-specific code below might override */
15325-
15326- /* Intel-defined flags: level 0x00000001 */
15327- if (c->cpuid_level >= 0x00000001) {
15328- __u32 misc;
15329- cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
15330- &c->x86_capability[0]);
15331- c->x86 = (tfms >> 8) & 0xf;
15332- c->x86_model = (tfms >> 4) & 0xf;
15333- c->x86_mask = tfms & 0xf;
15334- if (c->x86 == 0xf)
15335- c->x86 += (tfms >> 20) & 0xff;
15336- if (c->x86 >= 0x6)
15337- c->x86_model += ((tfms >> 16) & 0xF) << 4;
15338- if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
15339- c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
15340- } else {
15341- /* Have CPUID level 0 only - unheard of */
15342- c->x86 = 4;
15343- }
15344-
15345- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
15346-#ifdef CONFIG_SMP
15347- c->phys_proc_id = c->initial_apicid;
15348-#endif
15349- /* AMD-defined flags: level 0x80000001 */
15350- xlvl = cpuid_eax(0x80000000);
15351- c->extended_cpuid_level = xlvl;
15352- if ((xlvl & 0xffff0000) == 0x80000000) {
15353- if (xlvl >= 0x80000001) {
15354- c->x86_capability[1] = cpuid_edx(0x80000001);
15355- c->x86_capability[6] = cpuid_ecx(0x80000001);
15356- }
15357- if (xlvl >= 0x80000004)
15358- get_model_name(c); /* Default name */
15359- }
15360-
15361- /* Transmeta-defined flags: level 0x80860001 */
15362- xlvl = cpuid_eax(0x80860000);
15363- if ((xlvl & 0xffff0000) == 0x80860000) {
15364- /* Don't set x86_cpuid_level here for now to not confuse. */
15365- if (xlvl >= 0x80860001)
15366- c->x86_capability[2] = cpuid_edx(0x80860001);
15367- }
15368-
15369- c->extended_cpuid_level = cpuid_eax(0x80000000);
15370- if (c->extended_cpuid_level >= 0x80000007)
15371- c->x86_power = cpuid_edx(0x80000007);
15372-
15373- switch (c->x86_vendor) {
15374- case X86_VENDOR_AMD:
15375- early_init_amd(c);
15376- break;
15377- case X86_VENDOR_INTEL:
15378- early_init_intel(c);
15379- break;
15380- case X86_VENDOR_CENTAUR:
15381- early_init_centaur(c);
15382- break;
15383- }
15384-
15385- validate_pat_support(c);
15386-}
15387-
15388-/*
15389- * This does the hard work of actually picking apart the CPU stuff...
15390- */
15391-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
15392-{
15393- int i;
15394-
15395- early_identify_cpu(c);
15396-
15397- init_scattered_cpuid_features(c);
15398-
15399- c->apicid = phys_pkg_id(0);
15400-
15401- /*
15402- * Vendor-specific initialization. In this section we
15403- * canonicalize the feature flags, meaning if there are
15404- * features a certain CPU supports which CPUID doesn't
15405- * tell us, CPUID claiming incorrect flags, or other bugs,
15406- * we handle them here.
15407- *
15408- * At the end of this section, c->x86_capability better
15409- * indicate the features this CPU genuinely supports!
15410- */
15411- switch (c->x86_vendor) {
15412- case X86_VENDOR_AMD:
15413- init_amd(c);
15414- break;
15415-
15416- case X86_VENDOR_INTEL:
15417- init_intel(c);
15418- break;
15419-
15420- case X86_VENDOR_CENTAUR:
15421- init_centaur(c);
15422- break;
15423-
15424- case X86_VENDOR_UNKNOWN:
15425- default:
15426- display_cacheinfo(c);
15427- break;
15428- }
15429-
15430- detect_ht(c);
15431-
15432- /*
15433- * On SMP, boot_cpu_data holds the common feature set between
15434- * all CPUs; so make sure that we indicate which features are
15435- * common between the CPUs. The first time this routine gets
15436- * executed, c == &boot_cpu_data.
15437- */
15438- if (c != &boot_cpu_data) {
15439- /* AND the already accumulated flags with these */
15440- for (i = 0; i < NCAPINTS; i++)
15441- boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
15442- }
15443-
15444- /* Clear all flags overriden by options */
15445- for (i = 0; i < NCAPINTS; i++)
15446- c->x86_capability[i] &= ~cleared_cpu_caps[i];
15447-
15448-#ifdef CONFIG_X86_MCE
15449- mcheck_init(c);
15450-#endif
15451- select_idle_routine(c);
15452-
15453-#ifdef CONFIG_NUMA
15454- numa_add_cpu(smp_processor_id());
15455-#endif
15456-
15457-}
15458-
15459-void __cpuinit identify_boot_cpu(void)
15460-{
15461- identify_cpu(&boot_cpu_data);
15462-}
15463-
15464-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
15465-{
15466- BUG_ON(c == &boot_cpu_data);
15467- identify_cpu(c);
15468- mtrr_ap_init();
15469-}
15470-
15471-static __init int setup_noclflush(char *arg)
15472-{
15473- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
15474- return 1;
15475-}
15476-__setup("noclflush", setup_noclflush);
15477-
15478-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
15479-{
15480- if (c->x86_model_id[0])
15481- printk(KERN_CONT "%s", c->x86_model_id);
15482-
15483- if (c->x86_mask || c->cpuid_level >= 0)
15484- printk(KERN_CONT " stepping %02x\n", c->x86_mask);
15485- else
15486- printk(KERN_CONT "\n");
15487-}
15488-
15489-static __init int setup_disablecpuid(char *arg)
15490-{
15491- int bit;
15492- if (get_option(&arg, &bit) && bit < NCAPINTS*32)
15493- setup_clear_cpu_cap(bit);
15494- else
15495- return 0;
15496- return 1;
15497-}
15498-__setup("clearcpuid=", setup_disablecpuid);
cc90b958 15499--- /dev/null 1970-01-01 00:00:00.000000000 +0000
00e5a55c 15500+++ sle11-2009-06-04/arch/x86/kernel/setup_percpu-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
15501@@ -0,0 +1,385 @@
15502+#include <linux/kernel.h>
15503+#include <linux/module.h>
15504+#include <linux/init.h>
15505+#include <linux/bootmem.h>
15506+#include <linux/percpu.h>
15507+#include <linux/kexec.h>
15508+#include <linux/crash_dump.h>
15509+#include <asm/smp.h>
15510+#include <asm/percpu.h>
15511+#include <asm/sections.h>
15512+#include <asm/processor.h>
15513+#include <asm/setup.h>
15514+#include <asm/topology.h>
15515+#include <asm/mpspec.h>
15516+#include <asm/apicdef.h>
15517+#include <asm/highmem.h>
15518+
15519+#ifdef CONFIG_X86_LOCAL_APIC
15520+unsigned int num_processors;
15521+unsigned disabled_cpus __cpuinitdata;
15522+/* Processor that is doing the boot up */
15523+unsigned int boot_cpu_physical_apicid = -1U;
15524+unsigned int max_physical_apicid;
15525+EXPORT_SYMBOL(boot_cpu_physical_apicid);
15526+
15527+/* Bitmask of physically existing CPUs */
15528+physid_mask_t phys_cpu_present_map;
15529+#endif
15530+
15531+/* map cpu index to physical APIC ID */
15532+#ifndef CONFIG_XEN
15533+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
15534+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
15535+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15536+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
15537+#else
15538+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
15539+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15540+#endif
15541+
15542+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
15543+#define X86_64_NUMA 1
15544+
15545+/* map cpu index to node index */
15546+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
15547+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
15548+
15549+/* which logical CPUs are on which nodes */
15550+cpumask_t *node_to_cpumask_map;
15551+EXPORT_SYMBOL(node_to_cpumask_map);
15552+
15553+/* setup node_to_cpumask_map */
15554+static void __init setup_node_to_cpumask_map(void);
15555+
15556+#else
15557+static inline void setup_node_to_cpumask_map(void) { }
15558+#endif
15559+
15560+#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
15561+/*
15562+ * Copy data used in early init routines from the initial arrays to the
15563+ * per cpu data areas. These arrays then become expendable and the
15564+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
15565+ */
15566+static void __init setup_per_cpu_maps(void)
15567+{
15568+#ifndef CONFIG_XEN
15569+ int cpu;
15570+
15571+ for_each_possible_cpu(cpu) {
15572+ per_cpu(x86_cpu_to_apicid, cpu) =
15573+ early_per_cpu_map(x86_cpu_to_apicid, cpu);
15574+ per_cpu(x86_bios_cpu_apicid, cpu) =
15575+ early_per_cpu_map(x86_bios_cpu_apicid, cpu);
15576+#ifdef X86_64_NUMA
15577+ per_cpu(x86_cpu_to_node_map, cpu) =
15578+ early_per_cpu_map(x86_cpu_to_node_map, cpu);
15579+#endif
15580+ }
15581+
15582+ /* indicate the early static arrays will soon be gone */
15583+ early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
15584+ early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
15585+#ifdef X86_64_NUMA
15586+ early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
15587+#endif
15588+#endif
15589+}
15590+
15591+#ifdef CONFIG_X86_32
15592+/*
15593+ * Great future not-so-futuristic plan: make i386 and x86_64 do it
15594+ * the same way
15595+ */
15596+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
15597+EXPORT_SYMBOL(__per_cpu_offset);
15598+static inline void setup_cpu_pda_map(void) { }
15599+
15600+#elif !defined(CONFIG_SMP)
15601+static inline void setup_cpu_pda_map(void) { }
15602+
15603+#else /* CONFIG_SMP && CONFIG_X86_64 */
15604+
15605+/*
15606+ * Allocate cpu_pda pointer table and array via alloc_bootmem.
15607+ */
15608+static void __init setup_cpu_pda_map(void)
15609+{
15610+ char *pda;
15611+ struct x8664_pda **new_cpu_pda;
15612+ unsigned long size;
15613+ int cpu;
15614+
15615+ size = roundup(sizeof(struct x8664_pda), cache_line_size());
15616+
15617+ /* allocate cpu_pda array and pointer table */
15618+ {
15619+ unsigned long tsize = nr_cpu_ids * sizeof(void *);
15620+ unsigned long asize = size * (nr_cpu_ids - 1);
15621+
15622+ tsize = roundup(tsize, cache_line_size());
15623+ new_cpu_pda = alloc_bootmem(tsize + asize);
15624+ pda = (char *)new_cpu_pda + tsize;
15625+ }
15626+
15627+ /* initialize pointer table to static pda's */
15628+ for_each_possible_cpu(cpu) {
15629+ if (cpu == 0) {
15630+ /* leave boot cpu pda in place */
15631+ new_cpu_pda[0] = cpu_pda(0);
15632+ continue;
15633+ }
15634+ new_cpu_pda[cpu] = (struct x8664_pda *)pda;
15635+ new_cpu_pda[cpu]->in_bootmem = 1;
15636+ pda += size;
15637+ }
15638+
15639+ /* point to new pointer table */
15640+ _cpu_pda = new_cpu_pda;
15641+}
15642+#endif
15643+
15644+/*
15645+ * Great future plan:
15646+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
15647+ * Always point %gs to its beginning
15648+ */
15649+void __init setup_per_cpu_areas(void)
15650+{
15651+ ssize_t size = PERCPU_ENOUGH_ROOM;
15652+ char *ptr;
15653+ int cpu;
15654+
15655+ /* Setup cpu_pda map */
15656+ setup_cpu_pda_map();
15657+
15658+ /* Copy section for each CPU (we discard the original) */
15659+ size = PERCPU_ENOUGH_ROOM;
15660+ printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
15661+ size);
15662+
15663+ for_each_possible_cpu(cpu) {
15664+#ifndef CONFIG_NEED_MULTIPLE_NODES
15665+ ptr = alloc_bootmem_pages(size);
15666+#else
15667+ int node = early_cpu_to_node(cpu);
15668+ if (!node_online(node) || !NODE_DATA(node)) {
15669+ ptr = alloc_bootmem_pages(size);
15670+ printk(KERN_INFO
15671+ "cpu %d has no node %d or node-local memory\n",
15672+ cpu, node);
15673+ }
15674+ else
15675+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
15676+#endif
15677+ per_cpu_offset(cpu) = ptr - __per_cpu_start;
15678+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
15679+
15680+ }
15681+
15682+ printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
15683+ NR_CPUS, nr_cpu_ids, nr_node_ids);
15684+
15685+ /* Setup percpu data maps */
15686+ setup_per_cpu_maps();
15687+
15688+ /* Setup node to cpumask map */
15689+ setup_node_to_cpumask_map();
15690+}
15691+
15692+#endif
15693+
15694+#ifdef X86_64_NUMA
15695+
15696+/*
15697+ * Allocate node_to_cpumask_map based on number of available nodes
15698+ * Requires node_possible_map to be valid.
15699+ *
15700+ * Note: node_to_cpumask() is not valid until after this is done.
15701+ */
15702+static void __init setup_node_to_cpumask_map(void)
15703+{
15704+ unsigned int node, num = 0;
15705+ cpumask_t *map;
15706+
15707+ /* setup nr_node_ids if not done yet */
15708+ if (nr_node_ids == MAX_NUMNODES) {
15709+ for_each_node_mask(node, node_possible_map)
15710+ num = node;
15711+ nr_node_ids = num + 1;
15712+ }
15713+
15714+ /* allocate the map */
15715+ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
15716+
15717+ pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
15718+ map, nr_node_ids);
15719+
15720+ /* node_to_cpumask() will now work */
15721+ node_to_cpumask_map = map;
15722+}
15723+
15724+void __cpuinit numa_set_node(int cpu, int node)
15725+{
15726+ int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
15727+
15728+ if (cpu_pda(cpu) && node != NUMA_NO_NODE)
15729+ cpu_pda(cpu)->nodenumber = node;
15730+
15731+ if (cpu_to_node_map)
15732+ cpu_to_node_map[cpu] = node;
15733+
15734+ else if (per_cpu_offset(cpu))
15735+ per_cpu(x86_cpu_to_node_map, cpu) = node;
15736+
15737+ else
15738+ pr_debug("Setting node for non-present cpu %d\n", cpu);
15739+}
15740+
15741+void __cpuinit numa_clear_node(int cpu)
15742+{
15743+ numa_set_node(cpu, NUMA_NO_NODE);
15744+}
15745+
15746+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
15747+
15748+void __cpuinit numa_add_cpu(int cpu)
15749+{
15750+ cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
15751+}
15752+
15753+void __cpuinit numa_remove_cpu(int cpu)
15754+{
15755+ cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
15756+}
15757+
15758+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
15759+
15760+/*
15761+ * --------- debug versions of the numa functions ---------
15762+ */
15763+static void __cpuinit numa_set_cpumask(int cpu, int enable)
15764+{
15765+ int node = cpu_to_node(cpu);
15766+ cpumask_t *mask;
15767+ char buf[64];
15768+
15769+ if (node_to_cpumask_map == NULL) {
15770+ printk(KERN_ERR "node_to_cpumask_map NULL\n");
15771+ dump_stack();
15772+ return;
15773+ }
15774+
15775+ mask = &node_to_cpumask_map[node];
15776+ if (enable)
15777+ cpu_set(cpu, *mask);
15778+ else
15779+ cpu_clear(cpu, *mask);
15780+
15781+ cpulist_scnprintf(buf, sizeof(buf), *mask);
15782+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
15783+ enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
15784+ }
15785+
15786+void __cpuinit numa_add_cpu(int cpu)
15787+{
15788+ numa_set_cpumask(cpu, 1);
15789+}
15790+
15791+void __cpuinit numa_remove_cpu(int cpu)
15792+{
15793+ numa_set_cpumask(cpu, 0);
15794+}
15795+
15796+int cpu_to_node(int cpu)
15797+{
15798+ if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
15799+ printk(KERN_WARNING
15800+ "cpu_to_node(%d): usage too early!\n", cpu);
15801+ dump_stack();
15802+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15803+ }
15804+ return per_cpu(x86_cpu_to_node_map, cpu);
15805+}
15806+EXPORT_SYMBOL(cpu_to_node);
15807+
15808+/*
15809+ * Same function as cpu_to_node() but used if called before the
15810+ * per_cpu areas are setup.
15811+ */
15812+int early_cpu_to_node(int cpu)
15813+{
15814+ if (early_per_cpu_ptr(x86_cpu_to_node_map))
15815+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15816+
15817+ if (!per_cpu_offset(cpu)) {
15818+ printk(KERN_WARNING
15819+ "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
15820+ dump_stack();
15821+ return NUMA_NO_NODE;
15822+ }
15823+ return per_cpu(x86_cpu_to_node_map, cpu);
15824+}
15825+
15826+
15827+/* empty cpumask */
15828+static const cpumask_t cpu_mask_none;
15829+
15830+/*
15831+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
15832+ */
15833+const cpumask_t *_node_to_cpumask_ptr(int node)
15834+{
15835+ if (node_to_cpumask_map == NULL) {
15836+ printk(KERN_WARNING
15837+ "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
15838+ node);
15839+ dump_stack();
15840+ return (const cpumask_t *)&cpu_online_map;
15841+ }
15842+ if (node >= nr_node_ids) {
15843+ printk(KERN_WARNING
15844+ "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
15845+ node, nr_node_ids);
15846+ dump_stack();
15847+ return &cpu_mask_none;
15848+ }
15849+ return &node_to_cpumask_map[node];
15850+}
15851+EXPORT_SYMBOL(_node_to_cpumask_ptr);
15852+
15853+/*
15854+ * Returns a bitmask of CPUs on Node 'node'.
15855+ *
15856+ * Side note: this function creates the returned cpumask on the stack
15857+ * so with a high NR_CPUS count, excessive stack space is used. The
15858+ * node_to_cpumask_ptr function should be used whenever possible.
15859+ */
15860+cpumask_t node_to_cpumask(int node)
15861+{
15862+ if (node_to_cpumask_map == NULL) {
15863+ printk(KERN_WARNING
15864+ "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
15865+ dump_stack();
15866+ return cpu_online_map;
15867+ }
15868+ if (node >= nr_node_ids) {
15869+ printk(KERN_WARNING
15870+ "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
15871+ node, nr_node_ids);
15872+ dump_stack();
15873+ return cpu_mask_none;
15874+ }
15875+ return node_to_cpumask_map[node];
15876+}
15877+EXPORT_SYMBOL(node_to_cpumask);
15878+
15879+/*
15880+ * --------- end of debug versions of the numa functions ---------
15881+ */
15882+
15883+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
15884+
15885+#endif /* X86_64_NUMA */
15886+
00e5a55c
BS
15887--- sle11-2009-06-04.orig/arch/x86/kernel/smp-xen.c 2009-03-16 16:38:05.000000000 +0100
15888+++ sle11-2009-06-04/arch/x86/kernel/smp-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
15889@@ -121,132 +121,14 @@ void xen_smp_send_reschedule(int cpu)
15890 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
15891 }
15892
15893-/*
15894- * Structure and data for smp_call_function(). This is designed to minimise
15895- * static memory requirements. It also looks cleaner.
15896- */
15897-static DEFINE_SPINLOCK(call_lock);
15898-
15899-struct call_data_struct {
15900- void (*func) (void *info);
15901- void *info;
15902- atomic_t started;
15903- atomic_t finished;
15904- int wait;
15905-};
15906-
15907-void lock_ipi_call_lock(void)
15908+void xen_send_call_func_single_ipi(int cpu)
15909 {
15910- spin_lock_irq(&call_lock);
15911+ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
15912 }
15913
15914-void unlock_ipi_call_lock(void)
15915+void xen_send_call_func_ipi(cpumask_t mask)
15916 {
15917- spin_unlock_irq(&call_lock);
15918-}
15919-
15920-static struct call_data_struct *call_data;
15921-
15922-static void __smp_call_function(void (*func) (void *info), void *info,
15923- int nonatomic, int wait)
15924-{
15925- struct call_data_struct data;
15926- int cpus = num_online_cpus() - 1;
15927-
15928- if (!cpus)
15929- return;
15930-
15931- data.func = func;
15932- data.info = info;
15933- atomic_set(&data.started, 0);
15934- data.wait = wait;
15935- if (wait)
15936- atomic_set(&data.finished, 0);
15937-
15938- call_data = &data;
15939- mb();
15940-
15941- /* Send a message to all other CPUs and wait for them to respond */
15942- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
15943-
15944- /* Wait for response */
15945- while (atomic_read(&data.started) != cpus)
15946- cpu_relax();
15947-
15948- if (wait)
15949- while (atomic_read(&data.finished) != cpus)
15950- cpu_relax();
15951-}
15952-
15953-
15954-/**
15955- * smp_call_function_mask(): Run a function on a set of other CPUs.
15956- * @mask: The set of cpus to run on. Must not include the current cpu.
15957- * @func: The function to run. This must be fast and non-blocking.
15958- * @info: An arbitrary pointer to pass to the function.
15959- * @wait: If true, wait (atomically) until function has completed on other CPUs.
15960- *
15961- * Returns 0 on success, else a negative status code.
15962- *
15963- * If @wait is true, then returns once @func has returned; otherwise
15964- * it returns just before the target cpu calls @func.
15965- *
15966- * You must not call this function with disabled interrupts or from a
15967- * hardware interrupt handler or from a bottom half handler.
15968- */
15969-int
15970-xen_smp_call_function_mask(cpumask_t mask,
15971- void (*func)(void *), void *info,
15972- int wait)
15973-{
15974- struct call_data_struct data;
15975- cpumask_t allbutself;
15976- int cpus;
15977-
15978- /* Can deadlock when called with interrupts disabled */
15979- WARN_ON(irqs_disabled());
15980-
15981- /* Holding any lock stops cpus from going down. */
15982- spin_lock(&call_lock);
15983-
15984- allbutself = cpu_online_map;
15985- cpu_clear(smp_processor_id(), allbutself);
15986-
15987- cpus_and(mask, mask, allbutself);
15988- cpus = cpus_weight(mask);
15989-
15990- if (!cpus) {
15991- spin_unlock(&call_lock);
15992- return 0;
15993- }
15994-
15995- data.func = func;
15996- data.info = info;
15997- atomic_set(&data.started, 0);
15998- data.wait = wait;
15999- if (wait)
16000- atomic_set(&data.finished, 0);
16001-
16002- call_data = &data;
16003- wmb();
16004-
16005- /* Send a message to other CPUs */
16006- if (cpus_equal(mask, allbutself) &&
16007- cpus_equal(cpu_online_map, cpu_callout_map))
16008- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
16009- else
16010- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16011-
16012- /* Wait for response */
16013- while (atomic_read(&data.started) != cpus)
16014- cpu_relax();
16015-
16016- if (wait)
16017- while (atomic_read(&data.finished) != cpus)
16018- cpu_relax();
16019- spin_unlock(&call_lock);
16020-
16021- return 0;
16022+ send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16023 }
16024
16025 static void stop_this_cpu(void *dummy)
16026@@ -268,15 +150,10 @@ static void stop_this_cpu(void *dummy)
16027
16028 void xen_smp_send_stop(void)
16029 {
16030- int nolock;
16031 unsigned long flags;
16032
16033- /* Don't deadlock on the call lock in panic */
16034- nolock = !spin_trylock(&call_lock);
16035+ smp_call_function(stop_this_cpu, NULL, 0);
16036 local_irq_save(flags);
16037- __smp_call_function(stop_this_cpu, NULL, 0, 0);
16038- if (!nolock)
16039- spin_unlock(&call_lock);
16040 disable_all_local_evtchn();
16041 local_irq_restore(flags);
16042 }
16043@@ -298,21 +175,8 @@ irqreturn_t smp_reschedule_interrupt(int
16044
16045 irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
16046 {
16047- void (*func) (void *info) = call_data->func;
16048- void *info = call_data->info;
16049- int wait = call_data->wait;
16050-
16051- /*
16052- * Notify initiating CPU that I've grabbed the data and am
16053- * about to execute the function
16054- */
16055- mb();
16056- atomic_inc(&call_data->started);
16057- /*
16058- * At this point the info structure may be out of scope unless wait==1
16059- */
16060 irq_enter();
16061- (*func)(info);
16062+ generic_smp_call_function_interrupt();
16063 #ifdef CONFIG_X86_32
16064 __get_cpu_var(irq_stat).irq_call_count++;
16065 #else
16066@@ -320,10 +184,19 @@ irqreturn_t smp_call_function_interrupt(
16067 #endif
16068 irq_exit();
16069
16070- if (wait) {
16071- mb();
16072- atomic_inc(&call_data->finished);
16073- }
16074+ return IRQ_HANDLED;
16075+}
16076+
16077+irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
16078+{
16079+ irq_enter();
16080+ generic_smp_call_function_single_interrupt();
16081+#ifdef CONFIG_X86_32
16082+ __get_cpu_var(irq_stat).irq_call_count++;
16083+#else
16084+ add_pda(irq_call_count, 1);
16085+#endif
16086+ irq_exit();
16087
16088 return IRQ_HANDLED;
16089 }
00e5a55c
BS
16090--- sle11-2009-06-04.orig/arch/x86/kernel/time_32-xen.c 2009-03-24 10:13:09.000000000 +0100
16091+++ sle11-2009-06-04/arch/x86/kernel/time_32-xen.c 2009-06-04 10:21:39.000000000 +0200
16092@@ -468,7 +468,7 @@ irqreturn_t timer_interrupt(int irq, voi
cc90b958
BS
16093
16094 /* Keep nmi watchdog up to date */
16095 #ifdef __i386__
16096- per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
16097+ x86_add_percpu(irq_stat.irq0_irqs, 1);
16098 #else
16099 add_pda(irq0_irqs, 1);
16100 #endif
00e5a55c 16101@@ -746,9 +746,7 @@ void __init time_init(void)
cc90b958
BS
16102
16103 update_wallclock();
16104
16105-#ifndef CONFIG_X86_64
16106 use_tsc_delay();
16107-#endif
16108
16109 /* Cannot request_irq() until kmem is initialised. */
16110 late_time_init = setup_cpu0_timer_irq;
00e5a55c 16111@@ -805,7 +803,8 @@ static void stop_hz_timer(void)
cc90b958
BS
16112
16113 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
16114 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
16115- (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
16116+ (j = get_next_timer_interrupt(jiffies),
16117+ time_before_eq(j, jiffies))) {
16118 cpu_clear(cpu, nohz_cpu_mask);
16119 j = jiffies + 1;
16120 }
00e5a55c
BS
16121--- sle11-2009-06-04.orig/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:38:05.000000000 +0100
16122+++ sle11-2009-06-04/arch/x86/kernel/traps_32-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
16123@@ -1,5 +1,6 @@
16124 /*
16125 * Copyright (C) 1991, 1992 Linus Torvalds
16126+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
16127 *
16128 * Pentium III FXSR, SSE support
16129 * Gareth Hughes <gareth@valinux.com>, May 2000
16130@@ -57,11 +58,10 @@
16131 #include <asm/nmi.h>
16132 #include <asm/smp.h>
16133 #include <asm/io.h>
16134+#include <asm/traps.h>
16135
16136 #include "mach_traps.h"
16137
16138-int panic_on_unrecovered_nmi;
16139-
16140 #ifndef CONFIG_XEN
16141 DECLARE_BITMAP(used_vectors, NR_VECTORS);
16142 EXPORT_SYMBOL_GPL(used_vectors);
16143@@ -82,43 +82,22 @@ gate_desc idt_table[256]
16144 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
16145 #endif
16146
16147-asmlinkage void divide_error(void);
16148-asmlinkage void debug(void);
16149-asmlinkage void nmi(void);
16150-asmlinkage void int3(void);
16151-asmlinkage void overflow(void);
16152-asmlinkage void bounds(void);
16153-asmlinkage void invalid_op(void);
16154-asmlinkage void device_not_available(void);
16155-asmlinkage void coprocessor_segment_overrun(void);
16156-asmlinkage void invalid_TSS(void);
16157-asmlinkage void segment_not_present(void);
16158-asmlinkage void stack_segment(void);
16159-asmlinkage void general_protection(void);
16160-asmlinkage void page_fault(void);
16161-asmlinkage void coprocessor_error(void);
16162-asmlinkage void simd_coprocessor_error(void);
16163-asmlinkage void alignment_check(void);
16164-#ifndef CONFIG_XEN
16165-asmlinkage void spurious_interrupt_bug(void);
16166-#else
16167-asmlinkage void fixup_4gb_segment(void);
16168-#endif
16169-asmlinkage void machine_check(void);
16170-
16171+int panic_on_unrecovered_nmi;
16172 int kstack_depth_to_print = 24;
16173 static unsigned int code_bytes = 64;
16174+static int ignore_nmis;
16175+static int die_counter;
16176
16177 void printk_address(unsigned long address, int reliable)
16178 {
16179 #ifdef CONFIG_KALLSYMS
16180- char namebuf[KSYM_NAME_LEN];
16181 unsigned long offset = 0;
16182 unsigned long symsize;
16183 const char *symname;
16184- char reliab[4] = "";
16185- char *delim = ":";
16186 char *modname;
16187+ char *delim = ":";
16188+ char namebuf[KSYM_NAME_LEN];
16189+ char reliab[4] = "";
16190
16191 symname = kallsyms_lookup(address, &symsize, &offset,
16192 &modname, namebuf);
16193@@ -138,22 +117,23 @@ void printk_address(unsigned long addres
16194 #endif
16195 }
16196
16197-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
16198+static inline int valid_stack_ptr(struct thread_info *tinfo,
16199+ void *p, unsigned int size)
16200 {
16201- return p > (void *)tinfo &&
16202- p <= (void *)tinfo + THREAD_SIZE - size;
16203+ void *t = tinfo;
16204+ return p > t && p <= t + THREAD_SIZE - size;
16205 }
16206
16207 /* The form of the top of the frame on the stack */
16208 struct stack_frame {
16209- struct stack_frame *next_frame;
16210- unsigned long return_address;
16211+ struct stack_frame *next_frame;
16212+ unsigned long return_address;
16213 };
16214
16215 static inline unsigned long
16216 print_context_stack(struct thread_info *tinfo,
16217- unsigned long *stack, unsigned long bp,
16218- const struct stacktrace_ops *ops, void *data)
16219+ unsigned long *stack, unsigned long bp,
16220+ const struct stacktrace_ops *ops, void *data)
16221 {
16222 struct stack_frame *frame = (struct stack_frame *)bp;
16223
16224@@ -175,8 +155,6 @@ print_context_stack(struct thread_info *
16225 return bp;
16226 }
16227
16228-#define MSG(msg) ops->warning(data, msg)
16229-
16230 void dump_trace(struct task_struct *task, struct pt_regs *regs,
16231 unsigned long *stack, unsigned long bp,
16232 const struct stacktrace_ops *ops, void *data)
16233@@ -186,7 +164,6 @@ void dump_trace(struct task_struct *task
16234
16235 if (!stack) {
16236 unsigned long dummy;
16237-
16238 stack = &dummy;
16239 if (task != current)
16240 stack = (unsigned long *)task->thread.sp;
16241@@ -204,7 +181,7 @@ void dump_trace(struct task_struct *task
16242 }
16243 #endif
16244
16245- while (1) {
16246+ for (;;) {
16247 struct thread_info *context;
16248
16249 context = (struct thread_info *)
16250@@ -256,15 +233,15 @@ static void print_trace_address(void *da
16251 }
16252
16253 static const struct stacktrace_ops print_trace_ops = {
16254- .warning = print_trace_warning,
16255- .warning_symbol = print_trace_warning_symbol,
16256- .stack = print_trace_stack,
16257- .address = print_trace_address,
16258+ .warning = print_trace_warning,
16259+ .warning_symbol = print_trace_warning_symbol,
16260+ .stack = print_trace_stack,
16261+ .address = print_trace_address,
16262 };
16263
16264 static void
16265 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16266- unsigned long *stack, unsigned long bp, char *log_lvl)
16267+ unsigned long *stack, unsigned long bp, char *log_lvl)
16268 {
16269 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16270 printk("%s =======================\n", log_lvl);
16271@@ -359,15 +336,14 @@ void show_registers(struct pt_regs *regs
16272 printk(KERN_EMERG "Code: ");
16273
16274 ip = (u8 *)regs->ip - code_prologue;
16275- if (ip < (u8 *)PAGE_OFFSET ||
16276- probe_kernel_address(ip, c)) {
16277+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
16278 /* try starting at EIP */
16279 ip = (u8 *)regs->ip;
16280 code_len = code_len - code_prologue + 1;
16281 }
16282 for (i = 0; i < code_len; i++, ip++) {
16283 if (ip < (u8 *)PAGE_OFFSET ||
16284- probe_kernel_address(ip, c)) {
16285+ probe_kernel_address(ip, c)) {
16286 printk(" Bad EIP value.");
16287 break;
16288 }
16289@@ -392,7 +368,53 @@ int is_valid_bugaddr(unsigned long ip)
16290 return ud2 == 0x0b0f;
16291 }
16292
16293-static int die_counter;
16294+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
16295+static int die_owner = -1;
16296+static unsigned int die_nest_count;
16297+
16298+unsigned __kprobes long oops_begin(void)
16299+{
16300+ unsigned long flags;
16301+
16302+ oops_enter();
16303+
16304+ if (die_owner != raw_smp_processor_id()) {
16305+ console_verbose();
16306+ raw_local_irq_save(flags);
16307+ __raw_spin_lock(&die_lock);
16308+ die_owner = smp_processor_id();
16309+ die_nest_count = 0;
16310+ bust_spinlocks(1);
16311+ } else {
16312+ raw_local_irq_save(flags);
16313+ }
16314+ die_nest_count++;
16315+ return flags;
16316+}
16317+
16318+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
16319+{
16320+ bust_spinlocks(0);
16321+ die_owner = -1;
16322+ add_taint(TAINT_DIE);
16323+ __raw_spin_unlock(&die_lock);
16324+ raw_local_irq_restore(flags);
16325+
16326+ if (!regs)
16327+ return;
16328+
16329+ if (kexec_should_crash(current))
16330+ crash_kexec(regs);
16331+
16332+ if (in_interrupt())
16333+ panic("Fatal exception in interrupt");
16334+
16335+ if (panic_on_oops)
16336+ panic("Fatal exception");
16337+
16338+ oops_exit();
16339+ do_exit(signr);
16340+}
16341
16342 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
16343 {
16344@@ -410,26 +432,22 @@ int __kprobes __die(const char *str, str
16345 printk("DEBUG_PAGEALLOC");
16346 #endif
16347 printk("\n");
16348-
16349 if (notify_die(DIE_OOPS, str, regs, err,
16350- current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
16351+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
16352+ return 1;
16353
16354- show_registers(regs);
16355- /* Executive summary in case the oops scrolled away */
16356- sp = (unsigned long) (&regs->sp);
16357- savesegment(ss, ss);
16358- if (user_mode(regs)) {
16359- sp = regs->sp;
16360- ss = regs->ss & 0xffff;
16361- }
16362- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16363- print_symbol("%s", regs->ip);
16364- printk(" SS:ESP %04x:%08lx\n", ss, sp);
16365-
16366- return 0;
16367- }
16368-
16369- return 1;
16370+ show_registers(regs);
16371+ /* Executive summary in case the oops scrolled away */
16372+ sp = (unsigned long) (&regs->sp);
16373+ savesegment(ss, ss);
16374+ if (user_mode(regs)) {
16375+ sp = regs->sp;
16376+ ss = regs->ss & 0xffff;
16377+ }
16378+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16379+ print_symbol("%s", regs->ip);
16380+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
16381+ return 0;
16382 }
16383
16384 /*
16385@@ -438,31 +456,9 @@ int __kprobes __die(const char *str, str
16386 */
16387 void die(const char *str, struct pt_regs *regs, long err)
16388 {
16389- static struct {
16390- raw_spinlock_t lock;
16391- u32 lock_owner;
16392- int lock_owner_depth;
16393- } die = {
16394- .lock = __RAW_SPIN_LOCK_UNLOCKED,
16395- .lock_owner = -1,
16396- .lock_owner_depth = 0
16397- };
16398- unsigned long flags;
16399-
16400- oops_enter();
16401+ unsigned long flags = oops_begin();
16402
16403- if (die.lock_owner != raw_smp_processor_id()) {
16404- console_verbose();
16405- raw_local_irq_save(flags);
16406- __raw_spin_lock(&die.lock);
16407- die.lock_owner = smp_processor_id();
16408- die.lock_owner_depth = 0;
16409- bust_spinlocks(1);
16410- } else {
16411- raw_local_irq_save(flags);
16412- }
16413-
16414- if (++die.lock_owner_depth < 3) {
16415+ if (die_nest_count < 3) {
16416 report_bug(regs->ip, regs);
16417
16418 if (__die(str, regs, err))
16419@@ -471,26 +467,7 @@ void die(const char *str, struct pt_regs
16420 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
16421 }
16422
16423- bust_spinlocks(0);
16424- die.lock_owner = -1;
16425- add_taint(TAINT_DIE);
16426- __raw_spin_unlock(&die.lock);
16427- raw_local_irq_restore(flags);
16428-
16429- if (!regs)
16430- return;
16431-
16432- if (kexec_should_crash(current))
16433- crash_kexec(regs);
16434-
16435- if (in_interrupt())
16436- panic("Fatal exception in interrupt");
16437-
16438- if (panic_on_oops)
16439- panic("Fatal exception");
16440-
16441- oops_exit();
16442- do_exit(SIGSEGV);
16443+ oops_end(flags, regs, SIGSEGV);
16444 }
16445
16446 static inline void
16447@@ -554,7 +531,7 @@ void do_##name(struct pt_regs *regs, lon
16448 { \
16449 trace_hardirqs_fixup(); \
16450 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16451- == NOTIFY_STOP) \
16452+ == NOTIFY_STOP) \
16453 return; \
16454 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
16455 }
16456@@ -570,7 +547,7 @@ void do_##name(struct pt_regs *regs, lon
16457 info.si_code = sicode; \
16458 info.si_addr = (void __user *)siaddr; \
16459 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16460- == NOTIFY_STOP) \
16461+ == NOTIFY_STOP) \
16462 return; \
16463 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
16464 }
16465@@ -579,7 +556,7 @@ void do_##name(struct pt_regs *regs, lon
16466 void do_##name(struct pt_regs *regs, long error_code) \
16467 { \
16468 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16469- == NOTIFY_STOP) \
16470+ == NOTIFY_STOP) \
16471 return; \
16472 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
16473 }
16474@@ -594,28 +571,29 @@ void do_##name(struct pt_regs *regs, lon
16475 info.si_addr = (void __user *)siaddr; \
16476 trace_hardirqs_fixup(); \
16477 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16478- == NOTIFY_STOP) \
16479+ == NOTIFY_STOP) \
16480 return; \
16481 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
16482 }
16483
16484-DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16485+DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16486 #ifndef CONFIG_KPROBES
16487 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
16488 #endif
16489 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
16490 DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
16491-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16492-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16493+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16494+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16495 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
16496-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16497-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16498+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16499+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16500 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
16501 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
16502
16503-void __kprobes do_general_protection(struct pt_regs * regs,
16504- long error_code)
16505+void __kprobes
16506+do_general_protection(struct pt_regs *regs, long error_code)
16507 {
16508+ struct task_struct *tsk;
16509 struct thread_struct *thread;
16510
16511 thread = &current->thread;
16512@@ -623,23 +601,24 @@ void __kprobes do_general_protection(str
16513 if (regs->flags & X86_VM_MASK)
16514 goto gp_in_vm86;
16515
16516+ tsk = current;
16517 if (!user_mode(regs))
16518 goto gp_in_kernel;
16519
16520- current->thread.error_code = error_code;
16521- current->thread.trap_no = 13;
16522+ tsk->thread.error_code = error_code;
16523+ tsk->thread.trap_no = 13;
16524
16525- if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
16526- printk_ratelimit()) {
16527+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
16528+ printk_ratelimit()) {
16529 printk(KERN_INFO
16530- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16531- current->comm, task_pid_nr(current),
16532- regs->ip, regs->sp, error_code);
16533+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16534+ tsk->comm, task_pid_nr(tsk),
16535+ regs->ip, regs->sp, error_code);
16536 print_vma_addr(" in ", regs->ip);
16537 printk("\n");
16538 }
16539
16540- force_sig(SIGSEGV, current);
16541+ force_sig(SIGSEGV, tsk);
16542 return;
16543
16544 gp_in_vm86:
16545@@ -648,14 +627,15 @@ gp_in_vm86:
16546 return;
16547
16548 gp_in_kernel:
16549- if (!fixup_exception(regs)) {
16550- current->thread.error_code = error_code;
16551- current->thread.trap_no = 13;
16552- if (notify_die(DIE_GPF, "general protection fault", regs,
16553+ if (fixup_exception(regs))
16554+ return;
16555+
16556+ tsk->thread.error_code = error_code;
16557+ tsk->thread.trap_no = 13;
16558+ if (notify_die(DIE_GPF, "general protection fault", regs,
16559 error_code, 13, SIGSEGV) == NOTIFY_STOP)
16560- return;
16561- die("general protection fault", regs, error_code);
16562- }
16563+ return;
16564+ die("general protection fault", regs, error_code);
16565 }
16566
16567 static notrace __kprobes void
16568@@ -722,9 +702,9 @@ unknown_nmi_error(unsigned char reason,
16569
16570 static DEFINE_SPINLOCK(nmi_print_lock);
16571
16572-void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
16573+void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
16574 {
16575- if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16576+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16577 return;
16578
16579 spin_lock(&nmi_print_lock);
16580@@ -733,10 +713,12 @@ void notrace __kprobes die_nmi(struct pt
16581 * to get a message out:
16582 */
16583 bust_spinlocks(1);
16584- printk(KERN_EMERG "%s", msg);
16585+ printk(KERN_EMERG "%s", str);
16586 printk(" on CPU%d, ip %08lx, registers:\n",
16587 smp_processor_id(), regs->ip);
16588 show_registers(regs);
16589+ if (do_panic)
16590+ panic("Non maskable interrupt");
16591 console_silent();
16592 spin_unlock(&nmi_print_lock);
16593 bust_spinlocks(0);
16594@@ -756,14 +738,17 @@ void notrace __kprobes die_nmi(struct pt
16595 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
16596 {
16597 unsigned char reason = 0;
16598+ int cpu;
16599
16600- /* Only the BSP gets external NMIs from the system: */
16601- if (!smp_processor_id())
16602+ cpu = smp_processor_id();
16603+
16604+ /* Only the BSP gets external NMIs from the system. */
16605+ if (!cpu)
16606 reason = get_nmi_reason();
16607
16608 if (!(reason & 0xc0)) {
16609 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
16610- == NOTIFY_STOP)
16611+ == NOTIFY_STOP)
16612 return;
16613 #ifdef CONFIG_X86_LOCAL_APIC
16614 /*
16615@@ -772,7 +757,7 @@ static notrace __kprobes void default_do
16616 */
16617 if (nmi_watchdog_tick(regs, reason))
16618 return;
16619- if (!do_nmi_callback(regs, smp_processor_id()))
16620+ if (!do_nmi_callback(regs, cpu))
16621 unknown_nmi_error(reason, regs);
16622 #else
16623 unknown_nmi_error(reason, regs);
16624@@ -782,6 +767,8 @@ static notrace __kprobes void default_do
16625 }
16626 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
16627 return;
16628+
16629+ /* AK: following checks seem to be broken on modern chipsets. FIXME */
16630 if (reason & 0x80)
16631 mem_parity_error(reason, regs);
16632 if (reason & 0x40)
16633@@ -793,8 +780,6 @@ static notrace __kprobes void default_do
16634 reassert_nmi();
16635 }
16636
16637-static int ignore_nmis;
16638-
16639 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
16640 {
16641 int cpu;
16642@@ -879,7 +864,7 @@ void __kprobes do_debug(struct pt_regs *
16643 tsk->thread.debugctlmsr = 0;
16644
16645 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16646- SIGTRAP) == NOTIFY_STOP)
16647+ SIGTRAP) == NOTIFY_STOP)
16648 return;
16649 /* It's safe to allow irq's after DR6 has been saved */
16650 if (regs->flags & X86_EFLAGS_IF)
16651@@ -940,9 +925,8 @@ clear_TF_reenable:
16652 void math_error(void __user *ip)
16653 {
16654 struct task_struct *task;
16655- unsigned short cwd;
16656- unsigned short swd;
16657 siginfo_t info;
16658+ unsigned short cwd, swd;
16659
16660 /*
16661 * Save the info for the exception handler and clear the error.
16662@@ -961,7 +945,7 @@ void math_error(void __user *ip)
16663 * C1 reg you need in case of a stack fault, 0x040 is the stack
16664 * fault bit. We should only be taking one exception at a time,
16665 * so if this combination doesn't produce any single exception,
16666- * then we have a bad program that isn't syncronizing its FPU usage
16667+ * then we have a bad program that isn't synchronizing its FPU usage
16668 * and it will suffer the consequences since we won't be able to
16669 * fully reproduce the context of the exception
16670 */
16671@@ -970,7 +954,7 @@ void math_error(void __user *ip)
16672 switch (swd & ~cwd & 0x3f) {
16673 case 0x000: /* No unmasked exception */
16674 return;
16675- default: /* Multiple exceptions */
16676+ default: /* Multiple exceptions */
16677 break;
16678 case 0x001: /* Invalid Op */
16679 /*
16680@@ -1006,8 +990,8 @@ void do_coprocessor_error(struct pt_regs
16681 static void simd_math_error(void __user *ip)
16682 {
16683 struct task_struct *task;
16684- unsigned short mxcsr;
16685 siginfo_t info;
16686+ unsigned short mxcsr;
16687
16688 /*
16689 * Save the info for the exception handler and clear the error.
16690@@ -1084,7 +1068,7 @@ void do_spurious_interrupt_bug(struct pt
16691
16692 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
16693 {
16694- struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
16695+ struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
16696 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
16697 unsigned long new_kesp = kesp - base;
16698 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
00e5a55c
BS
16699--- sle11-2009-06-04.orig/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:38:05.000000000 +0100
16700+++ sle11-2009-06-04/arch/x86/kernel/traps_64-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
16701@@ -10,73 +10,56 @@
16702 * 'Traps.c' handles hardware traps and faults after we have saved some
16703 * state in 'entry.S'.
16704 */
16705-#include <linux/sched.h>
16706+#include <linux/moduleparam.h>
16707+#include <linux/interrupt.h>
16708+#include <linux/kallsyms.h>
16709+#include <linux/spinlock.h>
16710+#include <linux/kprobes.h>
16711+#include <linux/uaccess.h>
16712+#include <linux/utsname.h>
16713+#include <linux/kdebug.h>
16714 #include <linux/kernel.h>
16715+#include <linux/module.h>
16716+#include <linux/ptrace.h>
16717 #include <linux/string.h>
16718+#include <linux/unwind.h>
16719+#include <linux/delay.h>
16720 #include <linux/errno.h>
16721-#include <linux/ptrace.h>
16722+#include <linux/kexec.h>
16723+#include <linux/sched.h>
16724 #include <linux/timer.h>
16725-#include <linux/mm.h>
16726 #include <linux/init.h>
16727-#include <linux/delay.h>
16728-#include <linux/spinlock.h>
16729-#include <linux/interrupt.h>
16730-#include <linux/kallsyms.h>
16731-#include <linux/module.h>
16732-#include <linux/moduleparam.h>
16733-#include <linux/nmi.h>
16734-#include <linux/kprobes.h>
16735-#include <linux/kexec.h>
16736-#include <linux/unwind.h>
16737-#include <linux/uaccess.h>
16738 #include <linux/bug.h>
16739-#include <linux/kdebug.h>
16740-#include <linux/utsname.h>
16741-
16742-#include <mach_traps.h>
16743+#include <linux/nmi.h>
16744+#include <linux/mm.h>
16745
16746 #if defined(CONFIG_EDAC)
16747 #include <linux/edac.h>
16748 #endif
16749
16750-#include <asm/system.h>
16751-#include <asm/io.h>
16752-#include <asm/atomic.h>
16753+#include <asm/stacktrace.h>
16754+#include <asm/processor.h>
16755 #include <asm/debugreg.h>
16756+#include <asm/atomic.h>
16757+#include <asm/system.h>
16758+#include <asm/unwind.h>
16759 #include <asm/desc.h>
16760 #include <asm/i387.h>
16761-#include <asm/processor.h>
16762-#include <asm/unwind.h>
16763+#include <asm/nmi.h>
16764 #include <asm/smp.h>
16765+#include <asm/io.h>
16766 #include <asm/pgalloc.h>
16767-#include <asm/pda.h>
16768 #include <asm/proto.h>
16769-#include <asm/nmi.h>
16770-#include <asm/stacktrace.h>
16771+#include <asm/pda.h>
16772+#include <asm/traps.h>
16773
16774-asmlinkage void divide_error(void);
16775-asmlinkage void debug(void);
16776-asmlinkage void nmi(void);
16777-asmlinkage void int3(void);
16778-asmlinkage void overflow(void);
16779-asmlinkage void bounds(void);
16780-asmlinkage void invalid_op(void);
16781-asmlinkage void device_not_available(void);
16782-asmlinkage void double_fault(void);
16783-asmlinkage void coprocessor_segment_overrun(void);
16784-asmlinkage void invalid_TSS(void);
16785-asmlinkage void segment_not_present(void);
16786-asmlinkage void stack_segment(void);
16787-asmlinkage void general_protection(void);
16788-asmlinkage void page_fault(void);
16789-asmlinkage void coprocessor_error(void);
16790-asmlinkage void simd_coprocessor_error(void);
16791-asmlinkage void reserved(void);
16792-asmlinkage void alignment_check(void);
16793-asmlinkage void machine_check(void);
16794-asmlinkage void spurious_interrupt_bug(void);
16795+#include <mach_traps.h>
16796
16797+int panic_on_unrecovered_nmi;
16798+int kstack_depth_to_print = 12;
16799 static unsigned int code_bytes = 64;
16800+static int ignore_nmis;
16801+static int die_counter;
16802
16803 static inline void conditional_sti(struct pt_regs *regs)
16804 {
16805@@ -100,34 +83,9 @@ static inline void preempt_conditional_c
16806 dec_preempt_count();
16807 }
16808
16809-int kstack_depth_to_print = 12;
16810-
16811 void printk_address(unsigned long address, int reliable)
16812 {
16813-#ifdef CONFIG_KALLSYMS
16814- unsigned long offset = 0, symsize;
16815- const char *symname;
16816- char *modname;
16817- char *delim = ":";
16818- char namebuf[KSYM_NAME_LEN];
16819- char reliab[4] = "";
16820-
16821- symname = kallsyms_lookup(address, &symsize, &offset,
16822- &modname, namebuf);
16823- if (!symname) {
16824- printk(" [<%016lx>]\n", address);
16825- return;
16826- }
16827- if (!reliable)
16828- strcpy(reliab, "? ");
16829-
16830- if (!modname)
16831- modname = delim = "";
16832- printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
16833- address, reliab, delim, modname, delim, symname, offset, symsize);
16834-#else
16835- printk(" [<%016lx>]\n", address);
16836-#endif
16837+ printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
16838 }
16839
16840 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
16841@@ -206,8 +164,6 @@ static unsigned long *in_exception_stack
16842 return NULL;
16843 }
16844
16845-#define MSG(txt) ops->warning(data, txt)
16846-
16847 /*
16848 * x86-64 can have up to three kernel stacks:
16849 * process stack
16850@@ -234,11 +190,11 @@ struct stack_frame {
16851 unsigned long return_address;
16852 };
16853
16854-
16855-static inline unsigned long print_context_stack(struct thread_info *tinfo,
16856- unsigned long *stack, unsigned long bp,
16857- const struct stacktrace_ops *ops, void *data,
16858- unsigned long *end)
16859+static inline unsigned long
16860+print_context_stack(struct thread_info *tinfo,
16861+ unsigned long *stack, unsigned long bp,
16862+ const struct stacktrace_ops *ops, void *data,
16863+ unsigned long *end)
16864 {
16865 struct stack_frame *frame = (struct stack_frame *)bp;
16866
16867@@ -260,7 +216,7 @@ static inline unsigned long print_contex
16868 return bp;
16869 }
16870
16871-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
16872+void dump_trace(struct task_struct *task, struct pt_regs *regs,
16873 unsigned long *stack, unsigned long bp,
16874 const struct stacktrace_ops *ops, void *data)
16875 {
16876@@ -269,36 +225,34 @@ void dump_trace(struct task_struct *tsk,
16877 unsigned used = 0;
16878 struct thread_info *tinfo;
16879
16880- if (!tsk)
16881- tsk = current;
16882- tinfo = task_thread_info(tsk);
16883+ if (!task)
16884+ task = current;
16885
16886 if (!stack) {
16887 unsigned long dummy;
16888 stack = &dummy;
16889- if (tsk && tsk != current)
16890- stack = (unsigned long *)tsk->thread.sp;
16891+ if (task && task != current)
16892+ stack = (unsigned long *)task->thread.sp;
16893 }
16894
16895 #ifdef CONFIG_FRAME_POINTER
16896 if (!bp) {
16897- if (tsk == current) {
16898+ if (task == current) {
16899 /* Grab bp right from our regs */
16900- asm("movq %%rbp, %0" : "=r" (bp):);
16901+ asm("movq %%rbp, %0" : "=r" (bp) :);
16902 } else {
16903 /* bp is the last reg pushed by switch_to */
16904- bp = *(unsigned long *) tsk->thread.sp;
16905+ bp = *(unsigned long *) task->thread.sp;
16906 }
16907 }
16908 #endif
16909
16910-
16911-
16912 /*
16913 * Print function call entries in all stacks, starting at the
16914 * current stack address. If the stacks consist of nested
16915 * exceptions
16916 */
16917+ tinfo = task_thread_info(task);
16918 for (;;) {
16919 char *id;
16920 unsigned long *estack_end;
16921@@ -383,18 +337,24 @@ static const struct stacktrace_ops print
16922 .address = print_trace_address,
16923 };
16924
16925-void
16926-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
16927- unsigned long bp)
16928+static void
16929+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16930+ unsigned long *stack, unsigned long bp, char *log_lvl)
16931 {
16932 printk("\nCall Trace:\n");
16933- dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
16934+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16935 printk("\n");
16936 }
16937
16938+void show_trace(struct task_struct *task, struct pt_regs *regs,
16939+ unsigned long *stack, unsigned long bp)
16940+{
16941+ show_trace_log_lvl(task, regs, stack, bp, "");
16942+}
16943+
16944 static void
16945-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
16946- unsigned long bp)
16947+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
16948+ unsigned long *sp, unsigned long bp, char *log_lvl)
16949 {
16950 unsigned long *stack;
16951 int i;
16952@@ -406,14 +366,14 @@ _show_stack(struct task_struct *tsk, str
16953 // back trace for this cpu.
16954
16955 if (sp == NULL) {
16956- if (tsk)
16957- sp = (unsigned long *)tsk->thread.sp;
16958+ if (task)
16959+ sp = (unsigned long *)task->thread.sp;
16960 else
16961 sp = (unsigned long *)&sp;
16962 }
16963
16964 stack = sp;
16965- for(i=0; i < kstack_depth_to_print; i++) {
16966+ for (i = 0; i < kstack_depth_to_print; i++) {
16967 if (stack >= irqstack && stack <= irqstack_end) {
16968 if (stack == irqstack_end) {
16969 stack = (unsigned long *) (irqstack_end[-1]);
16970@@ -428,12 +388,12 @@ _show_stack(struct task_struct *tsk, str
16971 printk(" %016lx", *stack++);
16972 touch_nmi_watchdog();
16973 }
16974- show_trace(tsk, regs, sp, bp);
16975+ show_trace_log_lvl(task, regs, sp, bp, log_lvl);
16976 }
16977
16978-void show_stack(struct task_struct *tsk, unsigned long * sp)
16979+void show_stack(struct task_struct *task, unsigned long *sp)
16980 {
16981- _show_stack(tsk, NULL, sp, 0);
16982+ show_stack_log_lvl(task, NULL, sp, 0, "");
16983 }
16984
16985 /*
16986@@ -441,8 +401,8 @@ void show_stack(struct task_struct *tsk,
16987 */
16988 void dump_stack(void)
16989 {
16990- unsigned long dummy;
16991 unsigned long bp = 0;
16992+ unsigned long stack;
16993
16994 #ifdef CONFIG_FRAME_POINTER
16995 if (!bp)
16996@@ -454,7 +414,7 @@ void dump_stack(void)
16997 init_utsname()->release,
16998 (int)strcspn(init_utsname()->version, " "),
16999 init_utsname()->version);
17000- show_trace(NULL, NULL, &dummy, bp);
17001+ show_trace(NULL, NULL, &stack, bp);
17002 }
17003
17004 EXPORT_SYMBOL(dump_stack);
17005@@ -465,12 +425,8 @@ void show_registers(struct pt_regs *regs
17006 unsigned long sp;
17007 const int cpu = smp_processor_id();
17008 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
17009- u8 *ip;
17010- unsigned int code_prologue = code_bytes * 43 / 64;
17011- unsigned int code_len = code_bytes;
17012
17013 sp = regs->sp;
17014- ip = (u8 *) regs->ip - code_prologue;
17015 printk("CPU %d ", cpu);
17016 __show_regs(regs);
17017 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
17018@@ -481,15 +437,22 @@ void show_registers(struct pt_regs *regs
17019 * time of the fault..
17020 */
17021 if (!user_mode(regs)) {
17022+ unsigned int code_prologue = code_bytes * 43 / 64;
17023+ unsigned int code_len = code_bytes;
17024 unsigned char c;
17025+ u8 *ip;
17026+
17027 printk("Stack: ");
17028- _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
17029+ show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
17030+ regs->bp, "");
17031 printk("\n");
17032
17033 printk(KERN_EMERG "Code: ");
17034+
17035+ ip = (u8 *)regs->ip - code_prologue;
17036 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
17037 /* try starting at RIP */
17038- ip = (u8 *) regs->ip;
17039+ ip = (u8 *)regs->ip;
17040 code_len = code_len - code_prologue + 1;
17041 }
17042 for (i = 0; i < code_len; i++, ip++) {
17043@@ -505,7 +468,7 @@ void show_registers(struct pt_regs *regs
17044 }
17045 }
17046 printk("\n");
17047-}
17048+}
17049
17050 int is_valid_bugaddr(unsigned long ip)
17051 {
17052@@ -545,7 +508,7 @@ unsigned __kprobes long oops_begin(void)
17053 }
17054
17055 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
17056-{
17057+{
17058 die_owner = -1;
17059 bust_spinlocks(0);
17060 die_nest_count--;
17061@@ -563,10 +526,9 @@ void __kprobes oops_end(unsigned long fl
17062 do_exit(signr);
17063 }
17064
17065-int __kprobes __die(const char * str, struct pt_regs * regs, long err)
17066+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
17067 {
17068- static int die_counter;
17069- printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
17070+ printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
17071 #ifdef CONFIG_PREEMPT
17072 printk("PREEMPT ");
17073 #endif
17074@@ -577,8 +539,10 @@ int __kprobes __die(const char * str, st
17075 printk("DEBUG_PAGEALLOC");
17076 #endif
17077 printk("\n");
17078- if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17079+ if (notify_die(DIE_OOPS, str, regs, err,
17080+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17081 return 1;
17082+
17083 show_registers(regs);
17084 add_taint(TAINT_DIE);
17085 /* Executive summary in case the oops scrolled away */
17086@@ -590,7 +554,7 @@ int __kprobes __die(const char * str, st
17087 return 0;
17088 }
17089
17090-void die(const char * str, struct pt_regs * regs, long err)
17091+void die(const char *str, struct pt_regs *regs, long err)
17092 {
17093 unsigned long flags = oops_begin();
17094
17095@@ -608,8 +572,7 @@ die_nmi(char *str, struct pt_regs *regs,
17096 {
17097 unsigned long flags;
17098
17099- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
17100- NOTIFY_STOP)
17101+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
17102 return;
17103
17104 flags = oops_begin();
17105@@ -617,7 +580,9 @@ die_nmi(char *str, struct pt_regs *regs,
17106 * We are in trouble anyway, lets at least try
17107 * to get a message out.
17108 */
17109- printk(str, smp_processor_id());
17110+ printk(KERN_EMERG "%s", str);
17111+ printk(" on CPU%d, ip %08lx, registers:\n",
17112+ smp_processor_id(), regs->ip);
17113 show_registers(regs);
17114 if (kexec_should_crash(current))
17115 crash_kexec(regs);
17116@@ -630,44 +595,44 @@ die_nmi(char *str, struct pt_regs *regs,
17117 }
17118 #endif
17119
17120-static void __kprobes do_trap(int trapnr, int signr, char *str,
17121- struct pt_regs * regs, long error_code,
17122- siginfo_t *info)
17123+static void __kprobes
17124+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
17125+ long error_code, siginfo_t *info)
17126 {
17127 struct task_struct *tsk = current;
17128
17129- if (user_mode(regs)) {
17130- /*
17131- * We want error_code and trap_no set for userspace
17132- * faults and kernelspace faults which result in
17133- * die(), but not kernelspace faults which are fixed
17134- * up. die() gives the process no chance to handle
17135- * the signal and notice the kernel fault information,
17136- * so that won't result in polluting the information
17137- * about previously queued, but not yet delivered,
17138- * faults. See also do_general_protection below.
17139- */
17140- tsk->thread.error_code = error_code;
17141- tsk->thread.trap_no = trapnr;
17142+ if (!user_mode(regs))
17143+ goto kernel_trap;
17144
17145- if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17146- printk_ratelimit()) {
17147- printk(KERN_INFO
17148- "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17149- tsk->comm, tsk->pid, str,
17150- regs->ip, regs->sp, error_code);
17151- print_vma_addr(" in ", regs->ip);
17152- printk("\n");
17153- }
17154+ /*
17155+ * We want error_code and trap_no set for userspace faults and
17156+ * kernelspace faults which result in die(), but not
17157+ * kernelspace faults which are fixed up. die() gives the
17158+ * process no chance to handle the signal and notice the
17159+ * kernel fault information, so that won't result in polluting
17160+ * the information about previously queued, but not yet
17161+ * delivered, faults. See also do_general_protection below.
17162+ */
17163+ tsk->thread.error_code = error_code;
17164+ tsk->thread.trap_no = trapnr;
17165
17166- if (info)
17167- force_sig_info(signr, info, tsk);
17168- else
17169- force_sig(signr, tsk);
17170- return;
17171+ if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17172+ printk_ratelimit()) {
17173+ printk(KERN_INFO
17174+ "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17175+ tsk->comm, tsk->pid, str,
17176+ regs->ip, regs->sp, error_code);
17177+ print_vma_addr(" in ", regs->ip);
17178+ printk("\n");
17179 }
17180
17181+ if (info)
17182+ force_sig_info(signr, info, tsk);
17183+ else
17184+ force_sig(signr, tsk);
17185+ return;
17186
17187+kernel_trap:
17188 if (!fixup_exception(regs)) {
17189 tsk->thread.error_code = error_code;
17190 tsk->thread.trap_no = trapnr;
17191@@ -677,41 +642,39 @@ static void __kprobes do_trap(int trapnr
17192 }
17193
17194 #define DO_ERROR(trapnr, signr, str, name) \
17195-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17196-{ \
17197- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17198- == NOTIFY_STOP) \
17199- return; \
17200+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17201+{ \
17202+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17203+ == NOTIFY_STOP) \
17204+ return; \
17205 conditional_sti(regs); \
17206- do_trap(trapnr, signr, str, regs, error_code, NULL); \
17207+ do_trap(trapnr, signr, str, regs, error_code, NULL); \
17208 }
17209
17210-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17211-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17212-{ \
17213- siginfo_t info; \
17214- info.si_signo = signr; \
17215- info.si_errno = 0; \
17216- info.si_code = sicode; \
17217- info.si_addr = (void __user *)siaddr; \
17218- trace_hardirqs_fixup(); \
17219- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17220- == NOTIFY_STOP) \
17221- return; \
17222+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17223+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17224+{ \
17225+ siginfo_t info; \
17226+ info.si_signo = signr; \
17227+ info.si_errno = 0; \
17228+ info.si_code = sicode; \
17229+ info.si_addr = (void __user *)siaddr; \
17230+ trace_hardirqs_fixup(); \
17231+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17232+ == NOTIFY_STOP) \
17233+ return; \
17234 conditional_sti(regs); \
17235- do_trap(trapnr, signr, str, regs, error_code, &info); \
17236+ do_trap(trapnr, signr, str, regs, error_code, &info); \
17237 }
17238
17239-DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17240-DO_ERROR( 4, SIGSEGV, "overflow", overflow)
17241-DO_ERROR( 5, SIGSEGV, "bounds", bounds)
17242-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17243-DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
17244-DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17245+DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17246+DO_ERROR(4, SIGSEGV, "overflow", overflow)
17247+DO_ERROR(5, SIGSEGV, "bounds", bounds)
17248+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17249+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17250 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
17251-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17252+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17253 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
17254-DO_ERROR(18, SIGSEGV, "reserved", reserved)
17255
17256 /* Runs on IST stack */
17257 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
17258@@ -741,31 +704,34 @@ asmlinkage void do_double_fault(struct p
17259 die(str, regs, error_code);
17260 }
17261
17262-asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
17263- long error_code)
17264+asmlinkage void __kprobes
17265+do_general_protection(struct pt_regs *regs, long error_code)
17266 {
17267- struct task_struct *tsk = current;
17268+ struct task_struct *tsk;
17269
17270 conditional_sti(regs);
17271
17272- if (user_mode(regs)) {
17273- tsk->thread.error_code = error_code;
17274- tsk->thread.trap_no = 13;
17275+ tsk = current;
17276+ if (!user_mode(regs))
17277+ goto gp_in_kernel;
17278
17279- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17280- printk_ratelimit()) {
17281- printk(KERN_INFO
17282- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17283- tsk->comm, tsk->pid,
17284- regs->ip, regs->sp, error_code);
17285- print_vma_addr(" in ", regs->ip);
17286- printk("\n");
17287- }
17288+ tsk->thread.error_code = error_code;
17289+ tsk->thread.trap_no = 13;
17290
17291- force_sig(SIGSEGV, tsk);
17292- return;
17293- }
17294+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17295+ printk_ratelimit()) {
17296+ printk(KERN_INFO
17297+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17298+ tsk->comm, tsk->pid,
17299+ regs->ip, regs->sp, error_code);
17300+ print_vma_addr(" in ", regs->ip);
17301+ printk("\n");
17302+ }
17303
17304+ force_sig(SIGSEGV, tsk);
17305+ return;
17306+
17307+gp_in_kernel:
17308 if (fixup_exception(regs))
17309 return;
17310
17311@@ -778,14 +744,14 @@ asmlinkage void __kprobes do_general_pro
17312 }
17313
17314 static notrace __kprobes void
17315-mem_parity_error(unsigned char reason, struct pt_regs * regs)
17316+mem_parity_error(unsigned char reason, struct pt_regs *regs)
17317 {
17318 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
17319 reason);
17320 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
17321
17322 #if defined(CONFIG_EDAC)
17323- if(edac_handler_set()) {
17324+ if (edac_handler_set()) {
17325 edac_atomic_assert_error();
17326 return;
17327 }
17328@@ -801,7 +767,7 @@ mem_parity_error(unsigned char reason, s
17329 }
17330
17331 static notrace __kprobes void
17332-io_check_error(unsigned char reason, struct pt_regs * regs)
17333+io_check_error(unsigned char reason, struct pt_regs *regs)
17334 {
17335 printk("NMI: IOCK error (debug interrupt?)\n");
17336 show_registers(regs);
17337@@ -827,14 +793,14 @@ unknown_nmi_error(unsigned char reason,
17338
17339 /* Runs on IST stack. This code must keep interrupts off all the time.
17340 Nested NMIs are prevented by the CPU. */
17341-asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17342+asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17343 {
17344 unsigned char reason = 0;
17345 int cpu;
17346
17347 cpu = smp_processor_id();
17348
17349- /* Only the BSP gets external NMIs from the system. */
17350+ /* Only the BSP gets external NMIs from the system. */
17351 if (!cpu)
17352 reason = get_nmi_reason();
17353
17354@@ -847,33 +813,58 @@ asmlinkage notrace __kprobes void defau
17355 * Ok, so this is none of the documented NMI sources,
17356 * so it must be the NMI watchdog.
17357 */
17358- if (nmi_watchdog_tick(regs,reason))
17359+ if (nmi_watchdog_tick(regs, reason))
17360 return;
17361 #endif
17362- if (!do_nmi_callback(regs,cpu))
17363+ if (!do_nmi_callback(regs, cpu))
17364 unknown_nmi_error(reason, regs);
17365
17366 return;
17367 }
17368 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
17369- return;
17370+ return;
17371
17372 /* AK: following checks seem to be broken on modern chipsets. FIXME */
17373-
17374 if (reason & 0x80)
17375 mem_parity_error(reason, regs);
17376 if (reason & 0x40)
17377 io_check_error(reason, regs);
17378 }
17379
17380+asmlinkage notrace __kprobes void
17381+do_nmi(struct pt_regs *regs, long error_code)
17382+{
17383+ nmi_enter();
17384+
17385+ add_pda(__nmi_count, 1);
17386+
17387+ if (!ignore_nmis)
17388+ default_do_nmi(regs);
17389+
17390+ nmi_exit();
17391+}
17392+
17393+void stop_nmi(void)
17394+{
17395+ acpi_nmi_disable();
17396+ ignore_nmis++;
17397+}
17398+
17399+void restart_nmi(void)
17400+{
17401+ ignore_nmis--;
17402+ acpi_nmi_enable();
17403+}
17404+
17405 /* runs on IST stack. */
17406 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
17407 {
17408 trace_hardirqs_fixup();
17409
17410- if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
17411+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
17412+ == NOTIFY_STOP)
17413 return;
17414- }
17415+
17416 preempt_conditional_sti(regs);
17417 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
17418 preempt_conditional_cli(regs);
17419@@ -904,8 +895,8 @@ asmlinkage __kprobes struct pt_regs *syn
17420 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
17421 unsigned long error_code)
17422 {
17423- unsigned long condition;
17424 struct task_struct *tsk = current;
17425+ unsigned long condition;
17426 siginfo_t info;
17427
17428 trace_hardirqs_fixup();
17429@@ -926,21 +917,19 @@ asmlinkage void __kprobes do_debug(struc
17430
17431 /* Mask out spurious debug traps due to lazy DR7 setting */
17432 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
17433- if (!tsk->thread.debugreg7) {
17434+ if (!tsk->thread.debugreg7)
17435 goto clear_dr7;
17436- }
17437 }
17438
17439 tsk->thread.debugreg6 = condition;
17440
17441-
17442 /*
17443 * Single-stepping through TF: make sure we ignore any events in
17444 * kernel space (but re-enable TF when returning to user mode).
17445 */
17446 if (condition & DR_STEP) {
17447- if (!user_mode(regs))
17448- goto clear_TF_reenable;
17449+ if (!user_mode(regs))
17450+ goto clear_TF_reenable;
17451 }
17452
17453 /* Ok, finally something we can handle */
17454@@ -953,7 +942,7 @@ asmlinkage void __kprobes do_debug(struc
17455 force_sig_info(SIGTRAP, &info, tsk);
17456
17457 clear_dr7:
17458- set_debugreg(0UL, 7);
17459+ set_debugreg(0, 7);
17460 preempt_conditional_cli(regs);
17461 return;
17462
17463@@ -961,6 +950,7 @@ clear_TF_reenable:
17464 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
17465 regs->flags &= ~X86_EFLAGS_TF;
17466 preempt_conditional_cli(regs);
17467+ return;
17468 }
17469
17470 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
17471@@ -983,7 +973,7 @@ static int kernel_math_error(struct pt_r
17472 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
17473 {
17474 void __user *ip = (void __user *)(regs->ip);
17475- struct task_struct * task;
17476+ struct task_struct *task;
17477 siginfo_t info;
17478 unsigned short cwd, swd;
17479
17480@@ -1016,30 +1006,30 @@ asmlinkage void do_coprocessor_error(str
17481 cwd = get_fpu_cwd(task);
17482 swd = get_fpu_swd(task);
17483 switch (swd & ~cwd & 0x3f) {
17484- case 0x000:
17485- default:
17486- break;
17487- case 0x001: /* Invalid Op */
17488- /*
17489- * swd & 0x240 == 0x040: Stack Underflow
17490- * swd & 0x240 == 0x240: Stack Overflow
17491- * User must clear the SF bit (0x40) if set
17492- */
17493- info.si_code = FPE_FLTINV;
17494- break;
17495- case 0x002: /* Denormalize */
17496- case 0x010: /* Underflow */
17497- info.si_code = FPE_FLTUND;
17498- break;
17499- case 0x004: /* Zero Divide */
17500- info.si_code = FPE_FLTDIV;
17501- break;
17502- case 0x008: /* Overflow */
17503- info.si_code = FPE_FLTOVF;
17504- break;
17505- case 0x020: /* Precision */
17506- info.si_code = FPE_FLTRES;
17507- break;
17508+ case 0x000: /* No unmasked exception */
17509+ default: /* Multiple exceptions */
17510+ break;
17511+ case 0x001: /* Invalid Op */
17512+ /*
17513+ * swd & 0x240 == 0x040: Stack Underflow
17514+ * swd & 0x240 == 0x240: Stack Overflow
17515+ * User must clear the SF bit (0x40) if set
17516+ */
17517+ info.si_code = FPE_FLTINV;
17518+ break;
17519+ case 0x002: /* Denormalize */
17520+ case 0x010: /* Underflow */
17521+ info.si_code = FPE_FLTUND;
17522+ break;
17523+ case 0x004: /* Zero Divide */
17524+ info.si_code = FPE_FLTDIV;
17525+ break;
17526+ case 0x008: /* Overflow */
17527+ info.si_code = FPE_FLTOVF;
17528+ break;
17529+ case 0x020: /* Precision */
17530+ info.si_code = FPE_FLTRES;
17531+ break;
17532 }
17533 force_sig_info(SIGFPE, &info, task);
17534 }
17535@@ -1052,7 +1042,7 @@ asmlinkage void bad_intr(void)
17536 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
17537 {
17538 void __user *ip = (void __user *)(regs->ip);
17539- struct task_struct * task;
17540+ struct task_struct *task;
17541 siginfo_t info;
17542 unsigned short mxcsr;
17543
17544@@ -1080,25 +1070,25 @@ asmlinkage void do_simd_coprocessor_erro
17545 */
17546 mxcsr = get_fpu_mxcsr(task);
17547 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
17548- case 0x000:
17549- default:
17550- break;
17551- case 0x001: /* Invalid Op */
17552- info.si_code = FPE_FLTINV;
17553- break;
17554- case 0x002: /* Denormalize */
17555- case 0x010: /* Underflow */
17556- info.si_code = FPE_FLTUND;
17557- break;
17558- case 0x004: /* Zero Divide */
17559- info.si_code = FPE_FLTDIV;
17560- break;
17561- case 0x008: /* Overflow */
17562- info.si_code = FPE_FLTOVF;
17563- break;
17564- case 0x020: /* Precision */
17565- info.si_code = FPE_FLTRES;
17566- break;
17567+ case 0x000:
17568+ default:
17569+ break;
17570+ case 0x001: /* Invalid Op */
17571+ info.si_code = FPE_FLTINV;
17572+ break;
17573+ case 0x002: /* Denormalize */
17574+ case 0x010: /* Underflow */
17575+ info.si_code = FPE_FLTUND;
17576+ break;
17577+ case 0x004: /* Zero Divide */
17578+ info.si_code = FPE_FLTDIV;
17579+ break;
17580+ case 0x008: /* Overflow */
17581+ info.si_code = FPE_FLTOVF;
17582+ break;
17583+ case 0x020: /* Precision */
17584+ info.si_code = FPE_FLTRES;
17585+ break;
17586 }
17587 force_sig_info(SIGFPE, &info, task);
17588 }
17589@@ -1118,7 +1108,7 @@ asmlinkage void __attribute__((weak)) mc
17590 }
17591
17592 /*
17593- * 'math_state_restore()' saves the current math information in the
17594+ * 'math_state_restore()' saves the current math information in the
17595 * old math state array, and gets the new ones from the current task
17596 *
17597 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
17598@@ -1145,7 +1135,14 @@ asmlinkage void math_state_restore(void)
17599
17600 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
17601
17602- restore_fpu_checking(&me->thread.xstate->fxsave);
17603+ /*
17604+ * Paranoid restore. send a SIGSEGV if we fail to restore the state.
17605+ */
17606+ if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
17607+ stts();
17608+ force_sig(SIGSEGV, me);
17609+ return;
17610+ }
17611 task_thread_info(me)->status |= TS_USEDFPU;
17612 me->fpu_counter++;
17613 }
17614@@ -1190,13 +1187,12 @@ void __init trap_init(void)
17615 ret = HYPERVISOR_set_trap_table(trap_table);
17616 if (ret)
17617 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
17618-
17619 /*
17620 * initialize the per thread extended state:
17621 */
17622- init_thread_xstate();
17623+ init_thread_xstate();
17624 /*
17625- * Should be a barrier for any external CPU state.
17626+ * Should be a barrier for any external CPU state:
17627 */
17628 cpu_init();
17629 }
17630@@ -1212,27 +1208,25 @@ void __cpuinit smp_trap_init(trap_info_t
17631 }
17632 }
17633
17634-
17635 static int __init oops_setup(char *s)
17636-{
17637+{
17638 if (!s)
17639 return -EINVAL;
17640 if (!strcmp(s, "panic"))
17641 panic_on_oops = 1;
17642 return 0;
17643-}
17644+}
17645 early_param("oops", oops_setup);
17646
17647 static int __init kstack_setup(char *s)
17648 {
17649 if (!s)
17650 return -EINVAL;
17651- kstack_depth_to_print = simple_strtoul(s,NULL,0);
17652+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
17653 return 0;
17654 }
17655 early_param("kstack", kstack_setup);
17656
17657-
17658 static int __init code_bytes_setup(char *s)
17659 {
17660 code_bytes = simple_strtoul(s, NULL, 0);
00e5a55c
BS
17661--- sle11-2009-06-04.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:38:05.000000000 +0100
17662+++ sle11-2009-06-04/arch/x86/kernel/vsyscall_64-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
17663@@ -42,7 +42,8 @@
17664 #include <asm/topology.h>
17665 #include <asm/vgtod.h>
17666
17667-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
17668+#define __vsyscall(nr) \
17669+ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
17670 #define __syscall_clobber "r11","cx","memory"
17671
17672 /*
17673@@ -264,10 +265,7 @@ static void __cpuinit vsyscall_set_cpu(i
17674 d |= cpu;
17675 d |= (node & 0xf) << 12;
17676 d |= (node >> 4) << 48;
17677- if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
17678- + GDT_ENTRY_PER_CPU),
17679- d))
17680- BUG();
17681+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
17682 }
17683
17684 static void __cpuinit cpu_vsyscall_init(void *arg)
17685@@ -281,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_bl
17686 {
17687 long cpu = (long)arg;
17688 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
17689- smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
17690+ smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
17691 return NOTIFY_DONE;
17692 }
17693
17694@@ -311,7 +309,7 @@ static int __init vsyscall_init(void)
17695 #ifdef CONFIG_SYSCTL
17696 register_sysctl_table(kernel_root_table2);
17697 #endif
17698- on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
17699+ on_each_cpu(cpu_vsyscall_init, NULL, 1);
17700 hotcpu_notifier(cpu_vsyscall_notifier, 0);
17701 return 0;
17702 }
00e5a55c
BS
17703--- sle11-2009-06-04.orig/arch/x86/mach-xen/setup.c 2009-03-16 16:33:40.000000000 +0100
17704+++ sle11-2009-06-04/arch/x86/mach-xen/setup.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
17705@@ -17,6 +17,8 @@
17706 #include <xen/interface/callback.h>
17707 #include <xen/interface/memory.h>
17708
17709+#ifdef CONFIG_X86_32
17710+
17711 #ifdef CONFIG_HOTPLUG_CPU
17712 #define DEFAULT_SEND_IPI (1)
17713 #else
17714@@ -44,51 +46,6 @@ static int __init print_ipi_mode(void)
17715
17716 late_initcall(print_ipi_mode);
17717
17718-/**
17719- * machine_specific_memory_setup - Hook for machine specific memory setup.
17720- *
17721- * Description:
17722- * This is included late in kernel/setup.c so that it can make
17723- * use of all of the static functions.
17724- **/
17725-
17726-char * __init machine_specific_memory_setup(void)
17727-{
17728- int rc;
17729- struct xen_memory_map memmap;
17730- /*
17731- * This is rather large for a stack variable but this early in
17732- * the boot process we know we have plenty slack space.
17733- */
17734- struct e820entry map[E820MAX];
17735-
17736- memmap.nr_entries = E820MAX;
17737- set_xen_guest_handle(memmap.buffer, map);
17738-
17739- rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
17740- if ( rc == -ENOSYS ) {
17741- memmap.nr_entries = 1;
17742- map[0].addr = 0ULL;
17743- map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
17744- /* 8MB slack (to balance backend allocations). */
17745- map[0].size += 8ULL << 20;
17746- map[0].type = E820_RAM;
17747- rc = 0;
17748- }
17749- BUG_ON(rc);
17750-
17751- sanitize_e820_map(map, (char *)&memmap.nr_entries);
17752-
17753- BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
17754-
17755- return "Xen";
17756-}
17757-
17758-
17759-extern void hypervisor_callback(void);
17760-extern void failsafe_callback(void);
17761-extern void nmi(void);
17762-
17763 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
17764 EXPORT_SYMBOL(machine_to_phys_mapping);
17765 unsigned int machine_to_phys_order;
17766@@ -121,33 +78,66 @@ void __init pre_setup_arch_hook(void)
17767 (unsigned long *)xen_start_info->mfn_list;
17768 }
17769
17770+#endif /* CONFIG_X86_32 */
17771+
17772+extern void hypervisor_callback(void);
17773+extern void failsafe_callback(void);
17774+extern void nmi(void);
17775+
17776+#ifdef CONFIG_X86_64
17777+#include <asm/proto.h>
17778+#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
17779+#else
17780+#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
17781+#endif
17782+
17783 void __init machine_specific_arch_setup(void)
17784 {
17785 int ret;
17786 static struct callback_register __initdata event = {
17787 .type = CALLBACKTYPE_event,
17788- .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
17789+ .address = CALLBACK_ADDR(hypervisor_callback)
17790 };
17791 static struct callback_register __initdata failsafe = {
17792 .type = CALLBACKTYPE_failsafe,
17793- .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
17794+ .address = CALLBACK_ADDR(failsafe_callback)
17795+ };
17796+#ifdef CONFIG_X86_64
17797+ static struct callback_register __initdata syscall = {
17798+ .type = CALLBACKTYPE_syscall,
17799+ .address = CALLBACK_ADDR(system_call)
17800 };
17801+#endif
17802+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17803 static struct callback_register __initdata nmi_cb = {
17804 .type = CALLBACKTYPE_nmi,
17805- .address = { __KERNEL_CS, (unsigned long)nmi },
17806+ .address = CALLBACK_ADDR(nmi)
17807 };
17808+#endif
17809
17810 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
17811 if (ret == 0)
17812 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
17813+#ifdef CONFIG_X86_64
17814+ if (ret == 0)
17815+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
17816+#endif
17817 #if CONFIG_XEN_COMPAT <= 0x030002
17818+#ifdef CONFIG_X86_32
17819 if (ret == -ENOSYS)
17820 ret = HYPERVISOR_set_callbacks(
17821 event.address.cs, event.address.eip,
17822 failsafe.address.cs, failsafe.address.eip);
17823+#else
17824+ ret = HYPERVISOR_set_callbacks(
17825+ event.address,
17826+ failsafe.address,
17827+ syscall.address);
17828+#endif
17829 #endif
17830 BUG_ON(ret);
17831
17832+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17833 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
17834 #if CONFIG_XEN_COMPAT <= 0x030002
17835 if (ret == -ENOSYS) {
17836@@ -158,15 +148,43 @@ void __init machine_specific_arch_setup(
17837 HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
17838 }
17839 #endif
17840+#endif
17841
17842+#ifdef CONFIG_X86_32
17843 /* Do an early initialization of the fixmap area */
17844 {
17845 extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
17846 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
17847 pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
17848 pmd_t *pmd = pmd_offset(pud, addr);
17849+ unsigned int i;
17850
17851 make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
17852 set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
17853+
17854+#define __FIXADDR_TOP (-PAGE_SIZE)
17855+#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
17856+ != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
17857+ FIX_BUG_ON(SHARED_INFO);
17858+ FIX_BUG_ON(ISAMAP_BEGIN);
17859+ FIX_BUG_ON(ISAMAP_END);
17860+#undef __FIXADDR_TOP
17861+ BUG_ON(pte_index(hypervisor_virt_start));
17862+
17863+ /* Switch to the real shared_info page, and clear the
17864+ * dummy page. */
17865+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17866+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17867+ memset(empty_zero_page, 0, sizeof(empty_zero_page));
17868+
17869+ /* Setup mapping of lower 1st MB */
17870+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
17871+ if (is_initial_xendomain())
17872+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17873+ else
17874+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
17875+ virt_to_machine(empty_zero_page),
17876+ PAGE_KERNEL_RO);
17877 }
17878+#endif
17879 }
00e5a55c
BS
17880--- sle11-2009-06-04.orig/arch/x86/mm/fault-xen.c 2009-03-16 16:38:05.000000000 +0100
17881+++ sle11-2009-06-04/arch/x86/mm/fault-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
17882@@ -10,6 +10,7 @@
17883 #include <linux/string.h>
17884 #include <linux/types.h>
17885 #include <linux/ptrace.h>
17886+#include <linux/mmiotrace.h>
17887 #include <linux/mman.h>
17888 #include <linux/mm.h>
17889 #include <linux/smp.h>
17890@@ -49,17 +50,23 @@
17891 #define PF_RSVD (1<<3)
17892 #define PF_INSTR (1<<4)
17893
17894+static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
17895+{
17896+#ifdef CONFIG_MMIOTRACE_HOOKS
17897+ if (unlikely(is_kmmio_active()))
17898+ if (kmmio_handler(regs, addr) == 1)
17899+ return -1;
17900+#endif
17901+ return 0;
17902+}
17903+
17904 static inline int notify_page_fault(struct pt_regs *regs)
17905 {
17906 #ifdef CONFIG_KPROBES
17907 int ret = 0;
17908
17909 /* kprobe_running() needs smp_processor_id() */
17910-#ifdef CONFIG_X86_32
17911 if (!user_mode_vm(regs)) {
17912-#else
17913- if (!user_mode(regs)) {
17914-#endif
17915 preempt_disable();
17916 if (kprobe_running() && kprobe_fault_handler(regs, 14))
17917 ret = 1;
17918@@ -409,11 +416,7 @@ static void show_fault_oops(struct pt_re
17919 printk(KERN_CONT "NULL pointer dereference");
17920 else
17921 printk(KERN_CONT "paging request");
17922-#ifdef CONFIG_X86_32
17923- printk(KERN_CONT " at %08lx\n", address);
17924-#else
17925- printk(KERN_CONT " at %016lx\n", address);
17926-#endif
17927+ printk(KERN_CONT " at %p\n", (void *) address);
17928 printk(KERN_ALERT "IP:");
17929 printk_address(regs->ip, 1);
17930 dump_pagetable(address);
17931@@ -628,6 +631,8 @@ void __kprobes do_page_fault(struct pt_r
17932
17933 if (notify_page_fault(regs))
17934 return;
17935+ if (unlikely(kmmio_fault(regs, address)))
17936+ return;
17937
17938 /*
17939 * We fault-in kernel-space virtual memory on-demand. The
00e5a55c 17940@@ -831,14 +836,10 @@ bad_area_nosemaphore:
cc90b958
BS
17941 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17942 printk_ratelimit()) {
17943 printk(
17944-#ifdef CONFIG_X86_32
17945- "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
17946-#else
17947- "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
17948-#endif
17949+ "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
17950 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
17951- tsk->comm, task_pid_nr(tsk), address, regs->ip,
17952- regs->sp, error_code);
17953+ tsk->comm, task_pid_nr(tsk), address,
17954+ (void *) regs->ip, (void *) regs->sp, error_code);
17955 print_vma_addr(" in ", regs->ip);
17956 printk("\n");
17957 }
00e5a55c 17958@@ -946,81 +947,45 @@ LIST_HEAD(pgd_list);
cc90b958
BS
17959 void vmalloc_sync_all(void)
17960 {
17961 #ifdef CONFIG_X86_32
17962- /*
17963- * Note that races in the updates of insync and start aren't
17964- * problematic: insync can only get set bits added, and updates to
17965- * start are only improving performance (without affecting correctness
17966- * if undone).
17967- * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
17968- * This change works just fine with 2-level paging too.
17969- */
17970-#define sync_index(a) ((a) >> PMD_SHIFT)
17971- static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
17972- static unsigned long start = TASK_SIZE;
17973- unsigned long address;
17974+ unsigned long address = VMALLOC_START & PGDIR_MASK;
17975
17976 if (SHARED_KERNEL_PMD)
17977 return;
17978
17979 BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
17980- for (address = start;
17981- address < hypervisor_virt_start;
17982- address += PMD_SIZE) {
17983- if (!test_bit(sync_index(address), insync)) {
17984- unsigned long flags;
17985- struct page *page;
17986-
17987- spin_lock_irqsave(&pgd_lock, flags);
17988- /* XEN: failure path assumes non-empty pgd_list. */
17989- if (unlikely(list_empty(&pgd_list))) {
17990- spin_unlock_irqrestore(&pgd_lock, flags);
17991- return;
17992- }
17993- list_for_each_entry(page, &pgd_list, lru) {
17994- if (!vmalloc_sync_one(page_address(page),
17995- address))
17996- break;
17997- }
17998- spin_unlock_irqrestore(&pgd_lock, flags);
17999- if (!page)
18000- set_bit(sync_index(address), insync);
18001+ for (; address < hypervisor_virt_start; address += PMD_SIZE) {
18002+ unsigned long flags;
18003+ struct page *page;
18004+
18005+ spin_lock_irqsave(&pgd_lock, flags);
18006+ list_for_each_entry(page, &pgd_list, lru) {
18007+ if (!vmalloc_sync_one(page_address(page),
18008+ address))
18009+ break;
18010 }
18011- if (address == start && test_bit(sync_index(address), insync))
18012- start = address + PMD_SIZE;
18013+ spin_unlock_irqrestore(&pgd_lock, flags);
18014 }
18015 #else /* CONFIG_X86_64 */
18016- /*
18017- * Note that races in the updates of insync and start aren't
18018- * problematic: insync can only get set bits added, and updates to
18019- * start are only improving performance (without affecting correctness
18020- * if undone).
18021- */
18022- static DECLARE_BITMAP(insync, PTRS_PER_PGD);
18023- static unsigned long start = VMALLOC_START & PGDIR_MASK;
18024+ unsigned long start = VMALLOC_START & PGDIR_MASK;
18025 unsigned long address;
18026
18027 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
18028- if (!test_bit(pgd_index(address), insync)) {
18029- const pgd_t *pgd_ref = pgd_offset_k(address);
18030- unsigned long flags;
18031- struct page *page;
18032-
18033- if (pgd_none(*pgd_ref))
18034- continue;
18035- spin_lock_irqsave(&pgd_lock, flags);
18036- list_for_each_entry(page, &pgd_list, lru) {
18037- pgd_t *pgd;
18038- pgd = (pgd_t *)page_address(page) + pgd_index(address);
18039- if (pgd_none(*pgd))
18040- set_pgd(pgd, *pgd_ref);
18041- else
18042- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18043- }
18044- spin_unlock_irqrestore(&pgd_lock, flags);
18045- set_bit(pgd_index(address), insync);
18046+ const pgd_t *pgd_ref = pgd_offset_k(address);
18047+ unsigned long flags;
18048+ struct page *page;
18049+
18050+ if (pgd_none(*pgd_ref))
18051+ continue;
18052+ spin_lock_irqsave(&pgd_lock, flags);
18053+ list_for_each_entry(page, &pgd_list, lru) {
18054+ pgd_t *pgd;
18055+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
18056+ if (pgd_none(*pgd))
18057+ set_pgd(pgd, *pgd_ref);
18058+ else
18059+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18060 }
18061- if (address == start)
18062- start = address + PGDIR_SIZE;
18063+ spin_unlock_irqrestore(&pgd_lock, flags);
18064 }
18065 #endif
18066 }
00e5a55c
BS
18067--- sle11-2009-06-04.orig/arch/x86/mm/hypervisor.c 2009-05-14 11:18:39.000000000 +0200
18068+++ sle11-2009-06-04/arch/x86/mm/hypervisor.c 2009-06-04 10:21:39.000000000 +0200
18069@@ -709,6 +709,72 @@ void xen_destroy_contiguous_region(unsig
18070 }
18071 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
18072
18073+int __init early_create_contiguous_region(unsigned long pfn,
18074+ unsigned int order,
18075+ unsigned int address_bits)
18076+{
18077+ unsigned long *in_frames = discontig_frames, out_frame = pfn;
18078+ unsigned int i;
18079+ int rc, success;
18080+ struct xen_memory_exchange exchange = {
18081+ .in = {
18082+ .nr_extents = 1UL << order,
18083+ .extent_order = 0,
18084+ .domid = DOMID_SELF
18085+ },
18086+ .out = {
18087+ .nr_extents = 1,
18088+ .extent_order = order,
18089+ .address_bits = address_bits,
18090+ .domid = DOMID_SELF
18091+ }
18092+ };
18093+
18094+ if (xen_feature(XENFEAT_auto_translated_physmap))
18095+ return 0;
18096+
18097+ if (unlikely(order > MAX_CONTIG_ORDER))
18098+ return -ENOMEM;
18099+
18100+ for (i = 0; i < (1U << order); ++i) {
18101+ in_frames[i] = pfn_to_mfn(pfn + i);
18102+ set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
18103+ }
18104+
18105+ set_xen_guest_handle(exchange.in.extent_start, in_frames);
18106+ set_xen_guest_handle(exchange.out.extent_start, &out_frame);
18107+
18108+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
18109+ success = (exchange.nr_exchanged == (1UL << order));
18110+ BUG_ON(!success && (exchange.nr_exchanged || !rc));
18111+ BUG_ON(success && rc);
18112+#if CONFIG_XEN_COMPAT <= 0x030002
18113+ if (unlikely(rc == -ENOSYS)) {
18114+ /* Compatibility when XENMEM_exchange is unavailable. */
18115+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
18116+ &exchange.in) != (1UL << order))
18117+ BUG();
18118+ success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18119+ &exchange.out) == 1);
18120+ if (!success) {
18121+ for (i = 0; i < (1U << order); ++i)
18122+ in_frames[i] = pfn + i;
18123+ if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18124+ &exchange.in) != (1UL << order))
18125+ BUG();
18126+ }
18127+ }
18128+#endif
18129+
18130+ for (i = 0; i < (1U << order); ++i, ++out_frame) {
18131+ if (!success)
18132+ out_frame = in_frames[i];
18133+ set_phys_to_machine(pfn + i, out_frame);
18134+ }
18135+
18136+ return success ? 0 : -ENOMEM;
18137+}
18138+
18139 static void undo_limit_pages(struct page *pages, unsigned int order)
18140 {
18141 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
18142@@ -875,42 +941,9 @@ int write_ldt_entry(struct desc_struct *
cc90b958
BS
18143 return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
18144 }
18145
18146-#define MAX_BATCHED_FULL_PTES 32
18147-
18148-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
18149- unsigned long addr, unsigned long end, pgprot_t newprot,
18150- int dirty_accountable)
18151+int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
18152+ int type)
18153 {
18154- int rc = 0, i = 0;
18155- mmu_update_t u[MAX_BATCHED_FULL_PTES];
18156- pte_t *pte;
18157- spinlock_t *ptl;
18158-
18159- if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
18160- return 0;
18161-
18162- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
18163- do {
18164- if (pte_present(*pte)) {
18165- pte_t ptent = pte_modify(*pte, newprot);
18166-
18167- if (dirty_accountable && pte_dirty(ptent))
18168- ptent = pte_mkwrite(ptent);
18169- u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
18170- | ((unsigned long)pte & ~PAGE_MASK)
18171- | MMU_PT_UPDATE_PRESERVE_AD;
18172- u[i].val = __pte_val(ptent);
18173- if (++i == MAX_BATCHED_FULL_PTES) {
18174- if ((rc = HYPERVISOR_mmu_update(
18175- &u[0], i, NULL, DOMID_SELF)) != 0)
18176- break;
18177- i = 0;
18178- }
18179- }
18180- } while (pte++, addr += PAGE_SIZE, addr != end);
18181- if (i)
18182- rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
18183- pte_unmap_unlock(pte - 1, ptl);
18184- BUG_ON(rc && rc != -ENOSYS);
18185- return !rc;
18186+ maddr_t mach_gp = virt_to_machine(gdt + entry);
18187+ return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
18188 }
00e5a55c
BS
18189--- sle11-2009-06-04.orig/arch/x86/mm/init_32-xen.c 2009-03-16 16:38:05.000000000 +0100
18190+++ sle11-2009-06-04/arch/x86/mm/init_32-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
18191@@ -54,6 +54,7 @@
18192
18193 unsigned int __VMALLOC_RESERVE = 128 << 20;
18194
18195+unsigned long max_low_pfn_mapped;
18196 unsigned long max_pfn_mapped;
18197
18198 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18199@@ -61,6 +62,27 @@ unsigned long highstart_pfn, highend_pfn
18200
18201 static noinline int do_test_wp_bit(void);
18202
18203+
18204+static unsigned long __initdata table_start;
18205+static unsigned long __initdata table_end;
18206+static unsigned long __initdata table_top;
18207+
18208+static int __initdata after_init_bootmem;
18209+
18210+static __init void *alloc_low_page(unsigned long *phys)
18211+{
18212+ unsigned long pfn = table_end++;
18213+ void *adr;
18214+
18215+ if (pfn >= table_top)
18216+ panic("alloc_low_page: ran out of memory");
18217+
18218+ adr = __va(pfn * PAGE_SIZE);
18219+ memset(adr, 0, PAGE_SIZE);
18220+ *phys = pfn * PAGE_SIZE;
18221+ return adr;
18222+}
18223+
18224 /*
18225 * Creates a middle page table and puts a pointer to it in the
18226 * given global directory entry. This only returns the gd entry
18227@@ -72,9 +94,12 @@ static pmd_t * __init one_md_table_init(
18228 pmd_t *pmd_table;
18229
18230 #ifdef CONFIG_X86_PAE
18231+ unsigned long phys;
18232 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
18233- pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18234-
18235+ if (after_init_bootmem)
18236+ pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18237+ else
18238+ pmd_table = (pmd_t *)alloc_low_page(&phys);
18239 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
18240 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18241 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18242@@ -101,12 +126,16 @@ static pte_t * __init one_page_table_ini
18243 #endif
18244 pte_t *page_table = NULL;
18245
18246+ if (after_init_bootmem) {
18247 #ifdef CONFIG_DEBUG_PAGEALLOC
18248- page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18249+ page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18250 #endif
18251- if (!page_table) {
18252- page_table =
18253+ if (!page_table)
18254+ page_table =
18255 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18256+ } else {
18257+ unsigned long phys;
18258+ page_table = (pte_t *)alloc_low_page(&phys);
18259 }
18260
18261 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
18262@@ -167,24 +196,24 @@ static inline int is_kernel_text(unsigne
18263 * of max_low_pfn pages, by creating page tables starting from address
18264 * PAGE_OFFSET:
18265 */
18266-static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18267+static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
18268+ unsigned long start_pfn,
18269+ unsigned long end_pfn,
18270+ int use_pse)
18271 {
18272 int pgd_idx, pmd_idx, pte_ofs;
18273 unsigned long pfn;
18274 pgd_t *pgd;
18275 pmd_t *pmd;
18276 pte_t *pte;
18277+ unsigned pages_2m = 0, pages_4k = 0;
18278
18279- unsigned long max_ram_pfn = xen_start_info->nr_pages;
18280- if (max_ram_pfn > max_low_pfn)
18281- max_ram_pfn = max_low_pfn;
18282+ if (!cpu_has_pse)
18283+ use_pse = 0;
18284
18285- pgd_idx = pgd_index(PAGE_OFFSET);
18286+ pfn = start_pfn;
18287+ pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18288 pgd = pgd_base + pgd_idx;
18289- pfn = 0;
18290- pmd_idx = pmd_index(PAGE_OFFSET);
18291- pte_ofs = pte_index(PAGE_OFFSET);
18292-
18293 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18294 #ifdef CONFIG_XEN
18295 /*
18296@@ -198,10 +227,16 @@ static void __init kernel_physical_mappi
18297 #else
18298 pmd = one_md_table_init(pgd);
18299 #endif
18300- if (pfn >= max_low_pfn)
18301+
18302+ if (pfn >= end_pfn)
18303 continue;
18304+#ifdef CONFIG_X86_PAE
18305+ pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18306 pmd += pmd_idx;
18307- for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
18308+#else
18309+ pmd_idx = 0;
18310+#endif
18311+ for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
18312 pmd++, pmd_idx++) {
18313 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
18314
18315@@ -211,13 +246,8 @@ static void __init kernel_physical_mappi
18316 /*
18317 * Map with big pages if possible, otherwise
18318 * create normal page tables:
18319- *
18320- * Don't use a large page for the first 2/4MB of memory
18321- * because there are often fixed size MTRRs in there
18322- * and overlapping MTRRs into large pages can cause
18323- * slowdowns.
18324 */
18325- if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
18326+ if (use_pse) {
18327 unsigned int addr2;
18328 pgprot_t prot = PAGE_KERNEL_LARGE;
18329
18330@@ -228,49 +258,35 @@ static void __init kernel_physical_mappi
18331 is_kernel_text(addr2))
18332 prot = PAGE_KERNEL_LARGE_EXEC;
18333
18334+ pages_2m++;
18335 set_pmd(pmd, pfn_pmd(pfn, prot));
18336
18337 pfn += PTRS_PER_PTE;
18338- max_pfn_mapped = pfn;
18339 continue;
18340 }
18341 pte = one_page_table_init(pmd);
18342
18343- for (pte += pte_ofs;
18344- pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
18345+ pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18346+ pte += pte_ofs;
18347+ for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
18348 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
18349 pgprot_t prot = PAGE_KERNEL;
18350
18351 /* XEN: Only map initial RAM allocation. */
18352- if ((pfn >= max_ram_pfn) || pte_present(*pte))
18353+ if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
18354 continue;
18355 if (is_kernel_text(addr))
18356 prot = PAGE_KERNEL_EXEC;
18357
18358+ pages_4k++;
18359 set_pte(pte, pfn_pte(pfn, prot));
18360 }
18361- max_pfn_mapped = pfn;
18362- pte_ofs = 0;
18363 }
18364- pmd_idx = 0;
18365 }
18366+ update_page_count(PG_LEVEL_2M, pages_2m);
18367+ update_page_count(PG_LEVEL_4K, pages_4k);
18368 }
18369
18370-#ifndef CONFIG_XEN
18371-
18372-static inline int page_kills_ppro(unsigned long pagenr)
18373-{
18374- if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18375- return 1;
18376- return 0;
18377-}
18378-
18379-#else
18380-
18381-#define page_kills_ppro(p) 0
18382-
18383-#endif
18384-
18385 /*
18386 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
18387 * is valid. The argument is a physical page number.
18388@@ -331,30 +347,63 @@ static void __init permanent_kmaps_init(
18389 pkmap_page_table = pte;
18390 }
18391
18392-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18393+static void __init add_one_highpage_init(struct page *page, int pfn)
18394 {
18395- if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18396- ClearPageReserved(page);
18397- init_page_count(page);
18398- if (pfn < xen_start_info->nr_pages)
18399- __free_page(page);
18400- totalhigh_pages++;
18401- } else
18402- SetPageReserved(page);
18403+ ClearPageReserved(page);
18404+ init_page_count(page);
18405+ if (pfn < xen_start_info->nr_pages)
18406+ __free_page(page);
18407+ totalhigh_pages++;
18408+}
18409+
18410+struct add_highpages_data {
18411+ unsigned long start_pfn;
18412+ unsigned long end_pfn;
18413+};
18414+
18415+static int __init add_highpages_work_fn(unsigned long start_pfn,
18416+ unsigned long end_pfn, void *datax)
18417+{
18418+ int node_pfn;
18419+ struct page *page;
18420+ unsigned long final_start_pfn, final_end_pfn;
18421+ struct add_highpages_data *data;
18422+
18423+ data = (struct add_highpages_data *)datax;
18424+
18425+ final_start_pfn = max(start_pfn, data->start_pfn);
18426+ final_end_pfn = min(end_pfn, data->end_pfn);
18427+ if (final_start_pfn >= final_end_pfn)
18428+ return 0;
18429+
18430+ for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
18431+ node_pfn++) {
18432+ if (!pfn_valid(node_pfn))
18433+ continue;
18434+ page = pfn_to_page(node_pfn);
18435+ add_one_highpage_init(page, node_pfn);
18436+ }
18437+
18438+ return 0;
18439+
18440+}
18441+
18442+void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
18443+ unsigned long end_pfn)
18444+{
18445+ struct add_highpages_data data;
18446+
18447+ data.start_pfn = start_pfn;
18448+ data.end_pfn = end_pfn;
18449+
18450+ work_with_active_regions(nid, add_highpages_work_fn, &data);
18451 }
18452
18453 #ifndef CONFIG_NUMA
18454-static void __init set_highmem_pages_init(int bad_ppro)
18455+static void __init set_highmem_pages_init(void)
18456 {
18457- int pfn;
18458+ add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
18459
18460- for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
18461- /*
18462- * Holes under sparsemem might not have no mem_map[]:
18463- */
18464- if (pfn_valid(pfn))
18465- add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18466- }
18467 totalram_pages += totalhigh_pages;
18468 }
18469 #endif /* !CONFIG_NUMA */
18470@@ -362,24 +411,11 @@ static void __init set_highmem_pages_ini
18471 #else
18472 # define kmap_init() do { } while (0)
18473 # define permanent_kmaps_init(pgd_base) do { } while (0)
18474-# define set_highmem_pages_init(bad_ppro) do { } while (0)
18475+# define set_highmem_pages_init() do { } while (0)
18476 #endif /* CONFIG_HIGHMEM */
18477
18478-pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
18479-EXPORT_SYMBOL(__PAGE_KERNEL);
18480-
18481-pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18482-
18483 pgd_t *swapper_pg_dir;
18484
18485-static void __init xen_pagetable_setup_start(pgd_t *base)
18486-{
18487-}
18488-
18489-static void __init xen_pagetable_setup_done(pgd_t *base)
18490-{
18491-}
18492-
18493 /*
18494 * Build a proper pagetable for the kernel mappings. Up until this
18495 * point, we've been running on some set of pagetables constructed by
18496@@ -399,27 +435,10 @@ static void __init xen_pagetable_setup_d
18497 * be partially populated, and so it avoids stomping on any existing
18498 * mappings.
18499 */
18500-static void __init pagetable_init(void)
18501+static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
18502 {
18503- pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18504 unsigned long vaddr, end;
18505
18506- xen_pagetable_setup_start(pgd_base);
18507-
18508- /* Enable PSE if available */
18509- if (cpu_has_pse)
18510- set_in_cr4(X86_CR4_PSE);
18511-
18512- /* Enable PGE if available */
18513- if (cpu_has_pge) {
18514- set_in_cr4(X86_CR4_PGE);
18515- __PAGE_KERNEL |= _PAGE_GLOBAL;
18516- __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18517- }
18518-
18519- kernel_physical_mapping_init(pgd_base);
18520- remap_numa_kva();
18521-
18522 /*
18523 * Fixed mappings, only the page table structure has to be
18524 * created - mappings will be set by set_fixmap():
18525@@ -429,10 +448,13 @@ static void __init pagetable_init(void)
18526 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
18527 page_table_range_init(vaddr, end, pgd_base);
18528 early_ioremap_reset();
18529+}
18530
18531- permanent_kmaps_init(pgd_base);
18532+static void __init pagetable_init(void)
18533+{
18534+ pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18535
18536- xen_pagetable_setup_done(pgd_base);
18537+ permanent_kmaps_init(pgd_base);
18538 }
18539
18540 #if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
18541@@ -475,7 +497,7 @@ void zap_low_mappings(void)
18542
18543 int nx_enabled;
18544
18545-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
18546+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
18547 EXPORT_SYMBOL_GPL(__supported_pte_mask);
18548
18549 #ifdef CONFIG_X86_PAE
00e5a55c 18550@@ -528,42 +550,364 @@ static void __init set_nx(void)
cc90b958
BS
18551 }
18552 #endif
18553
18554+/* user-defined highmem size */
18555+static unsigned int highmem_pages = -1;
18556+
18557 /*
18558- * paging_init() sets up the page tables - note that the first 8MB are
18559- * already mapped by head.S.
18560- *
18561- * This routines also unmaps the page at virtual kernel address 0, so
18562- * that we can trap those pesky NULL-reference errors in the kernel.
18563+ * highmem=size forces highmem to be exactly 'size' bytes.
18564+ * This works even on boxes that have no highmem otherwise.
18565+ * This also works to reduce highmem size on bigger boxes.
18566 */
18567-void __init paging_init(void)
18568+static int __init parse_highmem(char *arg)
18569+{
18570+ if (!arg)
18571+ return -EINVAL;
18572+
18573+ highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
18574+ return 0;
18575+}
18576+early_param("highmem", parse_highmem);
18577+
18578+/*
18579+ * Determine low and high memory ranges:
18580+ */
18581+void __init find_low_pfn_range(void)
18582+{
18583+ /* it could update max_pfn */
18584+
18585+ /* max_low_pfn is 0, we already have early_res support */
18586+
18587+ max_low_pfn = max_pfn;
18588+ if (max_low_pfn > MAXMEM_PFN) {
18589+ if (highmem_pages == -1)
18590+ highmem_pages = max_pfn - MAXMEM_PFN;
18591+ if (highmem_pages + MAXMEM_PFN < max_pfn)
18592+ max_pfn = MAXMEM_PFN + highmem_pages;
18593+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
18594+ printk(KERN_WARNING "only %luMB highmem pages "
18595+ "available, ignoring highmem size of %uMB.\n",
18596+ pages_to_mb(max_pfn - MAXMEM_PFN),
18597+ pages_to_mb(highmem_pages));
18598+ highmem_pages = 0;
18599+ }
18600+ max_low_pfn = MAXMEM_PFN;
18601+#ifndef CONFIG_HIGHMEM
18602+ /* Maximum memory usable is what is directly addressable */
18603+ printk(KERN_WARNING "Warning only %ldMB will be used.\n",
18604+ MAXMEM>>20);
18605+ if (max_pfn > MAX_NONPAE_PFN)
18606+ printk(KERN_WARNING
18607+ "Use a HIGHMEM64G enabled kernel.\n");
18608+ else
18609+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
18610+ max_pfn = MAXMEM_PFN;
18611+#else /* !CONFIG_HIGHMEM */
18612+#ifndef CONFIG_HIGHMEM64G
18613+ if (max_pfn > MAX_NONPAE_PFN) {
18614+ max_pfn = MAX_NONPAE_PFN;
18615+ printk(KERN_WARNING "Warning only 4GB will be used."
18616+ "Use a HIGHMEM64G enabled kernel.\n");
18617+ }
18618+#endif /* !CONFIG_HIGHMEM64G */
18619+#endif /* !CONFIG_HIGHMEM */
18620+ } else {
18621+ if (highmem_pages == -1)
18622+ highmem_pages = 0;
18623+#ifdef CONFIG_HIGHMEM
18624+ if (highmem_pages >= max_pfn) {
18625+ printk(KERN_ERR "highmem size specified (%uMB) is "
18626+ "bigger than pages available (%luMB)!.\n",
18627+ pages_to_mb(highmem_pages),
18628+ pages_to_mb(max_pfn));
18629+ highmem_pages = 0;
18630+ }
18631+ if (highmem_pages) {
18632+ if (max_low_pfn - highmem_pages <
18633+ 64*1024*1024/PAGE_SIZE){
18634+ printk(KERN_ERR "highmem size %uMB results in "
18635+ "smaller than 64MB lowmem, ignoring it.\n"
18636+ , pages_to_mb(highmem_pages));
18637+ highmem_pages = 0;
18638+ }
18639+ max_low_pfn -= highmem_pages;
18640+ }
18641+#else
18642+ if (highmem_pages)
18643+ printk(KERN_ERR "ignoring highmem size on non-highmem"
18644+ " kernel!\n");
18645+#endif
18646+ }
18647+}
18648+
18649+#ifndef CONFIG_NEED_MULTIPLE_NODES
18650+void __init initmem_init(unsigned long start_pfn,
18651+ unsigned long end_pfn)
18652+{
18653+#ifdef CONFIG_HIGHMEM
18654+ highstart_pfn = highend_pfn = max_pfn;
18655+ if (max_pfn > max_low_pfn)
18656+ highstart_pfn = max_low_pfn;
18657+ memory_present(0, 0, highend_pfn);
18658+ e820_register_active_regions(0, 0, highend_pfn);
18659+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
18660+ pages_to_mb(highend_pfn - highstart_pfn));
18661+ num_physpages = highend_pfn;
18662+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
18663+#else
18664+ memory_present(0, 0, max_low_pfn);
18665+ e820_register_active_regions(0, 0, max_low_pfn);
18666+ num_physpages = max_low_pfn;
18667+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
18668+#endif
18669+#ifdef CONFIG_FLATMEM
18670+ max_mapnr = num_physpages;
18671+#endif
18672+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
18673+ pages_to_mb(max_low_pfn));
18674+
18675+ setup_bootmem_allocator();
18676+}
18677+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
18678+
18679+static void __init zone_sizes_init(void)
18680+{
18681+ unsigned long max_zone_pfns[MAX_NR_ZONES];
18682+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
18683+ max_zone_pfns[ZONE_DMA] =
18684+ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
18685+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
18686+#ifdef CONFIG_HIGHMEM
18687+ max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
18688+#endif
18689+
18690+ free_area_init_nodes(max_zone_pfns);
18691+}
18692+
18693+void __init setup_bootmem_allocator(void)
18694 {
18695 int i;
18696+ unsigned long bootmap_size, bootmap;
18697+ unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
18698+
18699+ /*
18700+ * Initialize the boot-time allocator (with low memory only):
18701+ */
18702+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
18703+ bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
00e5a55c
BS
18704+ min(max_pfn_mapped, xen_start_info->nr_pages)<<PAGE_SHIFT,
18705+ bootmap_size, PAGE_SIZE);
cc90b958
BS
18706+ if (bootmap == -1L)
18707+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
18708+ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
18709+
18710+ /* don't touch min_low_pfn */
18711+ bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
18712+ min_low_pfn, end_pfn);
18713+ printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
18714+ max_pfn_mapped<<PAGE_SHIFT);
18715+ printk(KERN_INFO " low ram: %08lx - %08lx\n",
18716+ min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
18717+ printk(KERN_INFO " bootmap %08lx - %08lx\n",
18718+ bootmap, bootmap + bootmap_size);
18719+ for_each_online_node(i)
18720+ free_bootmem_with_active_regions(i, end_pfn);
18721+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
18722+
18723+ after_init_bootmem = 1;
18724+}
18725+
18726+static unsigned long __init extend_init_mapping(unsigned long tables_space)
18727+{
18728+ unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
18729+ + xen_start_info->nr_pt_frames;
00e5a55c 18730+ unsigned long start = start_pfn, va = (unsigned long)&_text;
cc90b958
BS
18731+ pgd_t *pgd;
18732+ pud_t *pud;
18733+ pmd_t *pmd;
18734+ pte_t *pte;
18735+
cc90b958
BS
18736+ /* Ensure init mappings cover kernel text/data and initial tables. */
18737+ while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
18738+ pgd = pgd_offset_k(va);
18739+ pud = pud_offset(pgd, va);
18740+ pmd = pmd_offset(pud, va);
18741+ if (pmd_none(*pmd)) {
18742+ unsigned long pa = start_pfn++ << PAGE_SHIFT;
18743+
18744+ memset(__va(pa), 0, PAGE_SIZE);
18745+ make_lowmem_page_readonly(__va(pa),
18746+ XENFEAT_writable_page_tables);
18747+ xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
18748+ }
18749+ pte = pte_offset_kernel(pmd, va);
18750+ if (pte_none(*pte)) {
18751+ pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
18752+
18753+ if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
18754+ BUG();
18755+ }
18756+ va += PAGE_SIZE;
18757+ }
18758+
18759+ /* Finally, blow away any spurious initial mappings. */
18760+ while (1) {
18761+ pgd = pgd_offset_k(va);
18762+ pud = pud_offset(pgd, va);
18763+ pmd = pmd_offset(pud, va);
18764+ if (pmd_none(*pmd))
18765+ break;
18766+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18767+ BUG();
18768+ va += PAGE_SIZE;
18769+ }
18770+
18771+ if (start_pfn > start)
18772+ reserve_early(start << PAGE_SHIFT,
18773+ start_pfn << PAGE_SHIFT, "INITMAP");
18774+
18775+ return start_pfn;
18776+}
18777+
18778+static void __init find_early_table_space(unsigned long end)
18779+{
18780+ unsigned long puds, pmds, ptes, tables;
18781+
18782+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
18783+ tables = PAGE_ALIGN(puds * sizeof(pud_t));
18784+
18785+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
18786+ tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
18787+
18788+ if (cpu_has_pse) {
18789+ unsigned long extra;
18790+
18791+ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
18792+ extra += PMD_SIZE;
18793+ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
18794+ } else
18795+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
18796+
18797+ tables += PAGE_ALIGN(ptes * sizeof(pte_t));
18798+
18799+ /* for fixmap */
18800+ tables += PAGE_SIZE
18801+ * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
18802+ - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
18803+ >> PMD_SHIFT);
18804+
18805+ table_start = extend_init_mapping(tables);
18806+
18807+ table_end = table_start;
18808+ table_top = table_start + (tables>>PAGE_SHIFT);
18809+
18810+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
18811+ end, table_start << PAGE_SHIFT,
18812+ (table_start << PAGE_SHIFT) + tables);
18813+}
18814+
18815+unsigned long __init_refok init_memory_mapping(unsigned long start,
18816+ unsigned long end)
18817+{
18818+ pgd_t *pgd_base = swapper_pg_dir;
18819+ unsigned long start_pfn, end_pfn;
18820+ unsigned long big_page_start;
18821+
18822+ /*
18823+ * Find space for the kernel direct mapping tables.
18824+ */
18825+ if (!after_init_bootmem)
18826+ find_early_table_space(end);
18827
18828 #ifdef CONFIG_X86_PAE
18829 set_nx();
18830 if (nx_enabled)
18831 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
18832 #endif
18833+
18834+ /* Enable PSE if available */
18835+ if (cpu_has_pse)
18836+ set_in_cr4(X86_CR4_PSE);
18837+
18838+ /* Enable PGE if available */
18839+ if (cpu_has_pge) {
18840+ set_in_cr4(X86_CR4_PGE);
18841+ __supported_pte_mask |= _PAGE_GLOBAL;
18842+ }
18843+
18844+ /*
18845+ * Don't use a large page for the first 2/4MB of memory
18846+ * because there are often fixed size MTRRs in there
18847+ * and overlapping MTRRs into large pages can cause
18848+ * slowdowns.
18849+ */
18850+ big_page_start = PMD_SIZE;
18851+
18852+ if (start < big_page_start) {
18853+ start_pfn = start >> PAGE_SHIFT;
18854+ end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
18855+ } else {
18856+ /* head is not big page alignment ? */
18857+ start_pfn = start >> PAGE_SHIFT;
18858+ end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18859+ << (PMD_SHIFT - PAGE_SHIFT);
18860+ }
18861+ if (start_pfn < end_pfn)
18862+ kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
18863+
18864+ /* big page range */
18865+ start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18866+ << (PMD_SHIFT - PAGE_SHIFT);
18867+ if (start_pfn < (big_page_start >> PAGE_SHIFT))
18868+ start_pfn = big_page_start >> PAGE_SHIFT;
18869+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
18870+ if (start_pfn < end_pfn)
18871+ kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
18872+ cpu_has_pse);
18873+
18874+ /* tail is not big page alignment ? */
18875+ start_pfn = end_pfn;
18876+ if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
18877+ end_pfn = end >> PAGE_SHIFT;
18878+ if (start_pfn < end_pfn)
18879+ kernel_physical_mapping_init(pgd_base, start_pfn,
18880+ end_pfn, 0);
18881+ }
18882+
18883+ early_ioremap_page_table_range_init(pgd_base);
18884+
18885+ __flush_tlb_all();
18886+
18887+ if (!after_init_bootmem)
18888+ reserve_early(table_start << PAGE_SHIFT,
18889+ table_end << PAGE_SHIFT, "PGTABLE");
18890+
18891+ if (!after_init_bootmem)
18892+ early_memtest(start, end);
18893+
18894+ return end >> PAGE_SHIFT;
18895+}
18896+
18897+
18898+/*
18899+ * paging_init() sets up the page tables - note that the first 8MB are
18900+ * already mapped by head.S.
18901+ *
18902+ * This routines also unmaps the page at virtual kernel address 0, so
18903+ * that we can trap those pesky NULL-reference errors in the kernel.
18904+ */
18905+void __init paging_init(void)
18906+{
18907 pagetable_init();
18908
18909 __flush_tlb_all();
18910
18911 kmap_init();
18912
18913- /* Switch to the real shared_info page, and clear the
18914- * dummy page. */
18915- set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18916- HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18917- memset(empty_zero_page, 0, sizeof(empty_zero_page));
18918-
18919- /* Setup mapping of lower 1st MB */
18920- for (i = 0; i < NR_FIX_ISAMAPS; i++)
18921- if (is_initial_xendomain())
18922- set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18923- else
18924- __set_fixmap(FIX_ISAMAP_BEGIN - i,
18925- virt_to_machine(empty_zero_page),
18926- PAGE_KERNEL_RO);
18927+ /*
18928+ * NOTE: at this point the bootmem allocator is fully available.
18929+ */
18930+ sparse_init();
18931+ zone_sizes_init();
18932 }
18933
18934 /*
00e5a55c 18935@@ -598,7 +942,7 @@ static struct kcore_list kcore_mem, kcor
cc90b958
BS
18936 void __init mem_init(void)
18937 {
18938 int codesize, reservedpages, datasize, initsize;
18939- int tmp, bad_ppro;
18940+ int tmp;
18941 unsigned long pfn;
18942
18943 pci_iommu_alloc();
00e5a55c 18944@@ -606,19 +950,6 @@ void __init mem_init(void)
cc90b958
BS
18945 #ifdef CONFIG_FLATMEM
18946 BUG_ON(!mem_map);
18947 #endif
18948- bad_ppro = ppro_with_ram_bug();
18949-
18950-#ifdef CONFIG_HIGHMEM
18951- /* check that fixmap and pkmap do not overlap */
18952- if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
18953- printk(KERN_ERR
18954- "fixmap and kmap areas overlap - this will crash\n");
18955- printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
18956- PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
18957- FIXADDR_START);
18958- BUG();
18959- }
18960-#endif
18961 /* this will put all low memory onto the freelists */
18962 totalram_pages += free_all_bootmem();
18963 /* XEN: init and count low-mem pages outside initial allocation. */
00e5a55c 18964@@ -636,7 +967,7 @@ void __init mem_init(void)
cc90b958
BS
18965 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
18966 reservedpages++;
18967
18968- set_highmem_pages_init(bad_ppro);
18969+ set_highmem_pages_init();
18970
18971 codesize = (unsigned long) &_etext - (unsigned long) &_text;
18972 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
00e5a55c 18973@@ -657,7 +988,6 @@ void __init mem_init(void)
cc90b958
BS
18974 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
18975 );
18976
18977-#if 1 /* double-sanity-check paranoia */
18978 printk(KERN_INFO "virtual kernel memory layout:\n"
18979 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
18980 #ifdef CONFIG_HIGHMEM
00e5a55c 18981@@ -698,7 +1028,6 @@ void __init mem_init(void)
cc90b958
BS
18982 #endif
18983 BUG_ON(VMALLOC_START > VMALLOC_END);
18984 BUG_ON((unsigned long)high_memory > VMALLOC_START);
18985-#endif /* double-sanity-check paranoia */
18986
18987 if (boot_cpu_data.wp_works_ok < 0)
18988 test_wp_bit();
00e5a55c 18989@@ -755,6 +1084,8 @@ void mark_rodata_ro(void)
cc90b958
BS
18990 unsigned long start = PFN_ALIGN(_text);
18991 unsigned long size = PFN_ALIGN(_etext) - start;
18992
18993+#ifndef CONFIG_DYNAMIC_FTRACE
18994+ /* Dynamic tracing modifies the kernel text section */
18995 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18996 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
18997 size >> 10);
00e5a55c 18998@@ -767,6 +1098,8 @@ void mark_rodata_ro(void)
cc90b958
BS
18999 printk(KERN_INFO "Testing CPA: write protecting again\n");
19000 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
19001 #endif
19002+#endif /* CONFIG_DYNAMIC_FTRACE */
19003+
19004 start += size;
19005 size = (unsigned long)__end_rodata - start;
19006 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
00e5a55c 19007@@ -829,3 +1162,9 @@ void free_initrd_mem(unsigned long start
cc90b958
BS
19008 free_init_pages("initrd memory", start, end);
19009 }
19010 #endif
19011+
19012+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
19013+ int flags)
19014+{
19015+ return reserve_bootmem(phys, len, flags);
19016+}
00e5a55c
BS
19017--- sle11-2009-06-04.orig/arch/x86/mm/init_64-xen.c 2009-03-16 16:38:05.000000000 +0100
19018+++ sle11-2009-06-04/arch/x86/mm/init_64-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
19019@@ -21,6 +21,7 @@
19020 #include <linux/swap.h>
19021 #include <linux/smp.h>
19022 #include <linux/init.h>
19023+#include <linux/initrd.h>
19024 #include <linux/pagemap.h>
19025 #include <linux/bootmem.h>
19026 #include <linux/proc_fs.h>
19027@@ -52,6 +53,14 @@
19028
19029 #include <xen/features.h>
19030
19031+/*
19032+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
19033+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
19034+ * apertures, ACPI and other tables without having to play with fixmaps.
19035+ */
19036+unsigned long max_low_pfn_mapped;
19037+unsigned long max_pfn_mapped;
19038+
19039 #if CONFIG_XEN_COMPAT <= 0x030002
19040 unsigned int __kernel_page_user;
19041 EXPORT_SYMBOL(__kernel_page_user);
00e5a55c 19042@@ -60,13 +69,12 @@ EXPORT_SYMBOL(__kernel_page_user);
cc90b958
BS
19043 int after_bootmem;
19044
19045 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
19046-extern unsigned long start_pfn;
19047
19048 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
19049 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
19050
00e5a55c 19051 #ifndef CONFIG_XEN
cc90b958
BS
19052-int direct_gbpages __meminitdata
19053+int direct_gbpages
19054 #ifdef CONFIG_DIRECT_GBPAGES
19055 = 1
19056 #endif
19057@@ -145,55 +153,23 @@ void __meminit early_make_page_readonly(
19058 * around without checking the pgd every time.
19059 */
19060
19061-void show_mem(void)
19062-{
19063- long i, total = 0, reserved = 0;
19064- long shared = 0, cached = 0;
19065- struct page *page;
19066- pg_data_t *pgdat;
19067-
19068- printk(KERN_INFO "Mem-info:\n");
19069- show_free_areas();
19070- for_each_online_pgdat(pgdat) {
19071- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19072- /*
19073- * This loop can take a while with 256 GB and
19074- * 4k pages so defer the NMI watchdog:
19075- */
19076- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
19077- touch_nmi_watchdog();
19078-
19079- if (!pfn_valid(pgdat->node_start_pfn + i))
19080- continue;
19081-
19082- page = pfn_to_page(pgdat->node_start_pfn + i);
19083- total++;
19084- if (PageReserved(page))
19085- reserved++;
19086- else if (PageSwapCache(page))
19087- cached++;
19088- else if (page_count(page))
19089- shared += page_count(page) - 1;
19090- }
19091- }
19092- printk(KERN_INFO "%lu pages of RAM\n", total);
19093- printk(KERN_INFO "%lu reserved pages\n", reserved);
19094- printk(KERN_INFO "%lu pages shared\n", shared);
19095- printk(KERN_INFO "%lu pages swap cached\n", cached);
19096-}
19097-
19098 static unsigned long __meminitdata table_start;
00e5a55c 19099-static unsigned long __meminitdata table_end;
cc90b958 19100+static unsigned long __meminitdata table_cur;
00e5a55c 19101+static unsigned long __meminitdata table_top;
cc90b958
BS
19102
19103-static __init void *spp_getpage(void)
19104+/*
19105+ * NOTE: This function is marked __ref because it calls __init function
19106+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
19107+ */
19108+static __ref void *spp_getpage(void)
19109 {
19110 void *ptr;
19111
19112 if (after_bootmem)
19113 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
19114- else if (start_pfn < table_end) {
19115- ptr = __va(start_pfn << PAGE_SHIFT);
19116- start_pfn++;
00e5a55c 19117+ else if (table_cur < table_top) {
cc90b958
BS
19118+ ptr = __va(table_cur << PAGE_SHIFT);
19119+ table_cur++;
19120 memset(ptr, 0, PAGE_SIZE);
19121 } else
19122 ptr = alloc_bootmem_pages(PAGE_SIZE);
19123@@ -208,30 +184,18 @@ static __init void *spp_getpage(void)
19124 return ptr;
19125 }
19126
19127-#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
19128-#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
19129-
19130-static __init void
19131-set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
19132+void
19133+set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
19134 {
19135- pgd_t *pgd;
19136 pud_t *pud;
19137 pmd_t *pmd;
19138- pte_t *pte, new_pte;
19139-
19140- pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
19141+ pte_t *pte;
19142
19143- pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
19144- if (pgd_none(*pgd)) {
19145- printk(KERN_ERR
19146- "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19147- return;
19148- }
19149- pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
19150+ pud = pud_page + pud_index(vaddr);
19151 if (pud_none(*pud)) {
19152 pmd = (pmd_t *) spp_getpage();
19153 make_page_readonly(pmd, XENFEAT_writable_page_tables);
19154- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19155+ pud_populate(&init_mm, pud, pmd);
19156 if (pmd != pmd_offset(pud, 0)) {
19157 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19158 pmd, pmd_offset(pud, 0));
19159@@ -242,19 +206,20 @@ set_pte_phys(unsigned long vaddr, unsign
19160 if (pmd_none(*pmd)) {
19161 pte = (pte_t *) spp_getpage();
19162 make_page_readonly(pte, XENFEAT_writable_page_tables);
19163- set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19164+ pmd_populate_kernel(&init_mm, pmd, pte);
19165 if (pte != pte_offset_kernel(pmd, 0)) {
19166 printk(KERN_ERR "PAGETABLE BUG #02!\n");
19167 return;
19168 }
19169 }
19170- if (pgprot_val(prot))
19171- new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
19172- else
19173- new_pte = __pte(0);
19174
19175 pte = pte_offset_kernel(pmd, vaddr);
19176 if (!pte_none(*pte) && __pte_val(new_pte) &&
19177+#ifdef CONFIG_ACPI
19178+ /* __acpi_map_table() fails to properly call clear_fixmap() */
19179+ (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19180+ vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19181+#endif
19182 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19183 pte_ERROR(*pte);
19184 set_pte(pte, new_pte);
19185@@ -266,15 +231,13 @@ set_pte_phys(unsigned long vaddr, unsign
19186 __flush_tlb_one(vaddr);
19187 }
19188
19189-static __init void
19190-set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
19191+void
19192+set_pte_vaddr(unsigned long vaddr, pte_t pteval)
19193 {
19194 pgd_t *pgd;
19195- pud_t *pud;
19196- pmd_t *pmd;
19197- pte_t *pte, new_pte;
19198+ pud_t *pud_page;
19199
19200- pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
19201+ pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
19202
19203 pgd = pgd_offset_k(vaddr);
19204 if (pgd_none(*pgd)) {
19205@@ -282,47 +245,51 @@ set_pte_phys_ma(unsigned long vaddr, uns
19206 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19207 return;
19208 }
19209- pud = pud_offset(pgd, vaddr);
19210- if (pud_none(*pud)) {
19211- pmd = (pmd_t *) spp_getpage();
19212- make_page_readonly(pmd, XENFEAT_writable_page_tables);
19213- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19214- if (pmd != pmd_offset(pud, 0)) {
19215- printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19216- pmd, pmd_offset(pud, 0));
19217+ pud_page = (pud_t*)pgd_page_vaddr(*pgd);
19218+ set_pte_vaddr_pud(pud_page, vaddr, pteval);
19219+}
19220+
19221+#ifndef CONFIG_XEN
19222+/*
19223+ * Create large page table mappings for a range of physical addresses.
19224+ */
19225+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
19226+ pgprot_t prot)
19227+{
19228+ pgd_t *pgd;
19229+ pud_t *pud;
19230+ pmd_t *pmd;
19231+
19232+ BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
19233+ for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
19234+ pgd = pgd_offset_k((unsigned long)__va(phys));
19235+ if (pgd_none(*pgd)) {
19236+ pud = (pud_t *) spp_getpage();
19237+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
19238+ _PAGE_USER));
19239 }
19240- }
19241- pmd = pmd_offset(pud, vaddr);
19242- if (pmd_none(*pmd)) {
19243- pte = (pte_t *) spp_getpage();
19244- make_page_readonly(pte, XENFEAT_writable_page_tables);
19245- set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19246- if (pte != pte_offset_kernel(pmd, 0)) {
19247- printk(KERN_ERR "PAGETABLE BUG #02!\n");
19248- return;
19249+ pud = pud_offset(pgd, (unsigned long)__va(phys));
19250+ if (pud_none(*pud)) {
19251+ pmd = (pmd_t *) spp_getpage();
19252+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
19253+ _PAGE_USER));
19254 }
19255+ pmd = pmd_offset(pud, phys);
19256+ BUG_ON(!pmd_none(*pmd));
19257+ set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
19258 }
19259- new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
19260+}
19261
19262- pte = pte_offset_kernel(pmd, vaddr);
19263- if (!pte_none(*pte) && __pte_val(new_pte) &&
19264-#ifdef CONFIG_ACPI
19265- /* __acpi_map_table() fails to properly call clear_fixmap() */
19266- (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19267- vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19268-#endif
19269- __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19270- pte_ERROR(*pte);
19271- set_pte(pte, new_pte);
19272+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
19273+{
19274+ __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
19275+}
19276
19277- /*
19278- * It's enough to flush this one mapping.
19279- * (PGE mappings get flushed as well)
19280- */
19281- __flush_tlb_one(vaddr);
19282+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
19283+{
19284+ __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
19285 }
19286
19287-#ifndef CONFIG_XEN
19288 /*
19289 * The head.S code sets up the kernel high mapping:
19290 *
00e5a55c 19291@@ -352,63 +319,52 @@ void __init cleanup_highmap(void)
cc90b958
BS
19292 }
19293 #endif
19294
19295-/* NOTE: this is meant to be run only at boot */
19296-void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
19297-{
19298- unsigned long address = __fix_to_virt(idx);
19299-
19300- if (idx >= __end_of_fixed_addresses) {
19301- printk(KERN_ERR "Invalid __set_fixmap\n");
19302- return;
19303- }
19304- switch (idx) {
19305- case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
19306- set_pte_phys(address, phys, prot, 0);
19307- set_pte_phys(address, phys, prot, 1);
19308- break;
19309- case FIX_EARLYCON_MEM_BASE:
19310- xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
19311- pfn_pte_ma(phys >> PAGE_SHIFT, prot));
19312- break;
19313- default:
19314- set_pte_phys_ma(address, phys, prot);
19315- break;
19316- }
19317-}
19318-
00e5a55c
BS
19319-static __meminit void *alloc_static_page(unsigned long *phys)
19320+static __ref void *alloc_low_page(unsigned long *phys)
cc90b958
BS
19321 {
19322- unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
00e5a55c
BS
19323+ unsigned long pfn;
19324+ void *adr;
cc90b958
BS
19325
19326 if (after_bootmem) {
00e5a55c
BS
19327- void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
19328+ adr = (void *)get_zeroed_page(GFP_ATOMIC);
19329 *phys = __pa(adr);
19330
cc90b958
BS
19331 return adr;
19332 }
19333
19334- *phys = start_pfn << PAGE_SHIFT;
19335- start_pfn++;
19336- memset((void *)va, 0, PAGE_SIZE);
19337- return (void *)va;
19338+ BUG_ON(!table_cur);
00e5a55c
BS
19339+ pfn = table_cur++;
19340+ if (pfn >= table_top)
19341+ panic("alloc_low_page: ran out of memory");
19342+
19343+ adr = early_ioremap(pfn_to_mfn(pfn) * PAGE_SIZE, PAGE_SIZE);
19344+ memset(adr, 0, PAGE_SIZE);
19345+ *phys = pfn * PAGE_SIZE;
19346+ return adr;
cc90b958
BS
19347 }
19348
19349-#define PTE_SIZE PAGE_SIZE
00e5a55c
BS
19350+static __ref void unmap_low_page(void *adr)
19351+{
19352+ if (after_bootmem)
19353+ return;
19354+
19355+ early_iounmap(adr, PAGE_SIZE);
19356+}
cc90b958
BS
19357
19358 static inline int __meminit make_readonly(unsigned long paddr)
19359 {
00e5a55c
BS
19360 extern char __vsyscall_0;
19361 int readonly = 0;
19362
19363- /* Make new page tables read-only. */
19364+ /* Make new page tables read-only on the first pass. */
19365 if (!xen_feature(XENFEAT_writable_page_tables)
19366+ && !max_pfn_mapped
19367 && (paddr >= (table_start << PAGE_SHIFT))
19368- && (paddr < (table_end << PAGE_SHIFT)))
19369+ && (paddr < (table_top << PAGE_SHIFT)))
19370 readonly = 1;
cc90b958
BS
19371 /* Make old page tables read-only. */
19372 if (!xen_feature(XENFEAT_writable_page_tables)
19373 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
19374- && (paddr < (start_pfn << PAGE_SHIFT)))
19375+ && (paddr < (table_cur << PAGE_SHIFT)))
19376 readonly = 1;
19377
19378 /*
00e5a55c 19379@@ -425,118 +381,131 @@ static inline int __meminit make_readonl
cc90b958
BS
19380 return readonly;
19381 }
19382
19383-#ifndef CONFIG_XEN
19384-/* Must run before zap_low_mappings */
19385-__meminit void *early_ioremap(unsigned long addr, unsigned long size)
19386+static unsigned long __meminit
19387+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
19388 {
19389- pmd_t *pmd, *last_pmd;
19390- unsigned long vaddr;
19391- int i, pmds;
19392-
19393- pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19394- vaddr = __START_KERNEL_map;
19395- pmd = level2_kernel_pgt;
19396- last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
19397-
19398- for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
19399- for (i = 0; i < pmds; i++) {
19400- if (pmd_present(pmd[i]))
19401- goto continue_outer_loop;
19402- }
19403- vaddr += addr & ~PMD_MASK;
19404- addr &= PMD_MASK;
19405+ unsigned pages = 0;
19406+ unsigned long last_map_addr = end;
19407+ int i;
00e5a55c
BS
19408+
19409+ pte_t *pte = pte_page + pte_index(addr);
19410+
19411+ for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
19412+ unsigned long pteval = addr | __PAGE_KERNEL;
cc90b958
BS
19413
19414- for (i = 0; i < pmds; i++, addr += PMD_SIZE)
19415- set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
19416- __flush_tlb_all();
19417-
19418- return (void *)vaddr;
19419-continue_outer_loop:
19420- ;
00e5a55c
BS
19421+ if (addr >= end ||
19422+ (!after_bootmem &&
19423+ (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
cc90b958
BS
19424+ break;
19425+
19426+ if (__pte_val(*pte))
19427+ continue;
19428+
19429+ if (make_readonly(addr))
19430+ pteval &= ~_PAGE_RW;
19431+ if (0)
19432+ printk(" pte=%p addr=%lx pte=%016lx\n",
19433+ pte, addr, pteval);
19434+ if (!after_bootmem)
19435+ *pte = __pte(pteval & __supported_pte_mask);
19436+ else
19437+ set_pte(pte, __pte(pteval & __supported_pte_mask));
19438+ last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
19439+ pages++;
19440 }
19441- printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
19442- return NULL;
19443+ update_page_count(PG_LEVEL_4K, pages);
19444+
19445+ return last_map_addr;
19446 }
19447
19448-/*
19449- * To avoid virtual aliases later:
19450- */
19451-__meminit void early_iounmap(void *addr, unsigned long size)
19452+static unsigned long __meminit
19453+phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
19454 {
19455- unsigned long vaddr;
19456- pmd_t *pmd;
19457- int i, pmds;
19458-
19459- vaddr = (unsigned long)addr;
19460- pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19461- pmd = level2_kernel_pgt + pmd_index(vaddr);
00e5a55c 19462-
cc90b958
BS
19463- for (i = 0; i < pmds; i++)
19464- pmd_clear(pmd + i);
00e5a55c
BS
19465+ pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
19466
cc90b958 19467- __flush_tlb_all();
00e5a55c 19468+ BUG_ON(!max_pfn_mapped);
cc90b958
BS
19469+ return phys_pte_init(pte, address, end);
19470 }
19471-#endif
19472
19473 static unsigned long __meminit
19474-phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
19475+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
19476+ unsigned long page_size_mask)
19477 {
19478+ unsigned long pages = 0;
19479+ unsigned long last_map_addr = end;
19480+ unsigned long start = address;
19481+
19482 int i = pmd_index(address);
19483
19484- for (; i < PTRS_PER_PMD; i++) {
19485+ for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
19486 unsigned long pte_phys;
19487- pmd_t *pmd = pmd_page + i;
19488- pte_t *pte, *pte_save;
19489- int k;
19490+ pmd_t *pmd = pmd_page + pmd_index(address);
19491+ pte_t *pte;
19492
19493 if (address >= end)
19494 break;
19495
19496 if (__pmd_val(*pmd)) {
19497- address += PMD_SIZE;
19498+ if (!pmd_large(*pmd)) {
19499+ spin_lock(&init_mm.page_table_lock);
19500+ last_map_addr = phys_pte_update(pmd, address,
19501+ end);
19502+ spin_unlock(&init_mm.page_table_lock);
19503+ }
19504+ /* Count entries we're using from level2_ident_pgt */
19505+ if (start == 0)
19506+ pages++;
19507 continue;
19508 }
19509
19510- pte = alloc_static_page(&pte_phys);
19511- pte_save = pte;
19512- for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
19513- unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
19514-
19515- if (address >= (after_bootmem
19516- ? end
19517- : xen_start_info->nr_pages << PAGE_SHIFT))
19518- pteval = 0;
19519- else if (make_readonly(address))
19520- pteval &= ~_PAGE_RW;
19521- set_pte(pte, __pte(pteval & __supported_pte_mask));
19522+ if (page_size_mask & (1<<PG_LEVEL_2M)) {
19523+ pages++;
19524+ spin_lock(&init_mm.page_table_lock);
19525+ set_pte((pte_t *)pmd,
19526+ pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19527+ spin_unlock(&init_mm.page_table_lock);
19528+ last_map_addr = (address & PMD_MASK) + PMD_SIZE;
19529+ continue;
19530 }
19531+
00e5a55c 19532+ pte = alloc_low_page(&pte_phys);
cc90b958
BS
19533+ last_map_addr = phys_pte_init(pte, address, end);
19534+ unmap_low_page(pte);
19535+
19536 if (!after_bootmem) {
19537- early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19538- *pmd = __pmd(pte_phys | _KERNPG_TABLE);
00e5a55c
BS
19539+ if (max_pfn_mapped)
19540+ make_page_readonly(__va(pte_phys),
19541+ XENFEAT_writable_page_tables);
cc90b958
BS
19542+ *pmd = __pmd(pte_phys | _PAGE_TABLE);
19543 } else {
19544- make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19545- set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
19546+ make_page_readonly(pte, XENFEAT_writable_page_tables);
19547+ spin_lock(&init_mm.page_table_lock);
19548+ pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
19549+ spin_unlock(&init_mm.page_table_lock);
19550 }
19551 }
19552- return address;
19553+ update_page_count(PG_LEVEL_2M, pages);
19554+ return last_map_addr;
19555 }
19556
19557 static unsigned long __meminit
19558-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
19559+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
19560+ unsigned long page_size_mask)
19561 {
19562 pmd_t *pmd = pmd_offset(pud, 0);
19563 unsigned long last_map_addr;
19564
19565- spin_lock(&init_mm.page_table_lock);
19566- last_map_addr = phys_pmd_init(pmd, address, end);
19567- spin_unlock(&init_mm.page_table_lock);
00e5a55c 19568+ BUG_ON(!max_pfn_mapped);
cc90b958
BS
19569+ last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
19570 __flush_tlb_all();
19571 return last_map_addr;
19572 }
19573
19574 static unsigned long __meminit
19575-phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
19576+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
19577+ unsigned long page_size_mask)
19578 {
19579+ unsigned long pages = 0;
19580 unsigned long last_map_addr = end;
19581 int i = pud_index(addr);
19582
00e5a55c 19583@@ -550,29 +519,55 @@ phys_pud_init(pud_t *pud_page, unsigned
cc90b958
BS
19584
19585 if (__pud_val(*pud)) {
19586 if (!pud_large(*pud))
19587- last_map_addr = phys_pmd_update(pud, addr, end);
19588+ last_map_addr = phys_pmd_update(pud, addr, end,
19589+ page_size_mask);
19590 continue;
19591 }
19592
19593- if (direct_gbpages) {
19594+ if (page_size_mask & (1<<PG_LEVEL_1G)) {
19595+ pages++;
19596+ spin_lock(&init_mm.page_table_lock);
19597 set_pte((pte_t *)pud,
19598 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19599+ spin_unlock(&init_mm.page_table_lock);
19600 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
19601 continue;
19602 }
19603
00e5a55c
BS
19604- pmd = alloc_static_page(&pmd_phys);
19605-
cc90b958
BS
19606- spin_lock(&init_mm.page_table_lock);
19607- *pud = __pud(pmd_phys | _KERNPG_TABLE);
19608- last_map_addr = phys_pmd_init(pmd, addr, end);
19609- spin_unlock(&init_mm.page_table_lock);
00e5a55c
BS
19610+ pmd = alloc_low_page(&pmd_phys);
19611+ last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
19612+ unmap_low_page(pmd);
19613
cc90b958
BS
19614- early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19615+ if (!after_bootmem) {
00e5a55c
BS
19616+ if (max_pfn_mapped)
19617+ make_page_readonly(__va(pmd_phys),
19618+ XENFEAT_writable_page_tables);
cc90b958
BS
19619+ if (page_size_mask & (1 << PG_LEVEL_NUM))
19620+ xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
19621+ else
19622+ *pud = __pud(pmd_phys | _PAGE_TABLE);
19623+ } else {
19624+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
19625+ spin_lock(&init_mm.page_table_lock);
19626+ pud_populate(&init_mm, pud, __va(pmd_phys));
19627+ spin_unlock(&init_mm.page_table_lock);
19628+ }
19629 }
19630 __flush_tlb_all();
19631+ update_page_count(PG_LEVEL_1G, pages);
19632
19633- return last_map_addr >> PAGE_SHIFT;
19634+ return last_map_addr;
19635+}
19636+
19637+static unsigned long __meminit
19638+phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
19639+ unsigned long page_size_mask)
19640+{
19641+ pud_t *pud;
19642+
00e5a55c 19643+ pud = (pud_t *)pgd_page_vaddr(*pgd);
cc90b958
BS
19644+
19645+ return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
19646 }
19647
19648 void __init xen_init_pt(void)
00e5a55c 19649@@ -651,86 +646,36 @@ void __init xen_init_pt(void)
cc90b958 19650 }
cc90b958
BS
19651 }
19652
00e5a55c
BS
19653-static void __init extend_init_mapping(unsigned long tables_space)
19654-{
19655- unsigned long va = __START_KERNEL_map;
19656- unsigned long start = start_pfn;
19657- unsigned long phys, addr, *pte_page;
19658- pmd_t *pmd;
19659- pte_t *pte, new_pte;
19660- unsigned long *page = (unsigned long *)init_level4_pgt;
19661-
19662- addr = page[pgd_index(va)];
19663- addr_to_page(addr, page);
19664- addr = page[pud_index(va)];
19665- addr_to_page(addr, page);
19666-
19667- /* Kill mapping of low 1MB. */
19668- while (va < (unsigned long)&_text) {
19669- if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
19670- BUG();
19671- va += PAGE_SIZE;
19672- }
19673-
19674- /* Ensure init mappings cover kernel text/data and initial tables. */
19675- while (va < (__START_KERNEL_map
19676- + (start_pfn << PAGE_SHIFT)
19677- + tables_space)) {
19678- pmd = (pmd_t *)&page[pmd_index(va)];
19679- if (pmd_none(*pmd)) {
19680- pte_page = alloc_static_page(&phys);
19681- early_make_page_readonly(
19682- pte_page, XENFEAT_writable_page_tables);
19683- set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
19684- } else {
19685- addr = page[pmd_index(va)];
19686- addr_to_page(addr, pte_page);
19687- }
19688- pte = (pte_t *)&pte_page[pte_index(va)];
19689- if (pte_none(*pte)) {
19690- new_pte = pfn_pte(
19691- (va - __START_KERNEL_map) >> PAGE_SHIFT,
19692- __pgprot(_KERNPG_TABLE));
19693- xen_l1_entry_update(pte, new_pte);
19694- }
19695- va += PAGE_SIZE;
19696- }
19697-
19698- /* Finally, blow away any spurious initial mappings. */
19699- while (1) {
19700- pmd = (pmd_t *)&page[pmd_index(va)];
19701- if (pmd_none(*pmd))
19702- break;
19703- if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
19704- BUG();
19705- va += PAGE_SIZE;
19706- }
19707-
19708- if (start_pfn > start)
19709- reserve_early(start << PAGE_SHIFT,
19710- start_pfn << PAGE_SHIFT, "INITMAP");
19711-}
19712-
19713 static void __init find_early_table_space(unsigned long end)
19714 {
19715 unsigned long puds, pmds, ptes, tables;
19716
19717 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
19718+ tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
19719 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
cc90b958 19720- ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
00e5a55c 19721+ tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
cc90b958 19722
00e5a55c
BS
19723- tables = round_up(puds * 8, PAGE_SIZE) +
19724- round_up(pmds * 8, PAGE_SIZE) +
19725- round_up(ptes * 8, PAGE_SIZE);
19726+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
19727+ tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
cc90b958 19728
00e5a55c
BS
19729- extend_init_mapping(tables);
19730+ if (!table_top) {
19731+ table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
19732+ xen_start_info->nr_pt_frames;
19733+ table_cur = table_start;
19734+ } else {
19735+ /*
19736+ * [table_start, table_top) gets passed to reserve_early(),
19737+ * so we must not use table_cur here, despite continuing
19738+ * to allocate from there. table_cur possibly being below
19739+ * table_start is otoh not a problem.
19740+ */
19741+ table_start = table_top;
19742+ }
cc90b958
BS
19743
19744- table_start = start_pfn;
00e5a55c
BS
19745- table_end = table_start + (tables>>PAGE_SHIFT);
19746+ table_top = table_cur + (tables >> PAGE_SHIFT);
cc90b958
BS
19747
19748- early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
19749- end, table_start << PAGE_SHIFT,
19750- (table_start << PAGE_SHIFT) + tables);
19751+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
00e5a55c 19752+ end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT);
cc90b958
BS
19753 }
19754
00e5a55c
BS
19755 static void __init xen_finish_init_mapping(void)
19756@@ -752,18 +697,18 @@ static void __init xen_finish_init_mappi
19757 xen_start_info->mod_start = (unsigned long)
19758 __va(__pa(xen_start_info->mod_start));
cc90b958 19759
00e5a55c
BS
19760- /* Destroy the Xen-created mappings beyond the kernel image as
19761- * well as the temporary mappings created above. Prevents
19762- * overlap with modules area (if init mapping is very big).
19763- */
19764+ /* Destroy the Xen-created mappings beyond the kernel image. */
19765 start = PAGE_ALIGN((unsigned long)_end);
19766- end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
19767+ end = __START_KERNEL_map + (table_start << PAGE_SHIFT);
19768 for (; start < end; start += PAGE_SIZE)
cc90b958
BS
19769 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
19770 BUG();
19771
19772- /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
00e5a55c 19773- table_end = ~0UL;
cc90b958 19774+ /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
00e5a55c
BS
19775+ start = table_top;
19776+ WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n",
19777+ table_start, table_cur, start);
19778+ table_top = ~0UL;
19779
19780 /* Switch to the real shared_info page, and clear the dummy page. */
19781 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
19782@@ -780,8 +725,7 @@ static void __init xen_finish_init_mappi
cc90b958
BS
19783 << PAGE_SHIFT,
19784 PAGE_KERNEL_RO);
19785
19786- /* Disable the 'start_pfn' allocator. */
19787- table_end = start_pfn;
00e5a55c 19788+ table_top = max(table_cur, start);
cc90b958
BS
19789 }
19790
19791 static void __init init_gbpages(void)
00e5a55c
BS
19792@@ -794,126 +738,91 @@ static void __init init_gbpages(void)
19793 #endif
cc90b958
BS
19794 }
19795
19796-#ifdef CONFIG_MEMTEST_BOOTPARAM
19797-
19798-static void __init memtest(unsigned long start_phys, unsigned long size,
19799- unsigned pattern)
19800+static unsigned long __init kernel_physical_mapping_init(unsigned long start,
19801+ unsigned long end,
19802+ unsigned long page_size_mask)
19803 {
19804- unsigned long i;
19805- unsigned long *start;
19806- unsigned long start_bad;
19807- unsigned long last_bad;
19808- unsigned long val;
19809- unsigned long start_phys_aligned;
19810- unsigned long count;
19811- unsigned long incr;
19812-
19813- switch (pattern) {
19814- case 0:
19815- val = 0UL;
19816- break;
19817- case 1:
19818- val = -1UL;
19819- break;
19820- case 2:
19821- val = 0x5555555555555555UL;
19822- break;
19823- case 3:
19824- val = 0xaaaaaaaaaaaaaaaaUL;
19825- break;
19826- default:
19827- return;
19828- }
00e5a55c 19829-
cc90b958
BS
19830- incr = sizeof(unsigned long);
19831- start_phys_aligned = ALIGN(start_phys, incr);
19832- count = (size - (start_phys_aligned - start_phys))/incr;
19833- start = __va(start_phys_aligned);
19834- start_bad = 0;
19835- last_bad = 0;
19836-
19837- for (i = 0; i < count; i++)
19838- start[i] = val;
19839- for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
19840- if (*start != val) {
19841- if (start_phys_aligned == last_bad + incr) {
19842- last_bad += incr;
19843- } else {
19844- if (start_bad) {
19845- printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19846- val, start_bad, last_bad + incr);
19847- reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19848- }
19849- start_bad = last_bad = start_phys_aligned;
19850- }
19851- }
19852- }
19853- if (start_bad) {
19854- printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19855- val, start_bad, last_bad + incr);
19856- reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19857- }
00e5a55c 19858
cc90b958 19859-}
00e5a55c
BS
19860+ unsigned long next, last_map_addr = end;
19861
cc90b958 19862-static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
00e5a55c
BS
19863+ start = (unsigned long)__va(start);
19864+ end = (unsigned long)__va(end);
19865
cc90b958
BS
19866-static int __init parse_memtest(char *arg)
19867-{
19868- if (arg)
19869- memtest_pattern = simple_strtoul(arg, NULL, 0);
19870- return 0;
19871-}
cc90b958
BS
19872+ for (; start < end; start = next) {
19873+ pgd_t *pgd = pgd_offset_k(start);
19874+ unsigned long pud_phys;
19875+ pud_t *pud;
19876
00e5a55c 19877-early_param("memtest", parse_memtest);
cc90b958
BS
19878+ next = (start + PGDIR_SIZE) & PGDIR_MASK;
19879+ if (next > end)
19880+ next = end;
19881
00e5a55c
BS
19882-static void __init early_memtest(unsigned long start, unsigned long end)
19883-{
19884- u64 t_start, t_size;
19885- unsigned pattern;
cc90b958
BS
19886+ if (__pgd_val(*pgd)) {
19887+ last_map_addr = phys_pud_update(pgd, __pa(start),
19888+ __pa(end), page_size_mask);
19889+ continue;
19890+ }
19891
00e5a55c
BS
19892- if (!memtest_pattern)
19893- return;
19894+ pud = alloc_low_page(&pud_phys);
cc90b958
BS
19895+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
19896+ page_size_mask);
19897+ unmap_low_page(pud);
00e5a55c 19898+
cc90b958 19899+ if(!after_bootmem) {
00e5a55c
BS
19900+ if (max_pfn_mapped)
19901+ make_page_readonly(__va(pud_phys),
19902+ XENFEAT_writable_page_tables);
cc90b958
BS
19903+ xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
19904+ } else {
19905+ make_page_readonly(pud, XENFEAT_writable_page_tables);
19906+ spin_lock(&init_mm.page_table_lock);
19907+ pgd_populate(&init_mm, pgd, __va(pud_phys));
19908+ spin_unlock(&init_mm.page_table_lock);
00e5a55c
BS
19909+ }
19910+ }
19911
19912- printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
19913- for (pattern = 0; pattern < memtest_pattern; pattern++) {
19914- t_start = start;
19915- t_size = 0;
19916- while (t_start < end) {
19917- t_start = find_e820_area_size(t_start, &t_size, 1);
cc90b958 19918+ return last_map_addr;
00e5a55c
BS
19919+}
19920
19921- /* done ? */
19922- if (t_start >= end)
19923- break;
19924- if (t_start + t_size > end)
19925- t_size = end - t_start;
cc90b958
BS
19926+struct map_range {
19927+ unsigned long start;
19928+ unsigned long end;
19929+ unsigned page_size_mask;
19930+};
00e5a55c
BS
19931
19932- printk(KERN_CONT "\n %016llx - %016llx pattern %d",
19933- (unsigned long long)t_start,
19934- (unsigned long long)t_start + t_size, pattern);
cc90b958 19935+#define NR_RANGE_MR 5
00e5a55c
BS
19936
19937- memtest(t_start, t_size, pattern);
cc90b958
BS
19938+static int save_mr(struct map_range *mr, int nr_range,
19939+ unsigned long start_pfn, unsigned long end_pfn,
19940+ unsigned long page_size_mask)
00e5a55c
BS
19941+{
19942
19943- t_start += t_size;
19944- }
cc90b958
BS
19945+ if (start_pfn < end_pfn) {
19946+ if (nr_range >= NR_RANGE_MR)
19947+ panic("run out of range for init_memory_mapping\n");
19948+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
19949+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
19950+ mr[nr_range].page_size_mask = page_size_mask;
19951+ nr_range++;
00e5a55c
BS
19952 }
19953- printk(KERN_CONT "\n");
19954-}
19955-#else
19956-static void __init early_memtest(unsigned long start, unsigned long end)
19957-{
cc90b958
BS
19958+
19959+ return nr_range;
19960 }
19961-#endif
19962
19963 /*
19964 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
19965 * This runs before bootmem is initialized and gets pages directly from
19966 * the physical memory. To access them they are temporarily mapped.
19967 */
19968-unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
19969+unsigned long __init_refok init_memory_mapping(unsigned long start,
19970+ unsigned long end)
19971 {
19972- unsigned long next, last_map_addr = end;
19973- unsigned long start_phys = start, end_phys = end;
19974+ unsigned long last_map_addr = 0;
19975+ unsigned long page_size_mask = 0;
19976+ unsigned long start_pfn, end_pfn;
00e5a55c 19977+
cc90b958
BS
19978+ struct map_range mr[NR_RANGE_MR];
19979+ int nr_range, i;
19980
19981 printk(KERN_INFO "init_memory_mapping\n");
19982
00e5a55c 19983@@ -924,51 +833,150 @@ unsigned long __init_refok init_memory_m
cc90b958
BS
19984 * memory mapped. Unfortunately this is done currently before the
19985 * nodes are discovered.
19986 */
19987- if (!after_bootmem) {
19988+ if (!after_bootmem)
19989 init_gbpages();
19990- find_early_table_space(end);
00e5a55c 19991+
cc90b958
BS
19992+ if (direct_gbpages)
19993+ page_size_mask |= 1 << PG_LEVEL_1G;
19994+ if (cpu_has_pse)
19995+ page_size_mask |= 1 << PG_LEVEL_2M;
19996+
19997+ memset(mr, 0, sizeof(mr));
19998+ nr_range = 0;
19999+
20000+ /* head if not big page alignment ?*/
20001+ start_pfn = start >> PAGE_SHIFT;
20002+ end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
20003+ << (PMD_SHIFT - PAGE_SHIFT);
20004+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
20005+
20006+ /* big page (2M) range*/
20007+ start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
20008+ << (PMD_SHIFT - PAGE_SHIFT);
20009+ end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
20010+ << (PUD_SHIFT - PAGE_SHIFT);
20011+ if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
20012+ end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
20013+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20014+ page_size_mask & (1<<PG_LEVEL_2M));
20015+
20016+ /* big page (1G) range */
20017+ start_pfn = end_pfn;
20018+ end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
20019+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20020+ page_size_mask &
20021+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
20022+
20023+ /* tail is not big page (1G) alignment */
20024+ start_pfn = end_pfn;
20025+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
20026+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20027+ page_size_mask & (1<<PG_LEVEL_2M));
20028+
20029+ /* tail is not big page (2M) alignment */
20030+ start_pfn = end_pfn;
20031+ end_pfn = end>>PAGE_SHIFT;
20032+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
20033+
20034+ /* try to merge same page size and continuous */
20035+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
20036+ unsigned long old_start;
20037+ if (mr[i].end != mr[i+1].start ||
20038+ mr[i].page_size_mask != mr[i+1].page_size_mask)
20039+ continue;
20040+ /* move it */
20041+ old_start = mr[i].start;
20042+ memmove(&mr[i], &mr[i+1],
20043+ (nr_range - 1 - i) * sizeof (struct map_range));
20044+ mr[i--].start = old_start;
20045+ nr_range--;
00e5a55c 20046 }
cc90b958 20047
00e5a55c
BS
20048- start = (unsigned long)__va(start);
20049- end = (unsigned long)__va(end);
cc90b958
BS
20050+ for (i = 0; i < nr_range; i++)
20051+ printk(KERN_DEBUG " %010lx - %010lx page %s\n",
20052+ mr[i].start, mr[i].end,
20053+ (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
20054+ (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
20055
00e5a55c
BS
20056- for (; start < end; start = next) {
20057- pgd_t *pgd = pgd_offset_k(start);
20058- unsigned long pud_phys;
20059- pud_t *pud;
20060+ if (!after_bootmem)
20061+ find_early_table_space(end);
20062
cc90b958
BS
20063- if (after_bootmem)
20064- pud = pud_offset(pgd, start & PGDIR_MASK);
20065- else
20066- pud = alloc_static_page(&pud_phys);
20067- next = start + PGDIR_SIZE;
20068- if (next > end)
20069- next = end;
20070- last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
20071- if (!after_bootmem) {
20072- early_make_page_readonly(pud, XENFEAT_writable_page_tables);
20073- set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
00e5a55c
BS
20074+ if (!start) {
20075+ unsigned long addr, va = __START_KERNEL_map;
20076+ unsigned long *page = (unsigned long *)init_level4_pgt;
20077+
20078+ /* Kill mapping of memory below _text. */
20079+ while (va < (unsigned long)&_text) {
20080+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
20081+ BUG();
20082+ va += PAGE_SIZE;
20083+ }
20084+
20085+ /* Blow away any spurious initial mappings. */
20086+ va = __START_KERNEL_map + (table_start << PAGE_SHIFT);
20087+ addr = page[pgd_index(va)];
20088+ addr_to_page(addr, page);
20089+ addr = page[pud_index(va)];
20090+ addr_to_page(addr, page);
20091+ while (pmd_index(va) | pte_index(va)) {
20092+ if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
20093+ break;
20094+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
20095+ BUG();
20096+ va += PAGE_SIZE;
20097 }
20098 }
cc90b958
BS
20099
20100- if (!after_bootmem) {
20101- BUG_ON(start_pfn != table_end);
cc90b958
BS
20102+ for (i = 0; i < nr_range; i++)
20103+ last_map_addr = kernel_physical_mapping_init(
20104+ mr[i].start, mr[i].end,
20105+ mr[i].page_size_mask);
20106+
00e5a55c
BS
20107+ BUG_ON(table_cur > table_top);
20108+ if (!start)
20109 xen_finish_init_mapping();
20110- }
20111+ else if (table_cur < table_top)
20112+ /* Disable the 'table_cur' allocator. */
20113+ table_top = table_cur;
cc90b958
BS
20114
20115 __flush_tlb_all();
20116
20117- if (!after_bootmem)
00e5a55c 20118+ if (!after_bootmem && table_top > table_start)
cc90b958 20119 reserve_early(table_start << PAGE_SHIFT,
00e5a55c
BS
20120- table_end << PAGE_SHIFT, "PGTABLE");
20121+ table_top << PAGE_SHIFT, "PGTABLE");
20122+
cc90b958
BS
20123+ printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
20124+ last_map_addr, end);
00e5a55c 20125
cc90b958
BS
20126 if (!after_bootmem)
20127- early_memtest(start_phys, end_phys);
20128+ early_memtest(start, end);
20129
20130- return last_map_addr;
20131+ return last_map_addr >> PAGE_SHIFT;
20132 }
20133
20134 #ifndef CONFIG_NUMA
20135+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
20136+{
20137+ unsigned long bootmap_size, bootmap;
20138+
20139+ e820_register_active_regions(0, start_pfn, end_pfn);
20140+#ifdef CONFIG_XEN
20141+ if (end_pfn > xen_start_info->nr_pages)
20142+ end_pfn = xen_start_info->nr_pages;
20143+#endif
20144+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
20145+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
20146+ PAGE_SIZE);
20147+ if (bootmap == -1L)
20148+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
20149+ /* don't touch min_low_pfn */
20150+ bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
20151+ 0, end_pfn);
20152+ free_bootmem_with_active_regions(0, end_pfn);
20153+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
20154+ reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
20155+}
20156+
20157 void __init paging_init(void)
20158 {
20159 unsigned long max_zone_pfns[MAX_NR_ZONES];
00e5a55c 20160@@ -976,9 +984,9 @@ void __init paging_init(void)
cc90b958
BS
20161 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
20162 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
20163 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
20164- max_zone_pfns[ZONE_NORMAL] = end_pfn;
20165+ max_zone_pfns[ZONE_NORMAL] = max_pfn;
20166
20167- memory_present(0, 0, end_pfn);
20168+ memory_present(0, 0, max_pfn);
20169 sparse_init();
20170 free_area_init_nodes(max_zone_pfns);
20171
00e5a55c 20172@@ -1069,8 +1077,8 @@ void __init mem_init(void)
cc90b958
BS
20173 init_page_count(pfn_to_page(pfn));
20174 totalram_pages++;
20175 }
20176- reservedpages = end_pfn - totalram_pages -
20177- absent_pages_in_range(0, end_pfn);
20178+ reservedpages = max_pfn - totalram_pages -
20179+ absent_pages_in_range(0, max_pfn);
20180 after_bootmem = 1;
20181
20182 codesize = (unsigned long) &_etext - (unsigned long) &_text;
00e5a55c 20183@@ -1089,7 +1097,7 @@ void __init mem_init(void)
cc90b958
BS
20184 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
20185 "%ldk reserved, %ldk data, %ldk init)\n",
20186 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
20187- end_pfn << (PAGE_SHIFT-10),
20188+ max_pfn << (PAGE_SHIFT-10),
20189 codesize >> 10,
20190 reservedpages << (PAGE_SHIFT-10),
20191 datasize >> 10,
00e5a55c 20192@@ -1152,6 +1160,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
cc90b958
BS
20193 void mark_rodata_ro(void)
20194 {
20195 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
20196+ unsigned long rodata_start =
20197+ ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20198+
20199+#ifdef CONFIG_DYNAMIC_FTRACE
20200+ /* Dynamic tracing modifies the kernel text section */
20201+ start = rodata_start;
20202+#endif
20203
20204 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
20205 (end - start) >> 10);
00e5a55c 20206@@ -1161,8 +1176,7 @@ void mark_rodata_ro(void)
cc90b958
BS
20207 * The rodata section (but not the kernel text!) should also be
20208 * not-executable.
20209 */
20210- start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20211- set_memory_nx(start, (end - start) >> PAGE_SHIFT);
20212+ set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
20213
20214 rodata_test();
20215
00e5a55c 20216@@ -1184,24 +1198,26 @@ void free_initrd_mem(unsigned long start
cc90b958
BS
20217 }
20218 #endif
20219
20220-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
20221+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
20222+ int flags)
20223 {
20224 #ifdef CONFIG_NUMA
20225 int nid, next_nid;
20226+ int ret;
20227 #endif
20228 unsigned long pfn = phys >> PAGE_SHIFT;
20229
20230- if (pfn >= end_pfn) {
20231+ if (pfn >= max_pfn) {
20232 /*
20233 * This can happen with kdump kernels when accessing
20234 * firmware tables:
20235 */
20236 if (pfn < max_pfn_mapped)
20237- return;
20238+ return -EFAULT;
20239
20240- printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
20241+ printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
20242 phys, len);
20243- return;
20244+ return -EFAULT;
20245 }
20246
20247 /* Should check here against the e820 map to avoid double free */
00e5a55c 20248@@ -1209,9 +1225,13 @@ void __init reserve_bootmem_generic(unsi
cc90b958
BS
20249 nid = phys_to_nid(phys);
20250 next_nid = phys_to_nid(phys + len - 1);
20251 if (nid == next_nid)
20252- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
20253+ ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
20254 else
20255- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20256+ ret = reserve_bootmem(phys, len, flags);
20257+
20258+ if (ret != 0)
20259+ return ret;
20260+
20261 #else
20262 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20263 #endif
00e5a55c 20264@@ -1222,6 +1242,8 @@ void __init reserve_bootmem_generic(unsi
cc90b958
BS
20265 set_dma_reserve(dma_reserve);
20266 }
20267 #endif
20268+
20269+ return 0;
20270 }
20271
20272 int kern_addr_valid(unsigned long addr)
00e5a55c 20273@@ -1326,7 +1348,7 @@ vmemmap_populate(struct page *start_page
cc90b958
BS
20274 pmd_t *pmd;
20275
20276 for (; addr < end; addr = next) {
20277- next = pmd_addr_end(addr, end);
20278+ void *p = NULL;
20279
20280 pgd = vmemmap_pgd_populate(addr, node);
20281 if (!pgd)
00e5a55c 20282@@ -1336,33 +1358,51 @@ vmemmap_populate(struct page *start_page
cc90b958
BS
20283 if (!pud)
20284 return -ENOMEM;
20285
20286- pmd = pmd_offset(pud, addr);
20287- if (pmd_none(*pmd)) {
20288- pte_t entry;
20289- void *p;
20290+ if (!cpu_has_pse) {
20291+ next = (addr + PAGE_SIZE) & PAGE_MASK;
20292+ pmd = vmemmap_pmd_populate(pud, addr, node);
20293+
20294+ if (!pmd)
20295+ return -ENOMEM;
20296+
20297+ p = vmemmap_pte_populate(pmd, addr, node);
20298
20299- p = vmemmap_alloc_block(PMD_SIZE, node);
20300 if (!p)
20301 return -ENOMEM;
20302
20303- entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20304- PAGE_KERNEL_LARGE);
20305- set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20306-
20307- /* check to see if we have contiguous blocks */
20308- if (p_end != p || node_start != node) {
20309- if (p_start)
20310- printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20311- addr_start, addr_end-1, p_start, p_end-1, node_start);
20312- addr_start = addr;
20313- node_start = node;
20314- p_start = p;
20315- }
20316- addr_end = addr + PMD_SIZE;
20317- p_end = p + PMD_SIZE;
20318+ addr_end = addr + PAGE_SIZE;
20319+ p_end = p + PAGE_SIZE;
20320 } else {
20321- vmemmap_verify((pte_t *)pmd, node, addr, next);
20322+ next = pmd_addr_end(addr, end);
20323+
20324+ pmd = pmd_offset(pud, addr);
20325+ if (pmd_none(*pmd)) {
20326+ pte_t entry;
20327+
20328+ p = vmemmap_alloc_block(PMD_SIZE, node);
20329+ if (!p)
20330+ return -ENOMEM;
20331+
20332+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20333+ PAGE_KERNEL_LARGE);
20334+ set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20335+
20336+ /* check to see if we have contiguous blocks */
20337+ if (p_end != p || node_start != node) {
20338+ if (p_start)
20339+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20340+ addr_start, addr_end-1, p_start, p_end-1, node_start);
20341+ addr_start = addr;
20342+ node_start = node;
20343+ p_start = p;
20344+ }
20345+
20346+ addr_end = addr + PMD_SIZE;
20347+ p_end = p + PMD_SIZE;
20348+ } else
20349+ vmemmap_verify((pte_t *)pmd, node, addr, next);
20350 }
20351+
20352 }
20353 return 0;
20354 }
00e5a55c
BS
20355--- sle11-2009-06-04.orig/arch/x86/mm/ioremap-xen.c 2009-03-16 16:38:05.000000000 +0100
20356+++ sle11-2009-06-04/arch/x86/mm/ioremap-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
20357@@ -13,6 +13,7 @@
20358 #include <linux/pfn.h>
20359 #include <linux/slab.h>
20360 #include <linux/vmalloc.h>
20361+#include <linux/mmiotrace.h>
20362
20363 #include <asm/cacheflush.h>
20364 #include <asm/e820.h>
20365@@ -274,7 +275,8 @@ int ioremap_check_change_attr(unsigned l
20366 for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
20367 unsigned long pfn = mfn_to_local_pfn(mfn);
20368
20369- if (pfn >= max_pfn_mapped)
20370+ if (pfn >= max_low_pfn_mapped &&
20371+ (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
20372 continue;
20373 rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
20374 PAGE_SIZE, prot_val);
20375@@ -297,11 +299,14 @@ static void __iomem *__ioremap_caller(re
20376 {
20377 unsigned long mfn, offset, vaddr;
20378 resource_size_t last_addr;
20379+ const resource_size_t unaligned_phys_addr = phys_addr;
20380+ const unsigned long unaligned_size = size;
20381 struct vm_struct *area;
20382 unsigned long new_prot_val;
20383 pgprot_t prot;
20384 int retval;
20385 domid_t domid = DOMID_IO;
20386+ void __iomem *ret_addr;
20387
20388 /* Don't allow wraparound or zero size */
20389 last_addr = phys_addr + size - 1;
20390@@ -318,7 +323,7 @@ static void __iomem *__ioremap_caller(re
20391 /*
20392 * Don't remap the low PCI/ISA area, it's always mapped..
20393 */
20394- if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
20395+ if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
20396 return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
20397
20398 /*
20399@@ -342,7 +347,7 @@ static void __iomem *__ioremap_caller(re
20400 phys_addr &= PAGE_MASK;
20401 size = PAGE_ALIGN(last_addr+1) - phys_addr;
20402
20403- retval = reserve_memtype(phys_addr, phys_addr + size,
20404+ retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
20405 prot_val, &new_prot_val);
20406 if (retval) {
20407 pr_debug("Warning: reserve_memtype returned %d\n", retval);
20408@@ -410,7 +415,10 @@ static void __iomem *__ioremap_caller(re
20409 return NULL;
20410 }
20411
20412- return (void __iomem *) (vaddr + offset);
20413+ ret_addr = (void __iomem *) (vaddr + offset);
20414+ mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
20415+
20416+ return ret_addr;
20417 }
20418
20419 /**
20420@@ -438,7 +446,7 @@ void __iomem *ioremap_nocache(resource_s
20421 {
20422 /*
20423 * Ideally, this should be:
20424- * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20425+ * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20426 *
20427 * Till we fix all X drivers to use ioremap_wc(), we will use
20428 * UC MINUS.
20429@@ -462,7 +470,7 @@ EXPORT_SYMBOL(ioremap_nocache);
20430 */
20431 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
20432 {
20433- if (pat_wc_enabled)
20434+ if (pat_enabled)
20435 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
20436 __builtin_return_address(0));
20437 else
20438@@ -502,6 +510,14 @@ static void __iomem *ioremap_default(res
20439 }
20440 #endif
20441
20442+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
20443+ unsigned long prot_val)
20444+{
20445+ return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
20446+ __builtin_return_address(0));
20447+}
20448+EXPORT_SYMBOL(ioremap_prot);
20449+
20450 /**
20451 * iounmap - Free a IO remapping
20452 * @addr: virtual address from ioremap_*
20453@@ -526,6 +542,8 @@ void iounmap(volatile void __iomem *addr
20454 addr = (volatile void __iomem *)
20455 (PAGE_MASK & (unsigned long __force)addr);
20456
20457+ mmiotrace_iounmap(addr);
20458+
20459 /* Use the vm area unlocked, assuming the caller
20460 ensures there isn't another iounmap for the same address
20461 in parallel. Reuse of the virtual address is prevented by
20462@@ -533,7 +551,7 @@ void iounmap(volatile void __iomem *addr
20463 cpa takes care of the direct mappings. */
20464 read_lock(&vmlist_lock);
20465 for (p = vmlist; p; p = p->next) {
20466- if (p->addr == addr)
20467+ if (p->addr == (void __force *)addr)
20468 break;
20469 }
20470 read_unlock(&vmlist_lock);
20471@@ -547,7 +565,7 @@ void iounmap(volatile void __iomem *addr
20472 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
20473
20474 /* Finally remove it */
20475- o = remove_vm_area((void *)addr);
20476+ o = remove_vm_area((void __force *)addr);
20477 BUG_ON(p != o || o == NULL);
20478 kfree(p);
20479 }
20480@@ -567,7 +585,7 @@ void *xlate_dev_mem_ptr(unsigned long ph
20481 if (page_is_ram(start >> PAGE_SHIFT))
20482 return __va(phys);
20483
20484- addr = (void *)ioremap_default(start, PAGE_SIZE);
20485+ addr = (void __force *)ioremap_default(start, PAGE_SIZE);
20486 if (addr)
20487 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
20488
20489@@ -595,8 +613,7 @@ static int __init early_ioremap_debug_se
20490 early_param("early_ioremap_debug", early_ioremap_debug_setup);
20491
20492 static __initdata int after_paging_init;
20493-static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
20494- __section(.bss.page_aligned);
20495+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
20496
20497 #ifdef CONFIG_X86_32
20498 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
00e5a55c 20499@@ -695,10 +712,11 @@ static void __init __early_set_fixmap(en
cc90b958
BS
20500 return;
20501 }
20502 pte = early_ioremap_pte(addr);
20503+
20504 if (pgprot_val(flags))
20505 set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
20506 else
20507- pte_clear(NULL, addr, pte);
20508+ pte_clear(&init_mm, addr, pte);
20509 __flush_tlb_one(addr);
20510 }
20511
00e5a55c 20512@@ -726,13 +744,11 @@ static int __init check_early_ioremap_le
cc90b958
BS
20513 {
20514 if (!early_ioremap_nested)
20515 return 0;
20516-
20517- printk(KERN_WARNING
20518+ WARN(1, KERN_WARNING
20519 "Debug warning: early ioremap leak of %d areas detected.\n",
20520- early_ioremap_nested);
20521+ early_ioremap_nested);
20522 printk(KERN_WARNING
20523- "please boot with early_ioremap_debug and report the dmesg.\n");
20524- WARN_ON(1);
20525+ "please boot with early_ioremap_debug and report the dmesg.\n");
20526
20527 return 1;
20528 }
00e5a55c
BS
20529--- sle11-2009-06-04.orig/arch/x86/mm/pageattr-xen.c 2009-03-16 16:38:05.000000000 +0100
20530+++ sle11-2009-06-04/arch/x86/mm/pageattr-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
20531@@ -34,6 +34,47 @@ struct cpa_data {
20532 unsigned force_split : 1;
20533 };
20534
20535+#ifdef CONFIG_PROC_FS
20536+static unsigned long direct_pages_count[PG_LEVEL_NUM];
20537+
20538+void update_page_count(int level, unsigned long pages)
20539+{
20540+ unsigned long flags;
20541+
20542+ /* Protect against CPA */
20543+ spin_lock_irqsave(&pgd_lock, flags);
20544+ direct_pages_count[level] += pages;
20545+ spin_unlock_irqrestore(&pgd_lock, flags);
20546+}
20547+
20548+static void split_page_count(int level)
20549+{
20550+ direct_pages_count[level]--;
20551+ direct_pages_count[level - 1] += PTRS_PER_PTE;
20552+}
20553+
20554+int arch_report_meminfo(char *page)
20555+{
20556+ int n = sprintf(page, "DirectMap4k: %8lu kB\n",
20557+ direct_pages_count[PG_LEVEL_4K] << 2);
20558+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
20559+ n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
20560+ direct_pages_count[PG_LEVEL_2M] << 11);
20561+#else
20562+ n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
20563+ direct_pages_count[PG_LEVEL_2M] << 12);
20564+#endif
20565+#ifdef CONFIG_X86_64
20566+ if (direct_gbpages)
20567+ n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
20568+ direct_pages_count[PG_LEVEL_1G] << 20);
20569+#endif
20570+ return n;
20571+}
20572+#else
20573+static inline void split_page_count(int level) { }
20574+#endif
20575+
20576 #ifdef CONFIG_X86_64
20577
20578 static inline unsigned long highmap_start_pfn(void)
20579@@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long
20580 {
20581 BUG_ON(irqs_disabled());
20582
20583- on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
20584+ on_each_cpu(__cpa_flush_all, (void *) cache, 1);
20585 }
20586
20587 static void __cpa_flush_range(void *arg)
20588@@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned lon
20589 BUG_ON(irqs_disabled());
20590 WARN_ON(PAGE_ALIGN(start) != start);
20591
20592- on_each_cpu(__cpa_flush_range, NULL, 1, 1);
20593+ on_each_cpu(__cpa_flush_range, NULL, 1);
20594
20595 if (!cache)
20596 return;
20597@@ -229,6 +270,7 @@ pte_t *lookup_address(unsigned long addr
20598
20599 return pte_offset_kernel(pmd, address);
20600 }
20601+EXPORT_SYMBOL_GPL(lookup_address);
20602
20603 /*
20604 * Set the new pmd in all the pgds we know about:
20605@@ -509,6 +551,16 @@ static int split_large_page(pte_t *kpte,
20606 }
20607 #endif
20608
20609+ if (address >= (unsigned long)__va(0) &&
20610+ address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
20611+ split_page_count(level);
20612+
20613+#ifdef CONFIG_X86_64
20614+ if (address >= (unsigned long)__va(1UL<<32) &&
20615+ address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
20616+ split_page_count(level);
20617+#endif
20618+
20619 /*
20620 * Get the target mfn from the original entry:
20621 */
00e5a55c 20622@@ -566,10 +618,9 @@ repeat:
cc90b958
BS
20623 if (!__pte_val(old_pte)) {
20624 if (!primary)
20625 return 0;
20626- printk(KERN_WARNING "CPA: called for zero pte. "
20627+ WARN(1, KERN_WARNING "CPA: called for zero pte. "
20628 "vaddr = %lx cpa->vaddr = %lx\n", address,
20629 cpa->vaddr);
20630- WARN_ON(1);
20631 return -EINVAL;
20632 }
20633
00e5a55c 20634@@ -634,15 +685,24 @@ static int cpa_process_alias(struct cpa_
cc90b958
BS
20635 struct cpa_data alias_cpa;
20636 int ret = 0;
20637
20638- if (cpa->pfn > max_pfn_mapped)
20639+ if (cpa->pfn >= max_pfn_mapped)
20640 return 0;
20641
20642+#ifdef CONFIG_X86_64
20643+ if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
20644+ return 0;
20645+#endif
20646 /*
20647 * No need to redo, when the primary call touched the direct
20648 * mapping already:
20649 */
20650- if (!within(cpa->vaddr, PAGE_OFFSET,
20651- PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
20652+ if (!(within(cpa->vaddr, PAGE_OFFSET,
20653+ PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
20654+#ifdef CONFIG_X86_64
20655+ || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
20656+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
20657+#endif
20658+ )) {
20659
20660 alias_cpa = *cpa;
20661 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
00e5a55c
BS
20662@@ -796,6 +856,51 @@ static inline int change_page_attr_clear
20663 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
20664 }
20665
20666+#ifdef CONFIG_XEN
20667+static void _free_memtype(u64 pstart, u64 pend)
20668+{
20669+ u64 pa = pstart &= __PHYSICAL_MASK;
20670+ u64 ma = phys_to_machine(pa);
20671+
20672+ while ((pa += PAGE_SIZE) < pend) {
20673+ if (phys_to_machine(pa) != ma + (pa - pstart)) {
20674+ free_memtype(ma, ma + (pa - pstart));
20675+ pstart = pa;
20676+ ma = phys_to_machine(pa);
20677+ }
20678+ }
20679+ free_memtype(ma, ma + (pend - pstart));
20680+}
20681+#define free_memtype _free_memtype
20682+
20683+static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type)
20684+{
20685+ u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur;
20686+ u64 ma = phys_to_machine(pa);
20687+ int rc = 0;
20688+
20689+ while ((pa += PAGE_SIZE) < pend) {
20690+ if (phys_to_machine(pa) != ma + (pa - pcur)) {
20691+ rc = reserve_memtype(ma, ma + (pa - pcur),
20692+ req_type, NULL);
20693+ if (rc)
20694+ break;
20695+ pcur = pa;
20696+ ma = phys_to_machine(pa);
20697+ }
20698+ }
20699+ if (likely(!rc))
20700+ rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL);
20701+
20702+ if (unlikely(!rc) && pstart < pcur)
20703+ _free_memtype(pstart, pcur);
20704+
20705+ return rc;
20706+}
20707+#define reserve_memtype(s, e, r, n) \
20708+ _reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r))
20709+#endif
20710+
20711 int _set_memory_uc(unsigned long addr, int numpages)
20712 {
20713 /*
20714@@ -810,7 +915,7 @@ int set_memory_uc(unsigned long addr, in
cc90b958
BS
20715 /*
20716 * for now UC MINUS. see comments in ioremap_nocache()
20717 */
20718- if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20719+ if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20720 _PAGE_CACHE_UC_MINUS, NULL))
20721 return -EINVAL;
20722
00e5a55c 20723@@ -826,10 +931,10 @@ int _set_memory_wc(unsigned long addr, i
cc90b958
BS
20724
20725 int set_memory_wc(unsigned long addr, int numpages)
20726 {
20727- if (!pat_wc_enabled)
20728+ if (!pat_enabled)
20729 return set_memory_uc(addr, numpages);
20730
20731- if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20732+ if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20733 _PAGE_CACHE_WC, NULL))
20734 return -EINVAL;
20735
00e5a55c 20736@@ -845,7 +950,7 @@ int _set_memory_wb(unsigned long addr, i
cc90b958
BS
20737
20738 int set_memory_wb(unsigned long addr, int numpages)
20739 {
20740- free_memtype(addr, addr + numpages * PAGE_SIZE);
20741+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
20742
20743 return _set_memory_wb(addr, numpages);
20744 }
00e5a55c
BS
20745--- sle11-2009-06-04.orig/arch/x86/mm/pat-xen.c 2009-03-16 16:38:05.000000000 +0100
20746+++ sle11-2009-06-04/arch/x86/mm/pat-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
20747@@ -12,6 +12,8 @@
20748 #include <linux/gfp.h>
20749 #include <linux/fs.h>
20750 #include <linux/bootmem.h>
20751+#include <linux/debugfs.h>
20752+#include <linux/seq_file.h>
20753
20754 #include <asm/msr.h>
20755 #include <asm/tlbflush.h>
20756@@ -26,11 +28,11 @@
20757 #include <asm/io.h>
20758
20759 #ifdef CONFIG_X86_PAT
20760-int __read_mostly pat_wc_enabled = 1;
20761+int __read_mostly pat_enabled = 1;
20762
20763 void __cpuinit pat_disable(char *reason)
20764 {
20765- pat_wc_enabled = 0;
20766+ pat_enabled = 0;
20767 printk(KERN_INFO "%s\n", reason);
20768 }
20769
20770@@ -42,6 +44,19 @@ static int __init nopat(char *str)
20771 early_param("nopat", nopat);
20772 #endif
20773
20774+
20775+static int debug_enable;
20776+static int __init pat_debug_setup(char *str)
20777+{
20778+ debug_enable = 1;
20779+ return 0;
20780+}
20781+__setup("debugpat", pat_debug_setup);
20782+
20783+#define dprintk(fmt, arg...) \
20784+ do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
20785+
20786+
20787 static u64 __read_mostly boot_pat_state;
20788
20789 enum {
20790@@ -53,24 +68,25 @@ enum {
20791 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
20792 };
20793
20794-#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
20795+#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
20796
20797 void pat_init(void)
20798 {
20799 u64 pat;
20800
20801- if (!pat_wc_enabled)
20802+ if (!pat_enabled)
20803 return;
20804
20805 /* Paranoia check. */
20806- if (!cpu_has_pat) {
20807- printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
20808+ if (!cpu_has_pat && boot_pat_state) {
20809 /*
20810- * Panic if this happens on the secondary CPU, and we
20811+ * If this happens we are on a secondary CPU, but
20812 * switched to PAT on the boot CPU. We have no way to
20813 * undo PAT.
20814- */
20815- BUG_ON(boot_pat_state);
20816+ */
20817+ printk(KERN_ERR "PAT enabled, "
20818+ "but not supported by secondary CPU\n");
20819+ BUG();
20820 }
20821
20822 #ifndef CONFIG_XEN
20823@@ -87,8 +103,8 @@ void pat_init(void)
20824 * 011 UC _PAGE_CACHE_UC
20825 * PAT bit unused
20826 */
20827- pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
20828- PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
20829+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
20830+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
20831
20832 /* Boot CPU check */
20833 if (!boot_pat_state)
20834@@ -113,13 +129,13 @@ void pat_init(void)
20835 static char *cattr_name(unsigned long flags)
20836 {
20837 switch (flags & _PAGE_CACHE_MASK) {
20838- case _PAGE_CACHE_UC: return "uncached";
20839- case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20840- case _PAGE_CACHE_WB: return "write-back";
20841- case _PAGE_CACHE_WC: return "write-combining";
20842- case _PAGE_CACHE_WP: return "write-protected";
20843- case _PAGE_CACHE_WT: return "write-through";
20844- default: return "broken";
20845+ case _PAGE_CACHE_UC: return "uncached";
20846+ case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20847+ case _PAGE_CACHE_WB: return "write-back";
20848+ case _PAGE_CACHE_WC: return "write-combining";
20849+ case _PAGE_CACHE_WP: return "write-protected";
20850+ case _PAGE_CACHE_WT: return "write-through";
20851+ default: return "broken";
20852 }
20853 }
20854
20855@@ -157,49 +173,55 @@ static DEFINE_SPINLOCK(memtype_lock); /
20856 * The intersection is based on "Effective Memory Type" tables in IA-32
20857 * SDM vol 3a
20858 */
20859-static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
20860- unsigned long *ret_prot)
20861+static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
20862 {
20863- unsigned long pat_type;
20864- u8 mtrr_type;
20865-
20866- pat_type = prot & _PAGE_CACHE_MASK;
20867- prot &= (~_PAGE_CACHE_MASK);
20868-
20869- /*
20870- * We return the PAT request directly for types where PAT takes
20871- * precedence with respect to MTRR and for UC_MINUS.
20872- * Consistency checks with other PAT requests is done later
20873- * while going through memtype list.
20874- */
20875- if (pat_type == _PAGE_CACHE_WC) {
20876- *ret_prot = prot | _PAGE_CACHE_WC;
20877- return 0;
20878- } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
20879- *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
20880- return 0;
20881- } else if (pat_type == _PAGE_CACHE_UC) {
20882- *ret_prot = prot | _PAGE_CACHE_UC;
20883- return 0;
20884- }
20885-
20886 /*
20887 * Look for MTRR hint to get the effective type in case where PAT
20888 * request is for WB.
20889 */
20890- mtrr_type = mtrr_type_lookup(start, end);
20891+ if (req_type == _PAGE_CACHE_WB) {
20892+ u8 mtrr_type;
20893
20894- if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
20895- *ret_prot = prot | _PAGE_CACHE_UC;
20896- } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
20897- *ret_prot = prot | _PAGE_CACHE_WC;
20898- } else {
20899- *ret_prot = prot | _PAGE_CACHE_WB;
20900+ mtrr_type = mtrr_type_lookup(start, end);
20901+ if (mtrr_type == MTRR_TYPE_UNCACHABLE)
20902+ return _PAGE_CACHE_UC;
20903+ if (mtrr_type == MTRR_TYPE_WRCOMB)
20904+ return _PAGE_CACHE_WC;
20905+ }
20906+
20907+ return req_type;
20908+}
20909+
20910+static int chk_conflict(struct memtype *new, struct memtype *entry,
20911+ unsigned long *type)
20912+{
20913+ if (new->type != entry->type) {
20914+ if (type) {
20915+ new->type = entry->type;
20916+ *type = entry->type;
20917+ } else
20918+ goto conflict;
20919 }
20920
20921+ /* check overlaps with more than one entry in the list */
20922+ list_for_each_entry_continue(entry, &memtype_list, nd) {
20923+ if (new->end <= entry->start)
20924+ break;
20925+ else if (new->type != entry->type)
20926+ goto conflict;
20927+ }
20928 return 0;
20929+
20930+ conflict:
20931+ printk(KERN_INFO "%s:%d conflicting memory types "
20932+ "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
20933+ new->end, cattr_name(new->type), cattr_name(entry->type));
20934+ return -EBUSY;
20935 }
20936
20937+static struct memtype *cached_entry;
20938+static u64 cached_start;
20939+
20940 /*
20941 * req_type typically has one of the:
20942 * - _PAGE_CACHE_WB
20943@@ -210,37 +232,36 @@ static int pat_x_mtrr_type(u64 start, u6
20944 * req_type will have a special case value '-1', when requester want to inherit
20945 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
20946 *
20947- * If ret_type is NULL, function will return an error if it cannot reserve the
20948- * region with req_type. If ret_type is non-null, function will return
20949- * available type in ret_type in case of no error. In case of any error
20950+ * If new_type is NULL, function will return an error if it cannot reserve the
20951+ * region with req_type. If new_type is non-NULL, function will return
20952+ * available type in new_type in case of no error. In case of any error
20953 * it will return a negative return value.
20954 */
20955 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
20956- unsigned long *ret_type)
20957+ unsigned long *new_type)
20958 {
20959- struct memtype *new_entry = NULL;
20960- struct memtype *parse;
20961+ struct memtype *new, *entry;
20962 unsigned long actual_type;
20963+ struct list_head *where;
20964 int err = 0;
20965
20966- /* Only track when pat_wc_enabled */
20967- if (!pat_wc_enabled) {
20968+ BUG_ON(start >= end); /* end is exclusive */
20969+
20970+ if (!pat_enabled) {
20971 /* This is identical to page table setting without PAT */
20972- if (ret_type) {
20973- if (req_type == -1) {
20974- *ret_type = _PAGE_CACHE_WB;
20975- } else {
20976- *ret_type = req_type;
20977- }
20978+ if (new_type) {
20979+ if (req_type == -1)
20980+ *new_type = _PAGE_CACHE_WB;
20981+ else
20982+ *new_type = req_type & _PAGE_CACHE_MASK;
20983 }
20984 return 0;
20985 }
20986
20987 /* Low ISA region is always mapped WB in page table. No need to track */
20988- if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
20989- if (ret_type)
20990- *ret_type = _PAGE_CACHE_WB;
20991-
20992+ if (is_ISA_range(start, end - 1)) {
20993+ if (new_type)
20994+ *new_type = _PAGE_CACHE_WB;
20995 return 0;
20996 }
20997
20998@@ -253,206 +274,118 @@ int reserve_memtype(u64 start, u64 end,
20999 */
21000 u8 mtrr_type = mtrr_type_lookup(start, end);
21001
21002- if (mtrr_type == MTRR_TYPE_WRBACK) {
21003- req_type = _PAGE_CACHE_WB;
21004+ if (mtrr_type == MTRR_TYPE_WRBACK)
21005 actual_type = _PAGE_CACHE_WB;
21006- } else {
21007- req_type = _PAGE_CACHE_UC_MINUS;
21008+ else
21009 actual_type = _PAGE_CACHE_UC_MINUS;
21010- }
21011- } else {
21012- req_type &= _PAGE_CACHE_MASK;
21013- err = pat_x_mtrr_type(start, end, req_type, &actual_type);
21014- }
21015-
21016- if (err) {
21017- if (ret_type)
21018- *ret_type = actual_type;
21019+ } else
21020+ actual_type = pat_x_mtrr_type(start, end,
21021+ req_type & _PAGE_CACHE_MASK);
21022
21023- return -EINVAL;
21024- }
21025-
21026- new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21027- if (!new_entry)
21028+ new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21029+ if (!new)
21030 return -ENOMEM;
21031
21032- new_entry->start = start;
21033- new_entry->end = end;
21034- new_entry->type = actual_type;
21035+ new->start = start;
21036+ new->end = end;
21037+ new->type = actual_type;
21038
21039- if (ret_type)
21040- *ret_type = actual_type;
21041+ if (new_type)
21042+ *new_type = actual_type;
21043
21044 spin_lock(&memtype_lock);
21045
21046- /* Search for existing mapping that overlaps the current range */
21047- list_for_each_entry(parse, &memtype_list, nd) {
21048- struct memtype *saved_ptr;
21049+ if (cached_entry && start >= cached_start)
21050+ entry = cached_entry;
21051+ else
21052+ entry = list_entry(&memtype_list, struct memtype, nd);
21053
21054- if (parse->start >= end) {
21055- pr_debug("New Entry\n");
21056- list_add(&new_entry->nd, parse->nd.prev);
21057- new_entry = NULL;
21058+ /* Search for existing mapping that overlaps the current range */
21059+ where = NULL;
21060+ list_for_each_entry_continue(entry, &memtype_list, nd) {
21061+ if (end <= entry->start) {
21062+ where = entry->nd.prev;
21063+ cached_entry = list_entry(where, struct memtype, nd);
21064 break;
21065- }
21066-
21067- if (start <= parse->start && end >= parse->start) {
21068- if (actual_type != parse->type && ret_type) {
21069- actual_type = parse->type;
21070- *ret_type = actual_type;
21071- new_entry->type = actual_type;
21072- }
21073-
21074- if (actual_type != parse->type) {
21075- printk(
21076- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21077- current->comm, current->pid,
21078- start, end,
21079- cattr_name(actual_type),
21080- cattr_name(parse->type));
21081- err = -EBUSY;
21082- break;
21083- }
21084-
21085- saved_ptr = parse;
21086- /*
21087- * Check to see whether the request overlaps more
21088- * than one entry in the list
21089- */
21090- list_for_each_entry_continue(parse, &memtype_list, nd) {
21091- if (end <= parse->start) {
21092- break;
21093- }
21094-
21095- if (actual_type != parse->type) {
21096- printk(
21097- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21098- current->comm, current->pid,
21099- start, end,
21100- cattr_name(actual_type),
21101- cattr_name(parse->type));
21102- err = -EBUSY;
21103- break;
21104- }
21105- }
21106-
21107- if (err) {
21108- break;
21109+ } else if (start <= entry->start) { /* end > entry->start */
21110+ err = chk_conflict(new, entry, new_type);
21111+ if (!err) {
21112+ dprintk("Overlap at 0x%Lx-0x%Lx\n",
21113+ entry->start, entry->end);
21114+ where = entry->nd.prev;
21115+ cached_entry = list_entry(where,
21116+ struct memtype, nd);
21117 }
21118-
21119- pr_debug("Overlap at 0x%Lx-0x%Lx\n",
21120- saved_ptr->start, saved_ptr->end);
21121- /* No conflict. Go ahead and add this new entry */
21122- list_add(&new_entry->nd, saved_ptr->nd.prev);
21123- new_entry = NULL;
21124 break;
21125- }
21126-
21127- if (start < parse->end) {
21128- if (actual_type != parse->type && ret_type) {
21129- actual_type = parse->type;
21130- *ret_type = actual_type;
21131- new_entry->type = actual_type;
21132- }
21133-
21134- if (actual_type != parse->type) {
21135- printk(
21136- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21137- current->comm, current->pid,
21138- start, end,
21139- cattr_name(actual_type),
21140- cattr_name(parse->type));
21141- err = -EBUSY;
21142- break;
21143- }
21144-
21145- saved_ptr = parse;
21146- /*
21147- * Check to see whether the request overlaps more
21148- * than one entry in the list
21149- */
21150- list_for_each_entry_continue(parse, &memtype_list, nd) {
21151- if (end <= parse->start) {
21152- break;
21153- }
21154-
21155- if (actual_type != parse->type) {
21156- printk(
21157- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21158- current->comm, current->pid,
21159- start, end,
21160- cattr_name(actual_type),
21161- cattr_name(parse->type));
21162- err = -EBUSY;
21163- break;
21164+ } else if (start < entry->end) { /* start > entry->start */
21165+ err = chk_conflict(new, entry, new_type);
21166+ if (!err) {
21167+ dprintk("Overlap at 0x%Lx-0x%Lx\n",
21168+ entry->start, entry->end);
21169+ cached_entry = list_entry(entry->nd.prev,
21170+ struct memtype, nd);
21171+
21172+ /*
21173+ * Move to right position in the linked
21174+ * list to add this new entry
21175+ */
21176+ list_for_each_entry_continue(entry,
21177+ &memtype_list, nd) {
21178+ if (start <= entry->start) {
21179+ where = entry->nd.prev;
21180+ break;
21181+ }
21182 }
21183 }
21184-
21185- if (err) {
21186- break;
21187- }
21188-
21189- pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
21190- saved_ptr->start, saved_ptr->end);
21191- /* No conflict. Go ahead and add this new entry */
21192- list_add(&new_entry->nd, &saved_ptr->nd);
21193- new_entry = NULL;
21194 break;
21195 }
21196 }
21197
21198 if (err) {
21199- printk(KERN_INFO
21200- "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
21201- start, end, cattr_name(new_entry->type),
21202- cattr_name(req_type));
21203- kfree(new_entry);
21204+ printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
21205+ "track %s, req %s\n",
21206+ start, end, cattr_name(new->type), cattr_name(req_type));
21207+ kfree(new);
21208 spin_unlock(&memtype_lock);
21209 return err;
21210 }
21211
21212- if (new_entry) {
21213- /* No conflict. Not yet added to the list. Add to the tail */
21214- list_add_tail(&new_entry->nd, &memtype_list);
21215- pr_debug("New Entry\n");
21216- }
21217+ cached_start = start;
21218
21219- if (ret_type) {
21220- pr_debug(
21221- "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21222- start, end, cattr_name(actual_type),
21223- cattr_name(req_type), cattr_name(*ret_type));
21224- } else {
21225- pr_debug(
21226- "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
21227- start, end, cattr_name(actual_type),
21228- cattr_name(req_type));
21229- }
21230+ if (where)
21231+ list_add(&new->nd, where);
21232+ else
21233+ list_add_tail(&new->nd, &memtype_list);
21234
21235 spin_unlock(&memtype_lock);
21236+
21237+ dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21238+ start, end, cattr_name(new->type), cattr_name(req_type),
21239+ new_type ? cattr_name(*new_type) : "-");
21240+
21241 return err;
21242 }
21243
21244 int free_memtype(u64 start, u64 end)
21245 {
21246- struct memtype *ml;
21247+ struct memtype *entry;
21248 int err = -EINVAL;
21249
21250- /* Only track when pat_wc_enabled */
21251- if (!pat_wc_enabled) {
21252+ if (!pat_enabled)
21253 return 0;
21254- }
21255
21256 /* Low ISA region is always mapped WB. No need to track */
21257- if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
21258+ if (is_ISA_range(start, end - 1))
21259 return 0;
21260- }
21261
21262 spin_lock(&memtype_lock);
21263- list_for_each_entry(ml, &memtype_list, nd) {
21264- if (ml->start == start && ml->end == end) {
21265- list_del(&ml->nd);
21266- kfree(ml);
21267+ list_for_each_entry(entry, &memtype_list, nd) {
21268+ if (entry->start == start && entry->end == end) {
21269+ if (cached_entry == entry || cached_start == start)
21270+ cached_entry = NULL;
21271+
21272+ list_del(&entry->nd);
21273+ kfree(entry);
21274 err = 0;
21275 break;
21276 }
21277@@ -464,27 +397,19 @@ int free_memtype(u64 start, u64 end)
21278 current->comm, current->pid, start, end);
21279 }
21280
21281- pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21282+ dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21283 return err;
21284 }
21285
21286
21287-/*
21288- * /dev/mem mmap interface. The memtype used for mapping varies:
21289- * - Use UC for mappings with O_SYNC flag
21290- * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
21291- * inherit the memtype from existing mapping.
21292- * - Else use UC_MINUS memtype (for backward compatibility with existing
21293- * X drivers.
21294- */
21295 pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
21296 unsigned long size, pgprot_t vma_prot)
21297 {
21298 return vma_prot;
21299 }
21300
21301-#ifdef CONFIG_NONPROMISC_DEVMEM
21302-/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
21303+#ifdef CONFIG_STRICT_DEVMEM
21304+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
21305 static inline int range_is_allowed(unsigned long mfn, unsigned long size)
21306 {
21307 return 1;
21308@@ -508,20 +433,20 @@ static inline int range_is_allowed(unsig
21309 }
21310 return 1;
21311 }
21312-#endif /* CONFIG_NONPROMISC_DEVMEM */
21313+#endif /* CONFIG_STRICT_DEVMEM */
21314
21315 int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
21316 unsigned long size, pgprot_t *vma_prot)
21317 {
21318 u64 addr = (u64)mfn << PAGE_SHIFT;
21319- unsigned long flags = _PAGE_CACHE_UC_MINUS;
21320+ unsigned long flags = -1;
21321 int retval;
21322
21323 if (!range_is_allowed(mfn, size))
21324 return 0;
21325
21326 if (file->f_flags & O_SYNC) {
21327- flags = _PAGE_CACHE_UC;
21328+ flags = _PAGE_CACHE_UC_MINUS;
21329 }
21330
21331 #ifndef CONFIG_X86_32
21332@@ -534,25 +459,26 @@ int phys_mem_access_prot_allowed(struct
21333 * caching for the high addresses through the KEN pin, but
21334 * we maintain the tradition of paranoia in this code.
21335 */
21336- if (!pat_wc_enabled &&
21337- ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
21338- test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
21339- test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
21340- test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
21341- (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21342+ if (!pat_enabled &&
21343+ !(boot_cpu_has(X86_FEATURE_MTRR) ||
21344+ boot_cpu_has(X86_FEATURE_K6_MTRR) ||
21345+ boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
21346+ boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
21347+ (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21348 flags = _PAGE_CACHE_UC;
21349 }
21350 #endif
21351 #endif
21352
21353 /*
21354- * With O_SYNC, we can only take UC mapping. Fail if we cannot.
21355+ * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
21356+ *
21357 * Without O_SYNC, we want to get
21358 * - WB for WB-able memory and no other conflicting mappings
21359 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
21360 * - Inherit from confliting mappings otherwise
21361 */
21362- if (flags != _PAGE_CACHE_UC_MINUS) {
21363+ if (flags != -1) {
21364 retval = reserve_memtype(addr, addr + size, flags, NULL);
21365 } else {
21366 retval = reserve_memtype(addr, addr + size, -1, &flags);
21367@@ -600,3 +526,88 @@ void unmap_devmem(unsigned long mfn, uns
21368 free_memtype(addr, addr + size);
21369 }
21370
21371+#if defined(CONFIG_DEBUG_FS)
21372+
21373+/* get Nth element of the linked list */
21374+static struct memtype *memtype_get_idx(loff_t pos)
21375+{
21376+ struct memtype *list_node, *print_entry;
21377+ int i = 1;
21378+
21379+ print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21380+ if (!print_entry)
21381+ return NULL;
21382+
21383+ spin_lock(&memtype_lock);
21384+ list_for_each_entry(list_node, &memtype_list, nd) {
21385+ if (pos == i) {
21386+ *print_entry = *list_node;
21387+ spin_unlock(&memtype_lock);
21388+ return print_entry;
21389+ }
21390+ ++i;
21391+ }
21392+ spin_unlock(&memtype_lock);
21393+ kfree(print_entry);
21394+ return NULL;
21395+}
21396+
21397+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
21398+{
21399+ if (*pos == 0) {
21400+ ++*pos;
21401+ seq_printf(seq, "PAT memtype list:\n");
21402+ }
21403+
21404+ return memtype_get_idx(*pos);
21405+}
21406+
21407+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
21408+{
21409+ ++*pos;
21410+ return memtype_get_idx(*pos);
21411+}
21412+
21413+static void memtype_seq_stop(struct seq_file *seq, void *v)
21414+{
21415+}
21416+
21417+static int memtype_seq_show(struct seq_file *seq, void *v)
21418+{
21419+ struct memtype *print_entry = (struct memtype *)v;
21420+
21421+ seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
21422+ print_entry->start, print_entry->end);
21423+ kfree(print_entry);
21424+ return 0;
21425+}
21426+
21427+static struct seq_operations memtype_seq_ops = {
21428+ .start = memtype_seq_start,
21429+ .next = memtype_seq_next,
21430+ .stop = memtype_seq_stop,
21431+ .show = memtype_seq_show,
21432+};
21433+
21434+static int memtype_seq_open(struct inode *inode, struct file *file)
21435+{
21436+ return seq_open(file, &memtype_seq_ops);
21437+}
21438+
21439+static const struct file_operations memtype_fops = {
21440+ .open = memtype_seq_open,
21441+ .read = seq_read,
21442+ .llseek = seq_lseek,
21443+ .release = seq_release,
21444+};
21445+
21446+static int __init pat_memtype_list_init(void)
21447+{
21448+ debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
21449+ NULL, &memtype_fops);
21450+ return 0;
21451+}
21452+
21453+late_initcall(pat_memtype_list_init);
21454+
21455+#endif /* CONFIG_DEBUG_FS */
00e5a55c
BS
21456--- sle11-2009-06-04.orig/arch/x86/mm/pgtable-xen.c 2009-03-16 16:38:05.000000000 +0100
21457+++ sle11-2009-06-04/arch/x86/mm/pgtable-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
21458@@ -4,6 +4,7 @@
21459 #include <asm/pgalloc.h>
21460 #include <asm/pgtable.h>
21461 #include <asm/tlb.h>
21462+#include <asm/fixmap.h>
21463 #include <asm/hypervisor.h>
21464 #include <asm/mmu_context.h>
21465
21466@@ -410,15 +411,9 @@ static inline void pgd_list_del(pgd_t *p
21467 static void pgd_ctor(void *p)
21468 {
21469 pgd_t *pgd = p;
21470- unsigned long flags;
21471
21472 pgd_test_and_unpin(pgd);
21473
21474- /* Clear usermode parts of PGD */
21475- memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
21476-
21477- spin_lock_irqsave(&pgd_lock, flags);
21478-
21479 /* If the pgd points to a shared pagetable level (either the
21480 ptes in non-PAE, or shared PMD in PAE), then just copy the
21481 references from swapper_pg_dir. */
21482@@ -440,13 +435,9 @@ static void pgd_ctor(void *p)
21483 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
21484 #endif
21485
21486-#ifndef CONFIG_X86_PAE
21487 /* list required to sync kernel mapping updates */
21488 if (!SHARED_KERNEL_PMD)
21489 pgd_list_add(pgd);
21490-#endif
21491-
21492- spin_unlock_irqrestore(&pgd_lock, flags);
21493 }
21494
21495 static void pgd_dtor(void *pgd)
21496@@ -475,33 +466,6 @@ static void pgd_dtor(void *pgd)
21497
21498 #ifdef CONFIG_X86_PAE
21499 /*
21500- * Mop up any pmd pages which may still be attached to the pgd.
21501- * Normally they will be freed by munmap/exit_mmap, but any pmd we
21502- * preallocate which never got a corresponding vma will need to be
21503- * freed manually.
21504- */
21505-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21506-{
21507- int i;
21508-
21509- for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
21510- pgd_t pgd = pgdp[i];
21511-
21512- if (__pgd_val(pgd) != 0) {
21513- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21514-
21515- pgdp[i] = xen_make_pgd(0);
21516-
21517- paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21518- pmd_free(mm, pmd);
21519- }
21520- }
21521-
21522- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21523- xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21524-}
21525-
21526-/*
21527 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
21528 * updating the top-level pagetable entries to guarantee the
21529 * processor notices the update. Since this is expensive, and
21530@@ -512,61 +476,7 @@ static void pgd_mop_up_pmds(struct mm_st
21531 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
21532 * and initialize the kernel pmds here.
21533 */
21534-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21535-{
21536- pud_t *pud;
21537- pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
21538- unsigned long addr, flags;
21539- int i;
21540-
21541- /*
21542- * We can race save/restore (if we sleep during a GFP_KERNEL memory
21543- * allocation). We therefore store virtual addresses of pmds as they
21544- * do not change across save/restore, and poke the machine addresses
21545- * into the pgdir under the pgd_lock.
21546- */
21547- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
21548- pmds[i] = pmd_alloc_one(mm, addr);
21549- if (!pmds[i])
21550- goto out_oom;
21551- }
21552-
21553- spin_lock_irqsave(&pgd_lock, flags);
21554-
21555- /* Protect against save/restore: move below 4GB under pgd_lock. */
21556- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21557- && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21558- spin_unlock_irqrestore(&pgd_lock, flags);
21559-out_oom:
21560- while (i--)
21561- pmd_free(mm, pmds[i]);
21562- return 0;
21563- }
21564-
21565- /* Copy kernel pmd contents and write-protect the new pmds. */
21566- pud = pud_offset(pgd, 0);
21567- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
21568- i++, pud++, addr += PUD_SIZE) {
21569- if (i >= KERNEL_PGD_BOUNDARY) {
21570- memcpy(pmds[i],
21571- (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21572- sizeof(pmd_t) * PTRS_PER_PMD);
21573- make_lowmem_page_readonly(
21574- pmds[i], XENFEAT_writable_page_tables);
21575- }
21576-
21577- /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21578- pud_populate(mm, pud, pmds[i]);
21579- }
21580-
21581- /* List required to sync kernel mapping updates and
21582- * to pin/unpin on save/restore. */
21583- pgd_list_add(pgd);
21584-
21585- spin_unlock_irqrestore(&pgd_lock, flags);
21586-
21587- return 1;
21588-}
21589+#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
21590
21591 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
21592 {
00e5a55c 21593@@ -596,16 +506,101 @@ void pud_populate(struct mm_struct *mm,
cc90b958
BS
21594 xen_tlb_flush();
21595 }
21596 #else /* !CONFIG_X86_PAE */
21597+
21598 /* No need to prepopulate any pagetable entries in non-PAE modes. */
21599-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21600+#define PREALLOCATED_PMDS 0
21601+
21602+#endif /* CONFIG_X86_PAE */
21603+
21604+static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
21605 {
21606- return 1;
21607+ int i;
21608+
00e5a55c 21609+#ifdef CONFIG_X86_PAE
cc90b958
BS
21610+ if (contig)
21611+ xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
00e5a55c 21612+#endif
cc90b958
BS
21613+
21614+ for(i = 0; i < PREALLOCATED_PMDS; i++)
21615+ if (pmds[i])
21616+ pmd_free(mm, pmds[i]);
21617 }
21618
21619-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
21620+static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
21621 {
21622+ int i;
21623+ bool failed = false;
21624+
21625+ for(i = 0; i < PREALLOCATED_PMDS; i++) {
21626+ pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
21627+ if (pmd == NULL)
21628+ failed = true;
21629+ pmds[i] = pmd;
21630+ }
21631+
21632+ if (failed) {
21633+ free_pmds(pmds, mm, false);
21634+ return -ENOMEM;
21635+ }
21636+
21637+ return 0;
21638+}
21639+
21640+/*
21641+ * Mop up any pmd pages which may still be attached to the pgd.
21642+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
21643+ * preallocate which never got a corresponding vma will need to be
21644+ * freed manually.
21645+ */
21646+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21647+{
21648+ int i;
21649+
21650+ for(i = 0; i < PREALLOCATED_PMDS; i++) {
21651+ pgd_t pgd = pgdp[i];
21652+
21653+ if (__pgd_val(pgd) != 0) {
21654+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21655+
21656+ pgdp[i] = xen_make_pgd(0);
21657+
21658+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21659+ pmd_free(mm, pmd);
21660+ }
21661+ }
21662+
00e5a55c 21663+#ifdef CONFIG_X86_PAE
cc90b958
BS
21664+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21665+ xen_destroy_contiguous_region((unsigned long)pgdp, 0);
00e5a55c 21666+#endif
cc90b958
BS
21667+}
21668+
21669+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
21670+{
21671+ pud_t *pud;
21672+ unsigned long addr;
21673+ int i;
21674+
21675+ if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
21676+ return;
21677+
21678+ pud = pud_offset(pgd, 0);
21679+ for (addr = i = 0; i < PREALLOCATED_PMDS;
21680+ i++, pud++, addr += PUD_SIZE) {
21681+ pmd_t *pmd = pmds[i];
21682+
21683+ if (i >= KERNEL_PGD_BOUNDARY) {
21684+ memcpy(pmd,
21685+ (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21686+ sizeof(pmd_t) * PTRS_PER_PMD);
21687+ make_lowmem_page_readonly(
21688+ pmd, XENFEAT_writable_page_tables);
21689+ }
21690+
21691+ /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21692+ pud_populate(mm, pud, pmd);
21693+ }
21694 }
21695-#endif /* CONFIG_X86_PAE */
21696
21697 #ifdef CONFIG_X86_64
21698 /* We allocate two contiguous pages for kernel and user. */
00e5a55c 21699@@ -616,19 +611,52 @@ static void pgd_mop_up_pmds(struct mm_st
cc90b958
BS
21700
21701 pgd_t *pgd_alloc(struct mm_struct *mm)
21702 {
21703- pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21704+ pgd_t *pgd;
21705+ pmd_t *pmds[PREALLOCATED_PMDS];
21706+ unsigned long flags;
21707+
21708+ pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21709+
21710+ if (pgd == NULL)
21711+ goto out;
21712
21713- /* so that alloc_pd can use it */
21714 mm->pgd = pgd;
21715- if (pgd)
21716- pgd_ctor(pgd);
21717
21718- if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
21719- free_pages((unsigned long)pgd, PGD_ORDER);
21720- pgd = NULL;
21721+ if (preallocate_pmds(pmds, mm) != 0)
21722+ goto out_free_pgd;
21723+
21724+ if (paravirt_pgd_alloc(mm) != 0)
21725+ goto out_free_pmds;
21726+
21727+ /*
21728+ * Make sure that pre-populating the pmds is atomic with
21729+ * respect to anything walking the pgd_list, so that they
21730+ * never see a partially populated pgd.
21731+ */
21732+ spin_lock_irqsave(&pgd_lock, flags);
21733+
21734+#ifdef CONFIG_X86_PAE
21735+ /* Protect against save/restore: move below 4GB under pgd_lock. */
21736+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21737+ && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21738+ spin_unlock_irqrestore(&pgd_lock, flags);
21739+ goto out_free_pmds;
21740 }
21741+#endif
21742+
21743+ pgd_ctor(pgd);
21744+ pgd_prepopulate_pmd(mm, pgd, pmds);
21745+
21746+ spin_unlock_irqrestore(&pgd_lock, flags);
21747
21748 return pgd;
21749+
21750+out_free_pmds:
21751+ free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
21752+out_free_pgd:
21753+ free_pages((unsigned long)pgd, PGD_ORDER);
21754+out:
21755+ return NULL;
21756 }
21757
21758 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
00e5a55c 21759@@ -644,6 +672,7 @@ void pgd_free(struct mm_struct *mm, pgd_
cc90b958
BS
21760 pgd_dtor(pgd);
21761
21762 pgd_mop_up_pmds(mm, pgd);
21763+ paravirt_pgd_free(mm, pgd);
21764 free_pages((unsigned long)pgd, PGD_ORDER);
21765 }
21766
00e5a55c 21767@@ -685,7 +714,7 @@ int ptep_test_and_clear_young(struct vm_
cc90b958
BS
21768
21769 if (pte_young(*ptep))
21770 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
21771- &ptep->pte);
21772+ (unsigned long *) &ptep->pte);
21773
21774 if (ret)
21775 pte_update(vma->vm_mm, addr, ptep);
00e5a55c 21776@@ -707,3 +736,42 @@ int ptep_clear_flush_young(struct vm_are
cc90b958
BS
21777
21778 return young;
21779 }
21780+
21781+int fixmaps_set;
21782+
21783+void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21784+{
21785+ unsigned long address = __fix_to_virt(idx);
21786+ pte_t pte;
21787+
21788+ if (idx >= __end_of_fixed_addresses) {
21789+ BUG();
21790+ return;
21791+ }
21792+
21793+ switch (idx) {
21794+#ifdef CONFIG_X86_64
21795+ extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
21796+
21797+ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
21798+ pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21799+ set_pte_vaddr_pud(level3_user_pgt, address, pte);
21800+ break;
21801+ case FIX_EARLYCON_MEM_BASE:
21802+ xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
21803+ pfn_pte_ma(phys >> PAGE_SHIFT, flags));
21804+ fixmaps_set++;
21805+ return;
21806+#else
21807+ case FIX_WP_TEST:
21808+ case FIX_VDSO:
21809+ pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21810+ break;
21811+#endif
21812+ default:
21813+ pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21814+ break;
21815+ }
21816+ set_pte_vaddr(address, pte);
21817+ fixmaps_set++;
21818+}
00e5a55c
BS
21819--- sle11-2009-06-04.orig/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:38:05.000000000 +0100
21820+++ sle11-2009-06-04/arch/x86/mm/pgtable_32-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
21821@@ -25,51 +25,49 @@
21822 #include <xen/features.h>
21823 #include <asm/hypervisor.h>
21824
21825-void show_mem(void)
21826+/*
21827+ * Associate a virtual page frame with a given physical page frame
21828+ * and protection flags for that frame.
21829+ */
21830+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
21831 {
21832- int total = 0, reserved = 0;
21833- int shared = 0, cached = 0;
21834- int highmem = 0;
21835- struct page *page;
21836- pg_data_t *pgdat;
21837- unsigned long i;
21838- unsigned long flags;
21839-
21840- printk(KERN_INFO "Mem-info:\n");
21841- show_free_areas();
21842- for_each_online_pgdat(pgdat) {
21843- pgdat_resize_lock(pgdat, &flags);
21844- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
21845- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
21846- touch_nmi_watchdog();
21847- page = pgdat_page_nr(pgdat, i);
21848- total++;
21849- if (PageHighMem(page))
21850- highmem++;
21851- if (PageReserved(page))
21852- reserved++;
21853- else if (PageSwapCache(page))
21854- cached++;
21855- else if (page_count(page))
21856- shared += page_count(page) - 1;
21857- }
21858- pgdat_resize_unlock(pgdat, &flags);
21859- }
21860- printk(KERN_INFO "%d pages of RAM\n", total);
21861- printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
21862- printk(KERN_INFO "%d reserved pages\n", reserved);
21863- printk(KERN_INFO "%d pages shared\n", shared);
21864- printk(KERN_INFO "%d pages swap cached\n", cached);
21865-
21866- printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
21867- printk(KERN_INFO "%lu pages writeback\n",
21868- global_page_state(NR_WRITEBACK));
21869- printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
21870- printk(KERN_INFO "%lu pages slab\n",
21871- global_page_state(NR_SLAB_RECLAIMABLE) +
21872- global_page_state(NR_SLAB_UNRECLAIMABLE));
21873- printk(KERN_INFO "%lu pages pagetables\n",
21874- global_page_state(NR_PAGETABLE));
21875+#ifndef CONFIG_XEN
21876+ pgd_t *pgd;
21877+ pud_t *pud;
21878+ pmd_t *pmd;
21879+ pte_t *pte;
21880+
21881+ pgd = swapper_pg_dir + pgd_index(vaddr);
21882+ if (pgd_none(*pgd)) {
21883+ BUG();
21884+ return;
21885+ }
21886+ pud = pud_offset(pgd, vaddr);
21887+ if (pud_none(*pud)) {
21888+ BUG();
21889+ return;
21890+ }
21891+ pmd = pmd_offset(pud, vaddr);
21892+ if (pmd_none(*pmd)) {
21893+ BUG();
21894+ return;
21895+ }
21896+ pte = pte_offset_kernel(pmd, vaddr);
21897+ if (pte_val(pteval))
21898+ set_pte_present(&init_mm, vaddr, pte, pteval);
21899+ else
21900+ pte_clear(&init_mm, vaddr, pte);
21901+
21902+ /*
21903+ * It's enough to flush this one mapping.
21904+ * (PGE mappings get flushed as well)
21905+ */
21906+ __flush_tlb_one(vaddr);
21907+#else
21908+ if (HYPERVISOR_update_va_mapping(vaddr, pteval,
21909+ UVMF_INVLPG|UVMF_ALL))
21910+ BUG();
21911+#endif
21912 }
21913
21914 /*
21915@@ -107,35 +105,10 @@ void set_pmd_pfn(unsigned long vaddr, un
21916 __flush_tlb_one(vaddr);
21917 }
21918
21919-static int fixmaps;
21920 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
21921 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
21922 EXPORT_SYMBOL(__FIXADDR_TOP);
21923
21924-void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21925-{
21926- unsigned long address = __fix_to_virt(idx);
21927- pte_t pte;
21928-
21929- if (idx >= __end_of_fixed_addresses) {
21930- BUG();
21931- return;
21932- }
21933- switch (idx) {
21934- case FIX_WP_TEST:
21935- case FIX_VDSO:
21936- pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21937- break;
21938- default:
21939- pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21940- break;
21941- }
21942- if (HYPERVISOR_update_va_mapping(address, pte,
21943- UVMF_INVLPG|UVMF_ALL))
21944- BUG();
21945- fixmaps++;
21946-}
21947-
21948 /**
21949 * reserve_top_address - reserves a hole in the top of kernel address space
21950 * @reserve - size of hole to reserve
21951@@ -145,13 +118,48 @@ void __set_fixmap (enum fixed_addresses
21952 */
21953 void __init reserve_top_address(unsigned long reserve)
21954 {
21955- BUG_ON(fixmaps > 0);
21956+ BUG_ON(fixmaps_set > 0);
21957 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
21958 (int)-reserve);
21959 __FIXADDR_TOP = -reserve - PAGE_SIZE;
21960 __VMALLOC_RESERVE += reserve;
21961 }
21962
21963+/*
21964+ * vmalloc=size forces the vmalloc area to be exactly 'size'
21965+ * bytes. This can be used to increase (or decrease) the
21966+ * vmalloc area - the default is 128m.
21967+ */
21968+static int __init parse_vmalloc(char *arg)
21969+{
21970+ if (!arg)
21971+ return -EINVAL;
21972+
21973+ __VMALLOC_RESERVE = memparse(arg, &arg);
21974+ return 0;
21975+}
21976+early_param("vmalloc", parse_vmalloc);
21977+
21978+#ifndef CONFIG_XEN
21979+/*
21980+ * reservetop=size reserves a hole at the top of the kernel address space which
21981+ * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
21982+ * so relocating the fixmap can be done before paging initialization.
21983+ */
21984+static int __init parse_reservetop(char *arg)
21985+{
21986+ unsigned long address;
21987+
21988+ if (!arg)
21989+ return -EINVAL;
21990+
21991+ address = memparse(arg, &arg);
21992+ reserve_top_address(address);
21993+ return 0;
21994+}
21995+early_param("reservetop", parse_reservetop);
21996+#endif
21997+
21998 void make_lowmem_page_readonly(void *va, unsigned int feature)
21999 {
22000 pte_t *pte;
00e5a55c
BS
22001--- sle11-2009-06-04.orig/arch/x86/pci/amd_bus.c 2009-06-04 11:08:07.000000000 +0200
22002+++ sle11-2009-06-04/arch/x86/pci/amd_bus.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
22003@@ -607,6 +607,14 @@ static int __init pci_io_ecs_init(void)
22004 for_each_online_cpu(cpu)
22005 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
22006 (void *)(long)cpu);
22007+#ifdef CONFIG_XEN
22008+ {
22009+ u64 reg;
22010+ rdmsrl(MSR_AMD64_NB_CFG, reg);
22011+ if (!(reg & ENABLE_CF8_EXT_CFG))
22012+ return 0;
22013+ }
22014+#endif
22015 pci_probe |= PCI_HAS_IO_ECS;
22016
22017 return 0;
22018@@ -614,6 +622,10 @@ static int __init pci_io_ecs_init(void)
22019
22020 static int __init amd_postcore_init(void)
22021 {
22022+#ifdef CONFIG_XEN
22023+ if (!is_initial_xendomain())
22024+ return 0;
22025+#endif
22026 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
22027 return 0;
22028
00e5a55c
BS
22029--- sle11-2009-06-04.orig/arch/x86/pci/irq-xen.c 2009-03-16 16:38:05.000000000 +0100
22030+++ sle11-2009-06-04/arch/x86/pci/irq-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
22031@@ -11,8 +11,8 @@
22032 #include <linux/slab.h>
22033 #include <linux/interrupt.h>
22034 #include <linux/dmi.h>
22035-#include <asm/io.h>
22036-#include <asm/smp.h>
22037+#include <linux/io.h>
22038+#include <linux/smp.h>
22039 #include <asm/io_apic.h>
22040 #include <linux/irq.h>
22041 #include <linux/acpi.h>
22042@@ -45,7 +45,8 @@ struct irq_router {
22043 char *name;
22044 u16 vendor, device;
22045 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
22046- int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
22047+ int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
22048+ int new);
22049 };
22050
22051 struct irq_router_handler {
22052@@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_d
22053 * and perform checksum verification.
22054 */
22055
22056-static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
22057+static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
22058 {
22059 struct irq_routing_table *rt;
22060 int i;
22061@@ -74,10 +75,11 @@ static inline struct irq_routing_table *
22062 rt->size < sizeof(struct irq_routing_table))
22063 return NULL;
22064 sum = 0;
22065- for (i=0; i < rt->size; i++)
22066+ for (i = 0; i < rt->size; i++)
22067 sum += addr[i];
22068 if (!sum) {
22069- DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
22070+ DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
22071+ rt);
22072 return rt;
22073 }
22074 return NULL;
22075@@ -104,7 +106,9 @@ static struct irq_routing_table * __init
22076 return rt;
22077 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
22078 }
22079- for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
22080+ for (addr = (u8 *) isa_bus_to_virt(0xf0000);
22081+ addr < (u8 *) isa_bus_to_virt(0x100000);
22082+ addr += 16) {
22083 rt = pirq_check_routing_table(addr);
22084 if (rt)
22085 return rt;
22086@@ -126,20 +130,20 @@ static void __init pirq_peer_trick(void)
22087 struct irq_info *e;
22088
22089 memset(busmap, 0, sizeof(busmap));
22090- for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
22091+ for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
22092 e = &rt->slots[i];
22093 #ifdef DEBUG
22094 {
22095 int j;
22096 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
22097- for(j=0; j<4; j++)
22098+ for (j = 0; j < 4; j++)
22099 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
22100 DBG("\n");
22101 }
22102 #endif
22103 busmap[e->bus] = 1;
22104 }
22105- for(i = 1; i < 256; i++) {
22106+ for (i = 1; i < 256; i++) {
22107 int node;
22108 if (!busmap[i] || pci_find_bus(0, i))
22109 continue;
22110@@ -187,7 +191,8 @@ static unsigned int read_config_nybble(s
22111 return (nr & 1) ? (x >> 4) : (x & 0xf);
22112 }
22113
22114-static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
22115+static void write_config_nybble(struct pci_dev *router, unsigned offset,
22116+ unsigned nr, unsigned int val)
22117 {
22118 u8 x;
22119 unsigned reg = offset + (nr >> 1);
22120@@ -289,7 +294,7 @@ static int pirq_ite_get(struct pci_dev *
22121 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
22122
22123 WARN_ON_ONCE(pirq > 4);
22124- return read_config_nybble(router,0x43, pirqmap[pirq-1]);
22125+ return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
22126 }
22127
22128 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22129@@ -318,7 +323,7 @@ static int pirq_opti_set(struct pci_dev
22130
22131 /*
22132 * Cyrix: nibble offset 0x5C
22133- * 0x5C bits 7:4 is INTB bits 3:0 is INTA
22134+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA
22135 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
22136 */
22137 static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
22138@@ -354,7 +359,7 @@ static int pirq_cyrix_set(struct pci_dev
22139 * Apparently there are systems implementing PCI routing table using
22140 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
22141 * We try our best to handle both link mappings.
22142- *
22143+ *
22144 * Currently (2003-05-21) it appears most SiS chipsets follow the
22145 * definition of routing registers from the SiS-5595 southbridge.
22146 * According to the SiS 5595 datasheets the revision id's of the
22147@@ -374,7 +379,7 @@ static int pirq_cyrix_set(struct pci_dev
22148 *
22149 * 0x62: USBIRQ:
22150 * bit 6 OHCI function disabled (0), enabled (1)
22151- *
22152+ *
22153 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
22154 *
22155 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
22156@@ -437,7 +442,7 @@ static int pirq_vlsi_get(struct pci_dev
22157 {
22158 WARN_ON_ONCE(pirq >= 9);
22159 if (pirq > 8) {
22160- printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
22161+ dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
22162 return 0;
22163 }
22164 return read_config_nybble(router, 0x74, pirq-1);
22165@@ -447,7 +452,7 @@ static int pirq_vlsi_set(struct pci_dev
22166 {
22167 WARN_ON_ONCE(pirq >= 9);
22168 if (pirq > 8) {
22169- printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
22170+ dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
22171 return 0;
22172 }
22173 write_config_nybble(router, 0x74, pirq-1, irq);
22174@@ -471,7 +476,8 @@ static int pirq_serverworks_get(struct p
22175 return inb(0xc01) & 0xf;
22176 }
22177
22178-static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22179+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
22180+ int pirq, int irq)
22181 {
22182 outb(pirq, 0xc00);
22183 outb(irq, 0xc01);
22184@@ -491,22 +497,20 @@ static int pirq_amd756_get(struct pci_de
22185 u8 irq;
22186 irq = 0;
22187 if (pirq <= 4)
22188- {
22189 irq = read_config_nybble(router, 0x56, pirq - 1);
22190- }
22191- printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
22192- dev->vendor, dev->device, pirq, irq);
22193+ dev_info(&dev->dev,
22194+ "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
22195+ dev->vendor, dev->device, pirq, irq);
22196 return irq;
22197 }
22198
22199 static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22200 {
22201- printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
22202- dev->vendor, dev->device, pirq, irq);
22203+ dev_info(&dev->dev,
22204+ "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
22205+ dev->vendor, dev->device, pirq, irq);
22206 if (pirq <= 4)
22207- {
22208 write_config_nybble(router, 0x56, pirq - 1, irq);
22209- }
22210 return 1;
22211 }
22212
22213@@ -553,50 +557,51 @@ static __init int intel_router_probe(str
22214 if (pci_dev_present(pirq_440gx))
22215 return 0;
22216
22217- switch(device)
22218- {
22219- case PCI_DEVICE_ID_INTEL_82371FB_0:
22220- case PCI_DEVICE_ID_INTEL_82371SB_0:
22221- case PCI_DEVICE_ID_INTEL_82371AB_0:
22222- case PCI_DEVICE_ID_INTEL_82371MX:
22223- case PCI_DEVICE_ID_INTEL_82443MX_0:
22224- case PCI_DEVICE_ID_INTEL_82801AA_0:
22225- case PCI_DEVICE_ID_INTEL_82801AB_0:
22226- case PCI_DEVICE_ID_INTEL_82801BA_0:
22227- case PCI_DEVICE_ID_INTEL_82801BA_10:
22228- case PCI_DEVICE_ID_INTEL_82801CA_0:
22229- case PCI_DEVICE_ID_INTEL_82801CA_12:
22230- case PCI_DEVICE_ID_INTEL_82801DB_0:
22231- case PCI_DEVICE_ID_INTEL_82801E_0:
22232- case PCI_DEVICE_ID_INTEL_82801EB_0:
22233- case PCI_DEVICE_ID_INTEL_ESB_1:
22234- case PCI_DEVICE_ID_INTEL_ICH6_0:
22235- case PCI_DEVICE_ID_INTEL_ICH6_1:
22236- case PCI_DEVICE_ID_INTEL_ICH7_0:
22237- case PCI_DEVICE_ID_INTEL_ICH7_1:
22238- case PCI_DEVICE_ID_INTEL_ICH7_30:
22239- case PCI_DEVICE_ID_INTEL_ICH7_31:
22240- case PCI_DEVICE_ID_INTEL_ESB2_0:
22241- case PCI_DEVICE_ID_INTEL_ICH8_0:
22242- case PCI_DEVICE_ID_INTEL_ICH8_1:
22243- case PCI_DEVICE_ID_INTEL_ICH8_2:
22244- case PCI_DEVICE_ID_INTEL_ICH8_3:
22245- case PCI_DEVICE_ID_INTEL_ICH8_4:
22246- case PCI_DEVICE_ID_INTEL_ICH9_0:
22247- case PCI_DEVICE_ID_INTEL_ICH9_1:
22248- case PCI_DEVICE_ID_INTEL_ICH9_2:
22249- case PCI_DEVICE_ID_INTEL_ICH9_3:
22250- case PCI_DEVICE_ID_INTEL_ICH9_4:
22251- case PCI_DEVICE_ID_INTEL_ICH9_5:
22252- case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22253- case PCI_DEVICE_ID_INTEL_ICH10_0:
22254- case PCI_DEVICE_ID_INTEL_ICH10_1:
22255- case PCI_DEVICE_ID_INTEL_ICH10_2:
22256- case PCI_DEVICE_ID_INTEL_ICH10_3:
22257- r->name = "PIIX/ICH";
22258- r->get = pirq_piix_get;
22259- r->set = pirq_piix_set;
22260- return 1;
22261+ switch (device) {
22262+ case PCI_DEVICE_ID_INTEL_82371FB_0:
22263+ case PCI_DEVICE_ID_INTEL_82371SB_0:
22264+ case PCI_DEVICE_ID_INTEL_82371AB_0:
22265+ case PCI_DEVICE_ID_INTEL_82371MX:
22266+ case PCI_DEVICE_ID_INTEL_82443MX_0:
22267+ case PCI_DEVICE_ID_INTEL_82801AA_0:
22268+ case PCI_DEVICE_ID_INTEL_82801AB_0:
22269+ case PCI_DEVICE_ID_INTEL_82801BA_0:
22270+ case PCI_DEVICE_ID_INTEL_82801BA_10:
22271+ case PCI_DEVICE_ID_INTEL_82801CA_0:
22272+ case PCI_DEVICE_ID_INTEL_82801CA_12:
22273+ case PCI_DEVICE_ID_INTEL_82801DB_0:
22274+ case PCI_DEVICE_ID_INTEL_82801E_0:
22275+ case PCI_DEVICE_ID_INTEL_82801EB_0:
22276+ case PCI_DEVICE_ID_INTEL_ESB_1:
22277+ case PCI_DEVICE_ID_INTEL_ICH6_0:
22278+ case PCI_DEVICE_ID_INTEL_ICH6_1:
22279+ case PCI_DEVICE_ID_INTEL_ICH7_0:
22280+ case PCI_DEVICE_ID_INTEL_ICH7_1:
22281+ case PCI_DEVICE_ID_INTEL_ICH7_30:
22282+ case PCI_DEVICE_ID_INTEL_ICH7_31:
22283+ case PCI_DEVICE_ID_INTEL_ESB2_0:
22284+ case PCI_DEVICE_ID_INTEL_ICH8_0:
22285+ case PCI_DEVICE_ID_INTEL_ICH8_1:
22286+ case PCI_DEVICE_ID_INTEL_ICH8_2:
22287+ case PCI_DEVICE_ID_INTEL_ICH8_3:
22288+ case PCI_DEVICE_ID_INTEL_ICH8_4:
22289+ case PCI_DEVICE_ID_INTEL_ICH9_0:
22290+ case PCI_DEVICE_ID_INTEL_ICH9_1:
22291+ case PCI_DEVICE_ID_INTEL_ICH9_2:
22292+ case PCI_DEVICE_ID_INTEL_ICH9_3:
22293+ case PCI_DEVICE_ID_INTEL_ICH9_4:
22294+ case PCI_DEVICE_ID_INTEL_ICH9_5:
22295+ case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22296+ case PCI_DEVICE_ID_INTEL_ICH10_0:
22297+ case PCI_DEVICE_ID_INTEL_ICH10_1:
22298+ case PCI_DEVICE_ID_INTEL_ICH10_2:
22299+ case PCI_DEVICE_ID_INTEL_ICH10_3:
22300+ case PCI_DEVICE_ID_INTEL_PCH_0:
22301+ case PCI_DEVICE_ID_INTEL_PCH_1:
22302+ r->name = "PIIX/ICH";
22303+ r->get = pirq_piix_get;
22304+ r->set = pirq_piix_set;
22305+ return 1;
22306 }
22307 return 0;
22308 }
22309@@ -610,7 +615,7 @@ static __init int via_router_probe(struc
22310 * workarounds for some buggy BIOSes
22311 */
22312 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
22313- switch(router->device) {
22314+ switch (router->device) {
22315 case PCI_DEVICE_ID_VIA_82C686:
22316 /*
22317 * Asus k7m bios wrongly reports 82C686A
22318@@ -635,7 +640,7 @@ static __init int via_router_probe(struc
22319 }
22320 }
22321
22322- switch(device) {
22323+ switch (device) {
22324 case PCI_DEVICE_ID_VIA_82C586_0:
22325 r->name = "VIA";
22326 r->get = pirq_via586_get;
22327@@ -658,28 +663,27 @@ static __init int via_router_probe(struc
22328
22329 static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22330 {
22331- switch(device)
22332- {
22333- case PCI_DEVICE_ID_VLSI_82C534:
22334- r->name = "VLSI 82C534";
22335- r->get = pirq_vlsi_get;
22336- r->set = pirq_vlsi_set;
22337- return 1;
22338+ switch (device) {
22339+ case PCI_DEVICE_ID_VLSI_82C534:
22340+ r->name = "VLSI 82C534";
22341+ r->get = pirq_vlsi_get;
22342+ r->set = pirq_vlsi_set;
22343+ return 1;
22344 }
22345 return 0;
22346 }
22347
22348
22349-static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22350+static __init int serverworks_router_probe(struct irq_router *r,
22351+ struct pci_dev *router, u16 device)
22352 {
22353- switch(device)
22354- {
22355- case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22356- case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22357- r->name = "ServerWorks";
22358- r->get = pirq_serverworks_get;
22359- r->set = pirq_serverworks_set;
22360- return 1;
22361+ switch (device) {
22362+ case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22363+ case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22364+ r->name = "ServerWorks";
22365+ r->get = pirq_serverworks_get;
22366+ r->set = pirq_serverworks_set;
22367+ return 1;
22368 }
22369 return 0;
22370 }
22371@@ -688,7 +692,7 @@ static __init int sis_router_probe(struc
22372 {
22373 if (device != PCI_DEVICE_ID_SI_503)
22374 return 0;
22375-
22376+
22377 r->name = "SIS";
22378 r->get = pirq_sis_get;
22379 r->set = pirq_sis_set;
22380@@ -697,50 +701,45 @@ static __init int sis_router_probe(struc
22381
22382 static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22383 {
22384- switch(device)
22385- {
22386- case PCI_DEVICE_ID_CYRIX_5520:
22387- r->name = "NatSemi";
22388- r->get = pirq_cyrix_get;
22389- r->set = pirq_cyrix_set;
22390- return 1;
22391+ switch (device) {
22392+ case PCI_DEVICE_ID_CYRIX_5520:
22393+ r->name = "NatSemi";
22394+ r->get = pirq_cyrix_get;
22395+ r->set = pirq_cyrix_set;
22396+ return 1;
22397 }
22398 return 0;
22399 }
22400
22401 static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22402 {
22403- switch(device)
22404- {
22405- case PCI_DEVICE_ID_OPTI_82C700:
22406- r->name = "OPTI";
22407- r->get = pirq_opti_get;
22408- r->set = pirq_opti_set;
22409- return 1;
22410+ switch (device) {
22411+ case PCI_DEVICE_ID_OPTI_82C700:
22412+ r->name = "OPTI";
22413+ r->get = pirq_opti_get;
22414+ r->set = pirq_opti_set;
22415+ return 1;
22416 }
22417 return 0;
22418 }
22419
22420 static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22421 {
22422- switch(device)
22423- {
22424- case PCI_DEVICE_ID_ITE_IT8330G_0:
22425- r->name = "ITE";
22426- r->get = pirq_ite_get;
22427- r->set = pirq_ite_set;
22428- return 1;
22429+ switch (device) {
22430+ case PCI_DEVICE_ID_ITE_IT8330G_0:
22431+ r->name = "ITE";
22432+ r->get = pirq_ite_get;
22433+ r->set = pirq_ite_set;
22434+ return 1;
22435 }
22436 return 0;
22437 }
22438
22439 static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22440 {
22441- switch(device)
22442- {
22443+ switch (device) {
22444 case PCI_DEVICE_ID_AL_M1533:
22445 case PCI_DEVICE_ID_AL_M1563:
22446- printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
22447 r->name = "ALI";
22448 r->get = pirq_ali_get;
22449 r->set = pirq_ali_set;
22450@@ -751,25 +750,24 @@ static __init int ali_router_probe(struc
22451
22452 static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22453 {
22454- switch(device)
22455- {
22456- case PCI_DEVICE_ID_AMD_VIPER_740B:
22457- r->name = "AMD756";
22458- break;
22459- case PCI_DEVICE_ID_AMD_VIPER_7413:
22460- r->name = "AMD766";
22461- break;
22462- case PCI_DEVICE_ID_AMD_VIPER_7443:
22463- r->name = "AMD768";
22464- break;
22465- default:
22466- return 0;
22467+ switch (device) {
22468+ case PCI_DEVICE_ID_AMD_VIPER_740B:
22469+ r->name = "AMD756";
22470+ break;
22471+ case PCI_DEVICE_ID_AMD_VIPER_7413:
22472+ r->name = "AMD766";
22473+ break;
22474+ case PCI_DEVICE_ID_AMD_VIPER_7443:
22475+ r->name = "AMD768";
22476+ break;
22477+ default:
22478+ return 0;
22479 }
22480 r->get = pirq_amd756_get;
22481 r->set = pirq_amd756_set;
22482 return 1;
22483 }
22484-
22485+
22486 static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22487 {
22488 switch (device) {
22489@@ -811,7 +809,7 @@ static struct pci_dev *pirq_router_dev;
22490 * FIXME: should we have an option to say "generic for
22491 * chipset" ?
22492 */
22493-
22494+
22495 static void __init pirq_find_router(struct irq_router *r)
22496 {
22497 struct irq_routing_table *rt = pirq_table;
22498@@ -830,7 +828,7 @@ static void __init pirq_find_router(stru
22499 r->name = "default";
22500 r->get = NULL;
22501 r->set = NULL;
22502-
22503+
22504 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
22505 rt->rtr_vendor, rt->rtr_device);
22506
22507@@ -841,19 +839,19 @@ static void __init pirq_find_router(stru
22508 return;
22509 }
22510
22511- for( h = pirq_routers; h->vendor; h++) {
22512+ for (h = pirq_routers; h->vendor; h++) {
22513 /* First look for a router match */
22514- if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
22515+ if (rt->rtr_vendor == h->vendor &&
22516+ h->probe(r, pirq_router_dev, rt->rtr_device))
22517 break;
22518 /* Fall back to a device match */
22519- if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
22520+ if (pirq_router_dev->vendor == h->vendor &&
22521+ h->probe(r, pirq_router_dev, pirq_router_dev->device))
22522 break;
22523 }
22524- printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
22525- pirq_router.name,
22526- pirq_router_dev->vendor,
22527- pirq_router_dev->device,
22528- pci_name(pirq_router_dev));
22529+ dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
22530+ pirq_router.name,
22531+ pirq_router_dev->vendor, pirq_router_dev->device);
22532
22533 /* The device remains referenced for the kernel lifetime */
22534 }
22535@@ -861,11 +859,13 @@ static void __init pirq_find_router(stru
22536 static struct irq_info *pirq_get_info(struct pci_dev *dev)
22537 {
22538 struct irq_routing_table *rt = pirq_table;
22539- int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
22540+ int entries = (rt->size - sizeof(struct irq_routing_table)) /
22541+ sizeof(struct irq_info);
22542 struct irq_info *info;
22543
22544 for (info = rt->slots; entries--; info++)
22545- if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22546+ if (info->bus == dev->bus->number &&
22547+ PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22548 return info;
22549 return NULL;
22550 }
22551@@ -884,7 +884,7 @@ static int pcibios_lookup_irq(struct pci
22552 /* Find IRQ pin */
22553 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
22554 if (!pin) {
22555- DBG(KERN_DEBUG " -> no interrupt pin\n");
22556+ dev_dbg(&dev->dev, "no interrupt pin\n");
22557 return 0;
22558 }
22559 pin = pin - 1;
22560@@ -893,20 +893,21 @@ static int pcibios_lookup_irq(struct pci
22561
22562 if (!pirq_table)
22563 return 0;
22564-
22565- DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
22566+
22567 info = pirq_get_info(dev);
22568 if (!info) {
22569- DBG(" -> not found in routing table\n" KERN_DEBUG);
22570+ dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
22571+ 'A' + pin);
22572 return 0;
22573 }
22574 pirq = info->irq[pin].link;
22575 mask = info->irq[pin].bitmap;
22576 if (!pirq) {
22577- DBG(" -> not routed\n" KERN_DEBUG);
22578+ dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
22579 return 0;
22580 }
22581- DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
22582+ dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
22583+ 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
22584 mask &= pcibios_irq_mask;
22585
22586 /* Work around broken HP Pavilion Notebooks which assign USB to
22587@@ -919,7 +920,8 @@ static int pcibios_lookup_irq(struct pci
22588 }
22589
22590 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
22591- if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
22592+ if (acer_tm360_irqrouting && dev->irq == 11 &&
22593+ dev->vendor == PCI_VENDOR_ID_O2) {
22594 pirq = 0x68;
22595 mask = 0x400;
22596 dev->irq = r->get(pirq_router_dev, dev, pirq);
22597@@ -932,51 +934,50 @@ static int pcibios_lookup_irq(struct pci
22598 */
22599 newirq = dev->irq;
22600 if (newirq && !((1 << newirq) & mask)) {
22601- if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
22602- else printk("\n" KERN_WARNING
22603- "PCI: IRQ %i for device %s doesn't match PIRQ mask "
22604- "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
22605- pci_name(dev));
22606+ if (pci_probe & PCI_USE_PIRQ_MASK)
22607+ newirq = 0;
22608+ else
22609+ dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
22610+ "%#x; try pci=usepirqmask\n", newirq, mask);
22611 }
22612 if (!newirq && assign) {
22613 for (i = 0; i < 16; i++) {
22614 if (!(mask & (1 << i)))
22615 continue;
22616- if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
22617+ if (pirq_penalty[i] < pirq_penalty[newirq] &&
22618+ can_request_irq(i, IRQF_SHARED))
22619 newirq = i;
22620 }
22621 }
22622- DBG(" -> newirq=%d", newirq);
22623+ dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
22624
22625 /* Check if it is hardcoded */
22626 if ((pirq & 0xf0) == 0xf0) {
22627 irq = pirq & 0xf;
22628- DBG(" -> hardcoded IRQ %d\n", irq);
22629- msg = "Hardcoded";
22630- } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22631- ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
22632- DBG(" -> got IRQ %d\n", irq);
22633- msg = "Found";
22634+ msg = "hardcoded";
22635+ } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22636+ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
22637+ msg = "found";
22638 eisa_set_level_irq(irq);
22639- } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22640- DBG(" -> assigning IRQ %d", newirq);
22641+ } else if (newirq && r->set &&
22642+ (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22643 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
22644 eisa_set_level_irq(newirq);
22645- DBG(" ... OK\n");
22646- msg = "Assigned";
22647+ msg = "assigned";
22648 irq = newirq;
22649 }
22650 }
22651
22652 if (!irq) {
22653- DBG(" ... failed\n");
22654 if (newirq && mask == (1 << newirq)) {
22655- msg = "Guessed";
22656+ msg = "guessed";
22657 irq = newirq;
22658- } else
22659+ } else {
22660+ dev_dbg(&dev->dev, "can't route interrupt\n");
22661 return 0;
22662+ }
22663 }
22664- printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
22665+ dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
22666
22667 /* Update IRQ for all devices with the same pirq value */
22668 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
22669@@ -988,20 +989,25 @@ static int pcibios_lookup_irq(struct pci
22670 if (!info)
22671 continue;
22672 if (info->irq[pin].link == pirq) {
22673- /* We refuse to override the dev->irq information. Give a warning! */
22674- if ( dev2->irq && dev2->irq != irq && \
22675+ /*
22676+ * We refuse to override the dev->irq
22677+ * information. Give a warning!
22678+ */
22679+ if (dev2->irq && dev2->irq != irq && \
22680 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
22681- ((1 << dev2->irq) & mask)) ) {
22682+ ((1 << dev2->irq) & mask))) {
22683 #ifndef CONFIG_PCI_MSI
22684- printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
22685- pci_name(dev2), dev2->irq, irq);
22686+ dev_info(&dev2->dev, "IRQ routing conflict: "
22687+ "have IRQ %d, want IRQ %d\n",
22688+ dev2->irq, irq);
22689 #endif
22690- continue;
22691- }
22692+ continue;
22693+ }
22694 dev2->irq = irq;
22695 pirq_penalty[irq]++;
22696 if (dev != dev2)
22697- printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
22698+ dev_info(&dev->dev, "sharing IRQ %d with %s\n",
22699+ irq, pci_name(dev2));
22700 }
22701 }
22702 return 1;
22703@@ -1015,15 +1021,20 @@ static void __init pcibios_fixup_irqs(vo
22704 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
22705 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
22706 /*
22707- * If the BIOS has set an out of range IRQ number, just ignore it.
22708- * Also keep track of which IRQ's are already in use.
22709+ * If the BIOS has set an out of range IRQ number, just
22710+ * ignore it. Also keep track of which IRQ's are
22711+ * already in use.
22712 */
22713 if (dev->irq >= 16) {
22714- DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
22715+ dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
22716 dev->irq = 0;
22717 }
22718- /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
22719- if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
22720+ /*
22721+ * If the IRQ is already assigned to a PCI device,
22722+ * ignore its ISA use penalty
22723+ */
22724+ if (pirq_penalty[dev->irq] >= 100 &&
22725+ pirq_penalty[dev->irq] < 100000)
22726 pirq_penalty[dev->irq] = 0;
22727 pirq_penalty[dev->irq]++;
22728 }
22729@@ -1035,13 +1046,17 @@ static void __init pcibios_fixup_irqs(vo
22730 /*
22731 * Recalculate IRQ numbers if we use the I/O APIC.
22732 */
22733- if (io_apic_assign_pci_irqs)
22734- {
22735+ if (io_apic_assign_pci_irqs) {
22736 int irq;
22737
22738 if (pin) {
22739- pin--; /* interrupt pins are numbered starting from 1 */
22740- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
22741+ /*
22742+ * interrupt pins are numbered starting
22743+ * from 1
22744+ */
22745+ pin--;
22746+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
22747+ PCI_SLOT(dev->devfn), pin);
22748 /*
22749 * Busses behind bridges are typically not listed in the MP-table.
22750 * In this case we have to look up the IRQ based on the parent bus,
22751@@ -1049,18 +1064,18 @@ static void __init pcibios_fixup_irqs(vo
22752 * busses itself so we should get into this branch reliably.
22753 */
22754 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22755- struct pci_dev * bridge = dev->bus->self;
22756+ struct pci_dev *bridge = dev->bus->self;
22757
22758 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22759- irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22760+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22761 PCI_SLOT(bridge->devfn), pin);
22762 if (irq >= 0)
22763- printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22764- pci_name(bridge), 'A' + pin, irq);
22765+ dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
22766+ pci_name(bridge),
22767+ 'A' + pin, irq);
22768 }
22769 if (irq >= 0) {
22770- printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22771- pci_name(dev), 'A' + pin, irq);
22772+ dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
22773 dev->irq = irq;
22774 }
22775 }
22776@@ -1082,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq
22777 {
22778 if (!broken_hp_bios_irq9) {
22779 broken_hp_bios_irq9 = 1;
22780- printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22781+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22782+ d->ident);
22783 }
22784 return 0;
22785 }
22786@@ -1095,7 +1111,8 @@ static int __init fix_acer_tm360_irqrout
22787 {
22788 if (!acer_tm360_irqrouting) {
22789 acer_tm360_irqrouting = 1;
22790- printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22791+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22792+ d->ident);
22793 }
22794 return 0;
22795 }
22796@@ -1107,7 +1124,8 @@ static struct dmi_system_id __initdata p
22797 .matches = {
22798 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
22799 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
22800- DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
22801+ DMI_MATCH(DMI_PRODUCT_VERSION,
22802+ "HP Pavilion Notebook Model GE"),
22803 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
22804 },
22805 },
22806@@ -1122,7 +1140,7 @@ static struct dmi_system_id __initdata p
22807 { }
22808 };
22809
22810-static int __init pcibios_irq_init(void)
22811+int __init pcibios_irq_init(void)
22812 {
22813 DBG(KERN_DEBUG "PCI: IRQ init\n");
22814
22815@@ -1142,11 +1160,14 @@ static int __init pcibios_irq_init(void)
22816 pirq_find_router(&pirq_router);
22817 if (pirq_table->exclusive_irqs) {
22818 int i;
22819- for (i=0; i<16; i++)
22820+ for (i = 0; i < 16; i++)
22821 if (!(pirq_table->exclusive_irqs & (1 << i)))
22822 pirq_penalty[i] += 100;
22823 }
22824- /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
22825+ /*
22826+ * If we're using the I/O APIC, avoid using the PCI IRQ
22827+ * routing table
22828+ */
22829 if (io_apic_assign_pci_irqs)
22830 pirq_table = NULL;
22831 }
22832@@ -1157,9 +1178,6 @@ static int __init pcibios_irq_init(void)
22833 return 0;
22834 }
22835
22836-subsys_initcall(pcibios_irq_init);
22837-
22838-
22839 static void pirq_penalize_isa_irq(int irq, int active)
22840 {
22841 /*
22842@@ -1193,7 +1211,7 @@ static int pirq_enable_irq(struct pci_de
22843 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
22844 char *msg = "";
22845
22846- pin--; /* interrupt pins are numbered starting from 1 */
22847+ pin--; /* interrupt pins are numbered starting from 1 */
22848
22849 if (io_apic_assign_pci_irqs) {
22850 int irq;
22851@@ -1207,35 +1225,41 @@ static int pirq_enable_irq(struct pci_de
22852 */
22853 temp_dev = dev;
22854 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22855- struct pci_dev * bridge = dev->bus->self;
22856+ struct pci_dev *bridge = dev->bus->self;
22857
22858 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22859- irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22860+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22861 PCI_SLOT(bridge->devfn), pin);
22862 if (irq >= 0)
22863- printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22864- pci_name(bridge), 'A' + pin, irq);
22865+ dev_warn(&dev->dev, "using bridge %s "
22866+ "INT %c to get IRQ %d\n",
22867+ pci_name(bridge), 'A' + pin,
22868+ irq);
22869 dev = bridge;
22870 }
22871 dev = temp_dev;
22872 if (irq >= 0) {
22873- printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22874- pci_name(dev), 'A' + pin, irq);
22875+ dev_info(&dev->dev, "PCI->APIC IRQ transform: "
22876+ "INT %c -> IRQ %d\n", 'A' + pin, irq);
22877 dev->irq = irq;
22878 return 0;
22879 } else
22880- msg = " Probably buggy MP table.";
22881+ msg = "; probably buggy MP table";
22882 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
22883 msg = "";
22884 else
22885- msg = " Please try using pci=biosirq.";
22886+ msg = "; please try using pci=biosirq";
22887
22888- /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
22889- if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
22890+ /*
22891+ * With IDE legacy devices the IRQ lookup failure is not
22892+ * a problem..
22893+ */
22894+ if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
22895+ !(dev->class & 0x5))
22896 return 0;
22897
22898- printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
22899- 'A' + pin, pci_name(dev), msg);
22900+ dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
22901+ 'A' + pin, msg);
22902 }
22903 return 0;
22904 }
00e5a55c
BS
22905--- sle11-2009-06-04.orig/arch/x86/vdso/Makefile 2009-03-16 16:33:40.000000000 +0100
22906+++ sle11-2009-06-04/arch/x86/vdso/Makefile 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
22907@@ -65,9 +65,7 @@ obj-$(VDSO32-y) += vdso32-syms.lds
22908 vdso32.so-$(VDSO32-y) += int80
22909 vdso32.so-$(CONFIG_COMPAT) += syscall
22910 vdso32.so-$(VDSO32-y) += sysenter
22911-xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
22912-xen-vdso32-$(CONFIG_X86_32) += syscall
22913-vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
22914+vdso32.so-$(CONFIG_X86_XEN) += syscall
22915
22916 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
22917
00e5a55c
BS
22918--- sle11-2009-06-04.orig/arch/x86/vdso/vdso32.S 2009-03-16 16:33:40.000000000 +0100
22919+++ sle11-2009-06-04/arch/x86/vdso/vdso32.S 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
22920@@ -9,7 +9,7 @@ vdso32_int80_end:
22921
22922 .globl vdso32_syscall_start, vdso32_syscall_end
22923 vdso32_syscall_start:
22924-#ifdef CONFIG_COMPAT
22925+#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
22926 .incbin "arch/x86/vdso/vdso32-syscall.so"
22927 #endif
22928 vdso32_syscall_end:
22929@@ -19,16 +19,4 @@ vdso32_sysenter_start:
22930 .incbin "arch/x86/vdso/vdso32-sysenter.so"
22931 vdso32_sysenter_end:
22932
22933-#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
22934- .globl vdso32_int80_start, vdso32_int80_end
22935-vdso32_int80_start:
22936- .incbin "arch/x86/vdso/vdso32-int80.so"
22937-vdso32_int80_end:
22938-#elif defined(CONFIG_X86_XEN)
22939- .globl vdso32_syscall_start, vdso32_syscall_end
22940-vdso32_syscall_start:
22941- .incbin "arch/x86/vdso/vdso32-syscall.so"
22942-vdso32_syscall_end:
22943-#endif
22944-
22945 __FINIT
00e5a55c
BS
22946--- sle11-2009-06-04.orig/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:38:05.000000000 +0100
22947+++ sle11-2009-06-04/arch/x86/vdso/vdso32-setup-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
22948@@ -195,50 +195,28 @@ static __init void relocate_vdso(Elf32_E
22949 }
22950 }
22951
22952-/*
22953- * These symbols are defined by vdso32.S to mark the bounds
22954- * of the ELF DSO images included therein.
22955- */
22956-extern const char vdso32_default_start, vdso32_default_end;
22957-extern const char vdso32_sysenter_start, vdso32_sysenter_end;
22958 static struct page *vdso32_pages[1];
22959
22960 #ifdef CONFIG_X86_64
22961
22962-#if CONFIG_XEN_COMPAT < 0x030200
22963-static int use_int80 = 1;
22964-#endif
22965-static int use_sysenter __read_mostly = -1;
22966-
22967-#define vdso32_sysenter() (use_sysenter > 0)
22968+#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
22969+#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22970
22971-/* May not be __init: called during resume */
22972-void syscall32_cpu_init(void)
22973+void __cpuinit syscall32_cpu_init(void)
22974 {
22975- static const struct callback_register cstar = {
00e5a55c 22976+ static const struct callback_register __cpuinitconst cstar = {
cc90b958
BS
22977 .type = CALLBACKTYPE_syscall32,
22978 .address = (unsigned long)ia32_cstar_target
22979 };
22980- static const struct callback_register sysenter = {
00e5a55c 22981+ static const struct callback_register __cpuinitconst sysenter = {
cc90b958
BS
22982 .type = CALLBACKTYPE_sysenter,
22983 .address = (unsigned long)ia32_sysenter_target
22984 };
22985
22986- if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
22987- (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
22988-#if CONFIG_XEN_COMPAT < 0x030200
22989- return;
22990- use_int80 = 0;
22991-#else
22992- BUG();
22993-#endif
22994-
22995- if (use_sysenter < 0) {
22996- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
22997- use_sysenter = 1;
22998- if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
22999- use_sysenter = 1;
23000- }
23001+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
23002+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
23003+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
23004+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
23005 }
23006
23007 #define compat_uses_vma 1
23008@@ -250,6 +228,7 @@ static inline void map_compat_vdso(int m
23009 #else /* CONFIG_X86_32 */
23010
23011 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
23012+#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
23013
23014 extern asmlinkage void ia32pv_cstar_target(void);
00e5a55c 23015 static const struct callback_register __cpuinitconst cstar = {
cc90b958
BS
23016@@ -265,13 +244,13 @@ void __cpuinit enable_sep_cpu(void)
23017 .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
23018 };
23019
23020- if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
23021+ if (vdso32_syscall()) {
23022 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
23023 BUG();
23024 return;
23025 }
23026
23027- if (!boot_cpu_has(X86_FEATURE_SEP))
23028+ if (!vdso32_sysenter())
23029 return;
23030
23031 if (xen_feature(XENFEAT_supervisor_mode_kernel))
23032@@ -341,34 +320,26 @@ int __init sysenter_setup(void)
23033
23034 #ifdef CONFIG_X86_32
23035 gate_vma_init();
23036-#endif
23037
23038-#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
23039- if (use_int80) {
23040- extern const char vdso32_int80_start, vdso32_int80_end;
23041-
23042- vsyscall = &vdso32_int80_start;
23043- vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
23044- } else
23045-#elif defined(CONFIG_X86_32)
23046- if (boot_cpu_has(X86_FEATURE_SYSCALL)
23047- && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
23048- || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
23049- setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
23050- barrier(); /* until clear_bit()'s constraints are correct ... */
23051 if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
23052- extern const char vdso32_syscall_start, vdso32_syscall_end;
23053-
23054+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
23055+ && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
23056+ setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
23057+ else {
23058+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
23059+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
23060+ }
23061+ }
23062+#endif
23063+ if (vdso32_syscall()) {
23064 vsyscall = &vdso32_syscall_start;
23065 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
23066- } else
23067-#endif
23068- if (!vdso32_sysenter()) {
23069- vsyscall = &vdso32_default_start;
23070- vsyscall_len = &vdso32_default_end - &vdso32_default_start;
23071- } else {
23072+ } else if (vdso32_sysenter()){
23073 vsyscall = &vdso32_sysenter_start;
23074 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
23075+ } else {
23076+ vsyscall = &vdso32_int80_start;
23077+ vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
23078 }
23079
23080 memcpy(syscall_page, vsyscall, vsyscall_len);
00e5a55c
BS
23081--- sle11-2009-06-04.orig/arch/x86/xen/Kconfig 2009-02-16 16:17:21.000000000 +0100
23082+++ sle11-2009-06-04/arch/x86/xen/Kconfig 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23083@@ -17,7 +17,7 @@ config XEN_MAX_DOMAIN_MEMORY
23084 int "Maximum allowed size of a domain in gigabytes"
23085 default 8 if X86_32
23086 default 32 if X86_64
23087- depends on XEN
23088+ depends on PARAVIRT_XEN
23089 help
23090 The pseudo-physical to machine address array is sized
23091 according to the maximum possible memory size of a Xen
23092@@ -26,5 +26,5 @@ config XEN_MAX_DOMAIN_MEMORY
23093
23094 config XEN_SAVE_RESTORE
23095 bool
23096- depends on PM
23097+ depends on PARAVIRT_XEN && PM
23098 default y
23099\ No newline at end of file
00e5a55c
BS
23100--- sle11-2009-06-04.orig/drivers/acpi/processor_core.c 2009-03-16 16:38:05.000000000 +0100
23101+++ sle11-2009-06-04/drivers/acpi/processor_core.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23102@@ -721,9 +721,11 @@ static int __cpuinit acpi_processor_star
23103 if (result)
23104 goto end;
23105
23106- sysdev = get_cpu_sysdev(pr->id);
23107- if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
23108- return -EFAULT;
23109+ if (pr->id != -1) {
23110+ sysdev = get_cpu_sysdev(pr->id);
23111+ if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
23112+ return -EFAULT;
23113+ }
23114
23115 status = acpi_install_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
23116 acpi_processor_notify, pr);
23117@@ -895,7 +897,8 @@ static int acpi_processor_remove(struct
23118 status = acpi_remove_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
23119 acpi_processor_notify);
23120
23121- sysfs_remove_link(&device->dev.kobj, "sysdev");
23122+ if (pr->id != -1)
23123+ sysfs_remove_link(&device->dev.kobj, "sysdev");
23124
23125 acpi_processor_remove_fs(device);
23126
00e5a55c
BS
23127--- sle11-2009-06-04.orig/drivers/char/tpm/tpm_vtpm.c 2009-02-16 15:58:14.000000000 +0100
23128+++ sle11-2009-06-04/drivers/char/tpm/tpm_vtpm.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23129@@ -347,7 +347,7 @@ static int _vtpm_send_queued(struct tpm_
23130 {
23131 int rc;
23132 int error = 0;
23133- long flags;
23134+ unsigned long flags;
23135 unsigned char buffer[1];
23136 struct vtpm_state *vtpms;
23137 vtpms = (struct vtpm_state *)chip_get_private(chip);
00e5a55c
BS
23138--- sle11-2009-06-04.orig/drivers/misc/Kconfig 2009-06-04 11:08:07.000000000 +0200
23139+++ sle11-2009-06-04/drivers/misc/Kconfig 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23140@@ -438,7 +438,7 @@ config ENCLOSURE_SERVICES
23141 config SGI_XP
23142 tristate "Support communication between SGI SSIs"
23143 depends on NET
23144- depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP
23145+ depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP && !XEN
23146 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
23147 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
23148 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
23149@@ -465,7 +465,7 @@ config HP_ILO
23150
23151 config SGI_GRU
23152 tristate "SGI GRU driver"
23153- depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP
23154+ depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP && !XEN
23155 default n
23156 select MMU_NOTIFIER
23157 ---help---
00e5a55c
BS
23158--- sle11-2009-06-04.orig/drivers/pci/msi-xen.c 2009-03-16 16:38:05.000000000 +0100
23159+++ sle11-2009-06-04/drivers/pci/msi-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23160@@ -90,12 +90,10 @@ arch_teardown_msi_irqs(struct pci_dev *d
23161 }
23162 #endif
23163
23164-static void msi_set_enable(struct pci_dev *dev, int enable)
23165+static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
23166 {
23167- int pos;
23168 u16 control;
23169
23170- pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
23171 if (pos) {
23172 pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
23173 control &= ~PCI_MSI_FLAGS_ENABLE;
23174@@ -105,6 +103,11 @@ static void msi_set_enable(struct pci_de
23175 }
23176 }
23177
23178+static void msi_set_enable(struct pci_dev *dev, int enable)
23179+{
23180+ __msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable);
23181+}
23182+
23183 static void msix_set_enable(struct pci_dev *dev, int enable)
23184 {
23185 int pos;
00e5a55c 23186@@ -568,9 +571,8 @@ int pci_enable_msi(struct pci_dev* dev)
cc90b958
BS
23187
23188 /* Check whether driver already requested for MSI-X irqs */
23189 if (dev->msix_enabled) {
23190- printk(KERN_INFO "PCI: %s: Can't enable MSI. "
23191- "Device already has MSI-X enabled\n",
23192- pci_name(dev));
23193+ dev_info(&dev->dev, "can't enable MSI "
23194+ "(MSI-X already enabled)\n");
23195 return -EINVAL;
23196 }
23197
00e5a55c 23198@@ -702,9 +704,8 @@ int pci_enable_msix(struct pci_dev* dev,
cc90b958
BS
23199 temp = dev->irq;
23200 /* Check whether driver already requested for MSI vector */
23201 if (dev->msi_enabled) {
23202- printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
23203- "Device already has an MSI irq assigned\n",
23204- pci_name(dev));
23205+ dev_info(&dev->dev, "can't enable MSI-X "
23206+ "(MSI IRQ already assigned)\n");
23207 return -EINVAL;
23208 }
23209
00e5a55c
BS
23210--- sle11-2009-06-04.orig/drivers/pci/quirks.c 2009-06-04 11:08:07.000000000 +0200
23211+++ sle11-2009-06-04/drivers/pci/quirks.c 2009-06-04 10:21:39.000000000 +0200
23212@@ -44,9 +44,8 @@ static void __devinit quirk_release_reso
cc90b958
BS
23213 /* PCI Host Bridge isn't a target device */
23214 return;
23215 }
23216- printk(KERN_INFO
00e5a55c 23217- "PCI: Disable memory decoding and release memory resources [%s].\n",
cc90b958 23218- pci_name(dev));
00e5a55c
BS
23219+ dev_info(&dev->dev,
23220+ "disable memory decoding and release memory resources\n");
23221 pci_read_config_word(dev, PCI_COMMAND, &command);
23222 command &= ~PCI_COMMAND_MEMORY;
23223 pci_write_config_word(dev, PCI_COMMAND, command);
23224--- sle11-2009-06-04.orig/drivers/pci/setup-res.c 2009-06-04 11:08:07.000000000 +0200
23225+++ sle11-2009-06-04/drivers/pci/setup-res.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23226@@ -129,7 +129,7 @@ int pci_claim_resource(struct pci_dev *d
23227 #ifdef CONFIG_PCI_REASSIGN
23228 void pci_disable_bridge_window(struct pci_dev *dev)
23229 {
23230- printk(KERN_DEBUG "PCI: Disable bridge window on %s\n", pci_name(dev));
23231+ dev_dbg(&dev->dev, "disable bridge window\n");
23232
23233 /* MMIO Base/Limit */
23234 pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0);
23235@@ -190,8 +190,8 @@ int pci_assign_resource(struct pci_dev *
23236 res->flags &= ~IORESOURCE_STARTALIGN;
23237 if (resno < PCI_BRIDGE_RESOURCES) {
23238 #ifdef CONFIG_PCI_REASSIGN
23239- printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23240- "%016llx - %016llx\n", resno, pci_name(dev),
23241+ dev_dbg(&dev->dev, "assign resource(%d) "
23242+ "%016llx - %016llx\n", resno,
23243 (unsigned long long)res->start,
23244 (unsigned long long)res->end);
23245 #endif
23246@@ -235,8 +235,8 @@ int pci_assign_resource_fixed(struct pci
23247 (unsigned long long)res->end);
23248 } else if (resno < PCI_BRIDGE_RESOURCES) {
23249 #ifdef CONFIG_PCI_REASSIGN
23250- printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23251- "%016llx - %016llx\n", resno, pci_name(dev),
23252+ dev_dbg(&dev->dev, "assign resource(%d) "
23253+ "%016llx - %016llx\n", resno,
23254 (unsigned long long)res->start,
23255 (unsigned long long)res->end);
23256 #endif
00e5a55c
BS
23257--- sle11-2009-06-04.orig/drivers/xen/Makefile 2009-03-16 16:38:05.000000000 +0100
23258+++ sle11-2009-06-04/drivers/xen/Makefile 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23259@@ -1,4 +1,4 @@
23260-obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
23261+obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o
23262 xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
23263 xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
23264
00e5a55c
BS
23265--- sle11-2009-06-04.orig/drivers/xen/balloon/sysfs.c 2009-03-16 16:33:40.000000000 +0100
23266+++ sle11-2009-06-04/drivers/xen/balloon/sysfs.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23267@@ -45,6 +45,7 @@
23268
23269 #define BALLOON_SHOW(name, format, args...) \
23270 static ssize_t show_##name(struct sys_device *dev, \
23271+ struct sysdev_attribute *attr, \
23272 char *buf) \
23273 { \
23274 return sprintf(buf, format, ##args); \
23275@@ -59,14 +60,15 @@ BALLOON_SHOW(hard_limit_kb,
23276 (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0);
23277 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
23278
23279-static ssize_t show_target_kb(struct sys_device *dev, char *buf)
23280+static ssize_t show_target_kb(struct sys_device *dev,
23281+ struct sysdev_attribute *attr, char *buf)
23282 {
23283 return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
23284 }
23285
23286 static ssize_t store_target_kb(struct sys_device *dev,
23287- const char *buf,
23288- size_t count)
23289+ struct sysdev_attribute *attr,
23290+ const char *buf, size_t count)
23291 {
23292 char memstring[64], *endchar;
23293 unsigned long long target_bytes;
00e5a55c
BS
23294--- sle11-2009-06-04.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:40:14.000000000 +0200
23295+++ sle11-2009-06-04/drivers/xen/blktap/blktap.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23296@@ -54,6 +54,7 @@
23297 #include <linux/gfp.h>
23298 #include <linux/poll.h>
23299 #include <linux/delay.h>
23300+#include <linux/nsproxy.h>
23301 #include <asm/tlbflush.h>
23302
23303 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
00e5a55c 23304@@ -498,7 +499,7 @@ found:
cc90b958
BS
23305
23306 if ((class = get_xen_class()) != NULL)
23307 device_create(class, NULL, MKDEV(blktap_major, minor),
23308- "blktap%d", minor);
23309+ NULL, "blktap%d", minor);
23310 }
23311
23312 out:
00e5a55c 23313@@ -1683,7 +1684,8 @@ static int __init blkif_init(void)
cc90b958
BS
23314 * We only create the device when a request of a new device is
23315 * made.
23316 */
23317- device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
23318+ device_create(class, NULL, MKDEV(blktap_major, 0), NULL,
23319+ "blktap0");
23320 } else {
23321 /* this is bad, but not fatal */
23322 WPRINTK("blktap: sysfs xen_class not created\n");
00e5a55c
BS
23323--- sle11-2009-06-04.orig/drivers/xen/char/mem.c 2009-03-16 16:38:05.000000000 +0100
23324+++ sle11-2009-06-04/drivers/xen/char/mem.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23325@@ -35,7 +35,7 @@ static inline int uncached_access(struct
23326
23327 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
23328 {
23329-#ifdef CONFIG_NONPROMISC_DEVMEM
23330+#ifdef CONFIG_STRICT_DEVMEM
23331 u64 from = ((u64)pfn) << PAGE_SHIFT;
23332 u64 to = from + size;
23333 u64 cursor = from;
23334@@ -172,7 +172,10 @@ static void mmap_mem_close(struct vm_are
23335
23336 static struct vm_operations_struct mmap_mem_ops = {
23337 .open = mmap_mem_open,
23338- .close = mmap_mem_close
23339+ .close = mmap_mem_close,
23340+#ifdef CONFIG_HAVE_IOREMAP_PROT
23341+ .access = generic_access_phys
23342+#endif
23343 };
23344
23345 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
00e5a55c
BS
23346--- sle11-2009-06-04.orig/drivers/xen/console/console.c 2009-03-16 16:38:05.000000000 +0100
23347+++ sle11-2009-06-04/drivers/xen/console/console.c 2009-06-04 10:21:39.000000000 +0200
23348@@ -432,9 +432,7 @@ static void __xencons_tx_flush(void)
cc90b958
BS
23349
23350 if (work_done && (xencons_tty != NULL)) {
23351 wake_up_interruptible(&xencons_tty->write_wait);
23352- if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
23353- (xencons_tty->ldisc.write_wakeup != NULL))
23354- (xencons_tty->ldisc.write_wakeup)(xencons_tty);
23355+ tty_wakeup(xencons_tty);
23356 }
23357 }
23358
00e5a55c 23359@@ -635,8 +633,8 @@ static void xencons_close(struct tty_str
cc90b958
BS
23360 tty->closing = 1;
23361 tty_wait_until_sent(tty, 0);
23362 tty_driver_flush_buffer(tty);
23363- if (tty->ldisc.flush_buffer != NULL)
23364- tty->ldisc.flush_buffer(tty);
23365+ if (tty->ldisc.ops->flush_buffer != NULL)
23366+ tty->ldisc.ops->flush_buffer(tty);
23367 tty->closing = 0;
23368 spin_lock_irqsave(&xencons_lock, flags);
23369 xencons_tty = NULL;
00e5a55c
BS
23370--- sle11-2009-06-04.orig/drivers/xen/core/evtchn.c 2009-03-16 16:33:40.000000000 +0100
23371+++ sle11-2009-06-04/drivers/xen/core/evtchn.c 2009-06-04 10:21:39.000000000 +0200
23372@@ -746,8 +746,9 @@ static struct irq_chip dynirq_chip = {
cc90b958
BS
23373 };
23374
23375 /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
23376-static int pirq_eoi_does_unmask;
23377+static bool pirq_eoi_does_unmask;
00e5a55c
BS
23378 static unsigned long *pirq_needs_eoi;
23379+static DECLARE_BITMAP(probing_pirq, NR_PIRQS);
cc90b958
BS
23380
23381 static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
23382 {
00e5a55c
BS
23383@@ -794,25 +795,31 @@ static inline void pirq_query_unmask(int
23384 set_bit(irq - PIRQ_BASE, pirq_needs_eoi);
23385 }
23386
23387-/*
23388- * On startup, if there is no action associated with the IRQ then we are
23389- * probing. In this case we should not share with others as it will confuse us.
23390- */
23391-#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
23392+static int set_type_pirq(unsigned int irq, unsigned int type)
23393+{
23394+ if (type != IRQ_TYPE_PROBE)
23395+ return -EINVAL;
23396+ set_bit(irq - PIRQ_BASE, probing_pirq);
23397+ return 0;
23398+}
23399
23400 static unsigned int startup_pirq(unsigned int irq)
23401 {
23402 struct evtchn_bind_pirq bind_pirq;
23403 int evtchn = evtchn_from_irq(irq);
23404
23405- if (VALID_EVTCHN(evtchn))
23406+ if (VALID_EVTCHN(evtchn)) {
23407+ clear_bit(irq - PIRQ_BASE, probing_pirq);
23408 goto out;
23409+ }
23410
23411 bind_pirq.pirq = evtchn_get_xen_pirq(irq);
23412 /* NB. We are happy to share unless we are probing. */
23413- bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
23414+ bind_pirq.flags = test_and_clear_bit(irq - PIRQ_BASE, probing_pirq)
23415+ || (irq_desc[irq].status & IRQ_AUTODETECT)
23416+ ? 0 : BIND_PIRQ__WILL_SHARE;
23417 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
23418- if (!probing_irq(irq))
23419+ if (bind_pirq.flags)
23420 printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
23421 irq);
23422 return 0;
23423@@ -891,6 +898,7 @@ static struct irq_chip pirq_chip = {
23424 .mask_ack = ack_pirq,
23425 .ack = ack_pirq,
23426 .end = end_pirq,
23427+ .set_type = set_type_pirq,
23428 #ifdef CONFIG_SMP
23429 .set_affinity = set_affinity_irq,
23430 #endif
23431@@ -1003,6 +1011,7 @@ void xen_poll_irq(int irq)
cc90b958
BS
23432 BUG();
23433 }
23434
23435+#ifdef CONFIG_PM_SLEEP
23436 static void restore_cpu_virqs(unsigned int cpu)
23437 {
23438 struct evtchn_bind_virq bind_virq;
00e5a55c 23439@@ -1095,6 +1104,7 @@ void irq_resume(void)
cc90b958
BS
23440 }
23441
23442 }
23443+#endif
23444
23445 #if defined(CONFIG_X86_IO_APIC)
23446 #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
00e5a55c
BS
23447@@ -1177,7 +1187,7 @@ void __init xen_init_IRQ(void)
23448 * BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8)));
23449 eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
23450 if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
cc90b958
BS
23451- pirq_eoi_does_unmask = 1;
23452+ pirq_eoi_does_unmask = true;
23453
23454 /* No event channels are 'live' right now. */
23455 for (i = 0; i < NR_EVENT_CHANNELS; i++)
00e5a55c
BS
23456--- sle11-2009-06-04.orig/drivers/xen/core/gnttab.c 2008-12-01 11:25:57.000000000 +0100
23457+++ sle11-2009-06-04/drivers/xen/core/gnttab.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23458@@ -449,6 +449,7 @@ static int map_pte_fn(pte_t *pte, struct
23459 return 0;
23460 }
23461
23462+#ifdef CONFIG_PM_SLEEP
23463 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
23464 unsigned long addr, void *data)
23465 {
23466@@ -456,6 +457,7 @@ static int unmap_pte_fn(pte_t *pte, stru
23467 set_pte_at(&init_mm, addr, pte, __pte(0));
23468 return 0;
23469 }
23470+#endif
23471
23472 void *arch_gnttab_alloc_shared(unsigned long *frames)
23473 {
23474@@ -633,6 +635,75 @@ void __gnttab_dma_map_page(struct page *
23475 } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
23476 }
23477
23478+#ifdef __HAVE_ARCH_PTE_SPECIAL
23479+
23480+static unsigned int GNTMAP_pte_special;
23481+
23482+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
23483+ unsigned int count)
23484+{
23485+ unsigned int i;
23486+
23487+ if (unlikely(cmd != GNTTABOP_map_grant_ref))
23488+ count = 0;
23489+
23490+ for (i = 0; i < count; ++i, ++map) {
23491+ if (!(map->flags & GNTMAP_host_map)
23492+ || !(map->flags & GNTMAP_application_map))
23493+ continue;
23494+ if (GNTMAP_pte_special)
23495+ map->flags |= GNTMAP_pte_special;
23496+ else {
23497+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
23498+ return true;
23499+ }
23500+ }
23501+
23502+ return false;
23503+}
23504+EXPORT_SYMBOL(gnttab_pre_map_adjust);
23505+
23506+#if CONFIG_XEN_COMPAT < 0x030400
23507+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
23508+{
23509+ unsigned int i;
23510+ int rc = 0;
23511+
23512+ for (i = 0; i < count && rc == 0; ++i, ++map) {
23513+ pte_t pte;
23514+
23515+ if (!(map->flags & GNTMAP_host_map)
23516+ || !(map->flags & GNTMAP_application_map))
23517+ continue;
23518+
23519+#ifdef CONFIG_X86
23520+ pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
23521+ | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
23522+ | _PAGE_SPECIAL)
23523+ & __supported_pte_mask);
23524+#else
23525+#error Architecture not yet supported.
23526+#endif
23527+ if (!(map->flags & GNTMAP_readonly))
23528+ pte = pte_mkwrite(pte);
23529+
23530+ if (map->flags & GNTMAP_contains_pte) {
23531+ mmu_update_t u;
23532+
23533+ u.ptr = map->host_addr;
23534+ u.val = __pte_val(pte);
23535+ rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
23536+ } else
23537+ rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
23538+ }
23539+
23540+ return rc;
23541+}
23542+EXPORT_SYMBOL(gnttab_post_map_adjust);
23543+#endif
23544+
23545+#endif /* __HAVE_ARCH_PTE_SPECIAL */
23546+
23547 int gnttab_resume(void)
23548 {
23549 if (max_nr_grant_frames() < nr_grant_frames)
23550@@ -640,6 +711,7 @@ int gnttab_resume(void)
23551 return gnttab_map(0, nr_grant_frames - 1);
23552 }
23553
23554+#ifdef CONFIG_PM_SLEEP
23555 int gnttab_suspend(void)
23556 {
23557 #ifdef CONFIG_X86
23558@@ -649,6 +721,7 @@ int gnttab_suspend(void)
23559 #endif
23560 return 0;
23561 }
23562+#endif
23563
23564 #else /* !CONFIG_XEN */
23565
23566@@ -759,6 +832,18 @@ int __devinit gnttab_init(void)
23567 gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
23568 gnttab_free_head = NR_RESERVED_ENTRIES;
23569
23570+#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
23571+ if (!xen_feature(XENFEAT_auto_translated_physmap)
23572+ && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
23573+#ifdef CONFIG_X86
23574+ GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
23575+ >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
23576+#else
23577+#error Architecture not yet supported.
23578+#endif
23579+ }
23580+#endif
23581+
23582 return 0;
23583
23584 ini_nomem:
00e5a55c
BS
23585--- sle11-2009-06-04.orig/drivers/xen/core/machine_kexec.c 2009-03-16 16:38:05.000000000 +0100
23586+++ sle11-2009-06-04/drivers/xen/core/machine_kexec.c 2009-06-04 10:21:39.000000000 +0200
23587@@ -91,7 +91,7 @@ void __init xen_machine_kexec_setup_reso
cc90b958
BS
23588 xen_hypervisor_res.start = range.start;
23589 xen_hypervisor_res.end = range.start + range.size - 1;
23590 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
23591-#ifdef CONFIG_X86_64
23592+#ifdef CONFIG_X86
23593 insert_resource(&iomem_resource, &xen_hypervisor_res);
23594 #endif
23595
00e5a55c 23596@@ -106,7 +106,7 @@ void __init xen_machine_kexec_setup_reso
cc90b958
BS
23597 if (range.size) {
23598 crashk_res.start = range.start;
23599 crashk_res.end = range.start + range.size - 1;
23600-#ifdef CONFIG_X86_64
23601+#ifdef CONFIG_X86
23602 insert_resource(&iomem_resource, &crashk_res);
23603 #endif
23604 }
00e5a55c 23605@@ -160,7 +160,7 @@ void __init xen_machine_kexec_setup_reso
cc90b958
BS
23606 return;
23607 }
23608
23609-#ifndef CONFIG_X86_64
23610+#ifndef CONFIG_X86
23611 void __init xen_machine_kexec_register_resources(struct resource *res)
23612 {
23613 request_resource(res, &xen_hypervisor_res);
00e5a55c
BS
23614--- sle11-2009-06-04.orig/drivers/xen/core/machine_reboot.c 2009-06-04 11:08:07.000000000 +0200
23615+++ sle11-2009-06-04/drivers/xen/core/machine_reboot.c 2009-06-04 10:21:39.000000000 +0200
23616@@ -57,6 +57,7 @@ EXPORT_SYMBOL(machine_restart);
cc90b958
BS
23617 EXPORT_SYMBOL(machine_halt);
23618 EXPORT_SYMBOL(machine_power_off);
23619
23620+#ifdef CONFIG_PM_SLEEP
23621 static void pre_suspend(void)
23622 {
23623 HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
00e5a55c 23624@@ -111,6 +112,7 @@ static void post_suspend(int suspend_can
cc90b958
BS
23625 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
23626 virt_to_mfn(pfn_to_mfn_frame_list_list);
23627 }
23628+#endif
23629
23630 #else /* !(defined(__i386__) || defined(__x86_64__)) */
23631
00e5a55c 23632@@ -129,6 +131,7 @@ static void post_suspend(int suspend_can
cc90b958
BS
23633
23634 #endif
23635
23636+#ifdef CONFIG_PM_SLEEP
23637 struct suspend {
23638 int fast_suspend;
23639 void (*resume_notifier)(int);
00e5a55c 23640@@ -222,7 +225,8 @@ int __xen_suspend(int fast_suspend, void
cc90b958
BS
23641
23642 if (fast_suspend) {
23643 xenbus_suspend();
23644- err = stop_machine_run(take_machine_down, &suspend, 0);
23645+ err = stop_machine(take_machine_down, &suspend,
23646+ &cpumask_of_cpu(0));
23647 if (err < 0)
23648 xenbus_suspend_cancel();
23649 } else {
00e5a55c 23650@@ -245,3 +249,4 @@ int __xen_suspend(int fast_suspend, void
cc90b958
BS
23651
23652 return 0;
23653 }
23654+#endif
00e5a55c
BS
23655--- sle11-2009-06-04.orig/drivers/xen/core/reboot.c 2009-02-16 16:17:21.000000000 +0100
23656+++ sle11-2009-06-04/drivers/xen/core/reboot.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23657@@ -29,17 +29,12 @@ MODULE_LICENSE("Dual BSD/GPL");
23658 /* Ignore multiple shutdown requests. */
23659 static int shutting_down = SHUTDOWN_INVALID;
23660
23661-/* Was last suspend request cancelled? */
23662-static int suspend_cancelled;
23663-
23664 /* Can we leave APs online when we suspend? */
23665 static int fast_suspend;
23666
23667 static void __shutdown_handler(struct work_struct *unused);
23668 static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
23669
23670-static int setup_suspend_evtchn(void);
23671-
23672 int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
23673
23674 static int shutdown_process(void *__unused)
23675@@ -69,6 +64,13 @@ static int shutdown_process(void *__unus
23676 return 0;
23677 }
23678
23679+#ifdef CONFIG_PM_SLEEP
23680+
23681+static int setup_suspend_evtchn(void);
23682+
23683+/* Was last suspend request cancelled? */
23684+static int suspend_cancelled;
23685+
23686 static void xen_resume_notifier(int _suspend_cancelled)
23687 {
23688 int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
23689@@ -118,6 +120,10 @@ static int xen_suspend(void *__unused)
23690 return 0;
23691 }
23692
23693+#else
23694+# define xen_suspend NULL
23695+#endif
23696+
23697 static void switch_shutdown_state(int new_state)
23698 {
23699 int prev_state, old_state = SHUTDOWN_INVALID;
23700@@ -194,8 +200,10 @@ static void shutdown_handler(struct xenb
23701 new_state = SHUTDOWN_POWEROFF;
23702 else if (strcmp(str, "reboot") == 0)
23703 ctrl_alt_del();
23704+#ifdef CONFIG_PM_SLEEP
23705 else if (strcmp(str, "suspend") == 0)
23706 new_state = SHUTDOWN_SUSPEND;
23707+#endif
23708 else if (strcmp(str, "halt") == 0)
23709 new_state = SHUTDOWN_HALT;
23710 else
23711@@ -247,6 +255,7 @@ static struct xenbus_watch sysrq_watch =
23712 .callback = sysrq_handler
23713 };
23714
23715+#ifdef CONFIG_PM_SLEEP
23716 static irqreturn_t suspend_int(int irq, void* dev_id)
23717 {
23718 switch_shutdown_state(SHUTDOWN_SUSPEND);
23719@@ -274,6 +283,9 @@ static int setup_suspend_evtchn(void)
23720
23721 return 0;
23722 }
23723+#else
23724+#define setup_suspend_evtchn() 0
23725+#endif
23726
23727 static int setup_shutdown_watcher(void)
23728 {
00e5a55c
BS
23729--- sle11-2009-06-04.orig/drivers/xen/core/smpboot.c 2009-03-16 16:38:05.000000000 +0100
23730+++ sle11-2009-06-04/drivers/xen/core/smpboot.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23731@@ -27,6 +27,7 @@
23732
23733 extern irqreturn_t smp_reschedule_interrupt(int, void *);
23734 extern irqreturn_t smp_call_function_interrupt(int, void *);
23735+extern irqreturn_t smp_call_function_single_interrupt(int, void *);
23736
23737 extern int local_setup_timer(unsigned int cpu);
23738 extern void local_teardown_timer(unsigned int cpu);
00e5a55c 23739@@ -50,8 +51,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
cc90b958
BS
23740
23741 static DEFINE_PER_CPU(int, resched_irq);
23742 static DEFINE_PER_CPU(int, callfunc_irq);
23743+static DEFINE_PER_CPU(int, call1func_irq);
23744 static char resched_name[NR_CPUS][15];
23745 static char callfunc_name[NR_CPUS][15];
23746+static char call1func_name[NR_CPUS][15];
23747
23748 #ifdef CONFIG_X86_LOCAL_APIC
23749 #define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
00e5a55c 23750@@ -73,15 +76,13 @@ void __init prefill_possible_map(void)
cc90b958
BS
23751
23752 for (i = 0; i < NR_CPUS; i++) {
23753 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
23754- if (rc >= 0)
23755+ if (rc >= 0) {
23756 cpu_set(i, cpu_possible_map);
23757+ nr_cpu_ids = i + 1;
23758+ }
23759 }
23760 }
23761
00e5a55c
BS
23762-void __init smp_alloc_memory(void)
23763-{
23764-}
23765-
23766 static inline void
23767 set_cpu_sibling_map(unsigned int cpu)
23768 {
23769@@ -110,7 +111,8 @@ static int __cpuinit xen_smp_intr_init(u
cc90b958
BS
23770 {
23771 int rc;
23772
23773- per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
23774+ per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) =
23775+ per_cpu(call1func_irq, cpu) = -1;
23776
23777 sprintf(resched_name[cpu], "resched%u", cpu);
23778 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
00e5a55c 23779@@ -134,6 +136,17 @@ static int __cpuinit xen_smp_intr_init(u
cc90b958
BS
23780 goto fail;
23781 per_cpu(callfunc_irq, cpu) = rc;
23782
23783+ sprintf(call1func_name[cpu], "call1func%u", cpu);
23784+ rc = bind_ipi_to_irqhandler(CALL_FUNC_SINGLE_VECTOR,
23785+ cpu,
23786+ smp_call_function_single_interrupt,
23787+ IRQF_DISABLED|IRQF_NOBALANCING,
23788+ call1func_name[cpu],
23789+ NULL);
23790+ if (rc < 0)
23791+ goto fail;
23792+ per_cpu(call1func_irq, cpu) = rc;
23793+
23794 rc = xen_spinlock_init(cpu);
23795 if (rc < 0)
23796 goto fail;
00e5a55c 23797@@ -148,6 +161,8 @@ static int __cpuinit xen_smp_intr_init(u
cc90b958
BS
23798 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23799 if (per_cpu(callfunc_irq, cpu) >= 0)
23800 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23801+ if (per_cpu(call1func_irq, cpu) >= 0)
23802+ unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23803 xen_spinlock_cleanup(cpu);
23804 return rc;
23805 }
00e5a55c 23806@@ -160,6 +175,7 @@ static void __cpuexit xen_smp_intr_exit(
cc90b958
BS
23807
23808 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23809 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23810+ unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23811 xen_spinlock_cleanup(cpu);
23812 }
23813 #endif
00e5a55c 23814@@ -167,11 +183,7 @@ static void __cpuexit xen_smp_intr_exit(
cc90b958
BS
23815 void __cpuinit cpu_bringup(void)
23816 {
23817 cpu_init();
23818-#ifdef __i386__
23819 identify_secondary_cpu(&current_cpu_data);
23820-#else
23821- identify_cpu(&current_cpu_data);
23822-#endif
23823 touch_softlockup_watchdog();
23824 preempt_disable();
23825 local_irq_enable();
00e5a55c 23826@@ -251,9 +263,6 @@ void __init smp_prepare_cpus(unsigned in
cc90b958
BS
23827 struct task_struct *idle;
23828 int apicid;
23829 struct vcpu_get_physid cpu_id;
23830-#ifdef __x86_64__
23831- struct desc_ptr *gdt_descr;
23832-#endif
23833 void *gdt_addr;
23834
23835 apicid = 0;
00e5a55c 23836@@ -266,7 +275,7 @@ void __init smp_prepare_cpus(unsigned in
cc90b958
BS
23837
23838 current_thread_info()->cpu = 0;
23839
23840- for (cpu = 0; cpu < NR_CPUS; cpu++) {
23841+ for_each_possible_cpu (cpu) {
23842 cpus_clear(per_cpu(cpu_sibling_map, cpu));
23843 cpus_clear(per_cpu(cpu_core_map, cpu));
23844 }
00e5a55c 23845@@ -293,21 +302,10 @@ void __init smp_prepare_cpus(unsigned in
cc90b958
BS
23846 if (IS_ERR(idle))
23847 panic("failed fork for CPU %d", cpu);
23848
23849-#ifdef __x86_64__
23850- gdt_descr = &cpu_gdt_descr[cpu];
23851- gdt_descr->address = get_zeroed_page(GFP_KERNEL);
23852- if (unlikely(!gdt_descr->address)) {
23853- printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
23854- cpu);
23855- continue;
23856- }
23857- gdt_descr->size = GDT_SIZE;
23858- memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
23859- gdt_addr = (void *)gdt_descr->address;
23860-#else
23861+#ifdef __i386__
23862 init_gdt(cpu);
23863- gdt_addr = get_cpu_gdt_table(cpu);
23864 #endif
23865+ gdt_addr = get_cpu_gdt_table(cpu);
23866 make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
23867
23868 apicid = cpu;
00e5a55c
BS
23869@@ -353,8 +351,8 @@ void __init smp_prepare_boot_cpu(void)
23870 {
23871 #ifdef __i386__
23872 init_gdt(smp_processor_id());
23873- switch_to_new_gdt();
23874 #endif
23875+ switch_to_new_gdt();
23876 prefill_possible_map();
23877 }
23878
23879--- sle11-2009-06-04.orig/drivers/xen/core/spinlock.c 2009-03-16 16:33:40.000000000 +0100
23880+++ sle11-2009-06-04/drivers/xen/core/spinlock.c 2009-06-04 10:36:24.000000000 +0200
23881@@ -5,6 +5,8 @@
23882 * portions of this file.
23883 */
23884
23885+#if CONFIG_XEN_COMPAT >= 0x030200
23886+
23887 #include <linux/init.h>
23888 #include <linux/irq.h>
23889 #include <linux/kernel.h>
23890@@ -73,9 +75,9 @@ int xen_spin_wait(raw_spinlock_t *lock,
cc90b958
BS
23891 /* announce we're spinning */
23892 spinning.ticket = token;
23893 spinning.lock = lock;
23894- spinning.prev = __get_cpu_var(spinning);
23895+ spinning.prev = x86_read_percpu(spinning);
23896 smp_wmb();
23897- __get_cpu_var(spinning) = &spinning;
23898+ x86_write_percpu(spinning, &spinning);
23899
23900 /* clear pending */
23901 xen_clear_irq_pending(irq);
00e5a55c 23902@@ -102,7 +104,7 @@ int xen_spin_wait(raw_spinlock_t *lock,
cc90b958
BS
23903 kstat_this_cpu.irqs[irq] += !rc;
23904
23905 /* announce we're done */
23906- __get_cpu_var(spinning) = spinning.prev;
23907+ x86_write_percpu(spinning, spinning.prev);
23908 rm_lock = &__get_cpu_var(spinning_rm_lock);
23909 raw_local_irq_save(flags);
23910 __raw_write_lock(rm_lock);
00e5a55c
BS
23911@@ -159,3 +161,5 @@ void xen_spin_kick(raw_spinlock_t *lock,
23912 }
23913 }
23914 EXPORT_SYMBOL(xen_spin_kick);
23915+
23916+#endif /* CONFIG_XEN_COMPAT >= 0x030200 */
23917--- sle11-2009-06-04.orig/drivers/xen/fbfront/xenfb.c 2009-03-16 16:38:05.000000000 +0100
23918+++ sle11-2009-06-04/drivers/xen/fbfront/xenfb.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23919@@ -18,6 +18,7 @@
23920 * frame buffer.
23921 */
23922
23923+#include <linux/console.h>
23924 #include <linux/kernel.h>
23925 #include <linux/errno.h>
23926 #include <linux/fb.h>
00e5a55c 23927@@ -545,6 +546,28 @@ static unsigned long vmalloc_to_mfn(void
cc90b958
BS
23928 return pfn_to_mfn(vmalloc_to_pfn(address));
23929 }
23930
23931+static __devinit void
23932+xenfb_make_preferred_console(void)
23933+{
23934+ struct console *c;
23935+
23936+ if (console_set_on_cmdline)
23937+ return;
23938+
23939+ acquire_console_sem();
23940+ for (c = console_drivers; c; c = c->next) {
23941+ if (!strcmp(c->name, "tty") && c->index == 0)
23942+ break;
23943+ }
23944+ release_console_sem();
23945+ if (c) {
23946+ unregister_console(c);
23947+ c->flags |= CON_CONSDEV;
23948+ c->flags &= ~CON_PRINTBUFFER; /* don't print again */
23949+ register_console(c);
23950+ }
23951+}
23952+
23953 static int __devinit xenfb_probe(struct xenbus_device *dev,
23954 const struct xenbus_device_id *id)
23955 {
00e5a55c
BS
23956@@ -665,6 +688,7 @@ static int __devinit xenfb_probe(struct
23957 if (ret < 0)
cc90b958 23958 goto error;
cc90b958
BS
23959
23960+ xenfb_make_preferred_console();
23961 return 0;
23962
23963 error_nomem:
00e5a55c 23964@@ -882,4 +906,5 @@ static void __exit xenfb_cleanup(void)
cc90b958
BS
23965 module_init(xenfb_init);
23966 module_exit(xenfb_cleanup);
23967
23968+MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
23969 MODULE_LICENSE("GPL");
00e5a55c
BS
23970--- sle11-2009-06-04.orig/drivers/xen/fbfront/xenkbd.c 2009-03-04 11:25:55.000000000 +0100
23971+++ sle11-2009-06-04/drivers/xen/fbfront/xenkbd.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23972@@ -350,4 +350,5 @@ static void __exit xenkbd_cleanup(void)
23973 module_init(xenkbd_init);
23974 module_exit(xenkbd_cleanup);
23975
23976+MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
23977 MODULE_LICENSE("GPL");
00e5a55c
BS
23978--- sle11-2009-06-04.orig/drivers/xen/gntdev/gntdev.c 2009-03-16 16:38:05.000000000 +0100
23979+++ sle11-2009-06-04/drivers/xen/gntdev/gntdev.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23980@@ -418,7 +418,7 @@ static int __init gntdev_init(void)
23981 }
23982
23983 device = device_create(class, NULL, MKDEV(gntdev_major, 0),
23984- GNTDEV_NAME);
23985+ NULL, GNTDEV_NAME);
23986 if (IS_ERR(device)) {
23987 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
23988 printk(KERN_ERR "gntdev created with major number = %d\n",
00e5a55c
BS
23989--- sle11-2009-06-04.orig/drivers/xen/netfront/accel.c 2009-03-30 16:39:19.000000000 +0200
23990+++ sle11-2009-06-04/drivers/xen/netfront/accel.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
23991@@ -28,6 +28,7 @@
23992 * IN THE SOFTWARE.
23993 */
23994
23995+#include <linux/version.h>
23996 #include <linux/netdevice.h>
23997 #include <linux/skbuff.h>
23998 #include <linux/list.h>
00e5a55c
BS
23999--- sle11-2009-06-04.orig/drivers/xen/netfront/netfront.c 2009-03-30 16:40:17.000000000 +0200
24000+++ sle11-2009-06-04/drivers/xen/netfront/netfront.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24001@@ -640,7 +640,7 @@ static int network_open(struct net_devic
24002 }
24003 spin_unlock_bh(&np->rx_lock);
24004
24005- network_maybe_wake_tx(dev);
24006+ netif_start_queue(dev);
24007
24008 return 0;
24009 }
00e5a55c
BS
24010--- sle11-2009-06-04.orig/drivers/xen/sfc_netback/accel.h 2009-03-30 16:00:09.000000000 +0200
24011+++ sle11-2009-06-04/drivers/xen/sfc_netback/accel.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24012@@ -25,6 +25,7 @@
24013 #ifndef NETBACK_ACCEL_H
24014 #define NETBACK_ACCEL_H
24015
24016+#include <linux/version.h>
24017 #include <linux/slab.h>
24018 #include <linux/ip.h>
24019 #include <linux/tcp.h>
00e5a55c
BS
24020--- sle11-2009-06-04.orig/drivers/xen/sfc_netfront/accel.h 2009-03-30 16:34:56.000000000 +0200
24021+++ sle11-2009-06-04/drivers/xen/sfc_netfront/accel.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24022@@ -35,6 +35,7 @@
24023 #include <xen/evtchn.h>
24024
24025 #include <linux/kernel.h>
24026+#include <linux/version.h>
24027 #include <linux/list.h>
24028
24029 enum netfront_accel_post_status {
00e5a55c
BS
24030--- sle11-2009-06-04.orig/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:13:17.000000000 +0100
24031+++ sle11-2009-06-04/drivers/xen/xenbus/xenbus_client.c 2009-06-04 10:21:39.000000000 +0200
24032@@ -150,7 +150,7 @@ int xenbus_watch_pathfmt(struct xenbus_d
24033 char *path;
24034
24035 va_start(ap, pathfmt);
24036- path = kvasprintf(GFP_KERNEL, pathfmt, ap);
24037+ path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap);
24038 va_end(ap);
24039
24040 if (!path) {
24041--- sle11-2009-06-04.orig/drivers/xen/xenbus/xenbus_comms.c 2009-02-16 16:17:21.000000000 +0100
24042+++ sle11-2009-06-04/drivers/xen/xenbus/xenbus_comms.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24043@@ -228,14 +228,11 @@ int xb_init_comms(void)
24044 intf->rsp_cons = intf->rsp_prod;
24045 }
24046
24047+#if defined(CONFIG_XEN) || defined(MODULE)
24048 if (xenbus_irq)
24049 unbind_from_irqhandler(xenbus_irq, &xb_waitq);
24050
24051-#if defined(CONFIG_XEN) || defined(MODULE)
24052 err = bind_caller_port_to_irqhandler(
24053-#else
24054- err = bind_evtchn_to_irqhandler(
24055-#endif
24056 xen_store_evtchn, wake_waiting,
24057 0, "xenbus", &xb_waitq);
24058 if (err <= 0) {
24059@@ -244,6 +241,20 @@ int xb_init_comms(void)
24060 }
24061
24062 xenbus_irq = err;
24063+#else
24064+ if (xenbus_irq) {
24065+ /* Already have an irq; assume we're resuming */
24066+ rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
24067+ } else {
24068+ err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
24069+ 0, "xenbus", &xb_waitq);
24070+ if (err <= 0) {
24071+ printk(KERN_ERR "XENBUS request irq failed %i\n", err);
24072+ return err;
24073+ }
24074+ xenbus_irq = err;
24075+ }
24076+#endif
24077
24078 return 0;
24079 }
00e5a55c
BS
24080--- sle11-2009-06-04.orig/drivers/xen/xenbus/xenbus_probe.c 2009-03-16 16:38:05.000000000 +0100
24081+++ sle11-2009-06-04/drivers/xen/xenbus/xenbus_probe.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24082@@ -36,6 +36,7 @@
24083 __FUNCTION__, __LINE__, ##args)
24084
24085 #include <linux/kernel.h>
24086+#include <linux/version.h>
24087 #include <linux/err.h>
24088 #include <linux/string.h>
24089 #include <linux/ctype.h>
00e5a55c
BS
24090--- sle11-2009-06-04.orig/fs/aio.c 2009-03-24 10:13:25.000000000 +0100
24091+++ sle11-2009-06-04/fs/aio.c 2009-06-04 10:21:39.000000000 +0200
24092@@ -1335,7 +1335,7 @@ static int make_aio_fd(struct kioctx *io
cc90b958
BS
24093 int fd;
24094 struct file *file;
24095
24096- fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
24097+ fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
24098 if (fd < 0)
24099 return fd;
24100
00e5a55c
BS
24101--- sle11-2009-06-04.orig/include/asm-generic/pgtable.h 2009-03-04 11:28:34.000000000 +0100
24102+++ sle11-2009-06-04/include/asm-generic/pgtable.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24103@@ -99,10 +99,6 @@ static inline void ptep_set_wrprotect(st
24104 }
24105 #endif
24106
24107-#ifndef arch_change_pte_range
24108-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
24109-#endif
24110-
24111 #ifndef __HAVE_ARCH_PTE_SAME
24112 #define pte_same(A,B) (pte_val(A) == pte_val(B))
24113 #endif
00e5a55c
BS
24114--- sle11-2009-06-04.orig/include/asm-x86/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
24115+++ sle11-2009-06-04/include/asm-x86/dma-mapping.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24116@@ -74,7 +74,7 @@ static inline struct dma_mapping_ops *ge
24117 /* Make sure we keep the same behaviour */
24118 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
24119 {
24120-#ifdef CONFIG_X86_32
24121+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
24122 return 0;
24123 #else
24124 struct dma_mapping_ops *ops = get_dma_ops(dev);
00e5a55c
BS
24125--- sle11-2009-06-04.orig/include/asm-x86/kexec.h 2008-12-01 11:11:08.000000000 +0100
24126+++ sle11-2009-06-04/include/asm-x86/kexec.h 2009-06-04 10:21:39.000000000 +0200
24127@@ -10,6 +10,7 @@
24128 # define VA_PTE_0 5
24129 # define PA_PTE_1 6
24130 # define VA_PTE_1 7
24131+# ifndef CONFIG_XEN
24132 # define PA_SWAP_PAGE 8
24133 # ifdef CONFIG_X86_PAE
24134 # define PA_PMD_0 9
24135@@ -20,6 +21,18 @@
24136 # else
24137 # define PAGES_NR 9
24138 # endif
24139+# else /* CONFIG_XEN */
24140+/*
24141+ * The hypervisor interface implicitly requires that all entries (except
24142+ * for possibly the final one) are arranged in matching PA_/VA_ pairs.
24143+ */
24144+# define PA_PMD_0 8
24145+# define VA_PMD_0 9
24146+# define PA_PMD_1 10
24147+# define VA_PMD_1 11
24148+# define PA_SWAP_PAGE 12
24149+# define PAGES_NR 13
24150+# endif /* CONFIG_XEN */
24151 #else
24152 # define PA_CONTROL_PAGE 0
24153 # define VA_CONTROL_PAGE 1
24154--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:38:05.000000000 +0100
24155+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/desc.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24156@@ -31,11 +31,17 @@ extern struct desc_ptr idt_descr;
24157 extern gate_desc idt_table[];
24158 #endif
24159
24160+struct gdt_page {
24161+ struct desc_struct gdt[GDT_ENTRIES];
24162+} __attribute__((aligned(PAGE_SIZE)));
24163+DECLARE_PER_CPU(struct gdt_page, gdt_page);
24164+
24165+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
24166+{
24167+ return per_cpu(gdt_page, cpu).gdt;
24168+}
24169+
24170 #ifdef CONFIG_X86_64
24171-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
24172-extern struct desc_ptr cpu_gdt_descr[];
24173-/* the cpu gdt accessor */
24174-#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
24175
24176 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
24177 unsigned dpl, unsigned ist, unsigned seg)
24178@@ -53,16 +59,6 @@ static inline void pack_gate(gate_desc *
24179 }
24180
24181 #else
24182-struct gdt_page {
24183- struct desc_struct gdt[GDT_ENTRIES];
24184-} __attribute__((aligned(PAGE_SIZE)));
24185-DECLARE_PER_CPU(struct gdt_page, gdt_page);
24186-
24187-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
24188-{
24189- return per_cpu(gdt_page, cpu).gdt;
24190-}
24191-
24192 static inline void pack_gate(gate_desc *gate, unsigned char type,
24193 unsigned long base, unsigned dpl, unsigned flags,
24194 unsigned short seg)
24195@@ -333,6 +329,28 @@ static inline void set_intr_gate(unsigne
24196 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
24197 }
24198
24199+#define SYS_VECTOR_FREE 0
24200+#define SYS_VECTOR_ALLOCED 1
24201+
24202+extern int first_system_vector;
24203+extern char system_vectors[];
24204+
24205+static inline void alloc_system_vector(int vector)
24206+{
24207+ if (system_vectors[vector] == SYS_VECTOR_FREE) {
24208+ system_vectors[vector] = SYS_VECTOR_ALLOCED;
24209+ if (first_system_vector > vector)
24210+ first_system_vector = vector;
24211+ } else
24212+ BUG();
24213+}
24214+
24215+static inline void alloc_intr_gate(unsigned int n, void *addr)
24216+{
24217+ alloc_system_vector(n);
24218+ set_intr_gate(n, addr);
24219+}
24220+
24221 /*
24222 * This routine sets up an interrupt gate at directory privilege level 3.
24223 */
00e5a55c
BS
24224--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/fixmap.h 2009-03-16 16:38:05.000000000 +0100
24225+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/fixmap.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24226@@ -7,7 +7,58 @@
24227 # include "fixmap_64.h"
24228 #endif
24229
24230+extern int fixmaps_set;
24231+
24232+void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
24233+
24234+static inline void __set_fixmap(enum fixed_addresses idx,
24235+ maddr_t phys, pgprot_t flags)
24236+{
24237+ xen_set_fixmap(idx, phys, flags);
24238+}
24239+
24240+#define set_fixmap(idx, phys) \
24241+ __set_fixmap(idx, phys, PAGE_KERNEL)
24242+
24243+/*
24244+ * Some hardware wants to get fixmapped without caching.
24245+ */
24246+#define set_fixmap_nocache(idx, phys) \
24247+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24248+
24249 #define clear_fixmap(idx) \
24250 __set_fixmap(idx, 0, __pgprot(0))
24251
24252+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24253+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24254+
24255+extern void __this_fixmap_does_not_exist(void);
24256+
24257+/*
24258+ * 'index to address' translation. If anyone tries to use the idx
24259+ * directly without translation, we catch the bug with a NULL-deference
24260+ * kernel oops. Illegal ranges of incoming indices are caught too.
24261+ */
24262+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24263+{
24264+ /*
24265+ * this branch gets completely eliminated after inlining,
24266+ * except when someone tries to use fixaddr indices in an
24267+ * illegal way. (such as mixing up address types or using
24268+ * out-of-range indices).
24269+ *
24270+ * If it doesn't get removed, the linker will complain
24271+ * loudly with a reasonably clear error message..
24272+ */
24273+ if (idx >= __end_of_fixed_addresses)
24274+ __this_fixmap_does_not_exist();
24275+
24276+ return __fix_to_virt(idx);
24277+}
24278+
24279+static inline unsigned long virt_to_fix(const unsigned long vaddr)
24280+{
24281+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24282+ return __virt_to_fix(vaddr);
24283+}
24284 #endif
00e5a55c
BS
24285--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:38:05.000000000 +0100
24286+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24287@@ -58,10 +58,17 @@ enum fixed_addresses {
24288 #ifdef CONFIG_X86_LOCAL_APIC
24289 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24290 #endif
24291-#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
24292+#ifndef CONFIG_XEN
24293+#ifdef CONFIG_X86_IO_APIC
24294 FIX_IO_APIC_BASE_0,
24295 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
24296 #endif
24297+#else
24298+ FIX_SHARED_INFO,
24299+#define NR_FIX_ISAMAPS 256
24300+ FIX_ISAMAP_END,
24301+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24302+#endif
24303 #ifdef CONFIG_X86_VISWS_APIC
24304 FIX_CO_CPU, /* Cobalt timer */
24305 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
24306@@ -78,51 +85,38 @@ enum fixed_addresses {
24307 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
24308 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
24309 #endif
24310-#ifdef CONFIG_ACPI
24311- FIX_ACPI_BEGIN,
24312- FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24313-#endif
24314 #ifdef CONFIG_PCI_MMCONFIG
24315 FIX_PCIE_MCFG,
24316 #endif
24317 #ifdef CONFIG_PARAVIRT
24318 FIX_PARAVIRT_BOOTMAP,
24319 #endif
24320- FIX_SHARED_INFO,
24321-#define NR_FIX_ISAMAPS 256
24322- FIX_ISAMAP_END,
24323- FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24324 __end_of_permanent_fixed_addresses,
24325 /*
24326 * 256 temporary boot-time mappings, used by early_ioremap(),
24327 * before ioremap() is functional.
24328 *
24329- * We round it up to the next 512 pages boundary so that we
24330+ * We round it up to the next 256 pages boundary so that we
24331 * can have a single pgd entry and a single pte table:
24332 */
24333 #define NR_FIX_BTMAPS 64
24334 #define FIX_BTMAPS_NESTING 4
24335- FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24336- (__end_of_permanent_fixed_addresses & 511),
24337+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
24338+ (__end_of_permanent_fixed_addresses & 255),
24339 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24340 FIX_WP_TEST,
24341+#ifdef CONFIG_ACPI
24342+ FIX_ACPI_BEGIN,
24343+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24344+#endif
24345 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24346 FIX_OHCI1394_BASE,
24347 #endif
24348 __end_of_fixed_addresses
24349 };
24350
24351-extern void __set_fixmap(enum fixed_addresses idx,
24352- maddr_t phys, pgprot_t flags);
24353 extern void reserve_top_address(unsigned long reserve);
24354
24355-#define set_fixmap(idx, phys) \
24356- __set_fixmap(idx, phys, PAGE_KERNEL)
24357-/*
24358- * Some hardware wants to get fixmapped without caching.
24359- */
24360-#define set_fixmap_nocache(idx, phys) \
24361- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24362
24363 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
24364
24365@@ -131,38 +125,5 @@ extern void reserve_top_address(unsigned
24366 #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
24367 #define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
24368
24369-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24370-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24371-
24372-extern void __this_fixmap_does_not_exist(void);
24373-
24374-/*
24375- * 'index to address' translation. If anyone tries to use the idx
24376- * directly without tranlation, we catch the bug with a NULL-deference
24377- * kernel oops. Illegal ranges of incoming indices are caught too.
24378- */
24379-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24380-{
24381- /*
24382- * this branch gets completely eliminated after inlining,
24383- * except when someone tries to use fixaddr indices in an
24384- * illegal way. (such as mixing up address types or using
24385- * out-of-range indices).
24386- *
24387- * If it doesn't get removed, the linker will complain
24388- * loudly with a reasonably clear error message..
24389- */
24390- if (idx >= __end_of_fixed_addresses)
24391- __this_fixmap_does_not_exist();
24392-
24393- return __fix_to_virt(idx);
24394-}
24395-
24396-static inline unsigned long virt_to_fix(const unsigned long vaddr)
24397-{
24398- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24399- return __virt_to_fix(vaddr);
24400-}
24401-
24402 #endif /* !__ASSEMBLY__ */
24403 #endif
00e5a55c
BS
24404--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:38:05.000000000 +0100
24405+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24406@@ -12,6 +12,7 @@
24407 #define _ASM_FIXMAP_64_H
24408
24409 #include <linux/kernel.h>
24410+#include <asm/acpi.h>
24411 #include <asm/apicdef.h>
24412 #include <asm/page.h>
24413 #include <asm/vsyscall.h>
24414@@ -40,7 +41,6 @@ enum fixed_addresses {
24415 VSYSCALL_HPET,
24416 FIX_DBGP_BASE,
24417 FIX_EARLYCON_MEM_BASE,
24418- FIX_HPET_BASE,
24419 #ifdef CONFIG_X86_LOCAL_APIC
24420 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24421 #endif
24422@@ -53,14 +53,21 @@ enum fixed_addresses {
24423 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
24424 + MAX_EFI_IO_PAGES - 1,
24425 #endif
24426+#ifdef CONFIG_PARAVIRT
24427+ FIX_PARAVIRT_BOOTMAP,
24428+#else
24429+ FIX_SHARED_INFO,
24430+#endif
24431 #ifdef CONFIG_ACPI
24432 FIX_ACPI_BEGIN,
24433 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24434 #endif
24435- FIX_SHARED_INFO,
24436 #define NR_FIX_ISAMAPS 256
24437 FIX_ISAMAP_END,
24438 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24439+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24440+ FIX_OHCI1394_BASE,
24441+#endif
24442 __end_of_permanent_fixed_addresses,
24443 /*
24444 * 256 temporary boot-time mappings, used by early_ioremap(),
24445@@ -71,27 +78,12 @@ enum fixed_addresses {
24446 */
24447 #define NR_FIX_BTMAPS 64
24448 #define FIX_BTMAPS_NESTING 4
24449- FIX_BTMAP_END =
24450- __end_of_permanent_fixed_addresses + 512 -
24451+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24452 (__end_of_permanent_fixed_addresses & 511),
24453 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24454-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24455- FIX_OHCI1394_BASE,
24456-#endif
24457 __end_of_fixed_addresses
24458 };
24459
24460-extern void __set_fixmap(enum fixed_addresses idx,
24461- unsigned long phys, pgprot_t flags);
24462-
24463-#define set_fixmap(idx, phys) \
24464- __set_fixmap(idx, phys, PAGE_KERNEL)
24465-/*
24466- * Some hardware wants to get fixmapped without caching.
24467- */
24468-#define set_fixmap_nocache(idx, phys) \
24469- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24470-
24471 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
24472 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
24473 #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
24474@@ -100,30 +92,4 @@ extern void __set_fixmap(enum fixed_addr
24475 #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
24476 #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
24477
24478-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24479-
24480-extern void __this_fixmap_does_not_exist(void);
24481-
24482-/*
24483- * 'index to address' translation. If anyone tries to use the idx
24484- * directly without translation, we catch the bug with a NULL-deference
24485- * kernel oops. Illegal ranges of incoming indices are caught too.
24486- */
24487-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24488-{
24489- /*
24490- * this branch gets completely eliminated after inlining,
24491- * except when someone tries to use fixaddr indices in an
24492- * illegal way. (such as mixing up address types or using
24493- * out-of-range indices).
24494- *
24495- * If it doesn't get removed, the linker will complain
24496- * loudly with a reasonably clear error message..
24497- */
24498- if (idx >= __end_of_fixed_addresses)
24499- __this_fixmap_does_not_exist();
24500-
24501- return __fix_to_virt(idx);
24502-}
24503-
24504 #endif
00e5a55c
BS
24505--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:38:05.000000000 +0100
24506+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/highmem.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24507@@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *p
24508
24509 #define flush_cache_kmaps() do { } while (0)
24510
24511+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
24512+ unsigned long end_pfn);
24513+
24514 void clear_highpage(struct page *);
24515 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
24516 {
00e5a55c
BS
24517--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/hypercall.h 2009-02-16 16:18:36.000000000 +0100
24518+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/hypercall.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24519@@ -323,9 +323,19 @@ static inline int __must_check
24520 HYPERVISOR_grant_table_op(
24521 unsigned int cmd, void *uop, unsigned int count)
24522 {
24523+ bool fixup = false;
24524+ int rc;
24525+
24526 if (arch_use_lazy_mmu_mode())
24527 xen_multicall_flush(false);
24528- return _hypercall3(int, grant_table_op, cmd, uop, count);
24529+#ifdef GNTTABOP_map_grant_ref
24530+ if (cmd == GNTTABOP_map_grant_ref)
24531+#endif
24532+ fixup = gnttab_pre_map_adjust(cmd, uop, count);
24533+ rc = _hypercall3(int, grant_table_op, cmd, uop, count);
24534+ if (rc == 0 && fixup)
24535+ rc = gnttab_post_map_adjust(uop, count);
24536+ return rc;
24537 }
24538
24539 static inline int __must_check
00e5a55c
BS
24540--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2009-03-16 16:33:40.000000000 +0100
24541+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/hypervisor.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24542@@ -35,7 +35,6 @@
24543
24544 #include <linux/types.h>
24545 #include <linux/kernel.h>
24546-#include <linux/version.h>
24547 #include <linux/errno.h>
24548 #include <xen/interface/xen.h>
24549 #include <xen/interface/platform.h>
00e5a55c
BS
24550@@ -112,6 +111,8 @@ int xen_create_contiguous_region(
24551 unsigned long vstart, unsigned int order, unsigned int address_bits);
24552 void xen_destroy_contiguous_region(
24553 unsigned long vstart, unsigned int order);
24554+int early_create_contiguous_region(unsigned long pfn, unsigned int order,
24555+ unsigned int address_bits);
24556
24557 struct page;
cc90b958 24558
00e5a55c
BS
24559@@ -181,6 +182,29 @@ static inline void xen_multicall_flush(b
24560
24561 #endif /* CONFIG_XEN && !MODULE */
24562
24563+#ifdef CONFIG_XEN
24564+
cc90b958
BS
24565+struct gnttab_map_grant_ref;
24566+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
24567+ unsigned int count);
24568+#if CONFIG_XEN_COMPAT < 0x030400
24569+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
24570+#else
24571+static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
24572+ unsigned int count)
24573+{
24574+ BUG();
24575+ return -ENOSYS;
24576+}
24577+#endif
24578+
00e5a55c
BS
24579+#else /* !CONFIG_XEN */
24580+
cc90b958
BS
24581+#define gnttab_pre_map_adjust(...) false
24582+#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
24583+
00e5a55c
BS
24584+#endif /* CONFIG_XEN */
24585+
cc90b958 24586 #if defined(CONFIG_X86_64)
00e5a55c
BS
24587 #define MULTI_UVMFLAGS_INDEX 2
24588 #define MULTI_UVMDOMID_INDEX 3
24589--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/io.h 2009-03-16 16:38:05.000000000 +0100
24590+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/io.h 2009-06-04 10:21:39.000000000 +0200
24591@@ -3,20 +3,139 @@
cc90b958
BS
24592
24593 #define ARCH_HAS_IOREMAP_WC
24594
24595+#include <linux/compiler.h>
24596+
24597+/*
24598+ * early_ioremap() and early_iounmap() are for temporary early boot-time
24599+ * mappings, before the real ioremap() is functional.
24600+ * A boot-time mapping is currently limited to at most 16 pages.
24601+ */
24602+#ifndef __ASSEMBLY__
24603+extern void early_ioremap_init(void);
24604+extern void early_ioremap_clear(void);
24605+extern void early_ioremap_reset(void);
24606+extern void *early_ioremap(unsigned long offset, unsigned long size);
24607+extern void early_iounmap(void *addr, unsigned long size);
24608+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24609+#endif
24610+
24611+#define build_mmio_read(name, size, type, reg, barrier) \
24612+static inline type name(const volatile void __iomem *addr) \
24613+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
24614+:"m" (*(volatile type __force *)addr) barrier); return ret; }
24615+
24616+#define build_mmio_write(name, size, type, reg, barrier) \
24617+static inline void name(type val, volatile void __iomem *addr) \
24618+{ asm volatile("mov" size " %0,%1": :reg (val), \
24619+"m" (*(volatile type __force *)addr) barrier); }
24620+
24621+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
24622+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
24623+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
24624+
24625+build_mmio_read(__readb, "b", unsigned char, "=q", )
24626+build_mmio_read(__readw, "w", unsigned short, "=r", )
24627+build_mmio_read(__readl, "l", unsigned int, "=r", )
24628+
24629+build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
24630+build_mmio_write(writew, "w", unsigned short, "r", :"memory")
24631+build_mmio_write(writel, "l", unsigned int, "r", :"memory")
24632+
24633+build_mmio_write(__writeb, "b", unsigned char, "q", )
24634+build_mmio_write(__writew, "w", unsigned short, "r", )
24635+build_mmio_write(__writel, "l", unsigned int, "r", )
24636+
24637+#define readb_relaxed(a) __readb(a)
24638+#define readw_relaxed(a) __readw(a)
24639+#define readl_relaxed(a) __readl(a)
24640+#define __raw_readb __readb
24641+#define __raw_readw __readw
24642+#define __raw_readl __readl
24643+
24644+#define __raw_writeb __writeb
24645+#define __raw_writew __writew
24646+#define __raw_writel __writel
24647+
24648+#define mmiowb() barrier()
24649+
24650+#ifdef CONFIG_X86_64
24651+build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
24652+build_mmio_read(__readq, "q", unsigned long, "=r", )
24653+build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
24654+build_mmio_write(__writeq, "q", unsigned long, "r", )
24655+
24656+#define readq_relaxed(a) __readq(a)
24657+#define __raw_readq __readq
24658+#define __raw_writeq writeq
24659+
24660+/* Let people know we have them */
24661+#define readq readq
24662+#define writeq writeq
24663+#endif
00e5a55c
BS
24664+
24665+#define native_io_delay xen_io_delay
cc90b958
BS
24666+
24667 #ifdef CONFIG_X86_32
00e5a55c
BS
24668-# include "io_32.h"
24669+# include "../../io_32.h"
cc90b958 24670 #else
00e5a55c
BS
24671-# include "io_64.h"
24672+# include "../../io_64.h"
24673+#endif
24674+
24675+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
24676+
24677+/* We will be supplying our own /dev/mem implementation */
24678+#define ARCH_HAS_DEV_MEM
24679+
24680+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
24681+#undef page_to_phys
24682+#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
24683+#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
24684+
24685+#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
24686+ (unsigned long) (bv)->bv_offset)
24687+
24688+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
24689+ (bvec_to_phys(vec1) + (vec1)->bv_len == bvec_to_phys(vec2) \
24690+ && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
24691+ == bvec_to_pseudophys(vec2))
24692+
24693+#undef virt_to_bus
24694+#undef bus_to_virt
24695+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
24696+#define bus_to_virt(_x) __va(machine_to_phys(_x))
24697+
24698+#include <asm/fixmap.h>
24699+
24700+#undef isa_virt_to_bus
24701+#undef isa_page_to_bus
24702+#undef isa_bus_to_virt
24703+#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
24704+#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->_x
24705+#define isa_bus_to_virt(_x) ((void *)__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
24706+
24707+#undef __ISA_IO_base
24708+#define __ISA_IO_base ((char __iomem *)fix_to_virt(FIX_ISAMAP_BEGIN))
24709+
24710 #endif
24711
24712 extern void *xlate_dev_mem_ptr(unsigned long phys);
24713 extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
24714
24715-extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
24716-extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
24717-
24718 extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
cc90b958
BS
24719 unsigned long prot_val);
24720 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
24721
24722+/*
24723+ * early_ioremap() and early_iounmap() are for temporary early boot-time
24724+ * mappings, before the real ioremap() is functional.
24725+ * A boot-time mapping is currently limited to at most 16 pages.
24726+ */
24727+extern void early_ioremap_init(void);
24728+extern void early_ioremap_clear(void);
24729+extern void early_ioremap_reset(void);
24730+extern void *early_ioremap(unsigned long offset, unsigned long size);
24731+extern void early_iounmap(void *addr, unsigned long size);
24732+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24733+
24734+
24735 #endif /* _ASM_X86_IO_H */
cc90b958 24736--- /dev/null 1970-01-01 00:00:00.000000000 +0000
00e5a55c 24737+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/irq_vectors.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24738@@ -0,0 +1,52 @@
24739+#ifndef _ASM_IRQ_VECTORS_H
24740+#define _ASM_IRQ_VECTORS_H
24741+
24742+#ifdef CONFIG_X86_32
24743+# define SYSCALL_VECTOR 0x80
24744+#else
24745+# define IA32_SYSCALL_VECTOR 0x80
24746+#endif
24747+
24748+#define RESCHEDULE_VECTOR 0
24749+#define CALL_FUNCTION_VECTOR 1
24750+#define CALL_FUNC_SINGLE_VECTOR 2
24751+#define SPIN_UNLOCK_VECTOR 3
24752+#define NR_IPIS 4
24753+
24754+/*
24755+ * The maximum number of vectors supported by i386 processors
24756+ * is limited to 256. For processors other than i386, NR_VECTORS
24757+ * should be changed accordingly.
24758+ */
24759+#define NR_VECTORS 256
24760+
24761+#define FIRST_VM86_IRQ 3
24762+#define LAST_VM86_IRQ 15
24763+#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
24764+
24765+/*
24766+ * The flat IRQ space is divided into two regions:
24767+ * 1. A one-to-one mapping of real physical IRQs. This space is only used
24768+ * if we have physical device-access privilege. This region is at the
24769+ * start of the IRQ space so that existing device drivers do not need
24770+ * to be modified to translate physical IRQ numbers into our IRQ space.
24771+ * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
24772+ * are bound using the provided bind/unbind functions.
24773+ */
24774+
24775+#define PIRQ_BASE 0
00e5a55c
BS
24776+#if defined(NR_CPUS) && defined(MAX_IO_APICS)
24777+# if NR_CPUS < MAX_IO_APICS
24778+# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24779+# else
24780+# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
24781+# endif
cc90b958
BS
24782+#endif
24783+
24784+#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
24785+#define NR_DYNIRQS 256
24786+
24787+#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
24788+#define NR_IRQ_VECTORS NR_IRQS
24789+
24790+#endif /* _ASM_IRQ_VECTORS_H */
00e5a55c
BS
24791--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:38:05.000000000 +0100
24792+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/irqflags.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24793@@ -118,7 +118,7 @@ static inline void halt(void)
24794
24795 #ifndef CONFIG_X86_64
24796 #define INTERRUPT_RETURN iret
24797-#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
24798+#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
24799 sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
24800 __TEST_PENDING ; \
24801 jnz 14f /* process more events if necessary... */ ; \
24802@@ -175,18 +175,6 @@ static inline void trace_hardirqs_fixup_
24803 #else
24804
24805 #ifdef CONFIG_X86_64
24806-/*
24807- * Currently paravirt can't handle swapgs nicely when we
24808- * don't have a stack we can rely on (such as a user space
24809- * stack). So we either find a way around these or just fault
24810- * and emulate if a guest tries to call swapgs directly.
24811- *
24812- * Either way, this is a good way to document that we don't
24813- * have a reliable stack. x86_64 only.
24814- */
24815-#define SWAPGS_UNSAFE_STACK swapgs
24816-#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
24817-#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
24818 #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
24819 #define ARCH_LOCKDEP_SYS_EXIT_IRQ \
24820 TRACE_IRQS_ON; \
24821@@ -198,24 +186,6 @@ static inline void trace_hardirqs_fixup_
24822 TRACE_IRQS_OFF;
24823
24824 #else
24825-#define ARCH_TRACE_IRQS_ON \
24826- pushl %eax; \
24827- pushl %ecx; \
24828- pushl %edx; \
24829- call trace_hardirqs_on; \
24830- popl %edx; \
24831- popl %ecx; \
24832- popl %eax;
24833-
24834-#define ARCH_TRACE_IRQS_OFF \
24835- pushl %eax; \
24836- pushl %ecx; \
24837- pushl %edx; \
24838- call trace_hardirqs_off; \
24839- popl %edx; \
24840- popl %ecx; \
24841- popl %eax;
24842-
24843 #define ARCH_LOCKDEP_SYS_EXIT \
24844 pushl %eax; \
24845 pushl %ecx; \
24846@@ -229,8 +199,8 @@ static inline void trace_hardirqs_fixup_
24847 #endif
24848
24849 #ifdef CONFIG_TRACE_IRQFLAGS
24850-# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
24851-# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
24852+# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
24853+# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
24854 #else
24855 # define TRACE_IRQS_ON
24856 # define TRACE_IRQS_OFF
00e5a55c
BS
24857--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/mmu_context.h 2009-02-16 16:18:36.000000000 +0100
24858+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/mmu_context.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24859@@ -1,5 +1,42 @@
24860+#ifndef __ASM_X86_MMU_CONTEXT_H
24861+#define __ASM_X86_MMU_CONTEXT_H
24862+
24863+#include <asm/desc.h>
24864+#include <asm/atomic.h>
24865+#include <asm/pgalloc.h>
24866+#include <asm/tlbflush.h>
24867+
24868+void arch_exit_mmap(struct mm_struct *mm);
24869+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24870+
24871+void mm_pin(struct mm_struct *mm);
24872+void mm_unpin(struct mm_struct *mm);
24873+void mm_pin_all(void);
24874+
24875+static inline void xen_activate_mm(struct mm_struct *prev,
24876+ struct mm_struct *next)
24877+{
24878+ if (!PagePinned(virt_to_page(next->pgd)))
24879+ mm_pin(next);
24880+}
24881+
24882+/*
24883+ * Used for LDT copy/destruction.
24884+ */
24885+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24886+void destroy_context(struct mm_struct *mm);
24887+
24888 #ifdef CONFIG_X86_32
24889 # include "mmu_context_32.h"
24890 #else
24891 # include "mmu_context_64.h"
24892 #endif
24893+
24894+#define activate_mm(prev, next) \
24895+do { \
24896+ xen_activate_mm(prev, next); \
24897+ switch_mm((prev), (next), NULL); \
24898+} while (0);
24899+
24900+
24901+#endif /* __ASM_X86_MMU_CONTEXT_H */
00e5a55c
BS
24902--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:38:05.000000000 +0100
24903+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24904@@ -1,32 +1,6 @@
24905 #ifndef __I386_SCHED_H
24906 #define __I386_SCHED_H
24907
24908-#include <asm/desc.h>
24909-#include <asm/atomic.h>
24910-#include <asm/pgalloc.h>
24911-#include <asm/tlbflush.h>
24912-
24913-void arch_exit_mmap(struct mm_struct *mm);
24914-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24915-
24916-void mm_pin(struct mm_struct *mm);
24917-void mm_unpin(struct mm_struct *mm);
24918-void mm_pin_all(void);
24919-
24920-static inline void xen_activate_mm(struct mm_struct *prev,
24921- struct mm_struct *next)
24922-{
24923- if (!PagePinned(virt_to_page(next->pgd)))
24924- mm_pin(next);
24925-}
24926-
24927-/*
24928- * Used for LDT copy/destruction.
24929- */
24930-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24931-void destroy_context(struct mm_struct *mm);
24932-
24933-
24934 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24935 {
24936 #if 0 /* XEN: no lazy tlb */
24937@@ -107,10 +81,4 @@ static inline void switch_mm(struct mm_s
24938 #define deactivate_mm(tsk, mm) \
24939 asm("movl %0,%%gs": :"r" (0));
24940
24941-#define activate_mm(prev, next) \
24942-do { \
24943- xen_activate_mm(prev, next); \
24944- switch_mm((prev), (next), NULL); \
24945-} while (0)
24946-
24947 #endif
00e5a55c
BS
24948--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:38:05.000000000 +0100
24949+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24950@@ -1,23 +1,6 @@
24951 #ifndef __X86_64_MMU_CONTEXT_H
24952 #define __X86_64_MMU_CONTEXT_H
24953
24954-#include <asm/desc.h>
24955-#include <asm/atomic.h>
24956-#include <asm/pgalloc.h>
24957-#include <asm/page.h>
24958-#include <asm/pda.h>
24959-#include <asm/pgtable.h>
24960-#include <asm/tlbflush.h>
24961-
24962-void arch_exit_mmap(struct mm_struct *mm);
24963-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24964-
24965-/*
24966- * possibly do the LDT unload here?
24967- */
24968-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24969-void destroy_context(struct mm_struct *mm);
24970-
24971 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24972 {
24973 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
24974@@ -58,10 +41,6 @@ static inline void __prepare_arch_switch
24975 }
24976 }
24977
24978-extern void mm_pin(struct mm_struct *mm);
24979-extern void mm_unpin(struct mm_struct *mm);
24980-void mm_pin_all(void);
24981-
24982 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
24983 struct task_struct *tsk)
24984 {
24985@@ -124,11 +103,4 @@ do { \
24986 asm volatile("movl %0,%%fs"::"r"(0)); \
24987 } while (0)
24988
24989-static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
24990-{
24991- if (!PagePinned(virt_to_page(next->pgd)))
24992- mm_pin(next);
24993- switch_mm(prev, next, NULL);
24994-}
24995-
24996 #endif
00e5a55c
BS
24997--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:38:05.000000000 +0100
24998+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/page.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
24999@@ -16,9 +16,9 @@
25000 * below. The preprocessor will warn if the two definitions aren't identical.
25001 */
25002 #define _PAGE_BIT_PRESENT 0
25003-#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25004-#define _PAGE_BIT_IO 9
25005-#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25006+#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25007+#define _PAGE_BIT_IO 11
25008+#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25009
25010 #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
25011 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
25012@@ -28,8 +28,11 @@
25013 (ie, 32-bit PAE). */
25014 #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
25015
25016-/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
25017-#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
25018+/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
25019+#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
25020+
25021+/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
25022+#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
25023
25024 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
25025 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
25026@@ -39,8 +42,7 @@
25027 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
25028 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
25029
25030-/* to align the pointer to the (next) page boundary */
25031-#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
25032+#define HUGE_MAX_HSTATE 2
25033
25034 #ifndef __ASSEMBLY__
25035 #include <linux/types.h>
25036@@ -61,9 +63,17 @@
25037
25038 #ifndef __ASSEMBLY__
25039
25040+typedef struct { pgdval_t pgd; } pgd_t;
25041+typedef struct { pgprotval_t pgprot; } pgprot_t;
25042+
25043 extern int page_is_ram(unsigned long pagenr);
25044 extern int devmem_is_allowed(unsigned long pagenr);
25045+extern void map_devmem(unsigned long pfn, unsigned long size,
25046+ pgprot_t vma_prot);
25047+extern void unmap_devmem(unsigned long pfn, unsigned long size,
25048+ pgprot_t vma_prot);
25049
25050+extern unsigned long max_low_pfn_mapped;
25051 extern unsigned long max_pfn_mapped;
25052
25053 struct page;
25054@@ -84,15 +94,11 @@ static inline void copy_user_page(void *
25055 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
25056 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
25057
25058-typedef struct { pgprotval_t pgprot; } pgprot_t;
25059-
25060 #define pgprot_val(x) ((x).pgprot)
25061 #define __pgprot(x) ((pgprot_t) { (x) } )
25062
25063 #include <asm/maddr.h>
25064
25065-typedef struct { pgdval_t pgd; } pgd_t;
25066-
25067 #define __pgd_ma(x) ((pgd_t) { (x) } )
25068 static inline pgd_t xen_make_pgd(pgdval_t val)
25069 {
25070@@ -196,6 +202,11 @@ static inline pteval_t xen_pte_val(pte_t
25071 return ret;
25072 }
25073
25074+static inline pteval_t xen_pte_flags(pte_t pte)
25075+{
25076+ return __pte_val(pte) & PTE_FLAGS_MASK;
25077+}
25078+
25079 #define pgd_val(x) xen_pgd_val(x)
25080 #define __pgd(x) xen_make_pgd(x)
25081
25082@@ -210,6 +221,7 @@ static inline pteval_t xen_pte_val(pte_t
25083 #endif
25084
25085 #define pte_val(x) xen_pte_val(x)
25086+#define pte_flags(x) xen_pte_flags(x)
25087 #define __pte(x) xen_make_pte(x)
25088
25089 #define __pa(x) __phys_addr((unsigned long)(x))
00e5a55c
BS
25090--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:38:05.000000000 +0100
25091+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/page_64.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25092@@ -26,6 +26,12 @@
25093 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
25094 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
25095
25096+/*
25097+ * Set __PAGE_OFFSET to the most negative possible address +
25098+ * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
25099+ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
25100+ * what Xen requires.
25101+ */
25102 #define __PAGE_OFFSET _AC(0xffff880000000000, UL)
25103
25104 #define __PHYSICAL_START CONFIG_PHYSICAL_START
25105@@ -63,7 +69,8 @@
25106 void clear_page(void *page);
25107 void copy_page(void *to, void *from);
25108
25109-extern unsigned long end_pfn;
25110+/* duplicated to the one in bootmem.h */
25111+extern unsigned long max_pfn;
25112
25113 static inline unsigned long __phys_addr(unsigned long x)
25114 {
25115@@ -91,6 +98,11 @@ typedef union { pteval_t pte; unsigned i
25116 extern unsigned long init_memory_mapping(unsigned long start,
25117 unsigned long end);
25118
25119+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
25120+
25121+extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
25122+extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
25123+
25124 #endif /* !__ASSEMBLY__ */
25125
25126 #ifdef CONFIG_FLATMEM
00e5a55c
BS
25127--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:38:05.000000000 +0100
25128+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pci.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25129@@ -21,6 +21,8 @@ struct pci_sysdata {
25130 #endif
25131 };
25132
25133+extern int pci_routeirq;
25134+
25135 /* scan a bus after allocating a pci_sysdata for it */
25136 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
25137 int node);
00e5a55c
BS
25138--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pci_32.h 2009-02-16 16:18:36.000000000 +0100
25139+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pci_32.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25140@@ -38,12 +38,14 @@ struct pci_dev;
25141 #define PCI_DMA_BUS_IS_PHYS (1)
25142
25143 /* pci_unmap_{page,single} is a nop so... */
25144-#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
25145-#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
25146-#define pci_unmap_addr(PTR, ADDR_NAME) (0)
25147-#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
25148-#define pci_unmap_len(PTR, LEN_NAME) (0)
25149-#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
25150+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
25151+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
25152+#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
25153+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25154+ do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
25155+#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
25156+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
25157+ do { break; } while (pci_unmap_len(PTR, LEN_NAME))
25158
25159 #endif
25160
00e5a55c
BS
25161--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2009-03-16 16:38:05.000000000 +0100
25162+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgalloc.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25163@@ -7,6 +7,9 @@
25164
25165 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
25166
25167+static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
25168+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
25169+
25170 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
25171 static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
25172 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
00e5a55c
BS
25173--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:38:05.000000000 +0100
25174+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25175@@ -13,11 +13,12 @@
25176 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
25177 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
25178 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
25179-#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
25180+#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
25181+#define _PAGE_BIT_UNUSED2 10
25182+#define _PAGE_BIT_IO 11 /* Mapped page is I/O or foreign and
25183 * has no associated page struct. */
25184-#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
25185-#define _PAGE_BIT_UNUSED3 11
25186 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
25187+#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
25188 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
25189
25190 /* If _PAGE_BIT_PRESENT is clear, we use these: */
25191@@ -28,34 +29,31 @@
25192 /* if the user mapped it with PROT_NONE; pte_present gives true */
25193 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
25194
25195-/*
25196- * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
25197- * sign-extended value on 32-bit with all 1's in the upper word,
25198- * which preserves the upper pte values on 64-bit ptes:
25199- */
25200-#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25201-#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
25202-#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
25203-#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
25204-#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
25205-#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
25206-#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
25207-#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
25208-#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
25209-#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25210-#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
25211-#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
25212-#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
25213-#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
25214+#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25215+#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
25216+#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
25217+#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
25218+#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
25219+#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
25220+#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
25221+#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
25222+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
25223+#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
25224+#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2)
25225+#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25226+#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
25227+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
25228+#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
25229+#define __HAVE_ARCH_PTE_SPECIAL
25230
25231 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
25232-#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
25233+#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
25234 #else
25235-#define _PAGE_NX 0
25236+#define _PAGE_NX (_AT(pteval_t, 0))
25237 #endif
25238
25239-#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
25240-#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
25241+#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
25242+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
25243
25244 #ifndef __ASSEMBLY__
25245 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
25246@@ -71,8 +69,8 @@ extern unsigned int __kernel_page_user;
25247 _PAGE_DIRTY | __kernel_page_user)
25248
25249 /* Set of bits not changed in pte_modify */
25250-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25251- _PAGE_ACCESSED | _PAGE_DIRTY)
25252+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25253+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
25254
25255 /*
25256 * PAT settings are part of the hypervisor interface, which sets the
25257@@ -102,19 +100,9 @@ extern unsigned int __kernel_page_user;
25258 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
25259 _PAGE_ACCESSED)
25260
25261-#ifdef CONFIG_X86_32
25262-#define _PAGE_KERNEL_EXEC \
25263- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
25264-#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
25265-
25266-#ifndef __ASSEMBLY__
25267-extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
25268-#endif /* __ASSEMBLY__ */
25269-#else
25270 #define __PAGE_KERNEL_EXEC \
25271 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
25272 #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
25273-#endif
25274
25275 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
25276 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
25277@@ -125,25 +113,22 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
25278 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
25279 #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
25280 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
25281+#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
25282 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
25283
25284-/*
25285- * We don't support GLOBAL page in xenolinux64
25286- */
25287-#define MAKE_GLOBAL(x) __pgprot((x))
25288-
25289-#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
25290-#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
25291-#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
25292-#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
25293-#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
25294-#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
25295-#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
25296-#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
25297-#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
25298-#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
25299-#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
25300-#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25301+#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
25302+#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
25303+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
25304+#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
25305+#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
25306+#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
25307+#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
25308+#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
25309+#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
25310+#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
25311+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
25312+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
25313+#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25314
25315 /* xwr */
25316 #define __P000 PAGE_NONE
25317@@ -182,27 +167,27 @@ extern struct list_head pgd_list;
25318 */
25319 static inline int pte_dirty(pte_t pte)
25320 {
25321- return __pte_val(pte) & _PAGE_DIRTY;
25322+ return pte_flags(pte) & _PAGE_DIRTY;
25323 }
25324
25325 static inline int pte_young(pte_t pte)
25326 {
25327- return __pte_val(pte) & _PAGE_ACCESSED;
25328+ return pte_flags(pte) & _PAGE_ACCESSED;
25329 }
25330
25331 static inline int pte_write(pte_t pte)
25332 {
25333- return __pte_val(pte) & _PAGE_RW;
25334+ return pte_flags(pte) & _PAGE_RW;
25335 }
25336
25337 static inline int pte_file(pte_t pte)
25338 {
25339- return __pte_val(pte) & _PAGE_FILE;
25340+ return pte_flags(pte) & _PAGE_FILE;
25341 }
25342
25343 static inline int pte_huge(pte_t pte)
25344 {
25345- return __pte_val(pte) & _PAGE_PSE;
25346+ return pte_flags(pte) & _PAGE_PSE;
25347 }
25348
25349 static inline int pte_global(pte_t pte)
25350@@ -212,12 +197,12 @@ static inline int pte_global(pte_t pte)
25351
25352 static inline int pte_exec(pte_t pte)
25353 {
25354- return !(__pte_val(pte) & _PAGE_NX);
25355+ return !(pte_flags(pte) & _PAGE_NX);
25356 }
25357
25358 static inline int pte_special(pte_t pte)
25359 {
25360- return 0;
25361+ return pte_flags(pte) & _PAGE_SPECIAL;
25362 }
25363
25364 static inline int pmd_large(pmd_t pte)
25365@@ -228,22 +213,22 @@ static inline int pmd_large(pmd_t pte)
25366
25367 static inline pte_t pte_mkclean(pte_t pte)
25368 {
25369- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
25370+ return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
25371 }
25372
25373 static inline pte_t pte_mkold(pte_t pte)
25374 {
25375- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
25376+ return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
25377 }
25378
25379 static inline pte_t pte_wrprotect(pte_t pte)
25380 {
25381- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
25382+ return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
25383 }
25384
25385 static inline pte_t pte_mkexec(pte_t pte)
25386 {
25387- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
25388+ return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
25389 }
25390
25391 static inline pte_t pte_mkdirty(pte_t pte)
25392@@ -268,7 +253,7 @@ static inline pte_t pte_mkhuge(pte_t pte
25393
25394 static inline pte_t pte_clrhuge(pte_t pte)
25395 {
25396- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
25397+ return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
25398 }
25399
25400 static inline pte_t pte_mkglobal(pte_t pte)
25401@@ -283,35 +268,46 @@ static inline pte_t pte_clrglobal(pte_t
25402
25403 static inline pte_t pte_mkspecial(pte_t pte)
25404 {
25405- return pte;
25406+ return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
25407 }
25408
25409 extern pteval_t __supported_pte_mask;
25410
25411 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
25412 {
25413- return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
25414- pgprot_val(pgprot)) & __supported_pte_mask);
25415+ pgprotval_t prot = pgprot_val(pgprot);
25416+
25417+ if (prot & _PAGE_PRESENT)
25418+ prot &= __supported_pte_mask;
25419+ return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25420 }
25421
25422 static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
25423 {
25424- return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
25425- pgprot_val(pgprot)) & __supported_pte_mask);
25426+ pgprotval_t prot = pgprot_val(pgprot);
25427+
25428+ if (prot & _PAGE_PRESENT)
25429+ prot &= __supported_pte_mask;
25430+ return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25431 }
25432
25433 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
25434 {
25435- return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
25436- pgprot_val(pgprot)) & __supported_pte_mask);
25437+ pgprotval_t prot = pgprot_val(pgprot);
25438+
25439+ if (prot & _PAGE_PRESENT)
25440+ prot &= __supported_pte_mask;
25441+ return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25442 }
25443
25444 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
25445 {
25446- pteval_t val = pte_val(pte);
25447+ pgprotval_t prot = pgprot_val(newprot);
25448+ pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
25449
25450- val &= _PAGE_CHG_MASK;
25451- val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
25452+ if (prot & _PAGE_PRESENT)
25453+ prot &= __supported_pte_mask;
25454+ val |= prot & ~_PAGE_CHG_MASK;
25455
25456 return __pte(val);
25457 }
25458@@ -325,9 +321,11 @@ static inline pgprot_t pgprot_modify(pgp
25459 return __pgprot(preservebits | addbits);
25460 }
25461
25462-#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
25463+#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
25464
25465-#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
25466+#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
25467+ ? pgprot_val(p) & __supported_pte_mask \
25468+ : pgprot_val(p))
25469
25470 #ifndef __ASSEMBLY__
25471 #define __HAVE_PHYS_MEM_ACCESS_PROT
25472@@ -338,6 +336,17 @@ int phys_mem_access_prot_allowed(struct
25473 unsigned long size, pgprot_t *vma_prot);
25474 #endif
25475
25476+/* Install a pte for a particular vaddr in kernel space. */
25477+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
25478+
25479+#ifndef CONFIG_XEN
25480+extern void native_pagetable_setup_start(pgd_t *base);
25481+extern void native_pagetable_setup_done(pgd_t *base);
25482+#else
25483+static inline void xen_pagetable_setup_start(pgd_t *base) {}
25484+static inline void xen_pagetable_setup_done(pgd_t *base) {}
25485+#endif
25486+
25487 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
25488 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
25489
25490@@ -373,6 +382,26 @@ int phys_mem_access_prot_allowed(struct
25491 # include "pgtable_64.h"
25492 #endif
25493
25494+/*
25495+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25496+ *
25497+ * this macro returns the index of the entry in the pgd page which would
25498+ * control the given virtual address
25499+ */
25500+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25501+
25502+/*
25503+ * pgd_offset() returns a (pgd_t *)
25504+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25505+ */
25506+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25507+/*
25508+ * a shortcut which implies the use of the kernel's pgd, instead
25509+ * of a process's
25510+ */
25511+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25512+
25513+
25514 #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
25515 #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
25516
25517@@ -383,8 +412,15 @@ enum {
25518 PG_LEVEL_4K,
25519 PG_LEVEL_2M,
25520 PG_LEVEL_1G,
25521+ PG_LEVEL_NUM
25522 };
25523
25524+#ifdef CONFIG_PROC_FS
25525+extern void update_page_count(int level, unsigned long pages);
25526+#else
25527+static inline void update_page_count(int level, unsigned long pages) { }
25528+#endif
25529+
25530 /*
25531 * Helper function that returns the kernel pagetable entry controlling
25532 * the virtual address 'address'. NULL means no pagetable entry present.
25533@@ -441,6 +477,8 @@ static inline void xen_pte_clear(struct
25534 * race with other CPU's that might be updating the dirty
25535 * bit at the same time.
25536 */
25537+struct vm_area_struct;
25538+
25539 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
25540 extern int ptep_set_access_flags(struct vm_area_struct *vma,
25541 unsigned long address, pte_t *ptep,
25542@@ -523,9 +561,6 @@ static inline void clone_pgd_range(pgd_t
25543 memcpy(dst, src, count * sizeof(pgd_t));
25544 }
25545
25546-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25547- xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25548-
25549 #define arbitrary_virt_to_machine(va) \
25550 ({ \
25551 unsigned int __lvl; \
00e5a55c
BS
25552@@ -548,6 +583,34 @@ struct page *kmap_atomic_to_page(void *)
25553 #define ptep_to_machine(ptep) virt_to_machine(ptep)
25554 #endif
cc90b958
BS
25555
25556+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
25557+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
25558+ pte_t *ptep)
25559+{
25560+#if CONFIG_XEN_COMPAT < 0x030300
25561+ if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
25562+ return ptep_get_and_clear(mm, addr, ptep);
25563+#endif
25564+ return *ptep;
25565+}
25566+
cc90b958
BS
25567+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
25568+ pte_t *ptep, pte_t pte)
25569+{
25570+ mmu_update_t u;
25571+
25572+#if CONFIG_XEN_COMPAT < 0x030300
25573+ if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
25574+ set_pte_at(mm, addr, ptep, pte);
25575+ return;
25576+ }
25577+#endif
00e5a55c 25578+ u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
cc90b958
BS
25579+ u.val = __pte_val(pte);
25580+ if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
25581+ BUG();
25582+}
25583+
25584 #include <asm-generic/pgtable.h>
25585
25586 #include <xen/features.h>
00e5a55c 25587@@ -576,10 +639,6 @@ int touch_pte_range(struct mm_struct *mm
cc90b958
BS
25588 unsigned long address,
25589 unsigned long size);
25590
25591-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25592- unsigned long addr, unsigned long end, pgprot_t newprot,
25593- int dirty_accountable);
25594-
25595 #endif /* __ASSEMBLY__ */
25596
25597 #endif /* _ASM_X86_PGTABLE_H */
00e5a55c
BS
25598--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:38:05.000000000 +0100
25599+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25600@@ -14,11 +14,11 @@
25601 #define pmd_ERROR(e) \
25602 printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
25603 __FILE__, __LINE__, &(e), __pmd_val(e), \
25604- (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25605+ (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25606 #define pgd_ERROR(e) \
25607 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
25608 __FILE__, __LINE__, &(e), __pgd_val(e), \
25609- (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25610+ (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25611
25612 static inline int pud_none(pud_t pud)
25613 {
25614@@ -27,7 +27,7 @@ static inline int pud_none(pud_t pud)
25615 }
25616 static inline int pud_bad(pud_t pud)
25617 {
25618- return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25619+ return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25620 }
25621
25622 static inline int pud_present(pud_t pud)
25623@@ -102,9 +102,9 @@ static inline void pud_clear(pud_t *pudp
25624 xen_tlb_flush();
25625 }
25626
25627-#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
25628+#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
25629
25630-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
25631+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
25632
25633
25634 /* Find an entry in the second-level page table.. */
00e5a55c
BS
25635--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:38:05.000000000 +0100
25636+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25637@@ -89,10 +89,10 @@ extern unsigned long pg0[];
25638 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
25639 can temporarily clear it. */
25640 #define pmd_present(x) (__pmd_val(x))
25641-#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25642+#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25643 #else
25644 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
25645-#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25646+#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25647 #endif
25648
25649
25650@@ -119,26 +119,6 @@ extern unsigned long pg0[];
25651 */
25652 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
25653
25654-/*
25655- * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25656- *
25657- * this macro returns the index of the entry in the pgd page which would
25658- * control the given virtual address
25659- */
25660-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25661-#define pgd_index_k(addr) pgd_index((addr))
25662-
25663-/*
25664- * pgd_offset() returns a (pgd_t *)
25665- * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25666- */
25667-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25668-
25669-/*
25670- * a shortcut which implies the use of the kernel's pgd, instead
25671- * of a process's
25672- */
25673-#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25674
25675 static inline int pud_large(pud_t pud) { return 0; }
25676
25677@@ -165,7 +145,7 @@ static inline int pud_large(pud_t pud) {
25678 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25679
25680 #define pmd_page_vaddr(pmd) \
25681- ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
25682+ ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
25683
25684 #if defined(CONFIG_HIGHPTE)
25685 #define pte_offset_map(dir, address) \
00e5a55c
BS
25686--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:38:05.000000000 +0100
25687+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25688@@ -23,6 +23,8 @@ extern void xen_init_pt(void);
25689 extern pud_t level3_kernel_pgt[512];
25690 extern pud_t level3_ident_pgt[512];
25691 extern pmd_t level2_kernel_pgt[512];
25692+extern pmd_t level2_fixmap_pgt[512];
25693+extern pmd_t level2_ident_pgt[512];
25694 extern pgd_t init_level4_pgt[];
25695
25696 #define swapper_pg_dir init_level4_pgt
25697@@ -79,6 +81,9 @@ extern void paging_init(void);
25698
25699 struct mm_struct;
25700
25701+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
25702+
25703+
25704 #define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
25705
25706 static inline void xen_set_pte(pte_t *ptep, pte_t pte)
00e5a55c
BS
25707@@ -145,29 +150,29 @@ static inline void xen_pgd_clear(pgd_t *
25708 #define PGDIR_MASK (~(PGDIR_SIZE - 1))
25709
25710
25711-#define MAXMEM _AC(0x00003fffffffffff, UL)
25712+#define MAXMEM _AC(0x000004ffffffffff, UL)
25713 #define VMALLOC_START _AC(0xffffc20000000000, UL)
cc90b958
BS
25714 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
25715 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
25716 #define MODULES_VADDR _AC(0xffffffffa0000000, UL)
25717-#define MODULES_END _AC(0xfffffffffff00000, UL)
25718+#define MODULES_END _AC(0xffffffffff000000, UL)
25719 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
25720
25721 #ifndef __ASSEMBLY__
25722
25723 static inline int pgd_bad(pgd_t pgd)
25724 {
25725- return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25726+ return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25727 }
25728
25729 static inline int pud_bad(pud_t pud)
25730 {
25731- return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25732+ return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25733 }
25734
25735 static inline int pmd_bad(pmd_t pmd)
25736 {
25737- return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25738+ return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25739 }
25740
25741 #define pte_none(x) (!(x).pte)
25742@@ -175,7 +180,7 @@ static inline int pmd_bad(pmd_t pmd)
25743
25744 #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
25745
25746-#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
25747+#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
25748 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
25749 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
25750 #define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
25751@@ -200,11 +205,8 @@ static inline int pmd_bad(pmd_t pmd)
25752 * Level 4 access.
25753 */
25754 #define pgd_page_vaddr(pgd) \
25755- ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
25756+ ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
25757 #define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
25758-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25759-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25760-#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
25761 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
25762 static inline int pgd_large(pgd_t pgd) { return 0; }
25763 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
25764@@ -226,7 +228,7 @@ static inline int pud_large(pud_t pte)
25765 }
25766
25767 /* PMD - Level 2 access */
25768-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
25769+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
25770 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25771
25772 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
00e5a55c
BS
25773--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:38:05.000000000 +0100
25774+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/processor.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25775@@ -134,7 +134,7 @@ extern __u32 cleared_cpu_caps[NCAPINTS
25776 #ifdef CONFIG_SMP
25777 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25778 #define cpu_data(cpu) per_cpu(cpu_info, cpu)
25779-#define current_cpu_data cpu_data(smp_processor_id())
25780+#define current_cpu_data __get_cpu_var(cpu_info)
25781 #else
25782 #define cpu_data(cpu) boot_cpu_data
25783 #define current_cpu_data boot_cpu_data
25784@@ -153,7 +153,7 @@ static inline int hlt_works(int cpu)
25785
25786 extern void cpu_detect(struct cpuinfo_x86 *c);
25787
25788-extern void identify_cpu(struct cpuinfo_x86 *);
25789+extern void early_cpu_init(void);
25790 extern void identify_boot_cpu(void);
25791 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25792 extern void print_cpu_info(struct cpuinfo_x86 *);
25793@@ -267,15 +267,11 @@ struct tss_struct {
25794 struct thread_struct *io_bitmap_owner;
25795
25796 /*
25797- * Pad the TSS to be cacheline-aligned (size is 0x100):
25798- */
25799- unsigned long __cacheline_filler[35];
25800- /*
25801 * .. and then another 0x100 bytes for the emergency kernel stack:
25802 */
25803 unsigned long stack[64];
25804
25805-} __attribute__((packed));
25806+} ____cacheline_aligned;
25807
25808 DECLARE_PER_CPU(struct tss_struct, init_tss);
25809
25810@@ -668,11 +664,36 @@ static inline void __sti_mwait(unsigned
25811
25812 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25813
25814-extern int force_mwait;
25815-
25816 extern void select_idle_routine(const struct cpuinfo_x86 *c);
25817
25818 extern unsigned long boot_option_idle_override;
25819+extern unsigned long idle_halt;
25820+extern unsigned long idle_nomwait;
25821+
25822+#ifndef CONFIG_XEN
25823+/*
25824+ * on systems with caches, caches must be flashed as the absolute
25825+ * last instruction before going into a suspended halt. Otherwise,
25826+ * dirty data can linger in the cache and become stale on resume,
25827+ * leading to strange errors.
25828+ *
25829+ * perform a variety of operations to guarantee that the compiler
25830+ * will not reorder instructions. wbinvd itself is serializing
25831+ * so the processor will not reorder.
25832+ *
25833+ * Systems without cache can just go into halt.
25834+ */
25835+static inline void wbinvd_halt(void)
25836+{
25837+ mb();
25838+ /* check for clflush to determine if wbinvd is legal */
25839+ if (cpu_has_clflush)
25840+ asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
25841+ else
25842+ while (1)
25843+ halt();
25844+}
25845+#endif
25846
25847 extern void enable_sep_cpu(void);
25848 extern int sysenter_setup(void);
00e5a55c
BS
25849--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:38:05.000000000 +0100
25850+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/segment.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25851@@ -1,6 +1,15 @@
25852 #ifndef _ASM_X86_SEGMENT_H_
25853 #define _ASM_X86_SEGMENT_H_
25854
25855+/* Constructor for a conventional segment GDT (or LDT) entry */
25856+/* This is a macro so it can be used in initializers */
25857+#define GDT_ENTRY(flags, base, limit) \
25858+ ((((base) & 0xff000000ULL) << (56-24)) | \
25859+ (((flags) & 0x0000f0ffULL) << 40) | \
25860+ (((limit) & 0x000f0000ULL) << (48-16)) | \
25861+ (((base) & 0x00ffffffULL) << 16) | \
25862+ (((limit) & 0x0000ffffULL)))
25863+
25864 /* Simple and small GDT entries for booting only */
25865
25866 #define GDT_ENTRY_BOOT_CS 2
25867@@ -61,18 +70,14 @@
25868 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
25869
25870 #define GDT_ENTRY_DEFAULT_USER_CS 14
25871-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
25872
25873 #define GDT_ENTRY_DEFAULT_USER_DS 15
25874-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
25875
25876 #define GDT_ENTRY_KERNEL_BASE 12
25877
25878 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
25879-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25880
25881 #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
25882-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25883
25884 #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
25885 #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
25886@@ -143,10 +148,11 @@
25887 #else
25888 #include <asm/cache.h>
25889
25890-#define __KERNEL_CS 0x10
25891-#define __KERNEL_DS 0x18
25892+#define GDT_ENTRY_KERNEL32_CS 1
25893+#define GDT_ENTRY_KERNEL_CS 2
25894+#define GDT_ENTRY_KERNEL_DS 3
25895
25896-#define __KERNEL32_CS 0x08
25897+#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
25898
25899 /*
25900 * we cannot use the same code segment descriptor for user and kernel
25901@@ -154,10 +160,10 @@
25902 * The segment offset needs to contain a RPL. Grr. -AK
25903 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
25904 */
25905-
25906-#define __USER32_CS 0x23 /* 4*8+3 */
25907-#define __USER_DS 0x2b /* 5*8+3 */
25908-#define __USER_CS 0x33 /* 6*8+3 */
25909+#define GDT_ENTRY_DEFAULT_USER32_CS 4
25910+#define GDT_ENTRY_DEFAULT_USER_DS 5
25911+#define GDT_ENTRY_DEFAULT_USER_CS 6
25912+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
25913 #define __USER32_DS __USER_DS
25914
25915 #define GDT_ENTRY_TSS 8 /* needs two entries */
25916@@ -179,6 +185,11 @@
25917
25918 #endif
25919
25920+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25921+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25922+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
25923+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
25924+
25925 /* User mode is privilege level 3 */
25926 #define USER_RPL 0x3
25927 /* LDT segment has TI set, GDT has it cleared */
00e5a55c
BS
25928--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/smp.h 2009-03-16 16:38:05.000000000 +0100
25929+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/smp.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
25930@@ -25,23 +25,16 @@ extern cpumask_t cpu_initialized;
25931 extern void (*mtrr_hook)(void);
25932 extern void zap_low_mappings(void);
25933
25934+extern int __cpuinit get_local_pda(int cpu);
25935+
25936 extern int smp_num_siblings;
25937 extern unsigned int num_processors;
25938 extern cpumask_t cpu_initialized;
25939
25940-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
25941-extern u16 x86_cpu_to_apicid_init[];
25942-extern u16 x86_bios_cpu_apicid_init[];
25943-extern void *x86_cpu_to_apicid_early_ptr;
25944-extern void *x86_bios_cpu_apicid_early_ptr;
25945-#else
25946-#define x86_cpu_to_apicid_early_ptr NULL
25947-#define x86_bios_cpu_apicid_early_ptr NULL
25948-#endif
25949-
25950 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
25951 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
25952 DECLARE_PER_CPU(u16, cpu_llc_id);
25953+
25954 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
25955 DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
25956
25957@@ -63,9 +56,9 @@ struct smp_ops {
25958
25959 void (*smp_send_stop)(void);
25960 void (*smp_send_reschedule)(int cpu);
25961- int (*smp_call_function_mask)(cpumask_t mask,
25962- void (*func)(void *info), void *info,
25963- int wait);
25964+
25965+ void (*send_call_func_ipi)(cpumask_t mask);
25966+ void (*send_call_func_single_ipi)(int cpu);
25967 };
25968
25969 /* Globals due to paravirt */
25970@@ -106,11 +99,14 @@ static inline void smp_send_reschedule(i
25971 smp_ops.smp_send_reschedule(cpu);
25972 }
25973
25974-static inline int smp_call_function_mask(cpumask_t mask,
25975- void (*func) (void *info), void *info,
25976- int wait)
25977+static inline void arch_send_call_function_single_ipi(int cpu)
25978 {
25979- return smp_ops.smp_call_function_mask(mask, func, info, wait);
25980+ smp_ops.send_call_func_single_ipi(cpu);
25981+}
25982+
25983+static inline void arch_send_call_function_ipi(cpumask_t mask)
25984+{
25985+ smp_ops.send_call_func_ipi(mask);
25986 }
25987
25988 void native_smp_prepare_boot_cpu(void);
25989@@ -122,23 +118,19 @@ int native_cpu_up(unsigned int cpunum);
25990
25991 void xen_smp_send_stop(void);
25992 void xen_smp_send_reschedule(int cpu);
25993-int xen_smp_call_function_mask(cpumask_t mask,
25994- void (*func) (void *info), void *info,
25995- int wait);
25996+void xen_send_call_func_ipi(cpumask_t mask);
25997+void xen_send_call_func_single_ipi(int cpu);
25998
25999 #define smp_send_stop xen_smp_send_stop
26000 #define smp_send_reschedule xen_smp_send_reschedule
26001-#define smp_call_function_mask xen_smp_call_function_mask
26002-
26003-extern void prefill_possible_map(void);
26004+#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
26005+#define arch_send_call_function_ipi xen_send_call_func_ipi
26006
26007 #endif /* CONFIG_XEN */
26008
26009 extern int __cpu_disable(void);
26010 extern void __cpu_die(unsigned int cpu);
26011
26012-extern void prefill_possible_map(void);
26013-
26014 void smp_store_cpu_info(int id);
26015 #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
26016
26017@@ -149,6 +141,14 @@ static inline int num_booting_cpus(void)
26018 }
26019 #endif /* CONFIG_SMP */
26020
26021+#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN))
26022+extern void prefill_possible_map(void);
26023+#else
26024+static inline void prefill_possible_map(void)
26025+{
26026+}
26027+#endif
26028+
26029 extern unsigned disabled_cpus __cpuinitdata;
26030
26031 #ifdef CONFIG_X86_32_SMP
26032@@ -216,12 +216,8 @@ static inline int hard_smp_processor_id(
26033 #endif /* CONFIG_X86_LOCAL_APIC */
26034
26035 #ifdef CONFIG_HOTPLUG_CPU
26036-extern void cpu_exit_clear(void);
26037 extern void cpu_uninit(void);
26038 #endif
26039
26040-extern void smp_alloc_memory(void);
26041-extern void lock_ipi_call_lock(void);
26042-extern void unlock_ipi_call_lock(void);
26043 #endif /* __ASSEMBLY__ */
26044 #endif
00e5a55c
BS
26045--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:38:05.000000000 +0100
26046+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/spinlock.h 2009-06-04 11:09:05.000000000 +0200
26047@@ -38,6 +38,11 @@
26048 # define UNLOCK_LOCK_PREFIX
26049 #endif
26050
26051+/*
26052+ * Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
26053+ */
26054+#if CONFIG_XEN_COMPAT >= 0x030200
26055+
26056 int xen_spinlock_init(unsigned int cpu);
26057 void xen_spinlock_cleanup(unsigned int cpu);
26058 extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
26059@@ -65,14 +70,14 @@ extern void xen_spin_kick(raw_spinlock_t
cc90b958
BS
26060 */
26061 #if (NR_CPUS < 256)
26062 #define TICKET_SHIFT 8
26063-#define __raw_spin_lock_preamble \
26064+#define __ticket_spin_lock_preamble \
26065 asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
26066 "cmpb %h0, %b0\n\t" \
26067 "sete %1" \
26068 : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
26069 : "0" (0x0100) \
26070 : "memory", "cc")
26071-#define __raw_spin_lock_body \
26072+#define __ticket_spin_lock_body \
26073 asm("1:\t" \
26074 "cmpb %h0, %b0\n\t" \
26075 "je 2f\n\t" \
00e5a55c 26076@@ -88,7 +93,7 @@ extern void xen_spin_kick(raw_spinlock_t
cc90b958
BS
26077 : "memory", "cc")
26078
26079
26080-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26081+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26082 {
26083 int tmp, new;
26084
00e5a55c 26085@@ -107,7 +112,7 @@ static __always_inline int __raw_spin_tr
cc90b958
BS
26086 return tmp;
26087 }
26088
26089-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26090+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26091 {
26092 unsigned int token;
26093 unsigned char kick;
00e5a55c 26094@@ -124,7 +129,7 @@ static __always_inline void __raw_spin_u
cc90b958
BS
26095 }
26096 #else
26097 #define TICKET_SHIFT 16
26098-#define __raw_spin_lock_preamble \
26099+#define __ticket_spin_lock_preamble \
26100 do { \
26101 unsigned int tmp; \
26102 asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
00e5a55c 26103@@ -136,7 +141,7 @@ static __always_inline void __raw_spin_u
cc90b958
BS
26104 : "0" (0x00010000) \
26105 : "memory", "cc"); \
26106 } while (0)
26107-#define __raw_spin_lock_body \
26108+#define __ticket_spin_lock_body \
26109 do { \
26110 unsigned int tmp; \
26111 asm("shldl $16, %0, %2\n" \
00e5a55c 26112@@ -155,7 +160,7 @@ static __always_inline void __raw_spin_u
cc90b958
BS
26113 : "memory", "cc"); \
26114 } while (0)
26115
26116-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26117+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26118 {
26119 int tmp;
26120 int new;
00e5a55c 26121@@ -177,7 +182,7 @@ static __always_inline int __raw_spin_tr
cc90b958
BS
26122 return tmp;
26123 }
26124
26125-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26126+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26127 {
26128 unsigned int token, tmp;
26129 bool kick;
00e5a55c 26130@@ -195,49 +200,161 @@ static __always_inline void __raw_spin_u
cc90b958
BS
26131 }
26132 #endif
26133
26134-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26135+static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
26136 {
26137 int tmp = ACCESS_ONCE(lock->slock);
26138
26139 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
26140 }
26141
26142-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26143+static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
26144 {
26145 int tmp = ACCESS_ONCE(lock->slock);
26146
26147 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
26148 }
26149
26150-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26151+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
26152 {
26153 unsigned int token, count;
26154 bool free;
26155
26156- __raw_spin_lock_preamble;
26157+ __ticket_spin_lock_preamble;
26158 if (unlikely(!free))
26159 token = xen_spin_adjust(lock, token);
26160 do {
26161 count = 1 << 10;
26162- __raw_spin_lock_body;
26163+ __ticket_spin_lock_body;
26164 } while (unlikely(!count) && !xen_spin_wait(lock, token));
26165 }
26166
26167-static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26168- unsigned long flags)
26169+static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
26170+ unsigned long flags)
26171 {
26172 unsigned int token, count;
26173 bool free;
26174
26175- __raw_spin_lock_preamble;
26176+ __ticket_spin_lock_preamble;
26177 if (unlikely(!free))
26178 token = xen_spin_adjust(lock, token);
26179 do {
26180 count = 1 << 10;
26181- __raw_spin_lock_body;
26182+ __ticket_spin_lock_body;
26183 } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
26184 }
26185
00e5a55c
BS
26186+#define __raw_spin(n) __ticket_spin_##n
26187+
26188+#else /* CONFIG_XEN_COMPAT < 0x030200 */
cc90b958
BS
26189+/*
26190+ * Define virtualization-friendly old-style lock byte lock, for use in
26191+ * pv_lock_ops if desired.
26192+ *
26193+ * This differs from the pre-2.6.24 spinlock by always using xchgb
26194+ * rather than decb to take the lock; this allows it to use a
26195+ * zero-initialized lock structure. It also maintains a 1-byte
26196+ * contention counter, so that we can implement
26197+ * __byte_spin_is_contended.
26198+ */
26199+struct __byte_spinlock {
00e5a55c
BS
26200+ u8 lock;
26201+#if NR_CPUS < 256
26202+ u8 spinners;
26203+#else
26204+#error NR_CPUS >= 256 support not implemented
26205+#endif
cc90b958
BS
26206+};
26207+
00e5a55c
BS
26208+static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
26209+static inline void xen_spinlock_cleanup(unsigned int cpu) {}
26210+
cc90b958
BS
26211+static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
26212+{
26213+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26214+ return bl->lock != 0;
26215+}
26216+
26217+static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
26218+{
26219+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26220+ return bl->spinners != 0;
26221+}
26222+
26223+static inline void __byte_spin_lock(raw_spinlock_t *lock)
26224+{
26225+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26226+ s8 val = 1;
26227+
26228+ asm("1: xchgb %1, %0\n"
26229+ " test %1,%1\n"
26230+ " jz 3f\n"
26231+ " " LOCK_PREFIX "incb %2\n"
26232+ "2: rep;nop\n"
26233+ " cmpb $1, %0\n"
26234+ " je 2b\n"
26235+ " " LOCK_PREFIX "decb %2\n"
26236+ " jmp 1b\n"
26237+ "3:"
26238+ : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
26239+}
26240+
00e5a55c
BS
26241+#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
26242+
cc90b958
BS
26243+static inline int __byte_spin_trylock(raw_spinlock_t *lock)
26244+{
26245+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26246+ u8 old = 1;
26247+
26248+ asm("xchgb %1,%0"
26249+ : "+m" (bl->lock), "+q" (old) : : "memory");
26250+
26251+ return old == 0;
26252+}
26253+
26254+static inline void __byte_spin_unlock(raw_spinlock_t *lock)
26255+{
26256+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26257+ smp_wmb();
26258+ bl->lock = 0;
26259+}
00e5a55c
BS
26260+
26261+#define __raw_spin(n) __byte_spin_##n
26262+
26263+#endif /* CONFIG_XEN_COMPAT */
26264+
cc90b958
BS
26265+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26266+{
00e5a55c 26267+ return __raw_spin(is_locked)(lock);
cc90b958
BS
26268+}
26269+
26270+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26271+{
00e5a55c 26272+ return __raw_spin(is_contended)(lock);
cc90b958
BS
26273+}
26274+
26275+static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26276+{
00e5a55c 26277+ __raw_spin(lock)(lock);
cc90b958
BS
26278+}
26279+
26280+static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26281+ unsigned long flags)
26282+{
00e5a55c 26283+ __raw_spin(lock_flags)(lock, flags);
cc90b958
BS
26284+}
26285+
26286+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26287+{
00e5a55c 26288+ return __raw_spin(trylock)(lock);
cc90b958
BS
26289+}
26290+
26291+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26292+{
00e5a55c 26293+ __raw_spin(unlock)(lock);
cc90b958 26294+}
00e5a55c
BS
26295+
26296+#undef __raw_spin
cc90b958
BS
26297+
26298 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
26299 {
26300 while (__raw_spin_is_locked(lock))
00e5a55c
BS
26301--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:38:05.000000000 +0100
26302+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/system.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
26303@@ -137,7 +137,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
26304 #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
26305 #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
26306
26307-extern void load_gs_index(unsigned);
26308+extern void xen_load_gs_index(unsigned);
26309
26310 /*
26311 * Load a segment. Fall back on loading the zero
26312@@ -154,14 +154,14 @@ extern void load_gs_index(unsigned);
26313 "jmp 2b\n" \
26314 ".previous\n" \
26315 _ASM_EXTABLE(1b,3b) \
26316- : :"r" (value), "r" (0))
26317+ : :"r" (value), "r" (0) : "memory")
26318
26319
26320 /*
26321 * Save a segment register away
26322 */
26323 #define savesegment(seg, value) \
26324- asm volatile("mov %%" #seg ",%0":"=rm" (value))
26325+ asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
26326
26327 static inline unsigned long get_limit(unsigned long segment)
26328 {
26329@@ -269,6 +269,7 @@ static inline void xen_wbinvd(void)
26330 #ifdef CONFIG_X86_64
26331 #define read_cr8() (xen_read_cr8())
26332 #define write_cr8(x) (xen_write_cr8(x))
26333+#define load_gs_index xen_load_gs_index
26334 #endif
26335
26336 /* Clear the 'TS' bit */
26337@@ -287,13 +288,12 @@ static inline void clflush(volatile void
26338 void disable_hlt(void);
26339 void enable_hlt(void);
26340
26341-extern int es7000_plat;
26342 void cpu_idle_wait(void);
26343
26344 extern unsigned long arch_align_stack(unsigned long sp);
26345 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
26346
26347-void default_idle(void);
26348+void xen_idle(void);
26349
26350 /*
26351 * Force strict CPU ordering.
00e5a55c
BS
26352--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/xor_64.h 2009-03-16 16:38:05.000000000 +0100
26353+++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/xor_64.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
26354@@ -1,3 +1,6 @@
26355+#ifndef ASM_X86__XOR_64_H
26356+#define ASM_X86__XOR_64_H
26357+
26358 /*
26359 * x86-64 changes / gcc fixes from Andi Kleen.
26360 * Copyright 2002 Andi Kleen, SuSE Labs.
26361@@ -330,3 +333,5 @@ do { \
26362 We may also be able to load into the L1 only depending on how the cpu
26363 deals with a load to a line that is being prefetched. */
26364 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
26365+
26366+#endif /* ASM_X86__XOR_64_H */
00e5a55c 26367--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/irq_vectors.h 2009-03-16 16:33:40.000000000 +0100
cc90b958
BS
26368+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26369@@ -1,126 +0,0 @@
26370-/*
26371- * This file should contain #defines for all of the interrupt vector
26372- * numbers used by this architecture.
26373- *
26374- * In addition, there are some standard defines:
26375- *
26376- * FIRST_EXTERNAL_VECTOR:
26377- * The first free place for external interrupts
26378- *
26379- * SYSCALL_VECTOR:
26380- * The IRQ vector a syscall makes the user to kernel transition
26381- * under.
26382- *
26383- * TIMER_IRQ:
26384- * The IRQ number the timer interrupt comes in at.
26385- *
26386- * NR_IRQS:
26387- * The total number of interrupt vectors (including all the
26388- * architecture specific interrupts) needed.
26389- *
26390- */
26391-#ifndef _ASM_IRQ_VECTORS_H
26392-#define _ASM_IRQ_VECTORS_H
26393-
26394-/*
26395- * IDT vectors usable for external interrupt sources start
26396- * at 0x20:
26397- */
26398-#define FIRST_EXTERNAL_VECTOR 0x20
26399-
26400-#define SYSCALL_VECTOR 0x80
26401-
26402-/*
26403- * Vectors 0x20-0x2f are used for ISA interrupts.
26404- */
26405-
26406-#if 0
26407-/*
26408- * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
26409- *
26410- * some of the following vectors are 'rare', they are merged
26411- * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
26412- * TLB, reschedule and local APIC vectors are performance-critical.
26413- *
26414- * Vectors 0xf0-0xfa are free (reserved for future Linux use).
26415- */
26416-#define SPURIOUS_APIC_VECTOR 0xff
26417-#define ERROR_APIC_VECTOR 0xfe
26418-#define INVALIDATE_TLB_VECTOR 0xfd
26419-#define RESCHEDULE_VECTOR 0xfc
26420-#define CALL_FUNCTION_VECTOR 0xfb
26421-
26422-#define THERMAL_APIC_VECTOR 0xf0
26423-/*
26424- * Local APIC timer IRQ vector is on a different priority level,
26425- * to work around the 'lost local interrupt if more than 2 IRQ
26426- * sources per level' errata.
26427- */
26428-#define LOCAL_TIMER_VECTOR 0xef
26429-#endif
26430-
26431-#define SPURIOUS_APIC_VECTOR 0xff
26432-#define ERROR_APIC_VECTOR 0xfe
26433-
26434-/*
26435- * First APIC vector available to drivers: (vectors 0x30-0xee)
26436- * we start at 0x31 to spread out vectors evenly between priority
26437- * levels. (0x80 is the syscall vector)
26438- */
26439-#define FIRST_DEVICE_VECTOR 0x31
26440-#define FIRST_SYSTEM_VECTOR 0xef
26441-
26442-/*
26443- * 16 8259A IRQ's, 208 potential APIC interrupt sources.
26444- * Right now the APIC is mostly only used for SMP.
26445- * 256 vectors is an architectural limit. (we can have
26446- * more than 256 devices theoretically, but they will
26447- * have to use shared interrupts)
26448- * Since vectors 0x00-0x1f are used/reserved for the CPU,
26449- * the usable vector space is 0x20-0xff (224 vectors)
26450- */
26451-
26452-#define RESCHEDULE_VECTOR 0
26453-#define CALL_FUNCTION_VECTOR 1
26454-#define SPIN_UNLOCK_VECTOR 2
26455-#define NR_IPIS 3
26456-
26457-/*
26458- * The maximum number of vectors supported by i386 processors
26459- * is limited to 256. For processors other than i386, NR_VECTORS
26460- * should be changed accordingly.
26461- */
26462-#define NR_VECTORS 256
26463-
26464-#define FPU_IRQ 13
26465-
26466-#define FIRST_VM86_IRQ 3
26467-#define LAST_VM86_IRQ 15
26468-#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
26469-
26470-/*
26471- * The flat IRQ space is divided into two regions:
26472- * 1. A one-to-one mapping of real physical IRQs. This space is only used
26473- * if we have physical device-access privilege. This region is at the
26474- * start of the IRQ space so that existing device drivers do not need
26475- * to be modified to translate physical IRQ numbers into our IRQ space.
26476- * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
26477- * are bound using the provided bind/unbind functions.
26478- */
26479-
26480-#define PIRQ_BASE 0
26481-#if !defined(MAX_IO_APICS)
26482-# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26483-#elif NR_CPUS < MAX_IO_APICS
26484-# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26485-#else
26486-# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
26487-#endif
26488-
26489-#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
26490-#define NR_DYNIRQS 256
26491-
26492-#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
26493-#define NR_IRQ_VECTORS NR_IRQS
26494-
26495-#endif /* _ASM_IRQ_VECTORS_H */
00e5a55c 26496--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/setup_arch_post.h 2009-06-04 11:08:07.000000000 +0200
cc90b958
BS
26497+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26498@@ -1,63 +0,0 @@
26499-/**
26500- * machine_specific_* - Hooks for machine specific setup.
26501- *
26502- * Description:
26503- * This is included late in kernel/setup.c so that it can make
26504- * use of all of the static functions.
26505- **/
26506-
26507-#include <xen/interface/callback.h>
26508-
26509-extern void hypervisor_callback(void);
26510-extern void failsafe_callback(void);
26511-extern void nmi(void);
26512-
26513-static void __init machine_specific_arch_setup(void)
26514-{
26515- int ret;
26516- static struct callback_register __initdata event = {
26517- .type = CALLBACKTYPE_event,
26518- .address = (unsigned long) hypervisor_callback,
26519- };
26520- static struct callback_register __initdata failsafe = {
26521- .type = CALLBACKTYPE_failsafe,
26522- .address = (unsigned long)failsafe_callback,
26523- };
26524- static struct callback_register __initdata syscall = {
26525- .type = CALLBACKTYPE_syscall,
26526- .address = (unsigned long)system_call,
26527- };
26528-#ifdef CONFIG_X86_LOCAL_APIC
26529- static struct callback_register __initdata nmi_cb = {
26530- .type = CALLBACKTYPE_nmi,
26531- .address = (unsigned long)nmi,
26532- };
26533-#endif
26534-
26535- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
26536- if (ret == 0)
26537- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
26538- if (ret == 0)
26539- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
26540-#if CONFIG_XEN_COMPAT <= 0x030002
26541- if (ret == -ENOSYS)
26542- ret = HYPERVISOR_set_callbacks(
26543- event.address,
26544- failsafe.address,
26545- syscall.address);
26546-#endif
26547- BUG_ON(ret);
26548-
26549-#ifdef CONFIG_X86_LOCAL_APIC
26550- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
26551-#if CONFIG_XEN_COMPAT <= 0x030002
26552- if (ret == -ENOSYS) {
26553- static struct xennmi_callback __initdata cb = {
26554- .handler_address = (unsigned long)nmi
26555- };
26556-
26557- HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
26558- }
26559-#endif
26560-#endif
26561-}
00e5a55c 26562--- sle11-2009-06-04.orig/include/asm-x86/mach-xen/setup_arch_pre.h 2009-06-04 11:08:07.000000000 +0200
cc90b958
BS
26563+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26564@@ -1,5 +0,0 @@
26565-/* Hook to call BIOS initialisation function */
26566-
26567-#define ARCH_SETUP machine_specific_arch_setup();
26568-
26569-static void __init machine_specific_arch_setup(void);
00e5a55c
BS
26570--- sle11-2009-06-04.orig/include/asm-x86/traps.h 2009-06-04 11:08:07.000000000 +0200
26571+++ sle11-2009-06-04/include/asm-x86/traps.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
26572@@ -23,6 +23,9 @@ asmlinkage void spurious_interrupt_bug(v
26573 #ifdef CONFIG_X86_MCE
26574 asmlinkage void machine_check(void);
26575 #endif /* CONFIG_X86_MCE */
26576+#ifdef CONFIG_X86_XEN
26577+asmlinkage void fixup_4gb_segment(void);
26578+#endif
26579
26580 void do_divide_error(struct pt_regs *, long);
26581 void do_overflow(struct pt_regs *, long);
26582@@ -48,6 +51,9 @@ void math_error(void __user *);
26583 void do_coprocessor_error(struct pt_regs *, long);
26584 void do_simd_coprocessor_error(struct pt_regs *, long);
26585 void do_spurious_interrupt_bug(struct pt_regs *, long);
26586+#ifdef CONFIG_XEN
26587+void do_fixup_4gb_segment(struct pt_regs *, long);
26588+#endif
26589 unsigned long patch_espfix_desc(unsigned long, unsigned long);
26590 asmlinkage void math_emulate(long);
26591
00e5a55c
BS
26592--- sle11-2009-06-04.orig/include/asm-x86/xen/interface_64.h 2009-06-04 11:08:07.000000000 +0200
26593+++ sle11-2009-06-04/include/asm-x86/xen/interface_64.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
26594@@ -136,7 +136,7 @@ struct cpu_user_regs {
26595 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
26596 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
26597 };
26598-DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
26599+DEFINE_XEN_GUEST_HANDLE_STRUCT(cpu_user_regs);
26600
26601 #undef __DECL_REG
26602
00e5a55c
BS
26603--- sle11-2009-06-04.orig/include/linux/page-flags.h 2009-03-16 16:38:05.000000000 +0100
26604+++ sle11-2009-06-04/include/linux/page-flags.h 2009-06-04 10:21:39.000000000 +0200
26605@@ -110,9 +110,11 @@ enum pageflags {
cc90b958
BS
26606 /* Filesystems */
26607 PG_checked = PG_owner_priv_1,
26608
26609+#ifdef CONFIG_PARAVIRT_XEN
26610 /* XEN */
26611 PG_pinned = PG_owner_priv_1,
26612 PG_savepinned = PG_dirty,
26613+#endif
26614
26615 /* SLOB */
26616 PG_slob_page = PG_active,
00e5a55c 26617@@ -187,8 +189,12 @@ PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU,
cc90b958
BS
26618 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
26619 __PAGEFLAG(Slab, slab)
26620 PAGEFLAG(Checked, checked) /* Used by some filesystems */
26621+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
26622 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
26623+#endif
26624+#ifdef CONFIG_PARAVIRT_XEN
26625 PAGEFLAG(SavePinned, savepinned); /* Xen */
26626+#endif
26627 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
26628 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
26629 __SETPAGEFLAG(Private, private)
00e5a55c
BS
26630--- sle11-2009-06-04.orig/include/xen/interface/memory.h 2009-03-16 16:38:05.000000000 +0100
26631+++ sle11-2009-06-04/include/xen/interface/memory.h 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
26632@@ -82,6 +82,7 @@ struct xen_memory_reservation {
26633 domid_t domid;
26634
26635 };
26636+DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
26637 typedef struct xen_memory_reservation xen_memory_reservation_t;
26638 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
26639
26640@@ -167,11 +168,7 @@ struct xen_machphys_mfn_list {
26641 * any large discontiguities in the machine address space, 2MB gaps in
26642 * the machphys table will be represented by an MFN base of zero.
26643 */
26644-#ifndef CONFIG_PARAVIRT_XEN
26645 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
26646-#else
26647- ulong extent_start;
26648-#endif
26649
26650 /*
26651 * Number of extents written to the above array. This will be smaller
26652@@ -179,6 +176,7 @@ struct xen_machphys_mfn_list {
26653 */
26654 unsigned int nr_extents;
26655 };
26656+DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
26657 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
26658 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
26659
26660@@ -218,6 +216,7 @@ struct xen_add_to_physmap {
26661 /* GPFN where the source mapping page should appear. */
26662 xen_pfn_t gpfn;
26663 };
26664+DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
26665 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
26666 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
26667
26668@@ -250,21 +249,13 @@ struct xen_translate_gpfn_list {
26669 xen_ulong_t nr_gpfns;
26670
26671 /* List of GPFNs to translate. */
26672-#ifndef CONFIG_PARAVIRT_XEN
26673 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
26674-#else
26675- ulong gpfn_list;
26676-#endif
26677
26678 /*
26679 * Output list to contain MFN translations. May be the same as the input
26680 * list (in which case each input GPFN is overwritten with the output MFN).
26681 */
26682-#ifndef CONFIG_PARAVIRT_XEN
26683 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
26684-#else
26685- ulong mfn_list;
26686-#endif
26687 };
26688 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
26689 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
00e5a55c
BS
26690--- sle11-2009-06-04.orig/kernel/hrtimer.c 2009-06-04 11:08:07.000000000 +0200
26691+++ sle11-2009-06-04/kernel/hrtimer.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
26692@@ -1084,7 +1084,7 @@ ktime_t hrtimer_get_remaining(const stru
26693 }
26694 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
26695
26696-#ifdef CONFIG_NO_HZ
26697+#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26698 /**
26699 * hrtimer_get_next_event - get the time until next expiry event
26700 *
00e5a55c
BS
26701--- sle11-2009-06-04.orig/kernel/kexec.c 2009-02-17 12:38:20.000000000 +0100
26702+++ sle11-2009-06-04/kernel/kexec.c 2009-06-04 10:21:39.000000000 +0200
26703@@ -54,7 +54,7 @@ int dump_after_notifier;
26704 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
26705 u32
26706 #if defined(CONFIG_XEN) && defined(CONFIG_X86)
26707-__attribute__((__section__(".bss.page_aligned"), __aligned__(PAGE_SIZE)))
26708+__page_aligned_bss
26709 #endif
26710 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
26711 size_t vmcoreinfo_size;
26712--- sle11-2009-06-04.orig/kernel/timer.c 2009-06-04 11:08:07.000000000 +0200
26713+++ sle11-2009-06-04/kernel/timer.c 2009-06-04 10:21:39.000000000 +0200
26714@@ -884,7 +884,7 @@ static inline void __run_timers(struct t
cc90b958
BS
26715 spin_unlock_irq(&base->lock);
26716 }
26717
26718-#ifdef CONFIG_NO_HZ
26719+#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26720 /*
26721 * Find out when the next timer event is due to happen. This
26722 * is used on S/390 to stop all activity when a cpus is idle.
00e5a55c
BS
26723--- sle11-2009-06-04.orig/lib/swiotlb-xen.c 2009-03-16 16:38:05.000000000 +0100
26724+++ sle11-2009-06-04/lib/swiotlb-xen.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
26725@@ -750,7 +750,7 @@ swiotlb_sync_sg_for_device(struct device
26726 }
26727
26728 int
26729-swiotlb_dma_mapping_error(dma_addr_t dma_addr)
26730+swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
26731 {
26732 return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
26733 }
00e5a55c
BS
26734--- sle11-2009-06-04.orig/mm/mprotect.c 2009-03-04 11:28:34.000000000 +0100
26735+++ sle11-2009-06-04/mm/mprotect.c 2009-06-04 10:21:39.000000000 +0200
cc90b958
BS
26736@@ -92,8 +92,6 @@ static inline void change_pmd_range(stru
26737 next = pmd_addr_end(addr, end);
26738 if (pmd_none_or_clear_bad(pmd))
26739 continue;
26740- if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
26741- continue;
26742 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
26743 } while (pmd++, addr = next, addr != end);
26744 }