]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/suse-2.6.27.25/patches.xen/xen3-patch-2.6.27
Corrected links and text on ids.cgi
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.xen / xen3-patch-2.6.27
1 From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
2 Subject: [PATCH] Linux: Update to 2.6.27
3 Patch-mainline: 2.6.27
4
5 This patch contains the differences between Linux 2.6.26 and 2.6.27.
6
7 Acked-by: Jeff Mahoney <jeffm@suse.com>
8 Automatically created from "patches.kernel.org/patch-2.6.27" by xen-port-patches.py
9
10 --- sle11-2009-06-04.orig/arch/x86/Kconfig 2009-03-16 16:38:05.000000000 +0100
11 +++ sle11-2009-06-04/arch/x86/Kconfig 2009-06-04 10:21:39.000000000 +0200
12 @@ -594,7 +594,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
13 config AMD_IOMMU
14 bool "AMD IOMMU support"
15 select SWIOTLB
16 - depends on X86_64 && PCI && ACPI
17 + depends on X86_64 && PCI && ACPI && !X86_64_XEN
18 help
19 With this option you can enable support for AMD IOMMU hardware in
20 your system. An IOMMU is a hardware component which provides
21 @@ -629,8 +629,10 @@ config MAXSMP
22
23 config NR_CPUS
24 int "Maximum number of CPUs (2-4096)"
25 + range 2 32 if XEN
26 range 2 4096
27 depends on SMP
28 + default "32" if MAXSMP && XEN
29 default "4096" if MAXSMP
30 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
31 default "16" if X86_64_XEN
32 @@ -1227,7 +1229,7 @@ config MTRR
33 config MTRR_SANITIZER
34 bool
35 prompt "MTRR cleanup support"
36 - depends on MTRR
37 + depends on MTRR && !XEN
38 help
39 Convert MTRR layout from continuous to discrete, so X drivers can
40 add writeback entries.
41 --- sle11-2009-06-04.orig/arch/x86/Kconfig.debug 2009-03-16 16:33:40.000000000 +0100
42 +++ sle11-2009-06-04/arch/x86/Kconfig.debug 2009-06-04 10:21:39.000000000 +0200
43 @@ -25,6 +25,7 @@ config STRICT_DEVMEM
44 config X86_VERBOSE_BOOTUP
45 bool "Enable verbose x86 bootup info messages"
46 default y
47 + depends on !XEN
48 help
49 Enables the informational output from the decompression stage
50 (e.g. bzImage) of the boot. If you disable this you will still
51 @@ -179,7 +180,7 @@ config MMIOTRACE_HOOKS
52
53 config MMIOTRACE
54 bool "Memory mapped IO tracing"
55 - depends on DEBUG_KERNEL && PCI
56 + depends on DEBUG_KERNEL && PCI && !XEN
57 select TRACING
58 select MMIOTRACE_HOOKS
59 help
60 --- sle11-2009-06-04.orig/arch/x86/Makefile 2009-02-16 16:18:36.000000000 +0100
61 +++ sle11-2009-06-04/arch/x86/Makefile 2009-06-04 10:21:39.000000000 +0200
62 @@ -116,8 +116,8 @@ mflags-$(CONFIG_X86_VOYAGER) := -Iinclud
63 mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
64
65 # Xen subarch support
66 -mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen
67 -mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/
68 +mflags-$(CONFIG_XEN) := -Iinclude/asm-x86/mach-xen
69 +mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
70
71 # generic subarchitecture
72 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
73 @@ -128,8 +128,10 @@ mcore-$(CONFIG_X86_GENERICARCH) := arch/
74 mflags-y += -Iinclude/asm-x86/mach-default
75
76 # 64 bit does not support subarch support - clear sub arch variables
77 +ifneq ($(CONFIG_XEN),y)
78 fcore-$(CONFIG_X86_64) :=
79 mcore-$(CONFIG_X86_64) :=
80 +endif
81
82 KBUILD_CFLAGS += $(mflags-y)
83 KBUILD_AFLAGS += $(mflags-y)
84 --- sle11-2009-06-04.orig/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:38:05.000000000 +0100
85 +++ sle11-2009-06-04/arch/x86/ia32/ia32entry-xen.S 2009-06-04 10:21:39.000000000 +0200
86 @@ -15,6 +15,16 @@
87 #include <asm/irqflags.h>
88 #include <linux/linkage.h>
89
90 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
91 +#include <linux/elf-em.h>
92 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
93 +#define __AUDIT_ARCH_LE 0x40000000
94 +
95 +#ifndef CONFIG_AUDITSYSCALL
96 +#define sysexit_audit int_ret_from_sys_call
97 +#define sysretl_audit int_ret_from_sys_call
98 +#endif
99 +
100 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
101
102 .macro IA32_ARG_FIXUP noebp=0
103 @@ -37,6 +47,11 @@
104 movq %rax,R8(%rsp)
105 .endm
106
107 + /*
108 + * Reload arg registers from stack in case ptrace changed them.
109 + * We don't reload %eax because syscall_trace_enter() returned
110 + * the value it wants us to use in the table lookup.
111 + */
112 .macro LOAD_ARGS32 offset
113 movl \offset(%rsp),%r11d
114 movl \offset+8(%rsp),%r10d
115 @@ -46,7 +61,6 @@
116 movl \offset+48(%rsp),%edx
117 movl \offset+56(%rsp),%esi
118 movl \offset+64(%rsp),%edi
119 - movl \offset+72(%rsp),%eax
120 .endm
121
122 .macro CFI_STARTPROC32 simple
123 @@ -61,6 +75,19 @@
124 CFI_UNDEFINED r15
125 .endm
126
127 +#ifdef CONFIG_PARAVIRT
128 +ENTRY(native_usergs_sysret32)
129 + swapgs
130 + sysretl
131 +ENDPROC(native_usergs_sysret32)
132 +
133 +ENTRY(native_irq_enable_sysexit)
134 + swapgs
135 + sti
136 + sysexit
137 +ENDPROC(native_irq_enable_sysexit)
138 +#endif
139 +
140 /*
141 * 32bit SYSENTER instruction entry.
142 *
143 @@ -98,7 +125,7 @@ ENTRY(ia32_sysenter_target)
144 CFI_RESTORE rcx
145 movl %ebp,%ebp /* zero extension */
146 movl %eax,%eax
147 - movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
148 + movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
149 movl $__USER32_DS,40(%rsp)
150 movq %rbp,32(%rsp)
151 movl $__USER32_CS,16(%rsp)
152 @@ -113,19 +140,79 @@ ENTRY(ia32_sysenter_target)
153 .quad 1b,ia32_badarg
154 .previous
155 GET_THREAD_INFO(%r10)
156 - orl $TS_COMPAT,threadinfo_status(%r10)
157 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
158 + orl $TS_COMPAT,TI_status(%r10)
159 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
160 jnz sysenter_tracesys
161 -sysenter_do_call:
162 cmpl $(IA32_NR_syscalls-1),%eax
163 ja ia32_badsys
164 +sysenter_do_call:
165 IA32_ARG_FIXUP 1
166 +sysenter_dispatch:
167 call *ia32_sys_call_table(,%rax,8)
168 movq %rax,RAX-ARGOFFSET(%rsp)
169 + GET_THREAD_INFO(%r10)
170 + DISABLE_INTERRUPTS(CLBR_NONE)
171 + TRACE_IRQS_OFF
172 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
173 + jnz sysexit_audit
174 + jmp int_ret_from_sys_call
175 +
176 +#ifdef CONFIG_AUDITSYSCALL
177 + .macro auditsys_entry_common
178 + movl %esi,%r9d /* 6th arg: 4th syscall arg */
179 + movl %edx,%r8d /* 5th arg: 3rd syscall arg */
180 + /* (already in %ecx) 4th arg: 2nd syscall arg */
181 + movl %ebx,%edx /* 3rd arg: 1st syscall arg */
182 + movl %eax,%esi /* 2nd arg: syscall number */
183 + movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
184 + call audit_syscall_entry
185 + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
186 + cmpl $(IA32_NR_syscalls-1),%eax
187 + ja ia32_badsys
188 + movl %ebx,%edi /* reload 1st syscall arg */
189 + movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
190 + movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
191 + movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
192 + movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
193 + .endm
194 +
195 + .macro auditsys_exit exit,ebpsave=RBP
196 + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
197 + jnz int_ret_from_sys_call
198 + TRACE_IRQS_ON
199 + ENABLE_INTERRUPTS(CLBR_NONE)
200 + movl %eax,%esi /* second arg, syscall return value */
201 + cmpl $0,%eax /* is it < 0? */
202 + setl %al /* 1 if so, 0 if not */
203 + movzbl %al,%edi /* zero-extend that into %edi */
204 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
205 + call audit_syscall_exit
206 + GET_THREAD_INFO(%r10)
207 + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
208 + movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
209 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
210 + DISABLE_INTERRUPTS(CLBR_NONE)
211 + TRACE_IRQS_OFF
212 + testl %edi,TI_flags(%r10)
213 + jnz int_with_check
214 jmp int_ret_from_sys_call
215 + .endm
216 +
217 +sysenter_auditsys:
218 + auditsys_entry_common
219 + movl %ebp,%r9d /* reload 6th syscall arg */
220 + jmp sysenter_dispatch
221 +
222 +sysexit_audit:
223 + auditsys_exit sysexit_from_sys_call
224 +#endif
225
226 sysenter_tracesys:
227 xchgl %r9d,%ebp
228 +#ifdef CONFIG_AUDITSYSCALL
229 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
230 + jz sysenter_auditsys
231 +#endif
232 SAVE_REST
233 CLEAR_RREGS
234 movq %r9,R9(%rsp)
235 @@ -186,18 +273,38 @@ ENTRY(ia32_cstar_target)
236 .quad 1b,ia32_badarg
237 .previous
238 GET_THREAD_INFO(%r10)
239 - orl $TS_COMPAT,threadinfo_status(%r10)
240 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
241 + orl $TS_COMPAT,TI_status(%r10)
242 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
243 jnz cstar_tracesys
244 cstar_do_call:
245 cmpl $IA32_NR_syscalls-1,%eax
246 ja ia32_badsys
247 IA32_ARG_FIXUP 1
248 +cstar_dispatch:
249 call *ia32_sys_call_table(,%rax,8)
250 movq %rax,RAX-ARGOFFSET(%rsp)
251 + GET_THREAD_INFO(%r10)
252 + DISABLE_INTERRUPTS(CLBR_NONE)
253 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
254 + jnz sysretl_audit
255 jmp int_ret_from_sys_call
256
257 -cstar_tracesys:
258 +#ifdef CONFIG_AUDITSYSCALL
259 +cstar_auditsys:
260 + movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
261 + auditsys_entry_common
262 + movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
263 + jmp cstar_dispatch
264 +
265 +sysretl_audit:
266 + auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
267 +#endif
268 +
269 +cstar_tracesys:
270 +#ifdef CONFIG_AUDITSYSCALL
271 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
272 + jz cstar_auditsys
273 +#endif
274 xchgl %r9d,%ebp
275 SAVE_REST
276 CLEAR_RREGS
277 @@ -263,8 +370,8 @@ ENTRY(ia32_syscall)
278 this could be a problem. */
279 SAVE_ARGS 0,0,1
280 GET_THREAD_INFO(%r10)
281 - orl $TS_COMPAT,threadinfo_status(%r10)
282 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
283 + orl $TS_COMPAT,TI_status(%r10)
284 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
285 jnz ia32_tracesys
286 ia32_do_syscall:
287 cmpl $(IA32_NR_syscalls-1),%eax
288 @@ -309,13 +416,11 @@ quiet_ni_syscall:
289 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
290 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
291 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
292 - PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
293 PTREGSCALL stub32_execve, sys32_execve, %rcx
294 PTREGSCALL stub32_fork, sys_fork, %rdi
295 PTREGSCALL stub32_clone, sys32_clone, %rdx
296 PTREGSCALL stub32_vfork, sys_vfork, %rdi
297 PTREGSCALL stub32_iopl, sys_iopl, %rsi
298 - PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
299
300 ENTRY(ia32_ptregs_common)
301 popq %r11
302 @@ -415,7 +520,7 @@ ia32_sys_call_table:
303 .quad sys_ssetmask
304 .quad sys_setreuid16 /* 70 */
305 .quad sys_setregid16
306 - .quad stub32_sigsuspend
307 + .quad sys32_sigsuspend
308 .quad compat_sys_sigpending
309 .quad sys_sethostname
310 .quad compat_sys_setrlimit /* 75 */
311 @@ -522,7 +627,7 @@ ia32_sys_call_table:
312 .quad sys32_rt_sigpending
313 .quad compat_sys_rt_sigtimedwait
314 .quad sys32_rt_sigqueueinfo
315 - .quad stub32_rt_sigsuspend
316 + .quad sys_rt_sigsuspend
317 .quad sys32_pread /* 180 */
318 .quad sys32_pwrite
319 .quad sys_chown16
320 @@ -670,4 +775,10 @@ ia32_sys_call_table:
321 .quad sys32_fallocate
322 .quad compat_sys_timerfd_settime /* 325 */
323 .quad compat_sys_timerfd_gettime
324 + .quad compat_sys_signalfd4
325 + .quad sys_eventfd2
326 + .quad sys_epoll_create1
327 + .quad sys_dup3 /* 330 */
328 + .quad sys_pipe2
329 + .quad sys_inotify_init1
330 ia32_syscall_end:
331 --- sle11-2009-06-04.orig/arch/x86/kernel/Makefile 2009-03-16 16:38:05.000000000 +0100
332 +++ sle11-2009-06-04/arch/x86/kernel/Makefile 2009-06-04 10:21:39.000000000 +0200
333 @@ -120,9 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
334
335 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
336
337 - obj-$(CONFIG_XEN) += nmi_64.o
338 + obj-$(CONFIG_XEN) += nmi.o
339 time_64-$(CONFIG_XEN) += time_32.o
340 endif
341
342 -disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
343 - pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
344 +disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \
345 + i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
346 + tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o
347 --- sle11-2009-06-04.orig/arch/x86/kernel/acpi/boot.c 2009-03-16 16:38:05.000000000 +0100
348 +++ sle11-2009-06-04/arch/x86/kernel/acpi/boot.c 2009-06-04 10:21:39.000000000 +0200
349 @@ -951,7 +951,9 @@ void __init mp_register_ioapic(int id, u
350 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
351 mp_ioapics[idx].mp_apicaddr = address;
352
353 +#ifndef CONFIG_XEN
354 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
355 +#endif
356 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
357 #ifdef CONFIG_X86_32
358 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
359 @@ -1108,7 +1110,7 @@ int mp_register_gsi(u32 gsi, int trigger
360 {
361 int ioapic;
362 int ioapic_pin;
363 -#ifdef CONFIG_X86_32
364 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
365 #define MAX_GSI_NUM 4096
366 #define IRQ_COMPRESSION_START 64
367
368 @@ -1156,7 +1158,7 @@ int mp_register_gsi(u32 gsi, int trigger
369 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
370 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
371 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
372 -#ifdef CONFIG_X86_32
373 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
374 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
375 #else
376 return gsi;
377 @@ -1164,7 +1166,7 @@ int mp_register_gsi(u32 gsi, int trigger
378 }
379
380 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
381 -#ifdef CONFIG_X86_32
382 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
383 /*
384 * For GSI >= 64, use IRQ compression
385 */
386 --- sle11-2009-06-04.orig/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:38:05.000000000 +0100
387 +++ sle11-2009-06-04/arch/x86/kernel/acpi/sleep-xen.c 2009-06-04 10:21:39.000000000 +0200
388 @@ -9,6 +9,7 @@
389 #include <linux/bootmem.h>
390 #include <linux/dmi.h>
391 #include <linux/cpumask.h>
392 +#include <asm/segment.h>
393
394 #include "realmode/wakeup.h"
395 #include "sleep.h"
396 @@ -20,7 +21,7 @@ unsigned long acpi_realmode_flags;
397 /* address in low memory of the wakeup routine. */
398 static unsigned long acpi_realmode;
399
400 -#ifdef CONFIG_64BIT
401 +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
402 static char temp_stack[10240];
403 #endif
404 #endif
405 @@ -54,18 +55,27 @@ int acpi_save_state_mem(void)
406 header->video_mode = saved_video_mode;
407
408 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
409 +
410 + /*
411 + * Set up the wakeup GDT. We set these up as Big Real Mode,
412 + * that is, with limits set to 4 GB. At least the Lenovo
413 + * Thinkpad X61 is known to need this for the video BIOS
414 + * initialization quirk to work; this is likely to also
415 + * be the case for other laptops or integrated video devices.
416 + */
417 +
418 /* GDT[0]: GDT self-pointer */
419 header->wakeup_gdt[0] =
420 (u64)(sizeof(header->wakeup_gdt) - 1) +
421 ((u64)(acpi_wakeup_address +
422 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
423 << 16);
424 - /* GDT[1]: real-mode-like code segment */
425 - header->wakeup_gdt[1] = (0x009bULL << 40) +
426 - ((u64)acpi_wakeup_address << 16) + 0xffff;
427 - /* GDT[2]: real-mode-like data segment */
428 - header->wakeup_gdt[2] = (0x0093ULL << 40) +
429 - ((u64)acpi_wakeup_address << 16) + 0xffff;
430 + /* GDT[1]: big real mode-like code segment */
431 + header->wakeup_gdt[1] =
432 + GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
433 + /* GDT[2]: big real mode-like data segment */
434 + header->wakeup_gdt[2] =
435 + GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
436
437 #ifndef CONFIG_64BIT
438 store_gdt((struct desc_ptr *)&header->pmode_gdt);
439 @@ -79,7 +89,7 @@ int acpi_save_state_mem(void)
440 #endif /* !CONFIG_64BIT */
441
442 header->pmode_cr0 = read_cr0();
443 - header->pmode_cr4 = read_cr4();
444 + header->pmode_cr4 = read_cr4_safe();
445 header->realmode_flags = acpi_realmode_flags;
446 header->real_magic = 0x12345678;
447
448 @@ -89,7 +99,9 @@ int acpi_save_state_mem(void)
449 saved_magic = 0x12345678;
450 #else /* CONFIG_64BIT */
451 header->trampoline_segment = setup_trampoline() >> 4;
452 - init_rsp = (unsigned long)temp_stack + 4096;
453 +#ifdef CONFIG_SMP
454 + stack_start.sp = temp_stack + 4096;
455 +#endif
456 initial_code = (unsigned long)wakeup_long64;
457 saved_magic = 0x123456789abcdef0;
458 #endif /* CONFIG_64BIT */
459 @@ -145,6 +157,12 @@ static int __init acpi_sleep_setup(char
460 acpi_realmode_flags |= 2;
461 if (strncmp(str, "s3_beep", 7) == 0)
462 acpi_realmode_flags |= 4;
463 +#ifdef CONFIG_HIBERNATION
464 + if (strncmp(str, "s4_nohwsig", 10) == 0)
465 + acpi_no_s4_hw_signature();
466 +#endif
467 + if (strncmp(str, "old_ordering", 12) == 0)
468 + acpi_old_suspend_ordering();
469 str = strchr(str, ',');
470 if (str != NULL)
471 str += strspn(str, ", \t");
472 --- sle11-2009-06-04.orig/arch/x86/kernel/apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
473 +++ sle11-2009-06-04/arch/x86/kernel/apic_32-xen.c 2009-06-04 10:21:39.000000000 +0200
474 @@ -59,7 +59,10 @@ static cpumask_t timer_bcast_ipi;
475 /*
476 * Debug level, exported for io_apic.c
477 */
478 -int apic_verbosity;
479 +unsigned int apic_verbosity;
480 +
481 +/* Have we found an MP table */
482 +int smp_found_config;
483
484 #ifndef CONFIG_XEN
485 static int modern_apic(void)
486 --- sle11-2009-06-04.orig/arch/x86/kernel/apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
487 +++ sle11-2009-06-04/arch/x86/kernel/apic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
488 @@ -39,7 +39,10 @@ int disable_apic;
489 /*
490 * Debug level, exported for io_apic.c
491 */
492 -int apic_verbosity;
493 +unsigned int apic_verbosity;
494 +
495 +/* Have we found an MP table */
496 +int smp_found_config;
497
498 /*
499 * The guts of the apic timer interrupt
500 --- sle11-2009-06-04.orig/arch/x86/kernel/asm-offsets_64.c 2008-11-25 12:35:54.000000000 +0100
501 +++ sle11-2009-06-04/arch/x86/kernel/asm-offsets_64.c 2009-06-04 10:21:39.000000000 +0200
502 @@ -138,7 +138,7 @@ int main(void)
503
504 BLANK();
505 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
506 -#ifdef CONFIG_XEN
507 +#ifdef CONFIG_PARAVIRT_XEN
508 BLANK();
509 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
510 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
511 --- sle11-2009-06-04.orig/arch/x86/kernel/cpu/amd_64.c 2009-06-04 11:08:07.000000000 +0200
512 +++ sle11-2009-06-04/arch/x86/kernel/cpu/amd_64.c 2009-06-04 10:21:39.000000000 +0200
513 @@ -193,6 +193,7 @@ static void __cpuinit init_amd(struct cp
514 fam10h_check_enable_mmcfg();
515 }
516
517 +#ifndef CONFIG_XEN
518 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
519 unsigned long long tseg;
520
521 @@ -211,6 +212,7 @@ static void __cpuinit init_amd(struct cp
522 set_memory_4k((unsigned long)__va(tseg), 1);
523 }
524 }
525 +#endif
526 }
527
528 static struct cpu_dev amd_cpu_dev __cpuinitdata = {
529 --- sle11-2009-06-04.orig/arch/x86/kernel/cpu/bugs_64.c 2009-06-04 11:08:07.000000000 +0200
530 +++ sle11-2009-06-04/arch/x86/kernel/cpu/bugs_64.c 2009-06-04 10:21:39.000000000 +0200
531 @@ -20,6 +20,7 @@ void __init check_bugs(void)
532 #endif
533 alternative_instructions();
534
535 +#ifndef CONFIG_XEN
536 /*
537 * Make sure the first 2MB area is not mapped by huge pages
538 * There are typically fixed size MTRRs in there and overlapping
539 @@ -30,4 +31,5 @@ void __init check_bugs(void)
540 */
541 if (!direct_gbpages)
542 set_memory_4k((unsigned long)__va(0), 1);
543 +#endif
544 }
545 --- sle11-2009-06-04.orig/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:38:05.000000000 +0100
546 +++ sle11-2009-06-04/arch/x86/kernel/cpu/common-xen.c 2009-06-04 10:21:39.000000000 +0200
547 @@ -13,6 +13,7 @@
548 #include <asm/mtrr.h>
549 #include <asm/mce.h>
550 #include <asm/pat.h>
551 +#include <asm/asm.h>
552 #ifdef CONFIG_X86_LOCAL_APIC
553 #include <asm/mpspec.h>
554 #include <asm/apic.h>
555 @@ -341,11 +342,24 @@ static void __init early_cpu_detect(void
556
557 get_cpu_vendor(c, 1);
558
559 + early_get_cap(c);
560 +
561 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
562 cpu_devs[c->x86_vendor]->c_early_init)
563 cpu_devs[c->x86_vendor]->c_early_init(c);
564 +}
565
566 - early_get_cap(c);
567 +/*
568 + * The NOPL instruction is supposed to exist on all CPUs with
569 + * family >= 6; unfortunately, that's not true in practice because
570 + * of early VIA chips and (more importantly) broken virtualizers that
571 + * are not easy to detect. In the latter case it doesn't even *fail*
572 + * reliably, so probing for it doesn't even work. Disable it completely
573 + * unless we can find a reliable way to detect all the broken cases.
574 + */
575 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
576 +{
577 + clear_cpu_cap(c, X86_FEATURE_NOPL);
578 }
579
580 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
581 @@ -402,8 +416,8 @@ static void __cpuinit generic_identify(s
582 }
583
584 init_scattered_cpuid_features(c);
585 + detect_nopl(c);
586 }
587 -
588 }
589
590 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
591 @@ -434,7 +448,7 @@ __setup("serialnumber", x86_serial_nr_se
592 /*
593 * This does the hard work of actually picking apart the CPU stuff...
594 */
595 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
596 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
597 {
598 int i;
599
600 @@ -448,6 +462,8 @@ void __cpuinit identify_cpu(struct cpuin
601 c->x86_max_cores = 1;
602 c->x86_clflush_size = 32;
603 memset(&c->x86_capability, 0, sizeof c->x86_capability);
604 + if (boot_cpu_has(X86_FEATURE_SYSCALL32))
605 + set_cpu_cap(c, X86_FEATURE_SYSCALL32);
606
607 if (!have_cpuid_p()) {
608 /*
609 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
610 +++ sle11-2009-06-04/arch/x86/kernel/cpu/common_64-xen.c 2009-06-04 10:21:39.000000000 +0200
611 @@ -0,0 +1,771 @@
612 +#include <linux/init.h>
613 +#include <linux/kernel.h>
614 +#include <linux/sched.h>
615 +#include <linux/string.h>
616 +#include <linux/bootmem.h>
617 +#include <linux/bitops.h>
618 +#include <linux/module.h>
619 +#include <linux/kgdb.h>
620 +#include <linux/topology.h>
621 +#include <linux/delay.h>
622 +#include <linux/smp.h>
623 +#include <linux/percpu.h>
624 +#include <asm/i387.h>
625 +#include <asm/msr.h>
626 +#include <asm/io.h>
627 +#include <asm/linkage.h>
628 +#include <asm/mmu_context.h>
629 +#include <asm/mtrr.h>
630 +#include <asm/mce.h>
631 +#include <asm/pat.h>
632 +#include <asm/asm.h>
633 +#include <asm/numa.h>
634 +#ifdef CONFIG_X86_LOCAL_APIC
635 +#include <asm/mpspec.h>
636 +#include <asm/apic.h>
637 +#include <mach_apic.h>
638 +#elif defined(CONFIG_XEN)
639 +#include <mach_apic.h>
640 +#endif
641 +#include <asm/pda.h>
642 +#include <asm/pgtable.h>
643 +#include <asm/processor.h>
644 +#include <asm/desc.h>
645 +#include <asm/atomic.h>
646 +#include <asm/proto.h>
647 +#include <asm/sections.h>
648 +#include <asm/setup.h>
649 +#include <asm/genapic.h>
650 +
651 +#include "cpu.h"
652 +
653 +/* We need valid kernel segments for data and code in long mode too
654 + * IRET will check the segment types kkeil 2000/10/28
655 + * Also sysret mandates a special GDT layout
656 + */
657 +/* The TLS descriptors are currently at a different place compared to i386.
658 + Hopefully nobody expects them at a fixed place (Wine?) */
659 +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
660 + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
661 + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
662 + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
663 + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
664 + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
665 + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
666 +} };
667 +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
668 +
669 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
670 +
671 +/* Current gdt points %fs at the "master" per-cpu area: after this,
672 + * it's on the real one. */
673 +void switch_to_new_gdt(void)
674 +{
675 +#ifndef CONFIG_XEN
676 + struct desc_ptr gdt_descr;
677 +
678 + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
679 + gdt_descr.size = GDT_SIZE - 1;
680 + load_gdt(&gdt_descr);
681 +#else
682 + void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id());
683 + unsigned long frames[16];
684 + unsigned int f = 0;
685 +
686 + for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) {
687 + frames[f++] = virt_to_mfn(va);
688 + make_page_readonly(va, XENFEAT_writable_descriptor_tables);
689 + }
690 + if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct)))
691 + BUG();
692 +#endif
693 +}
694 +
695 +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
696 +
697 +static void __cpuinit default_init(struct cpuinfo_x86 *c)
698 +{
699 + display_cacheinfo(c);
700 +}
701 +
702 +static struct cpu_dev __cpuinitdata default_cpu = {
703 + .c_init = default_init,
704 + .c_vendor = "Unknown",
705 +};
706 +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
707 +
708 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
709 +{
710 + unsigned int *v;
711 +
712 + if (c->extended_cpuid_level < 0x80000004)
713 + return 0;
714 +
715 + v = (unsigned int *) c->x86_model_id;
716 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
717 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
718 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
719 + c->x86_model_id[48] = 0;
720 + return 1;
721 +}
722 +
723 +
724 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
725 +{
726 + unsigned int n, dummy, ebx, ecx, edx;
727 +
728 + n = c->extended_cpuid_level;
729 +
730 + if (n >= 0x80000005) {
731 + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
732 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
733 + "D cache %dK (%d bytes/line)\n",
734 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
735 + c->x86_cache_size = (ecx>>24) + (edx>>24);
736 + /* On K8 L1 TLB is inclusive, so don't count it */
737 + c->x86_tlbsize = 0;
738 + }
739 +
740 + if (n >= 0x80000006) {
741 + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
742 + ecx = cpuid_ecx(0x80000006);
743 + c->x86_cache_size = ecx >> 16;
744 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
745 +
746 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
747 + c->x86_cache_size, ecx & 0xFF);
748 + }
749 +}
750 +
751 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
752 +{
753 +#ifdef CONFIG_SMP
754 + u32 eax, ebx, ecx, edx;
755 + int index_msb, core_bits;
756 +
757 + cpuid(1, &eax, &ebx, &ecx, &edx);
758 +
759 +
760 + if (!cpu_has(c, X86_FEATURE_HT))
761 + return;
762 + if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
763 + goto out;
764 +
765 + smp_num_siblings = (ebx & 0xff0000) >> 16;
766 +
767 + if (smp_num_siblings == 1) {
768 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
769 + } else if (smp_num_siblings > 1) {
770 +
771 + if (smp_num_siblings > NR_CPUS) {
772 + printk(KERN_WARNING "CPU: Unsupported number of "
773 + "siblings %d", smp_num_siblings);
774 + smp_num_siblings = 1;
775 + return;
776 + }
777 +
778 + index_msb = get_count_order(smp_num_siblings);
779 + c->phys_proc_id = phys_pkg_id(index_msb);
780 +
781 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
782 +
783 + index_msb = get_count_order(smp_num_siblings);
784 +
785 + core_bits = get_count_order(c->x86_max_cores);
786 +
787 + c->cpu_core_id = phys_pkg_id(index_msb) &
788 + ((1 << core_bits) - 1);
789 + }
790 +out:
791 + if ((c->x86_max_cores * smp_num_siblings) > 1) {
792 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
793 + c->phys_proc_id);
794 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
795 + c->cpu_core_id);
796 + }
797 +
798 +#endif
799 +}
800 +
801 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
802 +{
803 + char *v = c->x86_vendor_id;
804 + int i;
805 + static int printed;
806 +
807 + for (i = 0; i < X86_VENDOR_NUM; i++) {
808 + if (cpu_devs[i]) {
809 + if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
810 + (cpu_devs[i]->c_ident[1] &&
811 + !strcmp(v, cpu_devs[i]->c_ident[1]))) {
812 + c->x86_vendor = i;
813 + this_cpu = cpu_devs[i];
814 + return;
815 + }
816 + }
817 + }
818 + if (!printed) {
819 + printed++;
820 + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
821 + printk(KERN_ERR "CPU: Your system may be unstable.\n");
822 + }
823 + c->x86_vendor = X86_VENDOR_UNKNOWN;
824 +}
825 +
826 +static void __init early_cpu_support_print(void)
827 +{
828 + int i,j;
829 + struct cpu_dev *cpu_devx;
830 +
831 + printk("KERNEL supported cpus:\n");
832 + for (i = 0; i < X86_VENDOR_NUM; i++) {
833 + cpu_devx = cpu_devs[i];
834 + if (!cpu_devx)
835 + continue;
836 + for (j = 0; j < 2; j++) {
837 + if (!cpu_devx->c_ident[j])
838 + continue;
839 + printk(" %s %s\n", cpu_devx->c_vendor,
840 + cpu_devx->c_ident[j]);
841 + }
842 + }
843 +}
844 +
845 +/*
846 + * The NOPL instruction is supposed to exist on all CPUs with
847 + * family >= 6, unfortunately, that's not true in practice because
848 + * of early VIA chips and (more importantly) broken virtualizers that
849 + * are not easy to detect. Hence, probe for it based on first
850 + * principles.
851 + *
852 + * Note: no 64-bit chip is known to lack these, but put the code here
853 + * for consistency with 32 bits, and to make it utterly trivial to
854 + * diagnose the problem should it ever surface.
855 + */
856 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
857 +{
858 + const u32 nopl_signature = 0x888c53b1; /* Random number */
859 + u32 has_nopl = nopl_signature;
860 +
861 + clear_cpu_cap(c, X86_FEATURE_NOPL);
862 + if (c->x86 >= 6) {
863 + asm volatile("\n"
864 + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
865 + "2:\n"
866 + " .section .fixup,\"ax\"\n"
867 + "3: xor %0,%0\n"
868 + " jmp 2b\n"
869 + " .previous\n"
870 + _ASM_EXTABLE(1b,3b)
871 + : "+a" (has_nopl));
872 +
873 + if (has_nopl == nopl_signature)
874 + set_cpu_cap(c, X86_FEATURE_NOPL);
875 + }
876 +}
877 +
878 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
879 +
880 +void __init early_cpu_init(void)
881 +{
882 + struct cpu_vendor_dev *cvdev;
883 +
884 + for (cvdev = __x86cpuvendor_start ;
885 + cvdev < __x86cpuvendor_end ;
886 + cvdev++)
887 + cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
888 + early_cpu_support_print();
889 + early_identify_cpu(&boot_cpu_data);
890 +}
891 +
892 +/* Do some early cpuid on the boot CPU to get some parameter that are
893 + needed before check_bugs. Everything advanced is in identify_cpu
894 + below. */
895 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
896 +{
897 + u32 tfms, xlvl;
898 +
899 + c->loops_per_jiffy = loops_per_jiffy;
900 + c->x86_cache_size = -1;
901 + c->x86_vendor = X86_VENDOR_UNKNOWN;
902 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
903 + c->x86_vendor_id[0] = '\0'; /* Unset */
904 + c->x86_model_id[0] = '\0'; /* Unset */
905 + c->x86_clflush_size = 64;
906 + c->x86_cache_alignment = c->x86_clflush_size;
907 + c->x86_max_cores = 1;
908 + c->x86_coreid_bits = 0;
909 + c->extended_cpuid_level = 0;
910 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
911 +
912 + /* Get vendor name */
913 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
914 + (unsigned int *)&c->x86_vendor_id[0],
915 + (unsigned int *)&c->x86_vendor_id[8],
916 + (unsigned int *)&c->x86_vendor_id[4]);
917 +
918 + get_cpu_vendor(c);
919 +
920 + /* Initialize the standard set of capabilities */
921 + /* Note that the vendor-specific code below might override */
922 +
923 + /* Intel-defined flags: level 0x00000001 */
924 + if (c->cpuid_level >= 0x00000001) {
925 + __u32 misc;
926 + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
927 + &c->x86_capability[0]);
928 + c->x86 = (tfms >> 8) & 0xf;
929 + c->x86_model = (tfms >> 4) & 0xf;
930 + c->x86_mask = tfms & 0xf;
931 + if (c->x86 == 0xf)
932 + c->x86 += (tfms >> 20) & 0xff;
933 + if (c->x86 >= 0x6)
934 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
935 + if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
936 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
937 + } else {
938 + /* Have CPUID level 0 only - unheard of */
939 + c->x86 = 4;
940 + }
941 +
942 + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
943 +#ifdef CONFIG_SMP
944 + c->phys_proc_id = c->initial_apicid;
945 +#endif
946 + /* AMD-defined flags: level 0x80000001 */
947 + xlvl = cpuid_eax(0x80000000);
948 + c->extended_cpuid_level = xlvl;
949 + if ((xlvl & 0xffff0000) == 0x80000000) {
950 + if (xlvl >= 0x80000001) {
951 + c->x86_capability[1] = cpuid_edx(0x80000001);
952 + c->x86_capability[6] = cpuid_ecx(0x80000001);
953 + }
954 + if (xlvl >= 0x80000004)
955 + get_model_name(c); /* Default name */
956 + }
957 +
958 + /* Transmeta-defined flags: level 0x80860001 */
959 + xlvl = cpuid_eax(0x80860000);
960 + if ((xlvl & 0xffff0000) == 0x80860000) {
961 + /* Don't set x86_cpuid_level here for now to not confuse. */
962 + if (xlvl >= 0x80860001)
963 + c->x86_capability[2] = cpuid_edx(0x80860001);
964 + }
965 +
966 + if (c->extended_cpuid_level >= 0x80000007)
967 + c->x86_power = cpuid_edx(0x80000007);
968 +
969 + if (c->extended_cpuid_level >= 0x80000008) {
970 + u32 eax = cpuid_eax(0x80000008);
971 +
972 + c->x86_virt_bits = (eax >> 8) & 0xff;
973 + c->x86_phys_bits = eax & 0xff;
974 + }
975 +
976 + detect_nopl(c);
977 +
978 + if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
979 + cpu_devs[c->x86_vendor]->c_early_init)
980 + cpu_devs[c->x86_vendor]->c_early_init(c);
981 +
982 + validate_pat_support(c);
983 +}
984 +
985 +/*
986 + * This does the hard work of actually picking apart the CPU stuff...
987 + */
988 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
989 +{
990 + int i;
991 +
992 + early_identify_cpu(c);
993 +
994 + init_scattered_cpuid_features(c);
995 +
996 + c->apicid = phys_pkg_id(0);
997 +
998 + /*
999 + * Vendor-specific initialization. In this section we
1000 + * canonicalize the feature flags, meaning if there are
1001 + * features a certain CPU supports which CPUID doesn't
1002 + * tell us, CPUID claiming incorrect flags, or other bugs,
1003 + * we handle them here.
1004 + *
1005 + * At the end of this section, c->x86_capability better
1006 + * indicate the features this CPU genuinely supports!
1007 + */
1008 + if (this_cpu->c_init)
1009 + this_cpu->c_init(c);
1010 +
1011 + detect_ht(c);
1012 +
1013 + /*
1014 + * On SMP, boot_cpu_data holds the common feature set between
1015 + * all CPUs; so make sure that we indicate which features are
1016 + * common between the CPUs. The first time this routine gets
1017 + * executed, c == &boot_cpu_data.
1018 + */
1019 + if (c != &boot_cpu_data) {
1020 + /* AND the already accumulated flags with these */
1021 + for (i = 0; i < NCAPINTS; i++)
1022 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1023 + }
1024 +
1025 + /* Clear all flags overriden by options */
1026 + for (i = 0; i < NCAPINTS; i++)
1027 + c->x86_capability[i] &= ~cleared_cpu_caps[i];
1028 +
1029 +#ifdef CONFIG_X86_MCE
1030 + mcheck_init(c);
1031 +#endif
1032 + select_idle_routine(c);
1033 +
1034 +#ifdef CONFIG_NUMA
1035 + numa_add_cpu(smp_processor_id());
1036 +#endif
1037 +
1038 +}
1039 +
1040 +void __cpuinit identify_boot_cpu(void)
1041 +{
1042 + identify_cpu(&boot_cpu_data);
1043 +}
1044 +
1045 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1046 +{
1047 + BUG_ON(c == &boot_cpu_data);
1048 + identify_cpu(c);
1049 + mtrr_ap_init();
1050 +}
1051 +
1052 +static __init int setup_noclflush(char *arg)
1053 +{
1054 + setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1055 + return 1;
1056 +}
1057 +__setup("noclflush", setup_noclflush);
1058 +
1059 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1060 +{
1061 + if (c->x86_model_id[0])
1062 + printk(KERN_CONT "%s", c->x86_model_id);
1063 +
1064 + if (c->x86_mask || c->cpuid_level >= 0)
1065 + printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1066 + else
1067 + printk(KERN_CONT "\n");
1068 +}
1069 +
1070 +static __init int setup_disablecpuid(char *arg)
1071 +{
1072 + int bit;
1073 + if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1074 + setup_clear_cpu_cap(bit);
1075 + else
1076 + return 0;
1077 + return 1;
1078 +}
1079 +__setup("clearcpuid=", setup_disablecpuid);
1080 +
1081 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1082 +
1083 +struct x8664_pda **_cpu_pda __read_mostly;
1084 +EXPORT_SYMBOL(_cpu_pda);
1085 +
1086 +#ifndef CONFIG_X86_NO_IDT
1087 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
1088 +#endif
1089 +
1090 +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
1091 +
1092 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
1093 +EXPORT_SYMBOL_GPL(__supported_pte_mask);
1094 +
1095 +static int do_not_nx __cpuinitdata;
1096 +
1097 +/* noexec=on|off
1098 +Control non executable mappings for 64bit processes.
1099 +
1100 +on Enable(default)
1101 +off Disable
1102 +*/
1103 +static int __init nonx_setup(char *str)
1104 +{
1105 + if (!str)
1106 + return -EINVAL;
1107 + if (!strncmp(str, "on", 2)) {
1108 + __supported_pte_mask |= _PAGE_NX;
1109 + do_not_nx = 0;
1110 + } else if (!strncmp(str, "off", 3)) {
1111 + do_not_nx = 1;
1112 + __supported_pte_mask &= ~_PAGE_NX;
1113 + }
1114 + return 0;
1115 +}
1116 +early_param("noexec", nonx_setup);
1117 +
1118 +int force_personality32;
1119 +
1120 +/* noexec32=on|off
1121 +Control non executable heap for 32bit processes.
1122 +To control the stack too use noexec=off
1123 +
1124 +on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
1125 +off PROT_READ implies PROT_EXEC
1126 +*/
1127 +static int __init nonx32_setup(char *str)
1128 +{
1129 + if (!strcmp(str, "on"))
1130 + force_personality32 &= ~READ_IMPLIES_EXEC;
1131 + else if (!strcmp(str, "off"))
1132 + force_personality32 |= READ_IMPLIES_EXEC;
1133 + return 1;
1134 +}
1135 +__setup("noexec32=", nonx32_setup);
1136 +
1137 +static void __init_refok switch_pt(int cpu)
1138 +{
1139 +#ifdef CONFIG_XEN
1140 + if (cpu == 0)
1141 + xen_init_pt();
1142 + xen_pt_switch(__pa_symbol(init_level4_pgt));
1143 + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
1144 +#endif
1145 +}
1146 +
1147 +void pda_init(int cpu)
1148 +{
1149 + struct x8664_pda *pda = cpu_pda(cpu);
1150 +
1151 + /* Setup up data that may be needed in __get_free_pages early */
1152 + loadsegment(fs, 0);
1153 + loadsegment(gs, 0);
1154 +#ifndef CONFIG_XEN
1155 + /* Memory clobbers used to order PDA accessed */
1156 + mb();
1157 + wrmsrl(MSR_GS_BASE, pda);
1158 + mb();
1159 +#else
1160 + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
1161 + (unsigned long)pda))
1162 + BUG();
1163 +#endif
1164 +
1165 + pda->cpunumber = cpu;
1166 + pda->irqcount = -1;
1167 + pda->kernelstack = (unsigned long)stack_thread_info() -
1168 + PDA_STACKOFFSET + THREAD_SIZE;
1169 + pda->active_mm = &init_mm;
1170 + pda->mmu_state = 0;
1171 +
1172 + if (cpu == 0) {
1173 + /* others are initialized in smpboot.c */
1174 + pda->pcurrent = &init_task;
1175 + pda->irqstackptr = boot_cpu_stack;
1176 + pda->irqstackptr += IRQSTACKSIZE - 64;
1177 + } else {
1178 + if (!pda->irqstackptr) {
1179 + pda->irqstackptr = (char *)
1180 + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
1181 + if (!pda->irqstackptr)
1182 + panic("cannot allocate irqstack for cpu %d",
1183 + cpu);
1184 + pda->irqstackptr += IRQSTACKSIZE - 64;
1185 + }
1186 +
1187 + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
1188 + pda->nodenumber = cpu_to_node(cpu);
1189 + }
1190 +
1191 + switch_pt(cpu);
1192 +}
1193 +
1194 +#ifndef CONFIG_X86_NO_TSS
1195 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
1196 + DEBUG_STKSZ] __page_aligned_bss;
1197 +#endif
1198 +
1199 +extern asmlinkage void ignore_sysret(void);
1200 +
1201 +void __cpuinit syscall_init(void)
1202 +{
1203 +#ifndef CONFIG_XEN
1204 + /*
1205 + * LSTAR and STAR live in a bit strange symbiosis.
1206 + * They both write to the same internal register. STAR allows to
1207 + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
1208 + */
1209 + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1210 + wrmsrl(MSR_LSTAR, system_call);
1211 + wrmsrl(MSR_CSTAR, ignore_sysret);
1212 +
1213 + /* Flags to clear on syscall */
1214 + wrmsrl(MSR_SYSCALL_MASK,
1215 + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1216 +#endif
1217 +#ifdef CONFIG_IA32_EMULATION
1218 + syscall32_cpu_init();
1219 +#else
1220 + static const struct callback_register __cpuinitconst cstar = {
1221 + .type = CALLBACKTYPE_syscall32,
1222 + .address = (unsigned long)ignore_sysret
1223 + };
1224 +
1225 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
1226 + printk(KERN_WARN "Unable to register CSTAR callback\n");
1227 +#endif
1228 +}
1229 +
1230 +void __cpuinit check_efer(void)
1231 +{
1232 + unsigned long efer;
1233 +
1234 + rdmsrl(MSR_EFER, efer);
1235 + if (!(efer & EFER_NX) || do_not_nx)
1236 + __supported_pte_mask &= ~_PAGE_NX;
1237 +}
1238 +
1239 +unsigned long kernel_eflags;
1240 +
1241 +#ifndef CONFIG_X86_NO_TSS
1242 +/*
1243 + * Copies of the original ist values from the tss are only accessed during
1244 + * debugging, no special alignment required.
1245 + */
1246 +DEFINE_PER_CPU(struct orig_ist, orig_ist);
1247 +#endif
1248 +
1249 +/*
1250 + * cpu_init() initializes state that is per-CPU. Some data is already
1251 + * initialized (naturally) in the bootstrap process, such as the GDT
1252 + * and IDT. We reload them nevertheless, this function acts as a
1253 + * 'CPU state barrier', nothing should get across.
1254 + * A lot of state is already set up in PDA init.
1255 + */
1256 +void __cpuinit cpu_init(void)
1257 +{
1258 + int cpu = stack_smp_processor_id();
1259 +#ifndef CONFIG_X86_NO_TSS
1260 + struct tss_struct *t = &per_cpu(init_tss, cpu);
1261 + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
1262 + unsigned long v;
1263 + char *estacks = NULL;
1264 + int i;
1265 +#endif
1266 + struct task_struct *me;
1267 +
1268 + /* CPU 0 is initialised in head64.c */
1269 + if (cpu != 0)
1270 + pda_init(cpu);
1271 +#ifndef CONFIG_X86_NO_TSS
1272 + else
1273 + estacks = boot_exception_stacks;
1274 +#endif
1275 +
1276 + me = current;
1277 +
1278 + if (cpu_test_and_set(cpu, cpu_initialized))
1279 + panic("CPU#%d already initialized!\n", cpu);
1280 +
1281 + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1282 +
1283 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1284 +
1285 + /*
1286 + * Initialize the per-CPU GDT with the boot GDT,
1287 + * and set up the GDT descriptor:
1288 + */
1289 +
1290 + switch_to_new_gdt();
1291 +#ifndef CONFIG_X86_NO_IDT
1292 + load_idt((const struct desc_ptr *)&idt_descr);
1293 +#endif
1294 +
1295 + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
1296 + syscall_init();
1297 +
1298 + wrmsrl(MSR_FS_BASE, 0);
1299 + wrmsrl(MSR_KERNEL_GS_BASE, 0);
1300 + barrier();
1301 +
1302 + check_efer();
1303 +
1304 +#ifndef CONFIG_X86_NO_TSS
1305 + /*
1306 + * set up and load the per-CPU TSS
1307 + */
1308 + if (!orig_ist->ist[0]) {
1309 + static const unsigned int order[N_EXCEPTION_STACKS] = {
1310 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1311 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1312 + };
1313 + for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1314 + if (cpu) {
1315 + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1316 + if (!estacks)
1317 + panic("Cannot allocate exception "
1318 + "stack %ld %d\n", v, cpu);
1319 + }
1320 + estacks += PAGE_SIZE << order[v];
1321 + orig_ist->ist[v] = t->x86_tss.ist[v] =
1322 + (unsigned long)estacks;
1323 + }
1324 + }
1325 +
1326 + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1327 + /*
1328 + * <= is required because the CPU will access up to
1329 + * 8 bits beyond the end of the IO permission bitmap.
1330 + */
1331 + for (i = 0; i <= IO_BITMAP_LONGS; i++)
1332 + t->io_bitmap[i] = ~0UL;
1333 +#endif
1334 +
1335 + atomic_inc(&init_mm.mm_count);
1336 + me->active_mm = &init_mm;
1337 + if (me->mm)
1338 + BUG();
1339 + enter_lazy_tlb(&init_mm, me);
1340 +
1341 + load_sp0(t, &current->thread);
1342 +#ifndef CONFIG_X86_NO_TSS
1343 + set_tss_desc(cpu, t);
1344 + load_TR_desc();
1345 +#endif
1346 + load_LDT(&init_mm.context);
1347 +
1348 +#ifdef CONFIG_KGDB
1349 + /*
1350 + * If the kgdb is connected no debug regs should be altered. This
1351 + * is only applicable when KGDB and a KGDB I/O module are built
1352 + * into the kernel and you are using early debugging with
1353 + * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1354 + */
1355 + if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1356 + arch_kgdb_ops.correct_hw_break();
1357 + else {
1358 +#endif
1359 + /*
1360 + * Clear all 6 debug registers:
1361 + */
1362 +
1363 + set_debugreg(0UL, 0);
1364 + set_debugreg(0UL, 1);
1365 + set_debugreg(0UL, 2);
1366 + set_debugreg(0UL, 3);
1367 + set_debugreg(0UL, 6);
1368 + set_debugreg(0UL, 7);
1369 +#ifdef CONFIG_KGDB
1370 + /* If the kgdb is connected no debug regs should be altered. */
1371 + }
1372 +#endif
1373 +
1374 + fpu_init();
1375 +
1376 + asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
1377 + if (raw_irqs_disabled())
1378 + kernel_eflags &= ~X86_EFLAGS_IF;
1379 +
1380 + if (is_uv_system())
1381 + uv_cpu_init();
1382 +}
1383 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
1384 +++ sle11-2009-06-04/arch/x86/kernel/e820-xen.c 2009-06-04 10:21:39.000000000 +0200
1385 @@ -0,0 +1,1545 @@
1386 +/*
1387 + * Handle the memory map.
1388 + * The functions here do the job until bootmem takes over.
1389 + *
1390 + * Getting sanitize_e820_map() in sync with i386 version by applying change:
1391 + * - Provisions for empty E820 memory regions (reported by certain BIOSes).
1392 + * Alex Achenbach <xela@slit.de>, December 2002.
1393 + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
1394 + *
1395 + */
1396 +#include <linux/kernel.h>
1397 +#include <linux/types.h>
1398 +#include <linux/init.h>
1399 +#include <linux/bootmem.h>
1400 +#include <linux/ioport.h>
1401 +#include <linux/string.h>
1402 +#include <linux/kexec.h>
1403 +#include <linux/module.h>
1404 +#include <linux/mm.h>
1405 +#include <linux/pfn.h>
1406 +#include <linux/suspend.h>
1407 +#include <linux/firmware-map.h>
1408 +
1409 +#include <asm/pgtable.h>
1410 +#include <asm/page.h>
1411 +#include <asm/e820.h>
1412 +#include <asm/proto.h>
1413 +#include <asm/setup.h>
1414 +#include <xen/interface/memory.h>
1415 +
1416 +/*
1417 + * The e820 map is the map that gets modified e.g. with command line parameters
1418 + * and that is also registered with modifications in the kernel resource tree
1419 + * with the iomem_resource as parent.
1420 + *
1421 + * The e820_saved is directly saved after the BIOS-provided memory map is
1422 + * copied. It doesn't get modified afterwards. It's registered for the
1423 + * /sys/firmware/memmap interface.
1424 + *
1425 + * That memory map is not modified and is used as base for kexec. The kexec'd
1426 + * kernel should get the same memory map as the firmware provides. Then the
1427 + * user can e.g. boot the original kernel with mem=1G while still booting the
1428 + * next kernel with full memory.
1429 + */
1430 +struct e820map e820;
1431 +#ifndef CONFIG_XEN
1432 +struct e820map e820_saved;
1433 +#else
1434 +static struct e820map machine_e820;
1435 +#define e820_saved machine_e820
1436 +#endif
1437 +
1438 +/* For PCI or other memory-mapped resources */
1439 +unsigned long pci_mem_start = 0xaeedbabe;
1440 +#ifdef CONFIG_PCI
1441 +EXPORT_SYMBOL(pci_mem_start);
1442 +#endif
1443 +
1444 +/*
1445 + * This function checks if any part of the range <start,end> is mapped
1446 + * with type.
1447 + */
1448 +int
1449 +e820_any_mapped(u64 start, u64 end, unsigned type)
1450 +{
1451 + int i;
1452 +
1453 +#ifndef CONFIG_XEN
1454 + for (i = 0; i < e820.nr_map; i++) {
1455 + struct e820entry *ei = &e820.map[i];
1456 +#else
1457 + if (!is_initial_xendomain())
1458 + return 0;
1459 + for (i = 0; i < machine_e820.nr_map; ++i) {
1460 + const struct e820entry *ei = &machine_e820.map[i];
1461 +#endif
1462 +
1463 + if (type && ei->type != type)
1464 + continue;
1465 + if (ei->addr >= end || ei->addr + ei->size <= start)
1466 + continue;
1467 + return 1;
1468 + }
1469 + return 0;
1470 +}
1471 +EXPORT_SYMBOL_GPL(e820_any_mapped);
1472 +
1473 +/*
1474 + * This function checks if the entire range <start,end> is mapped with type.
1475 + *
1476 + * Note: this function only works correct if the e820 table is sorted and
1477 + * not-overlapping, which is the case
1478 + */
1479 +int __init e820_all_mapped(u64 start, u64 end, unsigned type)
1480 +{
1481 + int i;
1482 +
1483 +#ifndef CONFIG_XEN
1484 + for (i = 0; i < e820.nr_map; i++) {
1485 + struct e820entry *ei = &e820.map[i];
1486 +#else
1487 + if (!is_initial_xendomain())
1488 + return 0;
1489 + for (i = 0; i < machine_e820.nr_map; ++i) {
1490 + const struct e820entry *ei = &machine_e820.map[i];
1491 +#endif
1492 +
1493 + if (type && ei->type != type)
1494 + continue;
1495 + /* is the region (part) in overlap with the current region ?*/
1496 + if (ei->addr >= end || ei->addr + ei->size <= start)
1497 + continue;
1498 +
1499 + /* if the region is at the beginning of <start,end> we move
1500 + * start to the end of the region since it's ok until there
1501 + */
1502 + if (ei->addr <= start)
1503 + start = ei->addr + ei->size;
1504 + /*
1505 + * if start is now at or beyond end, we're done, full
1506 + * coverage
1507 + */
1508 + if (start >= end)
1509 + return 1;
1510 + }
1511 + return 0;
1512 +}
1513 +
1514 +/*
1515 + * Add a memory region to the kernel e820 map.
1516 + */
1517 +void __init e820_add_region(u64 start, u64 size, int type)
1518 +{
1519 + int x = e820.nr_map;
1520 +
1521 + if (x == ARRAY_SIZE(e820.map)) {
1522 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1523 + return;
1524 + }
1525 +
1526 + e820.map[x].addr = start;
1527 + e820.map[x].size = size;
1528 + e820.map[x].type = type;
1529 + e820.nr_map++;
1530 +}
1531 +
1532 +void __init e820_print_map(char *who)
1533 +{
1534 + int i;
1535 +
1536 + for (i = 0; i < e820.nr_map; i++) {
1537 + printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1538 + (unsigned long long) e820.map[i].addr,
1539 + (unsigned long long)
1540 + (e820.map[i].addr + e820.map[i].size));
1541 + switch (e820.map[i].type) {
1542 + case E820_RAM:
1543 + case E820_RESERVED_KERN:
1544 + printk(KERN_CONT "(usable)\n");
1545 + break;
1546 + case E820_RESERVED:
1547 + printk(KERN_CONT "(reserved)\n");
1548 + break;
1549 + case E820_ACPI:
1550 + printk(KERN_CONT "(ACPI data)\n");
1551 + break;
1552 + case E820_NVS:
1553 + printk(KERN_CONT "(ACPI NVS)\n");
1554 + break;
1555 + default:
1556 + printk(KERN_CONT "type %u\n", e820.map[i].type);
1557 + break;
1558 + }
1559 + }
1560 +}
1561 +
1562 +/*
1563 + * Sanitize the BIOS e820 map.
1564 + *
1565 + * Some e820 responses include overlapping entries. The following
1566 + * replaces the original e820 map with a new one, removing overlaps,
1567 + * and resolving conflicting memory types in favor of highest
1568 + * numbered type.
1569 + *
1570 + * The input parameter biosmap points to an array of 'struct
1571 + * e820entry' which on entry has elements in the range [0, *pnr_map)
1572 + * valid, and which has space for up to max_nr_map entries.
1573 + * On return, the resulting sanitized e820 map entries will be in
1574 + * overwritten in the same location, starting at biosmap.
1575 + *
1576 + * The integer pointed to by pnr_map must be valid on entry (the
1577 + * current number of valid entries located at biosmap) and will
1578 + * be updated on return, with the new number of valid entries
1579 + * (something no more than max_nr_map.)
1580 + *
1581 + * The return value from sanitize_e820_map() is zero if it
1582 + * successfully 'sanitized' the map entries passed in, and is -1
1583 + * if it did nothing, which can happen if either of (1) it was
1584 + * only passed one map entry, or (2) any of the input map entries
1585 + * were invalid (start + size < start, meaning that the size was
1586 + * so big the described memory range wrapped around through zero.)
1587 + *
1588 + * Visually we're performing the following
1589 + * (1,2,3,4 = memory types)...
1590 + *
1591 + * Sample memory map (w/overlaps):
1592 + * ____22__________________
1593 + * ______________________4_
1594 + * ____1111________________
1595 + * _44_____________________
1596 + * 11111111________________
1597 + * ____________________33__
1598 + * ___________44___________
1599 + * __________33333_________
1600 + * ______________22________
1601 + * ___________________2222_
1602 + * _________111111111______
1603 + * _____________________11_
1604 + * _________________4______
1605 + *
1606 + * Sanitized equivalent (no overlap):
1607 + * 1_______________________
1608 + * _44_____________________
1609 + * ___1____________________
1610 + * ____22__________________
1611 + * ______11________________
1612 + * _________1______________
1613 + * __________3_____________
1614 + * ___________44___________
1615 + * _____________33_________
1616 + * _______________2________
1617 + * ________________1_______
1618 + * _________________4______
1619 + * ___________________2____
1620 + * ____________________33__
1621 + * ______________________4_
1622 + */
1623 +
1624 +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
1625 + int *pnr_map)
1626 +{
1627 + struct change_member {
1628 + struct e820entry *pbios; /* pointer to original bios entry */
1629 + unsigned long long addr; /* address for this change point */
1630 + };
1631 + static struct change_member change_point_list[2*E820_X_MAX] __initdata;
1632 + static struct change_member *change_point[2*E820_X_MAX] __initdata;
1633 + static struct e820entry *overlap_list[E820_X_MAX] __initdata;
1634 + static struct e820entry new_bios[E820_X_MAX] __initdata;
1635 + struct change_member *change_tmp;
1636 + unsigned long current_type, last_type;
1637 + unsigned long long last_addr;
1638 + int chgidx, still_changing;
1639 + int overlap_entries;
1640 + int new_bios_entry;
1641 + int old_nr, new_nr, chg_nr;
1642 + int i;
1643 +
1644 + /* if there's only one memory region, don't bother */
1645 +#ifdef CONFIG_XEN
1646 + if (*pnr_map == 1)
1647 + return 0;
1648 +#endif
1649 + if (*pnr_map < 2)
1650 + return -1;
1651 +
1652 + old_nr = *pnr_map;
1653 + BUG_ON(old_nr > max_nr_map);
1654 +
1655 + /* bail out if we find any unreasonable addresses in bios map */
1656 + for (i = 0; i < old_nr; i++)
1657 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1658 + return -1;
1659 +
1660 + /* create pointers for initial change-point information (for sorting) */
1661 + for (i = 0; i < 2 * old_nr; i++)
1662 + change_point[i] = &change_point_list[i];
1663 +
1664 + /* record all known change-points (starting and ending addresses),
1665 + omitting those that are for empty memory regions */
1666 + chgidx = 0;
1667 + for (i = 0; i < old_nr; i++) {
1668 + if (biosmap[i].size != 0) {
1669 + change_point[chgidx]->addr = biosmap[i].addr;
1670 + change_point[chgidx++]->pbios = &biosmap[i];
1671 + change_point[chgidx]->addr = biosmap[i].addr +
1672 + biosmap[i].size;
1673 + change_point[chgidx++]->pbios = &biosmap[i];
1674 + }
1675 + }
1676 + chg_nr = chgidx;
1677 +
1678 + /* sort change-point list by memory addresses (low -> high) */
1679 + still_changing = 1;
1680 + while (still_changing) {
1681 + still_changing = 0;
1682 + for (i = 1; i < chg_nr; i++) {
1683 + unsigned long long curaddr, lastaddr;
1684 + unsigned long long curpbaddr, lastpbaddr;
1685 +
1686 + curaddr = change_point[i]->addr;
1687 + lastaddr = change_point[i - 1]->addr;
1688 + curpbaddr = change_point[i]->pbios->addr;
1689 + lastpbaddr = change_point[i - 1]->pbios->addr;
1690 +
1691 + /*
1692 + * swap entries, when:
1693 + *
1694 + * curaddr > lastaddr or
1695 + * curaddr == lastaddr and curaddr == curpbaddr and
1696 + * lastaddr != lastpbaddr
1697 + */
1698 + if (curaddr < lastaddr ||
1699 + (curaddr == lastaddr && curaddr == curpbaddr &&
1700 + lastaddr != lastpbaddr)) {
1701 + change_tmp = change_point[i];
1702 + change_point[i] = change_point[i-1];
1703 + change_point[i-1] = change_tmp;
1704 + still_changing = 1;
1705 + }
1706 + }
1707 + }
1708 +
1709 + /* create a new bios memory map, removing overlaps */
1710 + overlap_entries = 0; /* number of entries in the overlap table */
1711 + new_bios_entry = 0; /* index for creating new bios map entries */
1712 + last_type = 0; /* start with undefined memory type */
1713 + last_addr = 0; /* start with 0 as last starting address */
1714 +
1715 + /* loop through change-points, determining affect on the new bios map */
1716 + for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1717 + /* keep track of all overlapping bios entries */
1718 + if (change_point[chgidx]->addr ==
1719 + change_point[chgidx]->pbios->addr) {
1720 + /*
1721 + * add map entry to overlap list (> 1 entry
1722 + * implies an overlap)
1723 + */
1724 + overlap_list[overlap_entries++] =
1725 + change_point[chgidx]->pbios;
1726 + } else {
1727 + /*
1728 + * remove entry from list (order independent,
1729 + * so swap with last)
1730 + */
1731 + for (i = 0; i < overlap_entries; i++) {
1732 + if (overlap_list[i] ==
1733 + change_point[chgidx]->pbios)
1734 + overlap_list[i] =
1735 + overlap_list[overlap_entries-1];
1736 + }
1737 + overlap_entries--;
1738 + }
1739 + /*
1740 + * if there are overlapping entries, decide which
1741 + * "type" to use (larger value takes precedence --
1742 + * 1=usable, 2,3,4,4+=unusable)
1743 + */
1744 + current_type = 0;
1745 + for (i = 0; i < overlap_entries; i++)
1746 + if (overlap_list[i]->type > current_type)
1747 + current_type = overlap_list[i]->type;
1748 + /*
1749 + * continue building up new bios map based on this
1750 + * information
1751 + */
1752 + if (current_type != last_type) {
1753 + if (last_type != 0) {
1754 + new_bios[new_bios_entry].size =
1755 + change_point[chgidx]->addr - last_addr;
1756 + /*
1757 + * move forward only if the new size
1758 + * was non-zero
1759 + */
1760 + if (new_bios[new_bios_entry].size != 0)
1761 + /*
1762 + * no more space left for new
1763 + * bios entries ?
1764 + */
1765 + if (++new_bios_entry >= max_nr_map)
1766 + break;
1767 + }
1768 + if (current_type != 0) {
1769 + new_bios[new_bios_entry].addr =
1770 + change_point[chgidx]->addr;
1771 + new_bios[new_bios_entry].type = current_type;
1772 + last_addr = change_point[chgidx]->addr;
1773 + }
1774 + last_type = current_type;
1775 + }
1776 + }
1777 + /* retain count for new bios entries */
1778 + new_nr = new_bios_entry;
1779 +
1780 + /* copy new bios mapping into original location */
1781 + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
1782 + *pnr_map = new_nr;
1783 +
1784 + return 0;
1785 +}
1786 +
1787 +static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
1788 +{
1789 + while (nr_map) {
1790 + u64 start = biosmap->addr;
1791 + u64 size = biosmap->size;
1792 + u64 end = start + size;
1793 + u32 type = biosmap->type;
1794 +
1795 + /* Overflow in 64 bits? Ignore the memory map. */
1796 + if (start > end)
1797 + return -1;
1798 +
1799 + e820_add_region(start, size, type);
1800 +
1801 + biosmap++;
1802 + nr_map--;
1803 + }
1804 + return 0;
1805 +}
1806 +
1807 +/*
1808 + * Copy the BIOS e820 map into a safe place.
1809 + *
1810 + * Sanity-check it while we're at it..
1811 + *
1812 + * If we're lucky and live on a modern system, the setup code
1813 + * will have given us a memory map that we can use to properly
1814 + * set up memory. If we aren't, we'll fake a memory map.
1815 + */
1816 +static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
1817 +{
1818 +#ifndef CONFIG_XEN
1819 + /* Only one memory region (or negative)? Ignore it */
1820 + if (nr_map < 2)
1821 + return -1;
1822 +#else
1823 + BUG_ON(nr_map < 1);
1824 +#endif
1825 +
1826 + return __append_e820_map(biosmap, nr_map);
1827 +}
1828 +
1829 +static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
1830 + u64 size, unsigned old_type,
1831 + unsigned new_type)
1832 +{
1833 + unsigned int i, x;
1834 + u64 real_updated_size = 0;
1835 +
1836 + BUG_ON(old_type == new_type);
1837 +
1838 + if (size > (ULLONG_MAX - start))
1839 + size = ULLONG_MAX - start;
1840 +
1841 + for (i = 0; i < e820x->nr_map; i++) {
1842 + struct e820entry *ei = &e820x->map[i];
1843 + u64 final_start, final_end;
1844 + if (ei->type != old_type)
1845 + continue;
1846 + /* totally covered? */
1847 + if (ei->addr >= start &&
1848 + (ei->addr + ei->size) <= (start + size)) {
1849 + ei->type = new_type;
1850 + real_updated_size += ei->size;
1851 + continue;
1852 + }
1853 + /* partially covered */
1854 + final_start = max(start, ei->addr);
1855 + final_end = min(start + size, ei->addr + ei->size);
1856 + if (final_start >= final_end)
1857 + continue;
1858 +
1859 + x = e820x->nr_map;
1860 + if (x == ARRAY_SIZE(e820x->map)) {
1861 + printk(KERN_ERR "Too many memory map entries!\n");
1862 + break;
1863 + }
1864 + e820x->map[x].addr = final_start;
1865 + e820x->map[x].size = final_end - final_start;
1866 + e820x->map[x].type = new_type;
1867 + e820x->nr_map++;
1868 +
1869 + real_updated_size += final_end - final_start;
1870 +
1871 + if (ei->addr < final_start)
1872 + continue;
1873 + ei->addr = final_end;
1874 + ei->size -= final_end - final_start;
1875 + }
1876 + return real_updated_size;
1877 +}
1878 +
1879 +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
1880 + unsigned new_type)
1881 +{
1882 + return e820_update_range_map(&e820, start, size, old_type, new_type);
1883 +}
1884 +
1885 +static u64 __init e820_update_range_saved(u64 start, u64 size,
1886 + unsigned old_type, unsigned new_type)
1887 +{
1888 +#ifdef CONFIG_XEN
1889 + if (is_initial_xendomain())
1890 + return e820_update_range_map(&machine_e820,
1891 + phys_to_machine(start), size,
1892 + old_type, new_type);
1893 +#endif
1894 + return e820_update_range_map(&e820_saved, start, size, old_type,
1895 + new_type);
1896 +}
1897 +
1898 +/* make e820 not cover the range */
1899 +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
1900 + int checktype)
1901 +{
1902 + int i;
1903 + u64 real_removed_size = 0;
1904 +
1905 + if (size > (ULLONG_MAX - start))
1906 + size = ULLONG_MAX - start;
1907 +
1908 + for (i = 0; i < e820.nr_map; i++) {
1909 + struct e820entry *ei = &e820.map[i];
1910 + u64 final_start, final_end;
1911 +
1912 + if (checktype && ei->type != old_type)
1913 + continue;
1914 + /* totally covered? */
1915 + if (ei->addr >= start &&
1916 + (ei->addr + ei->size) <= (start + size)) {
1917 + real_removed_size += ei->size;
1918 + memset(ei, 0, sizeof(struct e820entry));
1919 + continue;
1920 + }
1921 + /* partially covered */
1922 + final_start = max(start, ei->addr);
1923 + final_end = min(start + size, ei->addr + ei->size);
1924 + if (final_start >= final_end)
1925 + continue;
1926 + real_removed_size += final_end - final_start;
1927 +
1928 + ei->size -= final_end - final_start;
1929 + if (ei->addr < final_start)
1930 + continue;
1931 + ei->addr = final_end;
1932 + }
1933 + return real_removed_size;
1934 +}
1935 +
1936 +void __init update_e820(void)
1937 +{
1938 + int nr_map;
1939 +
1940 + nr_map = e820.nr_map;
1941 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
1942 + return;
1943 + e820.nr_map = nr_map;
1944 + printk(KERN_INFO "modified physical RAM map:\n");
1945 + e820_print_map("modified");
1946 +}
1947 +static void __init update_e820_saved(void)
1948 +{
1949 + int nr_map;
1950 +
1951 + nr_map = e820_saved.nr_map;
1952 + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
1953 + return;
1954 + e820_saved.nr_map = nr_map;
1955 +}
1956 +
1957 +#ifdef CONFIG_XEN
1958 +#define e820 machine_e820
1959 +#endif
1960 +
1961 +#define MAX_GAP_END 0x100000000ull
1962 +/*
1963 + * Search for a gap in the e820 memory space from start_addr to end_addr.
1964 + */
1965 +__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
1966 + unsigned long start_addr, unsigned long long end_addr)
1967 +{
1968 + unsigned long long last;
1969 + int i = e820.nr_map;
1970 + int found = 0;
1971 +
1972 + last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
1973 +#ifdef CONFIG_X86_64
1974 + if (start_addr >= MAX_GAP_END)
1975 + last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
1976 +#endif
1977 +
1978 + while (--i >= 0) {
1979 + unsigned long long start = e820.map[i].addr;
1980 + unsigned long long end = start + e820.map[i].size;
1981 +
1982 + if (end < start_addr)
1983 + continue;
1984 +
1985 + /*
1986 + * Since "last" is at most 4GB, we know we'll
1987 + * fit in 32 bits if this condition is true
1988 + */
1989 + if (last > end) {
1990 + unsigned long gap = last - end;
1991 +
1992 + if (gap >= *gapsize) {
1993 + *gapsize = gap;
1994 + *gapstart = end;
1995 + found = 1;
1996 + }
1997 + }
1998 + if (start < last)
1999 + last = start;
2000 + }
2001 + return found;
2002 +}
2003 +
2004 +/*
2005 + * Search for the biggest gap in the low 32 bits of the e820
2006 + * memory space. We pass this space to PCI to assign MMIO resources
2007 + * for hotplug or unconfigured devices in.
2008 + * Hopefully the BIOS let enough space left.
2009 + */
2010 +__init void e820_setup_gap(void)
2011 +{
2012 + unsigned long gapstart, gapsize, round;
2013 + int found;
2014 +
2015 + gapstart = 0x10000000;
2016 + gapsize = 0x400000;
2017 + found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
2018 +
2019 +#ifdef CONFIG_X86_64
2020 + if (!found) {
2021 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2022 + "address range\n"
2023 + KERN_ERR "PCI: Unassigned devices with 32bit resource "
2024 + "registers may break!\n");
2025 + found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
2026 + BUG_ON(!found);
2027 + }
2028 +#endif
2029 +
2030 + /*
2031 + * See how much we want to round up: start off with
2032 + * rounding to the next 1MB area.
2033 + */
2034 + round = 0x100000;
2035 + while ((gapsize >> 4) > round)
2036 + round += round;
2037 + /* Fun with two's complement */
2038 + pci_mem_start = (gapstart + round) & -round;
2039 +
2040 + printk(KERN_INFO
2041 + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2042 + pci_mem_start, gapstart, gapsize);
2043 +}
2044 +
2045 +#undef e820
2046 +
2047 +#ifndef CONFIG_XEN
2048 +/**
2049 + * Because of the size limitation of struct boot_params, only first
2050 + * 128 E820 memory entries are passed to kernel via
2051 + * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
2052 + * linked list of struct setup_data, which is parsed here.
2053 + */
2054 +void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
2055 +{
2056 + u32 map_len;
2057 + int entries;
2058 + struct e820entry *extmap;
2059 +
2060 + entries = sdata->len / sizeof(struct e820entry);
2061 + map_len = sdata->len + sizeof(struct setup_data);
2062 + if (map_len > PAGE_SIZE)
2063 + sdata = early_ioremap(pa_data, map_len);
2064 + extmap = (struct e820entry *)(sdata->data);
2065 + __append_e820_map(extmap, entries);
2066 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
2067 + if (map_len > PAGE_SIZE)
2068 + early_iounmap(sdata, map_len);
2069 + printk(KERN_INFO "extended physical RAM map:\n");
2070 + e820_print_map("extended");
2071 +}
2072 +
2073 +#if defined(CONFIG_X86_64) || \
2074 + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
2075 +/**
2076 + * Find the ranges of physical addresses that do not correspond to
2077 + * e820 RAM areas and mark the corresponding pages as nosave for
2078 + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
2079 + *
2080 + * This function requires the e820 map to be sorted and without any
2081 + * overlapping entries and assumes the first e820 area to be RAM.
2082 + */
2083 +void __init e820_mark_nosave_regions(unsigned long limit_pfn)
2084 +{
2085 + int i;
2086 + unsigned long pfn;
2087 +
2088 + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
2089 + for (i = 1; i < e820.nr_map; i++) {
2090 + struct e820entry *ei = &e820.map[i];
2091 +
2092 + if (pfn < PFN_UP(ei->addr))
2093 + register_nosave_region(pfn, PFN_UP(ei->addr));
2094 +
2095 + pfn = PFN_DOWN(ei->addr + ei->size);
2096 + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
2097 + register_nosave_region(PFN_UP(ei->addr), pfn);
2098 +
2099 + if (pfn >= limit_pfn)
2100 + break;
2101 + }
2102 +}
2103 +#endif
2104 +#endif
2105 +
2106 +/*
2107 + * Early reserved memory areas.
2108 + */
2109 +#define MAX_EARLY_RES 20
2110 +
2111 +struct early_res {
2112 + u64 start, end;
2113 + char name[16];
2114 + char overlap_ok;
2115 +};
2116 +static struct early_res early_res[MAX_EARLY_RES] __initdata = {
2117 +#ifndef CONFIG_XEN
2118 + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
2119 +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
2120 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
2121 +#endif
2122 +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
2123 + /*
2124 + * But first pinch a few for the stack/trampoline stuff
2125 + * FIXME: Don't need the extra page at 4K, but need to fix
2126 + * trampoline before removing it. (see the GDT stuff)
2127 + */
2128 + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
2129 + /*
2130 + * Has to be in very low memory so we can execute
2131 + * real-mode AP code.
2132 + */
2133 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
2134 +#endif
2135 +#endif
2136 + {}
2137 +};
2138 +
2139 +static int __init find_overlapped_early(u64 start, u64 end)
2140 +{
2141 + int i;
2142 + struct early_res *r;
2143 +
2144 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2145 + r = &early_res[i];
2146 + if (end > r->start && start < r->end)
2147 + break;
2148 + }
2149 +
2150 + return i;
2151 +}
2152 +
2153 +/*
2154 + * Drop the i-th range from the early reservation map,
2155 + * by copying any higher ranges down one over it, and
2156 + * clearing what had been the last slot.
2157 + */
2158 +static void __init drop_range(int i)
2159 +{
2160 + int j;
2161 +
2162 + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
2163 + ;
2164 +
2165 + memmove(&early_res[i], &early_res[i + 1],
2166 + (j - 1 - i) * sizeof(struct early_res));
2167 +
2168 + early_res[j - 1].end = 0;
2169 +}
2170 +
2171 +/*
2172 + * Split any existing ranges that:
2173 + * 1) are marked 'overlap_ok', and
2174 + * 2) overlap with the stated range [start, end)
2175 + * into whatever portion (if any) of the existing range is entirely
2176 + * below or entirely above the stated range. Drop the portion
2177 + * of the existing range that overlaps with the stated range,
2178 + * which will allow the caller of this routine to then add that
2179 + * stated range without conflicting with any existing range.
2180 + */
2181 +static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
2182 +{
2183 + int i;
2184 + struct early_res *r;
2185 + u64 lower_start, lower_end;
2186 + u64 upper_start, upper_end;
2187 + char name[16];
2188 +
2189 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2190 + r = &early_res[i];
2191 +
2192 + /* Continue past non-overlapping ranges */
2193 + if (end <= r->start || start >= r->end)
2194 + continue;
2195 +
2196 + /*
2197 + * Leave non-ok overlaps as is; let caller
2198 + * panic "Overlapping early reservations"
2199 + * when it hits this overlap.
2200 + */
2201 + if (!r->overlap_ok)
2202 + return;
2203 +
2204 + /*
2205 + * We have an ok overlap. We will drop it from the early
2206 + * reservation map, and add back in any non-overlapping
2207 + * portions (lower or upper) as separate, overlap_ok,
2208 + * non-overlapping ranges.
2209 + */
2210 +
2211 + /* 1. Note any non-overlapping (lower or upper) ranges. */
2212 + strncpy(name, r->name, sizeof(name) - 1);
2213 +
2214 + lower_start = lower_end = 0;
2215 + upper_start = upper_end = 0;
2216 + if (r->start < start) {
2217 + lower_start = r->start;
2218 + lower_end = start;
2219 + }
2220 + if (r->end > end) {
2221 + upper_start = end;
2222 + upper_end = r->end;
2223 + }
2224 +
2225 + /* 2. Drop the original ok overlapping range */
2226 + drop_range(i);
2227 +
2228 + i--; /* resume for-loop on copied down entry */
2229 +
2230 + /* 3. Add back in any non-overlapping ranges. */
2231 + if (lower_end)
2232 + reserve_early_overlap_ok(lower_start, lower_end, name);
2233 + if (upper_end)
2234 + reserve_early_overlap_ok(upper_start, upper_end, name);
2235 + }
2236 +}
2237 +
2238 +static void __init __reserve_early(u64 start, u64 end, char *name,
2239 + int overlap_ok)
2240 +{
2241 + int i;
2242 + struct early_res *r;
2243 +
2244 + i = find_overlapped_early(start, end);
2245 + if (i >= MAX_EARLY_RES)
2246 + panic("Too many early reservations");
2247 + r = &early_res[i];
2248 + if (r->end)
2249 + panic("Overlapping early reservations "
2250 + "%llx-%llx %s to %llx-%llx %s\n",
2251 + start, end - 1, name?name:"", r->start,
2252 + r->end - 1, r->name);
2253 + r->start = start;
2254 + r->end = end;
2255 + r->overlap_ok = overlap_ok;
2256 + if (name)
2257 + strncpy(r->name, name, sizeof(r->name) - 1);
2258 +}
2259 +
2260 +/*
2261 + * A few early reservtations come here.
2262 + *
2263 + * The 'overlap_ok' in the name of this routine does -not- mean it
2264 + * is ok for these reservations to overlap an earlier reservation.
2265 + * Rather it means that it is ok for subsequent reservations to
2266 + * overlap this one.
2267 + *
2268 + * Use this entry point to reserve early ranges when you are doing
2269 + * so out of "Paranoia", reserving perhaps more memory than you need,
2270 + * just in case, and don't mind a subsequent overlapping reservation
2271 + * that is known to be needed.
2272 + *
2273 + * The drop_overlaps_that_are_ok() call here isn't really needed.
2274 + * It would be needed if we had two colliding 'overlap_ok'
2275 + * reservations, so that the second such would not panic on the
2276 + * overlap with the first. We don't have any such as of this
2277 + * writing, but might as well tolerate such if it happens in
2278 + * the future.
2279 + */
2280 +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
2281 +{
2282 + drop_overlaps_that_are_ok(start, end);
2283 + __reserve_early(start, end, name, 1);
2284 +}
2285 +
2286 +/*
2287 + * Most early reservations come here.
2288 + *
2289 + * We first have drop_overlaps_that_are_ok() drop any pre-existing
2290 + * 'overlap_ok' ranges, so that we can then reserve this memory
2291 + * range without risk of panic'ing on an overlapping overlap_ok
2292 + * early reservation.
2293 + */
2294 +void __init reserve_early(u64 start, u64 end, char *name)
2295 +{
2296 + drop_overlaps_that_are_ok(start, end);
2297 + __reserve_early(start, end, name, 0);
2298 +}
2299 +
2300 +void __init free_early(u64 start, u64 end)
2301 +{
2302 + struct early_res *r;
2303 + int i;
2304 +
2305 + i = find_overlapped_early(start, end);
2306 + r = &early_res[i];
2307 + if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
2308 + panic("free_early on not reserved area: %llx-%llx!",
2309 + start, end - 1);
2310 +
2311 + drop_range(i);
2312 +}
2313 +
2314 +void __init early_res_to_bootmem(u64 start, u64 end)
2315 +{
2316 + int i, count;
2317 + u64 final_start, final_end;
2318 +
2319 + count = 0;
2320 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
2321 + count++;
2322 +
2323 + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
2324 + count, start, end);
2325 + for (i = 0; i < count; i++) {
2326 + struct early_res *r = &early_res[i];
2327 + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
2328 + r->start, r->end, r->name);
2329 + final_start = max(start, r->start);
2330 + final_end = min(end, r->end);
2331 + if (final_start >= final_end) {
2332 + printk(KERN_CONT "\n");
2333 + continue;
2334 + }
2335 + printk(KERN_CONT " ==> [%010llx - %010llx]\n",
2336 + final_start, final_end);
2337 + reserve_bootmem_generic(final_start, final_end - final_start,
2338 + BOOTMEM_DEFAULT);
2339 + }
2340 +}
2341 +
2342 +/* Check for already reserved areas */
2343 +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
2344 +{
2345 + int i;
2346 + u64 addr = *addrp;
2347 + int changed = 0;
2348 + struct early_res *r;
2349 +again:
2350 + i = find_overlapped_early(addr, addr + size);
2351 + r = &early_res[i];
2352 + if (i < MAX_EARLY_RES && r->end) {
2353 + *addrp = addr = round_up(r->end, align);
2354 + changed = 1;
2355 + goto again;
2356 + }
2357 + return changed;
2358 +}
2359 +
2360 +/* Check for already reserved areas */
2361 +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
2362 +{
2363 + int i;
2364 + u64 addr = *addrp, last;
2365 + u64 size = *sizep;
2366 + int changed = 0;
2367 +again:
2368 + last = addr + size;
2369 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2370 + struct early_res *r = &early_res[i];
2371 + if (last > r->start && addr < r->start) {
2372 + size = r->start - addr;
2373 + changed = 1;
2374 + goto again;
2375 + }
2376 + if (last > r->end && addr < r->end) {
2377 + addr = round_up(r->end, align);
2378 + size = last - addr;
2379 + changed = 1;
2380 + goto again;
2381 + }
2382 + if (last <= r->end && addr >= r->start) {
2383 + (*sizep)++;
2384 + return 0;
2385 + }
2386 + }
2387 + if (changed) {
2388 + *addrp = addr;
2389 + *sizep = size;
2390 + }
2391 + return changed;
2392 +}
2393 +
2394 +/*
2395 + * Find a free area with specified alignment in a specific range.
2396 + */
2397 +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
2398 +{
2399 + int i;
2400 +
2401 + for (i = 0; i < e820.nr_map; i++) {
2402 + struct e820entry *ei = &e820.map[i];
2403 + u64 addr, last;
2404 + u64 ei_last;
2405 +
2406 + if (ei->type != E820_RAM)
2407 + continue;
2408 + addr = round_up(ei->addr, align);
2409 + ei_last = ei->addr + ei->size;
2410 + if (addr < start)
2411 + addr = round_up(start, align);
2412 + if (addr >= ei_last)
2413 + continue;
2414 + while (bad_addr(&addr, size, align) && addr+size <= ei_last)
2415 + ;
2416 + last = addr + size;
2417 + if (last > ei_last)
2418 + continue;
2419 + if (last > end)
2420 + continue;
2421 + return addr;
2422 + }
2423 + return -1ULL;
2424 +}
2425 +
2426 +/*
2427 + * Find next free range after *start
2428 + */
2429 +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
2430 +{
2431 + int i;
2432 +
2433 + for (i = 0; i < e820.nr_map; i++) {
2434 + struct e820entry *ei = &e820.map[i];
2435 + u64 addr, last;
2436 + u64 ei_last;
2437 +
2438 + if (ei->type != E820_RAM)
2439 + continue;
2440 + addr = round_up(ei->addr, align);
2441 + ei_last = ei->addr + ei->size;
2442 + if (addr < start)
2443 + addr = round_up(start, align);
2444 + if (addr >= ei_last)
2445 + continue;
2446 + *sizep = ei_last - addr;
2447 + while (bad_addr_size(&addr, sizep, align) &&
2448 + addr + *sizep <= ei_last)
2449 + ;
2450 + last = addr + *sizep;
2451 + if (last > ei_last)
2452 + continue;
2453 + return addr;
2454 + }
2455 +
2456 + return -1ULL;
2457 +}
2458 +
2459 +/*
2460 + * pre allocated 4k and reserved it in e820
2461 + */
2462 +u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
2463 +{
2464 + u64 size = 0;
2465 + u64 addr;
2466 + u64 start;
2467 +#ifdef CONFIG_XEN
2468 + unsigned int order = get_order(sizet);
2469 +
2470 + if (is_initial_xendomain()) {
2471 + sizet = PAGE_SIZE << order;
2472 + if (align < PAGE_SIZE)
2473 + align = PAGE_SIZE;
2474 + }
2475 +#endif
2476 + for (start = startt; ; start += size) {
2477 + start = find_e820_area_size(start, &size, align);
2478 + if (!(start + 1))
2479 + return 0;
2480 + if (size >= sizet)
2481 + break;
2482 + }
2483 +
2484 +#ifdef CONFIG_X86_32
2485 + if (start >= MAXMEM)
2486 + return 0;
2487 + if (start + size > MAXMEM)
2488 + size = MAXMEM - start;
2489 +#endif
2490 +#ifdef CONFIG_XEN
2491 + if ((start >> PAGE_SHIFT) >= xen_start_info->nr_pages)
2492 + return 0;
2493 + if (PFN_UP(start + size) > xen_start_info->nr_pages)
2494 + size = ((u64)xen_start_info->nr_pages << PAGE_SHIFT) - start;
2495 +#endif
2496 +
2497 + addr = round_down(start + size - sizet, align);
2498 + if (addr < start)
2499 + return 0;
2500 +#ifdef CONFIG_XEN
2501 + if (is_initial_xendomain()) {
2502 + int rc;
2503 + unsigned long max_initmap_pfn;
2504 +
2505 + max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
2506 + + xen_start_info->nr_pt_frames
2507 + + 1 + (1 << (19 - PAGE_SHIFT)),
2508 + 1UL << (22 - PAGE_SHIFT));
2509 +#ifdef CONFIG_X86_32
2510 + if ((addr >> PAGE_SHIFT)
2511 + < max(max_initmap_pfn, max_pfn_mapped))
2512 + rc = xen_create_contiguous_region((unsigned long)
2513 + __va(addr),
2514 + order, 32);
2515 +#else
2516 + if ((addr >> PAGE_SHIFT) < max_pfn_mapped)
2517 + rc = xen_create_contiguous_region((unsigned long)
2518 + __va(addr),
2519 + order, 32);
2520 + else if ((addr >> PAGE_SHIFT) < max_initmap_pfn)
2521 + rc = xen_create_contiguous_region(__START_KERNEL_map
2522 + + addr,
2523 + order, 32);
2524 +#endif
2525 + else
2526 + rc = early_create_contiguous_region(addr >> PAGE_SHIFT,
2527 + order, 32);
2528 + if (rc)
2529 + return 0;
2530 + }
2531 +#endif
2532 + e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
2533 + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
2534 + printk(KERN_INFO "update e820 for early_reserve_e820\n");
2535 + update_e820();
2536 + update_e820_saved();
2537 +
2538 + return addr;
2539 +}
2540 +
2541 +#ifdef CONFIG_X86_32
2542 +# ifdef CONFIG_X86_PAE
2543 +# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT))
2544 +# else
2545 +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
2546 +# endif
2547 +#else /* CONFIG_X86_32 */
2548 +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
2549 +#endif
2550 +
2551 +/*
2552 + * Find the highest page frame number we have available
2553 + */
2554 +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
2555 +{
2556 + int i;
2557 + unsigned long last_pfn = 0;
2558 + unsigned long max_arch_pfn = MAX_ARCH_PFN;
2559 +
2560 + for (i = 0; i < e820.nr_map; i++) {
2561 + struct e820entry *ei = &e820.map[i];
2562 + unsigned long start_pfn;
2563 + unsigned long end_pfn;
2564 +
2565 + if (ei->type != type)
2566 + continue;
2567 +
2568 + start_pfn = ei->addr >> PAGE_SHIFT;
2569 + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
2570 +
2571 + if (start_pfn >= limit_pfn)
2572 + continue;
2573 + if (end_pfn > limit_pfn) {
2574 + last_pfn = limit_pfn;
2575 + break;
2576 + }
2577 + if (end_pfn > last_pfn)
2578 + last_pfn = end_pfn;
2579 + }
2580 +
2581 + if (last_pfn > max_arch_pfn)
2582 + last_pfn = max_arch_pfn;
2583 +
2584 + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
2585 + last_pfn, max_arch_pfn);
2586 + return last_pfn;
2587 +}
2588 +unsigned long __init e820_end_of_ram_pfn(void)
2589 +{
2590 + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
2591 +}
2592 +
2593 +unsigned long __init e820_end_of_low_ram_pfn(void)
2594 +{
2595 + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
2596 +}
2597 +/*
2598 + * Finds an active region in the address range from start_pfn to last_pfn and
2599 + * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
2600 + */
2601 +int __init e820_find_active_region(const struct e820entry *ei,
2602 + unsigned long start_pfn,
2603 + unsigned long last_pfn,
2604 + unsigned long *ei_startpfn,
2605 + unsigned long *ei_endpfn)
2606 +{
2607 + u64 align = PAGE_SIZE;
2608 +
2609 + *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
2610 + *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
2611 +
2612 + /* Skip map entries smaller than a page */
2613 + if (*ei_startpfn >= *ei_endpfn)
2614 + return 0;
2615 +
2616 + /* Skip if map is outside the node */
2617 + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
2618 + *ei_startpfn >= last_pfn)
2619 + return 0;
2620 +
2621 + /* Check for overlaps */
2622 + if (*ei_startpfn < start_pfn)
2623 + *ei_startpfn = start_pfn;
2624 + if (*ei_endpfn > last_pfn)
2625 + *ei_endpfn = last_pfn;
2626 +
2627 + return 1;
2628 +}
2629 +
2630 +/* Walk the e820 map and register active regions within a node */
2631 +void __init e820_register_active_regions(int nid, unsigned long start_pfn,
2632 + unsigned long last_pfn)
2633 +{
2634 + unsigned long ei_startpfn;
2635 + unsigned long ei_endpfn;
2636 + int i;
2637 +
2638 + for (i = 0; i < e820.nr_map; i++)
2639 + if (e820_find_active_region(&e820.map[i],
2640 + start_pfn, last_pfn,
2641 + &ei_startpfn, &ei_endpfn))
2642 + add_active_range(nid, ei_startpfn, ei_endpfn);
2643 +}
2644 +
2645 +/*
2646 + * Find the hole size (in bytes) in the memory range.
2647 + * @start: starting address of the memory range to scan
2648 + * @end: ending address of the memory range to scan
2649 + */
2650 +u64 __init e820_hole_size(u64 start, u64 end)
2651 +{
2652 + unsigned long start_pfn = start >> PAGE_SHIFT;
2653 + unsigned long last_pfn = end >> PAGE_SHIFT;
2654 + unsigned long ei_startpfn, ei_endpfn, ram = 0;
2655 + int i;
2656 +
2657 + for (i = 0; i < e820.nr_map; i++) {
2658 + if (e820_find_active_region(&e820.map[i],
2659 + start_pfn, last_pfn,
2660 + &ei_startpfn, &ei_endpfn))
2661 + ram += ei_endpfn - ei_startpfn;
2662 + }
2663 + return end - start - ((u64)ram << PAGE_SHIFT);
2664 +}
2665 +
2666 +static void early_panic(char *msg)
2667 +{
2668 + early_printk(msg);
2669 + panic(msg);
2670 +}
2671 +
2672 +static int userdef __initdata;
2673 +
2674 +/* "mem=nopentium" disables the 4MB page tables. */
2675 +static int __init parse_memopt(char *p)
2676 +{
2677 + u64 mem_size, current_end;
2678 + unsigned int i;
2679 +
2680 + if (!p)
2681 + return -EINVAL;
2682 +
2683 +#ifdef CONFIG_X86_32
2684 + if (!strcmp(p, "nopentium")) {
2685 + setup_clear_cpu_cap(X86_FEATURE_PSE);
2686 + return 0;
2687 + }
2688 +#endif
2689 +
2690 + userdef = 1;
2691 + mem_size = memparse(p, &p);
2692 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2693 +
2694 + i = e820.nr_map - 1;
2695 + current_end = e820.map[i].addr + e820.map[i].size;
2696 + if (current_end < mem_size) {
2697 + /*
2698 + * The e820 map ends before our requested size so
2699 + * extend the final entry to the requested address.
2700 + */
2701 + if (e820.map[i].type == E820_RAM)
2702 + e820.map[i].size = mem_size - e820.map[i].addr;
2703 + else
2704 + e820_add_region(current_end, mem_size - current_end, E820_RAM);
2705 + }
2706 +
2707 + return 0;
2708 +}
2709 +early_param("mem", parse_memopt);
2710 +
2711 +#ifndef CONFIG_XEN
2712 +static int __init parse_memmap_opt(char *p)
2713 +{
2714 + char *oldp;
2715 + u64 start_at, mem_size;
2716 +
2717 + if (!p)
2718 + return -EINVAL;
2719 +
2720 + if (!strncmp(p, "exactmap", 8)) {
2721 +#ifdef CONFIG_CRASH_DUMP
2722 + /*
2723 + * If we are doing a crash dump, we still need to know
2724 + * the real mem size before original memory map is
2725 + * reset.
2726 + */
2727 + saved_max_pfn = e820_end_of_ram_pfn();
2728 +#endif
2729 + e820.nr_map = 0;
2730 + userdef = 1;
2731 + return 0;
2732 + }
2733 +
2734 + oldp = p;
2735 + mem_size = memparse(p, &p);
2736 + if (p == oldp)
2737 + return -EINVAL;
2738 +
2739 + userdef = 1;
2740 + if (*p == '@') {
2741 + start_at = memparse(p+1, &p);
2742 + e820_add_region(start_at, mem_size, E820_RAM);
2743 + } else if (*p == '#') {
2744 + start_at = memparse(p+1, &p);
2745 + e820_add_region(start_at, mem_size, E820_ACPI);
2746 + } else if (*p == '$') {
2747 + start_at = memparse(p+1, &p);
2748 + e820_add_region(start_at, mem_size, E820_RESERVED);
2749 + } else
2750 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2751 +
2752 + return *p == '\0' ? 0 : -EINVAL;
2753 +}
2754 +early_param("memmap", parse_memmap_opt);
2755 +
2756 +void __init finish_e820_parsing(void)
2757 +{
2758 + if (userdef) {
2759 + int nr = e820.nr_map;
2760 +
2761 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
2762 + early_panic("Invalid user supplied memory map");
2763 + e820.nr_map = nr;
2764 +
2765 + printk(KERN_INFO "user-defined physical RAM map:\n");
2766 + e820_print_map("user");
2767 + }
2768 +}
2769 +#endif
2770 +
2771 +static inline const char *e820_type_to_string(int e820_type)
2772 +{
2773 + switch (e820_type) {
2774 + case E820_RESERVED_KERN:
2775 + case E820_RAM: return "System RAM";
2776 + case E820_ACPI: return "ACPI Tables";
2777 + case E820_NVS: return "ACPI Non-volatile Storage";
2778 + default: return "reserved";
2779 + }
2780 +}
2781 +
2782 +#ifdef CONFIG_XEN
2783 +#define e820 machine_e820
2784 +#endif
2785 +
2786 +/*
2787 + * Mark e820 reserved areas as busy for the resource manager.
2788 + */
2789 +void __init e820_reserve_resources(void)
2790 +{
2791 + int i;
2792 + struct resource *res;
2793 + u64 end;
2794 +
2795 + res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
2796 + for (i = 0; i < e820.nr_map; i++) {
2797 + end = e820.map[i].addr + e820.map[i].size - 1;
2798 +#ifndef CONFIG_RESOURCES_64BIT
2799 + if (end > 0x100000000ULL) {
2800 + res++;
2801 + continue;
2802 + }
2803 +#endif
2804 + res->name = e820_type_to_string(e820.map[i].type);
2805 + res->start = e820.map[i].addr;
2806 + res->end = end;
2807 +
2808 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2809 + insert_resource(&iomem_resource, res);
2810 + res++;
2811 + }
2812 +
2813 + for (i = 0; i < e820_saved.nr_map; i++) {
2814 + struct e820entry *entry = &e820_saved.map[i];
2815 + firmware_map_add_early(entry->addr,
2816 + entry->addr + entry->size - 1,
2817 + e820_type_to_string(entry->type));
2818 + }
2819 +}
2820 +
2821 +#undef e820
2822 +
2823 +#ifndef CONFIG_XEN
2824 +char *__init default_machine_specific_memory_setup(void)
2825 +{
2826 + char *who = "BIOS-e820";
2827 + int new_nr;
2828 + /*
2829 + * Try to copy the BIOS-supplied E820-map.
2830 + *
2831 + * Otherwise fake a memory map; one section from 0k->640k,
2832 + * the next section from 1mb->appropriate_mem_k
2833 + */
2834 + new_nr = boot_params.e820_entries;
2835 + sanitize_e820_map(boot_params.e820_map,
2836 + ARRAY_SIZE(boot_params.e820_map),
2837 + &new_nr);
2838 + boot_params.e820_entries = new_nr;
2839 + if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
2840 + < 0) {
2841 + u64 mem_size;
2842 +
2843 + /* compare results from other methods and take the greater */
2844 + if (boot_params.alt_mem_k
2845 + < boot_params.screen_info.ext_mem_k) {
2846 + mem_size = boot_params.screen_info.ext_mem_k;
2847 + who = "BIOS-88";
2848 + } else {
2849 + mem_size = boot_params.alt_mem_k;
2850 + who = "BIOS-e801";
2851 + }
2852 +
2853 + e820.nr_map = 0;
2854 + e820_add_region(0, LOWMEMSIZE(), E820_RAM);
2855 + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
2856 + }
2857 +
2858 + /* In case someone cares... */
2859 + return who;
2860 +}
2861 +
2862 +char *__init __attribute__((weak)) machine_specific_memory_setup(void)
2863 +{
2864 + if (x86_quirks->arch_memory_setup) {
2865 + char *who = x86_quirks->arch_memory_setup();
2866 +
2867 + if (who)
2868 + return who;
2869 + }
2870 + return default_machine_specific_memory_setup();
2871 +}
2872 +#endif
2873 +
2874 +char * __init memory_setup(void)
2875 +{
2876 + int rc, nr_map;
2877 + struct xen_memory_map memmap;
2878 + /*
2879 + * This is rather large for a stack variable but this early in
2880 + * the boot process we know we have plenty slack space.
2881 + */
2882 + struct e820entry map[E820MAX];
2883 +
2884 + memmap.nr_entries = E820MAX;
2885 + set_xen_guest_handle(memmap.buffer, map);
2886 +
2887 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
2888 + if (rc == -ENOSYS) {
2889 + memmap.nr_entries = 1;
2890 + map[0].addr = 0ULL;
2891 + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
2892 + /* 8MB slack (to balance backend allocations). */
2893 + map[0].size += 8ULL << 20;
2894 + map[0].type = E820_RAM;
2895 + rc = 0;
2896 + }
2897 + BUG_ON(rc);
2898 +
2899 + nr_map = memmap.nr_entries;
2900 + sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
2901 +
2902 + if (append_e820_map(map, nr_map) < 0)
2903 + BUG();
2904 +
2905 +#ifdef CONFIG_XEN
2906 + if (is_initial_xendomain()) {
2907 + memmap.nr_entries = E820MAX;
2908 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
2909 +
2910 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
2911 + BUG();
2912 + machine_e820.nr_map = memmap.nr_entries;
2913 + }
2914 +#endif
2915 +
2916 + return "Xen";
2917 +}
2918 +
2919 +void __init setup_memory_map(void)
2920 +{
2921 + char *who;
2922 +
2923 + who = memory_setup();
2924 +#ifdef CONFIG_XEN
2925 + if (!is_initial_xendomain())
2926 +#endif
2927 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
2928 + printk(KERN_INFO "Xen-provided physical RAM map:\n");
2929 + e820_print_map(who);
2930 +}
2931 --- sle11-2009-06-04.orig/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:38:05.000000000 +0100
2932 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
2933 @@ -1,873 +0,0 @@
2934 -#include <linux/kernel.h>
2935 -#include <linux/types.h>
2936 -#include <linux/init.h>
2937 -#include <linux/bootmem.h>
2938 -#include <linux/ioport.h>
2939 -#include <linux/string.h>
2940 -#include <linux/kexec.h>
2941 -#include <linux/module.h>
2942 -#include <linux/mm.h>
2943 -#include <linux/pfn.h>
2944 -#include <linux/uaccess.h>
2945 -#include <linux/suspend.h>
2946 -
2947 -#include <asm/pgtable.h>
2948 -#include <asm/page.h>
2949 -#include <asm/e820.h>
2950 -#include <asm/setup.h>
2951 -#include <xen/interface/memory.h>
2952 -
2953 -struct e820map e820;
2954 -struct change_member {
2955 - struct e820entry *pbios; /* pointer to original bios entry */
2956 - unsigned long long addr; /* address for this change point */
2957 -};
2958 -static struct change_member change_point_list[2*E820MAX] __initdata;
2959 -static struct change_member *change_point[2*E820MAX] __initdata;
2960 -static struct e820entry *overlap_list[E820MAX] __initdata;
2961 -static struct e820entry new_bios[E820MAX] __initdata;
2962 -/* For PCI or other memory-mapped resources */
2963 -unsigned long pci_mem_start = 0x10000000;
2964 -#ifdef CONFIG_PCI
2965 -EXPORT_SYMBOL(pci_mem_start);
2966 -#endif
2967 -extern int user_defined_memmap;
2968 -
2969 -static struct resource system_rom_resource = {
2970 - .name = "System ROM",
2971 - .start = 0xf0000,
2972 - .end = 0xfffff,
2973 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2974 -};
2975 -
2976 -static struct resource extension_rom_resource = {
2977 - .name = "Extension ROM",
2978 - .start = 0xe0000,
2979 - .end = 0xeffff,
2980 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2981 -};
2982 -
2983 -static struct resource adapter_rom_resources[] = { {
2984 - .name = "Adapter ROM",
2985 - .start = 0xc8000,
2986 - .end = 0,
2987 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2988 -}, {
2989 - .name = "Adapter ROM",
2990 - .start = 0,
2991 - .end = 0,
2992 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2993 -}, {
2994 - .name = "Adapter ROM",
2995 - .start = 0,
2996 - .end = 0,
2997 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2998 -}, {
2999 - .name = "Adapter ROM",
3000 - .start = 0,
3001 - .end = 0,
3002 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3003 -}, {
3004 - .name = "Adapter ROM",
3005 - .start = 0,
3006 - .end = 0,
3007 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3008 -}, {
3009 - .name = "Adapter ROM",
3010 - .start = 0,
3011 - .end = 0,
3012 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3013 -} };
3014 -
3015 -static struct resource video_rom_resource = {
3016 - .name = "Video ROM",
3017 - .start = 0xc0000,
3018 - .end = 0xc7fff,
3019 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3020 -};
3021 -
3022 -#define ROMSIGNATURE 0xaa55
3023 -
3024 -static int __init romsignature(const unsigned char *rom)
3025 -{
3026 - const unsigned short * const ptr = (const unsigned short *)rom;
3027 - unsigned short sig;
3028 -
3029 - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
3030 -}
3031 -
3032 -static int __init romchecksum(const unsigned char *rom, unsigned long length)
3033 -{
3034 - unsigned char sum, c;
3035 -
3036 - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
3037 - sum += c;
3038 - return !length && !sum;
3039 -}
3040 -
3041 -static void __init probe_roms(void)
3042 -{
3043 - const unsigned char *rom;
3044 - unsigned long start, length, upper;
3045 - unsigned char c;
3046 - int i;
3047 -
3048 -#ifdef CONFIG_XEN
3049 - /* Nothing to do if not running in dom0. */
3050 - if (!is_initial_xendomain())
3051 - return;
3052 -#endif
3053 -
3054 - /* video rom */
3055 - upper = adapter_rom_resources[0].start;
3056 - for (start = video_rom_resource.start; start < upper; start += 2048) {
3057 - rom = isa_bus_to_virt(start);
3058 - if (!romsignature(rom))
3059 - continue;
3060 -
3061 - video_rom_resource.start = start;
3062 -
3063 - if (probe_kernel_address(rom + 2, c) != 0)
3064 - continue;
3065 -
3066 - /* 0 < length <= 0x7f * 512, historically */
3067 - length = c * 512;
3068 -
3069 - /* if checksum okay, trust length byte */
3070 - if (length && romchecksum(rom, length))
3071 - video_rom_resource.end = start + length - 1;
3072 -
3073 - request_resource(&iomem_resource, &video_rom_resource);
3074 - break;
3075 - }
3076 -
3077 - start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3078 - if (start < upper)
3079 - start = upper;
3080 -
3081 - /* system rom */
3082 - request_resource(&iomem_resource, &system_rom_resource);
3083 - upper = system_rom_resource.start;
3084 -
3085 - /* check for extension rom (ignore length byte!) */
3086 - rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
3087 - if (romsignature(rom)) {
3088 - length = extension_rom_resource.end - extension_rom_resource.start + 1;
3089 - if (romchecksum(rom, length)) {
3090 - request_resource(&iomem_resource, &extension_rom_resource);
3091 - upper = extension_rom_resource.start;
3092 - }
3093 - }
3094 -
3095 - /* check for adapter roms on 2k boundaries */
3096 - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3097 - rom = isa_bus_to_virt(start);
3098 - if (!romsignature(rom))
3099 - continue;
3100 -
3101 - if (probe_kernel_address(rom + 2, c) != 0)
3102 - continue;
3103 -
3104 - /* 0 < length <= 0x7f * 512, historically */
3105 - length = c * 512;
3106 -
3107 - /* but accept any length that fits if checksum okay */
3108 - if (!length || start + length > upper || !romchecksum(rom, length))
3109 - continue;
3110 -
3111 - adapter_rom_resources[i].start = start;
3112 - adapter_rom_resources[i].end = start + length - 1;
3113 - request_resource(&iomem_resource, &adapter_rom_resources[i]);
3114 -
3115 - start = adapter_rom_resources[i++].end & ~2047UL;
3116 - }
3117 -}
3118 -
3119 -#ifdef CONFIG_XEN
3120 -static struct e820map machine_e820;
3121 -#define e820 machine_e820
3122 -#endif
3123 -
3124 -/*
3125 - * Request address space for all standard RAM and ROM resources
3126 - * and also for regions reported as reserved by the e820.
3127 - */
3128 -void __init init_iomem_resources(struct resource *code_resource,
3129 - struct resource *data_resource,
3130 - struct resource *bss_resource)
3131 -{
3132 - int i;
3133 -
3134 - probe_roms();
3135 - for (i = 0; i < e820.nr_map; i++) {
3136 - struct resource *res;
3137 -#ifndef CONFIG_RESOURCES_64BIT
3138 - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
3139 - continue;
3140 -#endif
3141 - res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
3142 - switch (e820.map[i].type) {
3143 - case E820_RAM: res->name = "System RAM"; break;
3144 - case E820_ACPI: res->name = "ACPI Tables"; break;
3145 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
3146 - default: res->name = "reserved";
3147 - }
3148 - res->start = e820.map[i].addr;
3149 - res->end = res->start + e820.map[i].size - 1;
3150 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3151 - if (request_resource(&iomem_resource, res)) {
3152 - kfree(res);
3153 - continue;
3154 - }
3155 - if (e820.map[i].type == E820_RAM) {
3156 - /*
3157 - * We don't know which RAM region contains kernel data,
3158 - * so we try it repeatedly and let the resource manager
3159 - * test it.
3160 - */
3161 -#ifndef CONFIG_XEN
3162 - request_resource(res, code_resource);
3163 - request_resource(res, data_resource);
3164 - request_resource(res, bss_resource);
3165 -#endif
3166 -#ifdef CONFIG_KEXEC
3167 - if (crashk_res.start != crashk_res.end)
3168 - request_resource(res, &crashk_res);
3169 -#ifdef CONFIG_XEN
3170 - xen_machine_kexec_register_resources(res);
3171 -#endif
3172 -#endif
3173 - }
3174 - }
3175 -}
3176 -
3177 -#undef e820
3178 -
3179 -#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
3180 -/**
3181 - * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
3182 - * correspond to e820 RAM areas and mark the corresponding pages as nosave for
3183 - * hibernation.
3184 - *
3185 - * This function requires the e820 map to be sorted and without any
3186 - * overlapping entries and assumes the first e820 area to be RAM.
3187 - */
3188 -void __init e820_mark_nosave_regions(void)
3189 -{
3190 - int i;
3191 - unsigned long pfn;
3192 -
3193 - pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
3194 - for (i = 1; i < e820.nr_map; i++) {
3195 - struct e820entry *ei = &e820.map[i];
3196 -
3197 - if (pfn < PFN_UP(ei->addr))
3198 - register_nosave_region(pfn, PFN_UP(ei->addr));
3199 -
3200 - pfn = PFN_DOWN(ei->addr + ei->size);
3201 - if (ei->type != E820_RAM)
3202 - register_nosave_region(PFN_UP(ei->addr), pfn);
3203 -
3204 - if (pfn >= max_low_pfn)
3205 - break;
3206 - }
3207 -}
3208 -#endif
3209 -
3210 -void __init add_memory_region(unsigned long long start,
3211 - unsigned long long size, int type)
3212 -{
3213 - int x;
3214 -
3215 - x = e820.nr_map;
3216 -
3217 - if (x == E820MAX) {
3218 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3219 - return;
3220 - }
3221 -
3222 - e820.map[x].addr = start;
3223 - e820.map[x].size = size;
3224 - e820.map[x].type = type;
3225 - e820.nr_map++;
3226 -} /* add_memory_region */
3227 -
3228 -/*
3229 - * Sanitize the BIOS e820 map.
3230 - *
3231 - * Some e820 responses include overlapping entries. The following
3232 - * replaces the original e820 map with a new one, removing overlaps.
3233 - *
3234 - */
3235 -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3236 -{
3237 - struct change_member *change_tmp;
3238 - unsigned long current_type, last_type;
3239 - unsigned long long last_addr;
3240 - int chgidx, still_changing;
3241 - int overlap_entries;
3242 - int new_bios_entry;
3243 - int old_nr, new_nr, chg_nr;
3244 - int i;
3245 -
3246 - /*
3247 - Visually we're performing the following (1,2,3,4 = memory types)...
3248 -
3249 - Sample memory map (w/overlaps):
3250 - ____22__________________
3251 - ______________________4_
3252 - ____1111________________
3253 - _44_____________________
3254 - 11111111________________
3255 - ____________________33__
3256 - ___________44___________
3257 - __________33333_________
3258 - ______________22________
3259 - ___________________2222_
3260 - _________111111111______
3261 - _____________________11_
3262 - _________________4______
3263 -
3264 - Sanitized equivalent (no overlap):
3265 - 1_______________________
3266 - _44_____________________
3267 - ___1____________________
3268 - ____22__________________
3269 - ______11________________
3270 - _________1______________
3271 - __________3_____________
3272 - ___________44___________
3273 - _____________33_________
3274 - _______________2________
3275 - ________________1_______
3276 - _________________4______
3277 - ___________________2____
3278 - ____________________33__
3279 - ______________________4_
3280 - */
3281 - /* if there's only one memory region, don't bother */
3282 - if (*pnr_map < 2) {
3283 - return -1;
3284 - }
3285 -
3286 - old_nr = *pnr_map;
3287 -
3288 - /* bail out if we find any unreasonable addresses in bios map */
3289 - for (i=0; i<old_nr; i++)
3290 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
3291 - return -1;
3292 - }
3293 -
3294 - /* create pointers for initial change-point information (for sorting) */
3295 - for (i=0; i < 2*old_nr; i++)
3296 - change_point[i] = &change_point_list[i];
3297 -
3298 - /* record all known change-points (starting and ending addresses),
3299 - omitting those that are for empty memory regions */
3300 - chgidx = 0;
3301 - for (i=0; i < old_nr; i++) {
3302 - if (biosmap[i].size != 0) {
3303 - change_point[chgidx]->addr = biosmap[i].addr;
3304 - change_point[chgidx++]->pbios = &biosmap[i];
3305 - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3306 - change_point[chgidx++]->pbios = &biosmap[i];
3307 - }
3308 - }
3309 - chg_nr = chgidx; /* true number of change-points */
3310 -
3311 - /* sort change-point list by memory addresses (low -> high) */
3312 - still_changing = 1;
3313 - while (still_changing) {
3314 - still_changing = 0;
3315 - for (i=1; i < chg_nr; i++) {
3316 - /* if <current_addr> > <last_addr>, swap */
3317 - /* or, if current=<start_addr> & last=<end_addr>, swap */
3318 - if ((change_point[i]->addr < change_point[i-1]->addr) ||
3319 - ((change_point[i]->addr == change_point[i-1]->addr) &&
3320 - (change_point[i]->addr == change_point[i]->pbios->addr) &&
3321 - (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3322 - )
3323 - {
3324 - change_tmp = change_point[i];
3325 - change_point[i] = change_point[i-1];
3326 - change_point[i-1] = change_tmp;
3327 - still_changing=1;
3328 - }
3329 - }
3330 - }
3331 -
3332 - /* create a new bios memory map, removing overlaps */
3333 - overlap_entries=0; /* number of entries in the overlap table */
3334 - new_bios_entry=0; /* index for creating new bios map entries */
3335 - last_type = 0; /* start with undefined memory type */
3336 - last_addr = 0; /* start with 0 as last starting address */
3337 - /* loop through change-points, determining affect on the new bios map */
3338 - for (chgidx=0; chgidx < chg_nr; chgidx++)
3339 - {
3340 - /* keep track of all overlapping bios entries */
3341 - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3342 - {
3343 - /* add map entry to overlap list (> 1 entry implies an overlap) */
3344 - overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3345 - }
3346 - else
3347 - {
3348 - /* remove entry from list (order independent, so swap with last) */
3349 - for (i=0; i<overlap_entries; i++)
3350 - {
3351 - if (overlap_list[i] == change_point[chgidx]->pbios)
3352 - overlap_list[i] = overlap_list[overlap_entries-1];
3353 - }
3354 - overlap_entries--;
3355 - }
3356 - /* if there are overlapping entries, decide which "type" to use */
3357 - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3358 - current_type = 0;
3359 - for (i=0; i<overlap_entries; i++)
3360 - if (overlap_list[i]->type > current_type)
3361 - current_type = overlap_list[i]->type;
3362 - /* continue building up new bios map based on this information */
3363 - if (current_type != last_type) {
3364 - if (last_type != 0) {
3365 - new_bios[new_bios_entry].size =
3366 - change_point[chgidx]->addr - last_addr;
3367 - /* move forward only if the new size was non-zero */
3368 - if (new_bios[new_bios_entry].size != 0)
3369 - if (++new_bios_entry >= E820MAX)
3370 - break; /* no more space left for new bios entries */
3371 - }
3372 - if (current_type != 0) {
3373 - new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3374 - new_bios[new_bios_entry].type = current_type;
3375 - last_addr=change_point[chgidx]->addr;
3376 - }
3377 - last_type = current_type;
3378 - }
3379 - }
3380 - new_nr = new_bios_entry; /* retain count for new bios entries */
3381 -
3382 - /* copy new bios mapping into original location */
3383 - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3384 - *pnr_map = new_nr;
3385 -
3386 - return 0;
3387 -}
3388 -
3389 -/*
3390 - * Copy the BIOS e820 map into a safe place.
3391 - *
3392 - * Sanity-check it while we're at it..
3393 - *
3394 - * If we're lucky and live on a modern system, the setup code
3395 - * will have given us a memory map that we can use to properly
3396 - * set up memory. If we aren't, we'll fake a memory map.
3397 - *
3398 - * We check to see that the memory map contains at least 2 elements
3399 - * before we'll use it, because the detection code in setup.S may
3400 - * not be perfect and most every PC known to man has two memory
3401 - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3402 - * thinkpad 560x, for example, does not cooperate with the memory
3403 - * detection code.)
3404 - */
3405 -int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
3406 -{
3407 -#ifndef CONFIG_XEN
3408 - /* Only one memory region (or negative)? Ignore it */
3409 - if (nr_map < 2)
3410 - return -1;
3411 -#else
3412 - BUG_ON(nr_map < 1);
3413 -#endif
3414 -
3415 - do {
3416 - u64 start = biosmap->addr;
3417 - u64 size = biosmap->size;
3418 - u64 end = start + size;
3419 - u32 type = biosmap->type;
3420 -
3421 - /* Overflow in 64 bits? Ignore the memory map. */
3422 - if (start > end)
3423 - return -1;
3424 -
3425 - add_memory_region(start, size, type);
3426 - } while (biosmap++, --nr_map);
3427 -
3428 -#ifdef CONFIG_XEN
3429 - if (is_initial_xendomain()) {
3430 - struct xen_memory_map memmap;
3431 -
3432 - memmap.nr_entries = E820MAX;
3433 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3434 -
3435 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3436 - BUG();
3437 - machine_e820.nr_map = memmap.nr_entries;
3438 - } else
3439 - machine_e820 = e820;
3440 -#endif
3441 -
3442 - return 0;
3443 -}
3444 -
3445 -/*
3446 - * Find the highest page frame number we have available
3447 - */
3448 -void __init propagate_e820_map(void)
3449 -{
3450 - int i;
3451 -
3452 - max_pfn = 0;
3453 -
3454 - for (i = 0; i < e820.nr_map; i++) {
3455 - unsigned long start, end;
3456 - /* RAM? */
3457 - if (e820.map[i].type != E820_RAM)
3458 - continue;
3459 - start = PFN_UP(e820.map[i].addr);
3460 - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3461 - if (start >= end)
3462 - continue;
3463 - if (end > max_pfn)
3464 - max_pfn = end;
3465 - memory_present(0, start, end);
3466 - }
3467 -}
3468 -
3469 -/*
3470 - * Register fully available low RAM pages with the bootmem allocator.
3471 - */
3472 -void __init register_bootmem_low_pages(unsigned long max_low_pfn)
3473 -{
3474 - int i;
3475 -
3476 - for (i = 0; i < e820.nr_map; i++) {
3477 - unsigned long curr_pfn, last_pfn, size;
3478 - /*
3479 - * Reserve usable low memory
3480 - */
3481 - if (e820.map[i].type != E820_RAM)
3482 - continue;
3483 - /*
3484 - * We are rounding up the start address of usable memory:
3485 - */
3486 - curr_pfn = PFN_UP(e820.map[i].addr);
3487 - if (curr_pfn >= max_low_pfn)
3488 - continue;
3489 - /*
3490 - * ... and at the end of the usable range downwards:
3491 - */
3492 - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3493 -
3494 -#ifdef CONFIG_XEN
3495 - /*
3496 - * Truncate to the number of actual pages currently
3497 - * present.
3498 - */
3499 - if (last_pfn > xen_start_info->nr_pages)
3500 - last_pfn = xen_start_info->nr_pages;
3501 -#endif
3502 -
3503 - if (last_pfn > max_low_pfn)
3504 - last_pfn = max_low_pfn;
3505 -
3506 - /*
3507 - * .. finally, did all the rounding and playing
3508 - * around just make the area go away?
3509 - */
3510 - if (last_pfn <= curr_pfn)
3511 - continue;
3512 -
3513 - size = last_pfn - curr_pfn;
3514 - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
3515 - }
3516 -}
3517 -
3518 -void __init e820_register_memory(void)
3519 -{
3520 - unsigned long gapstart, gapsize, round;
3521 - unsigned long long last;
3522 - int i;
3523 -
3524 -#ifdef CONFIG_XEN
3525 - if (is_initial_xendomain()) {
3526 - struct xen_memory_map memmap;
3527 -
3528 - memmap.nr_entries = E820MAX;
3529 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3530 -
3531 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3532 - BUG();
3533 - machine_e820.nr_map = memmap.nr_entries;
3534 - }
3535 - else
3536 - machine_e820 = e820;
3537 -#define e820 machine_e820
3538 -#endif
3539 -
3540 - /*
3541 - * Search for the biggest gap in the low 32 bits of the e820
3542 - * memory space.
3543 - */
3544 - last = 0x100000000ull;
3545 - gapstart = 0x10000000;
3546 - gapsize = 0x400000;
3547 - i = e820.nr_map;
3548 - while (--i >= 0) {
3549 - unsigned long long start = e820.map[i].addr;
3550 - unsigned long long end = start + e820.map[i].size;
3551 -
3552 - /*
3553 - * Since "last" is at most 4GB, we know we'll
3554 - * fit in 32 bits if this condition is true
3555 - */
3556 - if (last > end) {
3557 - unsigned long gap = last - end;
3558 -
3559 - if (gap > gapsize) {
3560 - gapsize = gap;
3561 - gapstart = end;
3562 - }
3563 - }
3564 - if (start < last)
3565 - last = start;
3566 - }
3567 -#undef e820
3568 -
3569 - /*
3570 - * See how much we want to round up: start off with
3571 - * rounding to the next 1MB area.
3572 - */
3573 - round = 0x100000;
3574 - while ((gapsize >> 4) > round)
3575 - round += round;
3576 - /* Fun with two's complement */
3577 - pci_mem_start = (gapstart + round) & -round;
3578 -
3579 - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
3580 - pci_mem_start, gapstart, gapsize);
3581 -}
3582 -
3583 -void __init print_memory_map(char *who)
3584 -{
3585 - int i;
3586 -
3587 - for (i = 0; i < e820.nr_map; i++) {
3588 - printk(" %s: %016Lx - %016Lx ", who,
3589 - e820.map[i].addr,
3590 - e820.map[i].addr + e820.map[i].size);
3591 - switch (e820.map[i].type) {
3592 - case E820_RAM: printk("(usable)\n");
3593 - break;
3594 - case E820_RESERVED:
3595 - printk("(reserved)\n");
3596 - break;
3597 - case E820_ACPI:
3598 - printk("(ACPI data)\n");
3599 - break;
3600 - case E820_NVS:
3601 - printk("(ACPI NVS)\n");
3602 - break;
3603 - default: printk("type %u\n", e820.map[i].type);
3604 - break;
3605 - }
3606 - }
3607 -}
3608 -
3609 -void __init limit_regions(unsigned long long size)
3610 -{
3611 - unsigned long long current_addr = 0;
3612 - int i;
3613 -
3614 - print_memory_map("limit_regions start");
3615 - for (i = 0; i < e820.nr_map; i++) {
3616 - current_addr = e820.map[i].addr + e820.map[i].size;
3617 - if (current_addr < size)
3618 - continue;
3619 -
3620 - if (e820.map[i].type != E820_RAM)
3621 - continue;
3622 -
3623 - if (e820.map[i].addr >= size) {
3624 - /*
3625 - * This region starts past the end of the
3626 - * requested size, skip it completely.
3627 - */
3628 - e820.nr_map = i;
3629 - } else {
3630 - e820.nr_map = i + 1;
3631 - e820.map[i].size -= current_addr - size;
3632 - }
3633 - print_memory_map("limit_regions endfor");
3634 - return;
3635 - }
3636 -#ifdef CONFIG_XEN
3637 - if (current_addr < size) {
3638 - /*
3639 - * The e820 map finished before our requested size so
3640 - * extend the final entry to the requested address.
3641 - */
3642 - --i;
3643 - if (e820.map[i].type == E820_RAM)
3644 - e820.map[i].size -= current_addr - size;
3645 - else
3646 - add_memory_region(current_addr, size - current_addr, E820_RAM);
3647 - }
3648 -#endif
3649 - print_memory_map("limit_regions endfunc");
3650 -}
3651 -
3652 -/*
3653 - * This function checks if any part of the range <start,end> is mapped
3654 - * with type.
3655 - */
3656 -int
3657 -e820_any_mapped(u64 start, u64 end, unsigned type)
3658 -{
3659 - int i;
3660 -
3661 -#ifndef CONFIG_XEN
3662 - for (i = 0; i < e820.nr_map; i++) {
3663 - const struct e820entry *ei = &e820.map[i];
3664 -#else
3665 - if (!is_initial_xendomain())
3666 - return 0;
3667 - for (i = 0; i < machine_e820.nr_map; ++i) {
3668 - const struct e820entry *ei = &machine_e820.map[i];
3669 -#endif
3670 -
3671 - if (type && ei->type != type)
3672 - continue;
3673 - if (ei->addr >= end || ei->addr + ei->size <= start)
3674 - continue;
3675 - return 1;
3676 - }
3677 - return 0;
3678 -}
3679 -EXPORT_SYMBOL_GPL(e820_any_mapped);
3680 -
3681 - /*
3682 - * This function checks if the entire range <start,end> is mapped with type.
3683 - *
3684 - * Note: this function only works correct if the e820 table is sorted and
3685 - * not-overlapping, which is the case
3686 - */
3687 -int __init
3688 -e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
3689 -{
3690 - u64 start = s;
3691 - u64 end = e;
3692 - int i;
3693 -
3694 -#ifndef CONFIG_XEN
3695 - for (i = 0; i < e820.nr_map; i++) {
3696 - struct e820entry *ei = &e820.map[i];
3697 -#else
3698 - if (!is_initial_xendomain())
3699 - return 0;
3700 - for (i = 0; i < machine_e820.nr_map; ++i) {
3701 - const struct e820entry *ei = &machine_e820.map[i];
3702 -#endif
3703 -
3704 - if (type && ei->type != type)
3705 - continue;
3706 - /* is the region (part) in overlap with the current region ?*/
3707 - if (ei->addr >= end || ei->addr + ei->size <= start)
3708 - continue;
3709 - /* if the region is at the beginning of <start,end> we move
3710 - * start to the end of the region since it's ok until there
3711 - */
3712 - if (ei->addr <= start)
3713 - start = ei->addr + ei->size;
3714 - /* if start is now at or beyond end, we're done, full
3715 - * coverage */
3716 - if (start >= end)
3717 - return 1; /* we're done */
3718 - }
3719 - return 0;
3720 -}
3721 -
3722 -static int __init parse_memmap(char *arg)
3723 -{
3724 - if (!arg)
3725 - return -EINVAL;
3726 -
3727 - if (strcmp(arg, "exactmap") == 0) {
3728 -#ifdef CONFIG_CRASH_DUMP
3729 - /* If we are doing a crash dump, we
3730 - * still need to know the real mem
3731 - * size before original memory map is
3732 - * reset.
3733 - */
3734 - propagate_e820_map();
3735 - saved_max_pfn = max_pfn;
3736 -#endif
3737 - e820.nr_map = 0;
3738 - user_defined_memmap = 1;
3739 - } else {
3740 - /* If the user specifies memory size, we
3741 - * limit the BIOS-provided memory map to
3742 - * that size. exactmap can be used to specify
3743 - * the exact map. mem=number can be used to
3744 - * trim the existing memory map.
3745 - */
3746 - unsigned long long start_at, mem_size;
3747 -
3748 - mem_size = memparse(arg, &arg);
3749 - if (*arg == '@') {
3750 - start_at = memparse(arg+1, &arg);
3751 - add_memory_region(start_at, mem_size, E820_RAM);
3752 - } else if (*arg == '#') {
3753 - start_at = memparse(arg+1, &arg);
3754 - add_memory_region(start_at, mem_size, E820_ACPI);
3755 - } else if (*arg == '$') {
3756 - start_at = memparse(arg+1, &arg);
3757 - add_memory_region(start_at, mem_size, E820_RESERVED);
3758 - } else {
3759 - limit_regions(mem_size);
3760 - user_defined_memmap = 1;
3761 - }
3762 - }
3763 - return 0;
3764 -}
3765 -early_param("memmap", parse_memmap);
3766 -
3767 -#ifndef CONFIG_XEN
3768 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
3769 - unsigned new_type)
3770 -{
3771 - int i;
3772 -
3773 - BUG_ON(old_type == new_type);
3774 -
3775 - for (i = 0; i < e820.nr_map; i++) {
3776 - struct e820entry *ei = &e820.map[i];
3777 - u64 final_start, final_end;
3778 - if (ei->type != old_type)
3779 - continue;
3780 - /* totally covered? */
3781 - if (ei->addr >= start && ei->size <= size) {
3782 - ei->type = new_type;
3783 - continue;
3784 - }
3785 - /* partially covered */
3786 - final_start = max(start, ei->addr);
3787 - final_end = min(start + size, ei->addr + ei->size);
3788 - if (final_start >= final_end)
3789 - continue;
3790 - add_memory_region(final_start, final_end - final_start,
3791 - new_type);
3792 - }
3793 -}
3794 -
3795 -void __init update_e820(void)
3796 -{
3797 - u8 nr_map;
3798 -
3799 - nr_map = e820.nr_map;
3800 - if (sanitize_e820_map(e820.map, &nr_map))
3801 - return;
3802 - e820.nr_map = nr_map;
3803 - printk(KERN_INFO "modified physical RAM map:\n");
3804 - print_memory_map("modified");
3805 -}
3806 -#endif
3807 --- sle11-2009-06-04.orig/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:38:05.000000000 +0100
3808 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3809 @@ -1,1045 +0,0 @@
3810 -/*
3811 - * Handle the memory map.
3812 - * The functions here do the job until bootmem takes over.
3813 - *
3814 - * Getting sanitize_e820_map() in sync with i386 version by applying change:
3815 - * - Provisions for empty E820 memory regions (reported by certain BIOSes).
3816 - * Alex Achenbach <xela@slit.de>, December 2002.
3817 - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
3818 - *
3819 - */
3820 -#include <linux/kernel.h>
3821 -#include <linux/types.h>
3822 -#include <linux/init.h>
3823 -#include <linux/bootmem.h>
3824 -#include <linux/ioport.h>
3825 -#include <linux/string.h>
3826 -#include <linux/kexec.h>
3827 -#include <linux/module.h>
3828 -#include <linux/mm.h>
3829 -#include <linux/suspend.h>
3830 -#include <linux/pfn.h>
3831 -
3832 -#include <asm/pgtable.h>
3833 -#include <asm/page.h>
3834 -#include <asm/e820.h>
3835 -#include <asm/proto.h>
3836 -#include <asm/setup.h>
3837 -#include <asm/sections.h>
3838 -#include <asm/kdebug.h>
3839 -#include <xen/interface/memory.h>
3840 -
3841 -struct e820map e820 __initdata;
3842 -#ifdef CONFIG_XEN
3843 -struct e820map machine_e820;
3844 -#endif
3845 -
3846 -/*
3847 - * PFN of last memory page.
3848 - */
3849 -unsigned long end_pfn;
3850 -
3851 -/*
3852 - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
3853 - * The direct mapping extends to max_pfn_mapped, so that we can directly access
3854 - * apertures, ACPI and other tables without having to play with fixmaps.
3855 - */
3856 -unsigned long max_pfn_mapped;
3857 -
3858 -/*
3859 - * Last pfn which the user wants to use.
3860 - */
3861 -static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
3862 -
3863 -/*
3864 - * Early reserved memory areas.
3865 - */
3866 -#define MAX_EARLY_RES 20
3867 -
3868 -struct early_res {
3869 - unsigned long start, end;
3870 - char name[16];
3871 -};
3872 -static struct early_res early_res[MAX_EARLY_RES] __initdata = {
3873 -#ifndef CONFIG_XEN
3874 - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
3875 -#ifdef CONFIG_X86_TRAMPOLINE
3876 - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
3877 -#endif
3878 -#endif
3879 - {}
3880 -};
3881 -
3882 -void __init reserve_early(unsigned long start, unsigned long end, char *name)
3883 -{
3884 - int i;
3885 - struct early_res *r;
3886 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3887 - r = &early_res[i];
3888 - if (end > r->start && start < r->end)
3889 - panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
3890 - start, end - 1, name?name:"", r->start, r->end - 1, r->name);
3891 - }
3892 - if (i >= MAX_EARLY_RES)
3893 - panic("Too many early reservations");
3894 - r = &early_res[i];
3895 - r->start = start;
3896 - r->end = end;
3897 - if (name)
3898 - strncpy(r->name, name, sizeof(r->name) - 1);
3899 -}
3900 -
3901 -void __init free_early(unsigned long start, unsigned long end)
3902 -{
3903 - struct early_res *r;
3904 - int i, j;
3905 -
3906 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3907 - r = &early_res[i];
3908 - if (start == r->start && end == r->end)
3909 - break;
3910 - }
3911 - if (i >= MAX_EARLY_RES || !early_res[i].end)
3912 - panic("free_early on not reserved area: %lx-%lx!", start, end);
3913 -
3914 - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
3915 - ;
3916 -
3917 - memmove(&early_res[i], &early_res[i + 1],
3918 - (j - 1 - i) * sizeof(struct early_res));
3919 -
3920 - early_res[j - 1].end = 0;
3921 -}
3922 -
3923 -void __init early_res_to_bootmem(unsigned long start, unsigned long end)
3924 -{
3925 - int i;
3926 - unsigned long final_start, final_end;
3927 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3928 - struct early_res *r = &early_res[i];
3929 - final_start = max(start, r->start);
3930 - final_end = min(end, r->end);
3931 - if (final_start >= final_end)
3932 - continue;
3933 - printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
3934 - final_start, final_end - 1, r->name);
3935 - reserve_bootmem_generic(final_start, final_end - final_start);
3936 - }
3937 -}
3938 -
3939 -/* Check for already reserved areas */
3940 -static inline int __init
3941 -bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
3942 -{
3943 - int i;
3944 - unsigned long addr = *addrp, last;
3945 - int changed = 0;
3946 -again:
3947 - last = addr + size;
3948 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3949 - struct early_res *r = &early_res[i];
3950 - if (last >= r->start && addr < r->end) {
3951 - *addrp = addr = round_up(r->end, align);
3952 - changed = 1;
3953 - goto again;
3954 - }
3955 - }
3956 - return changed;
3957 -}
3958 -
3959 -/* Check for already reserved areas */
3960 -static inline int __init
3961 -bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
3962 -{
3963 - int i;
3964 - unsigned long addr = *addrp, last;
3965 - unsigned long size = *sizep;
3966 - int changed = 0;
3967 -again:
3968 - last = addr + size;
3969 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3970 - struct early_res *r = &early_res[i];
3971 - if (last > r->start && addr < r->start) {
3972 - size = r->start - addr;
3973 - changed = 1;
3974 - goto again;
3975 - }
3976 - if (last > r->end && addr < r->end) {
3977 - addr = round_up(r->end, align);
3978 - size = last - addr;
3979 - changed = 1;
3980 - goto again;
3981 - }
3982 - if (last <= r->end && addr >= r->start) {
3983 - (*sizep)++;
3984 - return 0;
3985 - }
3986 - }
3987 - if (changed) {
3988 - *addrp = addr;
3989 - *sizep = size;
3990 - }
3991 - return changed;
3992 -}
3993 -/*
3994 - * This function checks if any part of the range <start,end> is mapped
3995 - * with type.
3996 - */
3997 -int
3998 -e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
3999 -{
4000 - int i;
4001 -
4002 -#ifndef CONFIG_XEN
4003 - for (i = 0; i < e820.nr_map; i++) {
4004 - struct e820entry *ei = &e820.map[i];
4005 -#else
4006 - if (!is_initial_xendomain())
4007 - return 0;
4008 - for (i = 0; i < machine_e820.nr_map; i++) {
4009 - const struct e820entry *ei = &machine_e820.map[i];
4010 -#endif
4011 -
4012 - if (type && ei->type != type)
4013 - continue;
4014 - if (ei->addr >= end || ei->addr + ei->size <= start)
4015 - continue;
4016 - return 1;
4017 - }
4018 - return 0;
4019 -}
4020 -EXPORT_SYMBOL_GPL(e820_any_mapped);
4021 -
4022 -/*
4023 - * This function checks if the entire range <start,end> is mapped with type.
4024 - *
4025 - * Note: this function only works correct if the e820 table is sorted and
4026 - * not-overlapping, which is the case
4027 - */
4028 -int __init e820_all_mapped(unsigned long start, unsigned long end,
4029 - unsigned type)
4030 -{
4031 - int i;
4032 -
4033 -#ifndef CONFIG_XEN
4034 - for (i = 0; i < e820.nr_map; i++) {
4035 - struct e820entry *ei = &e820.map[i];
4036 -#else
4037 - if (!is_initial_xendomain())
4038 - return 0;
4039 - for (i = 0; i < machine_e820.nr_map; i++) {
4040 - const struct e820entry *ei = &machine_e820.map[i];
4041 -#endif
4042 -
4043 - if (type && ei->type != type)
4044 - continue;
4045 - /* is the region (part) in overlap with the current region ?*/
4046 - if (ei->addr >= end || ei->addr + ei->size <= start)
4047 - continue;
4048 -
4049 - /* if the region is at the beginning of <start,end> we move
4050 - * start to the end of the region since it's ok until there
4051 - */
4052 - if (ei->addr <= start)
4053 - start = ei->addr + ei->size;
4054 - /*
4055 - * if start is now at or beyond end, we're done, full
4056 - * coverage
4057 - */
4058 - if (start >= end)
4059 - return 1;
4060 - }
4061 - return 0;
4062 -}
4063 -
4064 -/*
4065 - * Find a free area with specified alignment in a specific range.
4066 - */
4067 -unsigned long __init find_e820_area(unsigned long start, unsigned long end,
4068 - unsigned long size, unsigned long align)
4069 -{
4070 - int i;
4071 -
4072 - for (i = 0; i < e820.nr_map; i++) {
4073 - struct e820entry *ei = &e820.map[i];
4074 - unsigned long addr, last;
4075 - unsigned long ei_last;
4076 -
4077 - if (ei->type != E820_RAM)
4078 - continue;
4079 - addr = round_up(ei->addr, align);
4080 - ei_last = ei->addr + ei->size;
4081 - if (addr < start)
4082 - addr = round_up(start, align);
4083 - if (addr >= ei_last)
4084 - continue;
4085 - while (bad_addr(&addr, size, align) && addr+size <= ei_last)
4086 - ;
4087 - last = addr + size;
4088 - if (last > ei_last)
4089 - continue;
4090 - if (last > end)
4091 - continue;
4092 - return addr;
4093 - }
4094 - return -1UL;
4095 -}
4096 -
4097 -/*
4098 - * Find next free range after *start
4099 - */
4100 -unsigned long __init find_e820_area_size(unsigned long start,
4101 - unsigned long *sizep,
4102 - unsigned long align)
4103 -{
4104 - int i;
4105 -
4106 - for (i = 0; i < e820.nr_map; i++) {
4107 - struct e820entry *ei = &e820.map[i];
4108 - unsigned long addr, last;
4109 - unsigned long ei_last;
4110 -
4111 - if (ei->type != E820_RAM)
4112 - continue;
4113 - addr = round_up(ei->addr, align);
4114 - ei_last = ei->addr + ei->size;
4115 - if (addr < start)
4116 - addr = round_up(start, align);
4117 - if (addr >= ei_last)
4118 - continue;
4119 - *sizep = ei_last - addr;
4120 - while (bad_addr_size(&addr, sizep, align) &&
4121 - addr + *sizep <= ei_last)
4122 - ;
4123 - last = addr + *sizep;
4124 - if (last > ei_last)
4125 - continue;
4126 - return addr;
4127 - }
4128 - return -1UL;
4129 -
4130 -}
4131 -/*
4132 - * Find the highest page frame number we have available
4133 - */
4134 -unsigned long __init e820_end_of_ram(void)
4135 -{
4136 - unsigned long end_pfn;
4137 -
4138 - end_pfn = find_max_pfn_with_active_regions();
4139 -
4140 - if (end_pfn > max_pfn_mapped)
4141 - max_pfn_mapped = end_pfn;
4142 - if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
4143 - max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
4144 - if (end_pfn > end_user_pfn)
4145 - end_pfn = end_user_pfn;
4146 - if (end_pfn > max_pfn_mapped)
4147 - end_pfn = max_pfn_mapped;
4148 -
4149 - printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
4150 - return end_pfn;
4151 -}
4152 -
4153 -/*
4154 - * Mark e820 reserved areas as busy for the resource manager.
4155 - */
4156 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
4157 -{
4158 - int i;
4159 - struct resource *res;
4160 -
4161 - res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
4162 - for (i = 0; i < nr_map; i++) {
4163 - switch (e820[i].type) {
4164 - case E820_RAM: res->name = "System RAM"; break;
4165 - case E820_ACPI: res->name = "ACPI Tables"; break;
4166 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
4167 - default: res->name = "reserved";
4168 - }
4169 - res->start = e820[i].addr;
4170 - res->end = res->start + e820[i].size - 1;
4171 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4172 - insert_resource(&iomem_resource, res);
4173 - res++;
4174 - }
4175 -}
4176 -
4177 -#ifndef CONFIG_XEN
4178 -/*
4179 - * Find the ranges of physical addresses that do not correspond to
4180 - * e820 RAM areas and mark the corresponding pages as nosave for software
4181 - * suspend and suspend to RAM.
4182 - *
4183 - * This function requires the e820 map to be sorted and without any
4184 - * overlapping entries and assumes the first e820 area to be RAM.
4185 - */
4186 -void __init e820_mark_nosave_regions(void)
4187 -{
4188 - int i;
4189 - unsigned long paddr;
4190 -
4191 - paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
4192 - for (i = 1; i < e820.nr_map; i++) {
4193 - struct e820entry *ei = &e820.map[i];
4194 -
4195 - if (paddr < ei->addr)
4196 - register_nosave_region(PFN_DOWN(paddr),
4197 - PFN_UP(ei->addr));
4198 -
4199 - paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
4200 - if (ei->type != E820_RAM)
4201 - register_nosave_region(PFN_UP(ei->addr),
4202 - PFN_DOWN(paddr));
4203 -
4204 - if (paddr >= (end_pfn << PAGE_SHIFT))
4205 - break;
4206 - }
4207 -}
4208 -#endif
4209 -
4210 -/*
4211 - * Finds an active region in the address range from start_pfn to end_pfn and
4212 - * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
4213 - */
4214 -static int __init e820_find_active_region(const struct e820entry *ei,
4215 - unsigned long start_pfn,
4216 - unsigned long end_pfn,
4217 - unsigned long *ei_startpfn,
4218 - unsigned long *ei_endpfn)
4219 -{
4220 - *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
4221 - *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
4222 -
4223 - /* Skip map entries smaller than a page */
4224 - if (*ei_startpfn >= *ei_endpfn)
4225 - return 0;
4226 -
4227 - /* Check if max_pfn_mapped should be updated */
4228 - if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
4229 - max_pfn_mapped = *ei_endpfn;
4230 -
4231 - /* Skip if map is outside the node */
4232 - if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
4233 - *ei_startpfn >= end_pfn)
4234 - return 0;
4235 -
4236 - /* Check for overlaps */
4237 - if (*ei_startpfn < start_pfn)
4238 - *ei_startpfn = start_pfn;
4239 - if (*ei_endpfn > end_pfn)
4240 - *ei_endpfn = end_pfn;
4241 -
4242 - /* Obey end_user_pfn to save on memmap */
4243 - if (*ei_startpfn >= end_user_pfn)
4244 - return 0;
4245 - if (*ei_endpfn > end_user_pfn)
4246 - *ei_endpfn = end_user_pfn;
4247 -
4248 - return 1;
4249 -}
4250 -
4251 -/* Walk the e820 map and register active regions within a node */
4252 -void __init
4253 -e820_register_active_regions(int nid, unsigned long start_pfn,
4254 - unsigned long end_pfn)
4255 -{
4256 - unsigned long ei_startpfn;
4257 - unsigned long ei_endpfn;
4258 - int i;
4259 -
4260 - for (i = 0; i < e820.nr_map; i++)
4261 - if (e820_find_active_region(&e820.map[i],
4262 - start_pfn, end_pfn,
4263 - &ei_startpfn, &ei_endpfn))
4264 - add_active_range(nid, ei_startpfn, ei_endpfn);
4265 -}
4266 -
4267 -/*
4268 - * Add a memory region to the kernel e820 map.
4269 - */
4270 -void __init add_memory_region(unsigned long start, unsigned long size, int type)
4271 -{
4272 - int x = e820.nr_map;
4273 -
4274 - if (x == E820MAX) {
4275 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
4276 - return;
4277 - }
4278 -
4279 - e820.map[x].addr = start;
4280 - e820.map[x].size = size;
4281 - e820.map[x].type = type;
4282 - e820.nr_map++;
4283 -}
4284 -
4285 -/*
4286 - * Find the hole size (in bytes) in the memory range.
4287 - * @start: starting address of the memory range to scan
4288 - * @end: ending address of the memory range to scan
4289 - */
4290 -unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
4291 -{
4292 - unsigned long start_pfn = start >> PAGE_SHIFT;
4293 - unsigned long end_pfn = end >> PAGE_SHIFT;
4294 - unsigned long ei_startpfn, ei_endpfn, ram = 0;
4295 - int i;
4296 -
4297 - for (i = 0; i < e820.nr_map; i++) {
4298 - if (e820_find_active_region(&e820.map[i],
4299 - start_pfn, end_pfn,
4300 - &ei_startpfn, &ei_endpfn))
4301 - ram += ei_endpfn - ei_startpfn;
4302 - }
4303 - return end - start - (ram << PAGE_SHIFT);
4304 -}
4305 -
4306 -static void __init e820_print_map(char *who)
4307 -{
4308 - int i;
4309 -
4310 - for (i = 0; i < e820.nr_map; i++) {
4311 - printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
4312 - (unsigned long long) e820.map[i].addr,
4313 - (unsigned long long)
4314 - (e820.map[i].addr + e820.map[i].size));
4315 - switch (e820.map[i].type) {
4316 - case E820_RAM:
4317 - printk(KERN_CONT "(usable)\n");
4318 - break;
4319 - case E820_RESERVED:
4320 - printk(KERN_CONT "(reserved)\n");
4321 - break;
4322 - case E820_ACPI:
4323 - printk(KERN_CONT "(ACPI data)\n");
4324 - break;
4325 - case E820_NVS:
4326 - printk(KERN_CONT "(ACPI NVS)\n");
4327 - break;
4328 - default:
4329 - printk(KERN_CONT "type %u\n", e820.map[i].type);
4330 - break;
4331 - }
4332 - }
4333 -}
4334 -
4335 -/*
4336 - * Sanitize the BIOS e820 map.
4337 - *
4338 - * Some e820 responses include overlapping entries. The following
4339 - * replaces the original e820 map with a new one, removing overlaps.
4340 - *
4341 - */
4342 -static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
4343 -{
4344 - struct change_member {
4345 - struct e820entry *pbios; /* pointer to original bios entry */
4346 - unsigned long long addr; /* address for this change point */
4347 - };
4348 - static struct change_member change_point_list[2*E820MAX] __initdata;
4349 - static struct change_member *change_point[2*E820MAX] __initdata;
4350 - static struct e820entry *overlap_list[E820MAX] __initdata;
4351 - static struct e820entry new_bios[E820MAX] __initdata;
4352 - struct change_member *change_tmp;
4353 - unsigned long current_type, last_type;
4354 - unsigned long long last_addr;
4355 - int chgidx, still_changing;
4356 - int overlap_entries;
4357 - int new_bios_entry;
4358 - int old_nr, new_nr, chg_nr;
4359 - int i;
4360 -
4361 - /*
4362 - Visually we're performing the following
4363 - (1,2,3,4 = memory types)...
4364 -
4365 - Sample memory map (w/overlaps):
4366 - ____22__________________
4367 - ______________________4_
4368 - ____1111________________
4369 - _44_____________________
4370 - 11111111________________
4371 - ____________________33__
4372 - ___________44___________
4373 - __________33333_________
4374 - ______________22________
4375 - ___________________2222_
4376 - _________111111111______
4377 - _____________________11_
4378 - _________________4______
4379 -
4380 - Sanitized equivalent (no overlap):
4381 - 1_______________________
4382 - _44_____________________
4383 - ___1____________________
4384 - ____22__________________
4385 - ______11________________
4386 - _________1______________
4387 - __________3_____________
4388 - ___________44___________
4389 - _____________33_________
4390 - _______________2________
4391 - ________________1_______
4392 - _________________4______
4393 - ___________________2____
4394 - ____________________33__
4395 - ______________________4_
4396 - */
4397 -
4398 - /* if there's only one memory region, don't bother */
4399 - if (*pnr_map < 2)
4400 - return -1;
4401 -
4402 - old_nr = *pnr_map;
4403 -
4404 - /* bail out if we find any unreasonable addresses in bios map */
4405 - for (i = 0; i < old_nr; i++)
4406 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
4407 - return -1;
4408 -
4409 - /* create pointers for initial change-point information (for sorting) */
4410 - for (i = 0; i < 2 * old_nr; i++)
4411 - change_point[i] = &change_point_list[i];
4412 -
4413 - /* record all known change-points (starting and ending addresses),
4414 - omitting those that are for empty memory regions */
4415 - chgidx = 0;
4416 - for (i = 0; i < old_nr; i++) {
4417 - if (biosmap[i].size != 0) {
4418 - change_point[chgidx]->addr = biosmap[i].addr;
4419 - change_point[chgidx++]->pbios = &biosmap[i];
4420 - change_point[chgidx]->addr = biosmap[i].addr +
4421 - biosmap[i].size;
4422 - change_point[chgidx++]->pbios = &biosmap[i];
4423 - }
4424 - }
4425 - chg_nr = chgidx;
4426 -
4427 - /* sort change-point list by memory addresses (low -> high) */
4428 - still_changing = 1;
4429 - while (still_changing) {
4430 - still_changing = 0;
4431 - for (i = 1; i < chg_nr; i++) {
4432 - unsigned long long curaddr, lastaddr;
4433 - unsigned long long curpbaddr, lastpbaddr;
4434 -
4435 - curaddr = change_point[i]->addr;
4436 - lastaddr = change_point[i - 1]->addr;
4437 - curpbaddr = change_point[i]->pbios->addr;
4438 - lastpbaddr = change_point[i - 1]->pbios->addr;
4439 -
4440 - /*
4441 - * swap entries, when:
4442 - *
4443 - * curaddr > lastaddr or
4444 - * curaddr == lastaddr and curaddr == curpbaddr and
4445 - * lastaddr != lastpbaddr
4446 - */
4447 - if (curaddr < lastaddr ||
4448 - (curaddr == lastaddr && curaddr == curpbaddr &&
4449 - lastaddr != lastpbaddr)) {
4450 - change_tmp = change_point[i];
4451 - change_point[i] = change_point[i-1];
4452 - change_point[i-1] = change_tmp;
4453 - still_changing = 1;
4454 - }
4455 - }
4456 - }
4457 -
4458 - /* create a new bios memory map, removing overlaps */
4459 - overlap_entries = 0; /* number of entries in the overlap table */
4460 - new_bios_entry = 0; /* index for creating new bios map entries */
4461 - last_type = 0; /* start with undefined memory type */
4462 - last_addr = 0; /* start with 0 as last starting address */
4463 -
4464 - /* loop through change-points, determining affect on the new bios map */
4465 - for (chgidx = 0; chgidx < chg_nr; chgidx++) {
4466 - /* keep track of all overlapping bios entries */
4467 - if (change_point[chgidx]->addr ==
4468 - change_point[chgidx]->pbios->addr) {
4469 - /*
4470 - * add map entry to overlap list (> 1 entry
4471 - * implies an overlap)
4472 - */
4473 - overlap_list[overlap_entries++] =
4474 - change_point[chgidx]->pbios;
4475 - } else {
4476 - /*
4477 - * remove entry from list (order independent,
4478 - * so swap with last)
4479 - */
4480 - for (i = 0; i < overlap_entries; i++) {
4481 - if (overlap_list[i] ==
4482 - change_point[chgidx]->pbios)
4483 - overlap_list[i] =
4484 - overlap_list[overlap_entries-1];
4485 - }
4486 - overlap_entries--;
4487 - }
4488 - /*
4489 - * if there are overlapping entries, decide which
4490 - * "type" to use (larger value takes precedence --
4491 - * 1=usable, 2,3,4,4+=unusable)
4492 - */
4493 - current_type = 0;
4494 - for (i = 0; i < overlap_entries; i++)
4495 - if (overlap_list[i]->type > current_type)
4496 - current_type = overlap_list[i]->type;
4497 - /*
4498 - * continue building up new bios map based on this
4499 - * information
4500 - */
4501 - if (current_type != last_type) {
4502 - if (last_type != 0) {
4503 - new_bios[new_bios_entry].size =
4504 - change_point[chgidx]->addr - last_addr;
4505 - /*
4506 - * move forward only if the new size
4507 - * was non-zero
4508 - */
4509 - if (new_bios[new_bios_entry].size != 0)
4510 - /*
4511 - * no more space left for new
4512 - * bios entries ?
4513 - */
4514 - if (++new_bios_entry >= E820MAX)
4515 - break;
4516 - }
4517 - if (current_type != 0) {
4518 - new_bios[new_bios_entry].addr =
4519 - change_point[chgidx]->addr;
4520 - new_bios[new_bios_entry].type = current_type;
4521 - last_addr = change_point[chgidx]->addr;
4522 - }
4523 - last_type = current_type;
4524 - }
4525 - }
4526 - /* retain count for new bios entries */
4527 - new_nr = new_bios_entry;
4528 -
4529 - /* copy new bios mapping into original location */
4530 - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
4531 - *pnr_map = new_nr;
4532 -
4533 - return 0;
4534 -}
4535 -
4536 -/*
4537 - * Copy the BIOS e820 map into a safe place.
4538 - *
4539 - * Sanity-check it while we're at it..
4540 - *
4541 - * If we're lucky and live on a modern system, the setup code
4542 - * will have given us a memory map that we can use to properly
4543 - * set up memory. If we aren't, we'll fake a memory map.
4544 - */
4545 -static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
4546 -{
4547 -#ifndef CONFIG_XEN
4548 - /* Only one memory region (or negative)? Ignore it */
4549 - if (nr_map < 2)
4550 - return -1;
4551 -#else
4552 - BUG_ON(nr_map < 1);
4553 -#endif
4554 -
4555 - do {
4556 - u64 start = biosmap->addr;
4557 - u64 size = biosmap->size;
4558 - u64 end = start + size;
4559 - u32 type = biosmap->type;
4560 -
4561 - /* Overflow in 64 bits? Ignore the memory map. */
4562 - if (start > end)
4563 - return -1;
4564 -
4565 - add_memory_region(start, size, type);
4566 - } while (biosmap++, --nr_map);
4567 -
4568 -#ifdef CONFIG_XEN
4569 - if (is_initial_xendomain()) {
4570 - struct xen_memory_map memmap;
4571 -
4572 - memmap.nr_entries = E820MAX;
4573 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
4574 -
4575 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
4576 - BUG();
4577 - machine_e820.nr_map = memmap.nr_entries;
4578 - } else
4579 - machine_e820 = e820;
4580 -#endif
4581 -
4582 - return 0;
4583 -}
4584 -
4585 -static void early_panic(char *msg)
4586 -{
4587 - early_printk(msg);
4588 - panic(msg);
4589 -}
4590 -
4591 -/* We're not void only for x86 32-bit compat */
4592 -char * __init machine_specific_memory_setup(void)
4593 -{
4594 -#ifndef CONFIG_XEN
4595 - char *who = "BIOS-e820";
4596 - /*
4597 - * Try to copy the BIOS-supplied E820-map.
4598 - *
4599 - * Otherwise fake a memory map; one section from 0k->640k,
4600 - * the next section from 1mb->appropriate_mem_k
4601 - */
4602 - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
4603 - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
4604 - early_panic("Cannot find a valid memory map");
4605 -#else /* CONFIG_XEN */
4606 - char *who = "Xen";
4607 - int rc;
4608 - struct xen_memory_map memmap;
4609 - /*
4610 - * This is rather large for a stack variable but this early in
4611 - * the boot process we know we have plenty slack space.
4612 - */
4613 - struct e820entry map[E820MAX];
4614 -
4615 - memmap.nr_entries = E820MAX;
4616 - set_xen_guest_handle(memmap.buffer, map);
4617 -
4618 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
4619 - if ( rc == -ENOSYS ) {
4620 - memmap.nr_entries = 1;
4621 - map[0].addr = 0ULL;
4622 - map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
4623 - /* 8MB slack (to balance backend allocations). */
4624 - map[0].size += 8 << 20;
4625 - map[0].type = E820_RAM;
4626 - rc = 0;
4627 - }
4628 - BUG_ON(rc);
4629 -
4630 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
4631 -
4632 - if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
4633 - early_panic("Cannot find a valid memory map");
4634 -#endif
4635 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4636 - e820_print_map(who);
4637 -
4638 - /* In case someone cares... */
4639 - return who;
4640 -}
4641 -
4642 -static int __init parse_memopt(char *p)
4643 -{
4644 - int i;
4645 - unsigned long current_end;
4646 - unsigned long end;
4647 -
4648 - if (!p)
4649 - return -EINVAL;
4650 - end_user_pfn = memparse(p, &p);
4651 - end_user_pfn >>= PAGE_SHIFT;
4652 -
4653 - end = end_user_pfn<<PAGE_SHIFT;
4654 - i = e820.nr_map-1;
4655 - current_end = e820.map[i].addr + e820.map[i].size;
4656 -
4657 - if (current_end < end) {
4658 - /*
4659 - * The e820 map ends before our requested size so
4660 - * extend the final entry to the requested address.
4661 - */
4662 - if (e820.map[i].type == E820_RAM)
4663 - e820.map[i].size = end - e820.map[i].addr;
4664 - else
4665 - add_memory_region(current_end, end - current_end, E820_RAM);
4666 - }
4667 -
4668 - return 0;
4669 -}
4670 -early_param("mem", parse_memopt);
4671 -
4672 -static int userdef __initdata;
4673 -
4674 -static int __init parse_memmap_opt(char *p)
4675 -{
4676 - char *oldp;
4677 - unsigned long long start_at, mem_size;
4678 -
4679 - if (!strcmp(p, "exactmap")) {
4680 -#ifdef CONFIG_CRASH_DUMP
4681 - /*
4682 - * If we are doing a crash dump, we still need to know
4683 - * the real mem size before original memory map is
4684 - * reset.
4685 - */
4686 - e820_register_active_regions(0, 0, -1UL);
4687 - saved_max_pfn = e820_end_of_ram();
4688 - remove_all_active_ranges();
4689 -#endif
4690 - max_pfn_mapped = 0;
4691 - e820.nr_map = 0;
4692 - userdef = 1;
4693 - return 0;
4694 - }
4695 -
4696 - oldp = p;
4697 - mem_size = memparse(p, &p);
4698 - if (p == oldp)
4699 - return -EINVAL;
4700 -
4701 - userdef = 1;
4702 - if (*p == '@') {
4703 - start_at = memparse(p+1, &p);
4704 - add_memory_region(start_at, mem_size, E820_RAM);
4705 - } else if (*p == '#') {
4706 - start_at = memparse(p+1, &p);
4707 - add_memory_region(start_at, mem_size, E820_ACPI);
4708 - } else if (*p == '$') {
4709 - start_at = memparse(p+1, &p);
4710 - add_memory_region(start_at, mem_size, E820_RESERVED);
4711 - } else {
4712 - end_user_pfn = (mem_size >> PAGE_SHIFT);
4713 - }
4714 - return *p == '\0' ? 0 : -EINVAL;
4715 -}
4716 -early_param("memmap", parse_memmap_opt);
4717 -
4718 -void __init finish_e820_parsing(void)
4719 -{
4720 - if (userdef) {
4721 - char nr = e820.nr_map;
4722 -
4723 - if (sanitize_e820_map(e820.map, &nr) < 0)
4724 - early_panic("Invalid user supplied memory map");
4725 - e820.nr_map = nr;
4726 -
4727 - printk(KERN_INFO "user-defined physical RAM map:\n");
4728 - e820_print_map("user");
4729 - }
4730 -}
4731 -
4732 -#ifndef CONFIG_XEN
4733 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
4734 - unsigned new_type)
4735 -{
4736 - int i;
4737 -
4738 - BUG_ON(old_type == new_type);
4739 -
4740 - for (i = 0; i < e820.nr_map; i++) {
4741 - struct e820entry *ei = &e820.map[i];
4742 - u64 final_start, final_end;
4743 - if (ei->type != old_type)
4744 - continue;
4745 - /* totally covered? */
4746 - if (ei->addr >= start && ei->size <= size) {
4747 - ei->type = new_type;
4748 - continue;
4749 - }
4750 - /* partially covered */
4751 - final_start = max(start, ei->addr);
4752 - final_end = min(start + size, ei->addr + ei->size);
4753 - if (final_start >= final_end)
4754 - continue;
4755 - add_memory_region(final_start, final_end - final_start,
4756 - new_type);
4757 - }
4758 -}
4759 -
4760 -void __init update_e820(void)
4761 -{
4762 - u8 nr_map;
4763 -
4764 - nr_map = e820.nr_map;
4765 - if (sanitize_e820_map(e820.map, &nr_map))
4766 - return;
4767 - e820.nr_map = nr_map;
4768 - printk(KERN_INFO "modified physical RAM map:\n");
4769 - e820_print_map("modified");
4770 -}
4771 -#endif
4772 -
4773 -unsigned long pci_mem_start = 0xaeedbabe;
4774 -EXPORT_SYMBOL(pci_mem_start);
4775 -
4776 -/*
4777 - * Search for the biggest gap in the low 32 bits of the e820
4778 - * memory space. We pass this space to PCI to assign MMIO resources
4779 - * for hotplug or unconfigured devices in.
4780 - * Hopefully the BIOS let enough space left.
4781 - */
4782 -__init void e820_setup_gap(struct e820entry *e820, int nr_map)
4783 -{
4784 - unsigned long gapstart, gapsize, round;
4785 - unsigned long last;
4786 - int i;
4787 - int found = 0;
4788 -
4789 - last = 0x100000000ull;
4790 - gapstart = 0x10000000;
4791 - gapsize = 0x400000;
4792 - i = nr_map;
4793 - while (--i >= 0) {
4794 - unsigned long long start = e820[i].addr;
4795 - unsigned long long end = start + e820[i].size;
4796 -
4797 - /*
4798 - * Since "last" is at most 4GB, we know we'll
4799 - * fit in 32 bits if this condition is true
4800 - */
4801 - if (last > end) {
4802 - unsigned long gap = last - end;
4803 -
4804 - if (gap > gapsize) {
4805 - gapsize = gap;
4806 - gapstart = end;
4807 - found = 1;
4808 - }
4809 - }
4810 - if (start < last)
4811 - last = start;
4812 - }
4813 -
4814 - if (!found) {
4815 - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
4816 - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
4817 - "address range\n"
4818 - KERN_ERR "PCI: Unassigned devices with 32bit resource "
4819 - "registers may break!\n");
4820 - }
4821 -
4822 - /*
4823 - * See how much we want to round up: start off with
4824 - * rounding to the next 1MB area.
4825 - */
4826 - round = 0x100000;
4827 - while ((gapsize >> 4) > round)
4828 - round += round;
4829 - /* Fun with two's complement */
4830 - pci_mem_start = (gapstart + round) & -round;
4831 -
4832 - printk(KERN_INFO
4833 - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
4834 - pci_mem_start, gapstart, gapsize);
4835 -}
4836 -
4837 -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
4838 -{
4839 - int i;
4840 -
4841 - if (slot < 0 || slot >= e820.nr_map)
4842 - return -1;
4843 - for (i = slot; i < e820.nr_map; i++) {
4844 - if (e820.map[i].type != E820_RAM)
4845 - continue;
4846 - break;
4847 - }
4848 - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
4849 - return -1;
4850 - *addr = e820.map[i].addr;
4851 - *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
4852 - max_pfn << PAGE_SHIFT) - *addr;
4853 - return i + 1;
4854 -}
4855 --- sle11-2009-06-04.orig/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:38:05.000000000 +0100
4856 +++ sle11-2009-06-04/arch/x86/kernel/early_printk-xen.c 2009-06-04 10:21:39.000000000 +0200
4857 @@ -225,7 +225,7 @@ static struct console simnow_console = {
4858 static struct console *early_console = &early_vga_console;
4859 static int early_console_initialized;
4860
4861 -void early_printk(const char *fmt, ...)
4862 +asmlinkage void early_printk(const char *fmt, ...)
4863 {
4864 char buf[512];
4865 int n;
4866 --- sle11-2009-06-04.orig/arch/x86/kernel/entry_32-xen.S 2009-03-16 16:38:05.000000000 +0100
4867 +++ sle11-2009-06-04/arch/x86/kernel/entry_32-xen.S 2009-06-04 10:21:39.000000000 +0200
4868 @@ -51,15 +51,26 @@
4869 #include <asm/percpu.h>
4870 #include <asm/dwarf2.h>
4871 #include <asm/processor-flags.h>
4872 -#include "irq_vectors.h"
4873 +#include <asm/ftrace.h>
4874 +#include <asm/irq_vectors.h>
4875 #include <xen/interface/xen.h>
4876
4877 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
4878 +#include <linux/elf-em.h>
4879 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
4880 +#define __AUDIT_ARCH_LE 0x40000000
4881 +
4882 +#ifndef CONFIG_AUDITSYSCALL
4883 +#define sysenter_audit syscall_trace_entry
4884 +#define sysexit_audit syscall_exit_work
4885 +#endif
4886 +
4887 /*
4888 * We use macros for low-level operations which need to be overridden
4889 * for paravirtualization. The following will never clobber any registers:
4890 * INTERRUPT_RETURN (aka. "iret")
4891 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
4892 - * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
4893 + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
4894 *
4895 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
4896 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
4897 @@ -277,11 +288,6 @@ END(resume_kernel)
4898 #endif
4899 CFI_ENDPROC
4900
4901 - .macro test_tif ti_reg # system call tracing in operation / emulation
4902 - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
4903 - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
4904 - .endm
4905 -
4906 /* SYSENTER_RETURN points to after the "sysenter" instruction in
4907 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
4908
4909 @@ -338,8 +344,9 @@ sysenter_past_esp:
4910 .previous
4911
4912 GET_THREAD_INFO(%ebp)
4913 - test_tif %ebp
4914 - jnz syscall_trace_entry
4915 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4916 + jnz sysenter_audit
4917 +sysenter_do_call:
4918 cmpl $(nr_syscalls), %eax
4919 jae syscall_badsys
4920 call *sys_call_table(,%eax,4)
4921 @@ -349,14 +356,54 @@ sysenter_past_esp:
4922 TRACE_IRQS_OFF
4923 movl TI_flags(%ebp), %ecx
4924 testw $_TIF_ALLWORK_MASK, %cx
4925 - jne syscall_exit_work
4926 + jne sysexit_audit
4927 +sysenter_exit:
4928 /* if something modifies registers it must also disable sysexit */
4929 movl PT_EIP(%esp), %edx
4930 movl PT_OLDESP(%esp), %ecx
4931 xorl %ebp,%ebp
4932 TRACE_IRQS_ON
4933 1: mov PT_FS(%esp), %fs
4934 - ENABLE_INTERRUPTS_SYSCALL_RET
4935 + ENABLE_INTERRUPTS_SYSEXIT
4936 +
4937 +#ifdef CONFIG_AUDITSYSCALL
4938 +sysenter_audit:
4939 + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
4940 + jnz syscall_trace_entry
4941 + addl $4,%esp
4942 + CFI_ADJUST_CFA_OFFSET -4
4943 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
4944 + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
4945 + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
4946 + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
4947 + movl %eax,%edx /* 2nd arg: syscall number */
4948 + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
4949 + call audit_syscall_entry
4950 + pushl %ebx
4951 + CFI_ADJUST_CFA_OFFSET 4
4952 + movl PT_EAX(%esp),%eax /* reload syscall number */
4953 + jmp sysenter_do_call
4954 +
4955 +sysexit_audit:
4956 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4957 + jne syscall_exit_work
4958 + TRACE_IRQS_ON
4959 + ENABLE_INTERRUPTS(CLBR_ANY)
4960 + movl %eax,%edx /* second arg, syscall return value */
4961 + cmpl $0,%eax /* is it < 0? */
4962 + setl %al /* 1 if so, 0 if not */
4963 + movzbl %al,%eax /* zero-extend that */
4964 + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
4965 + call audit_syscall_exit
4966 + DISABLE_INTERRUPTS(CLBR_ANY)
4967 + TRACE_IRQS_OFF
4968 + movl TI_flags(%ebp), %ecx
4969 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4970 + jne syscall_exit_work
4971 + movl PT_EAX(%esp),%eax /* reload syscall return value */
4972 + jmp sysenter_exit
4973 +#endif
4974 +
4975 CFI_ENDPROC
4976 .pushsection .fixup,"ax"
4977 2: movl $0,PT_FS(%esp)
4978 @@ -400,7 +447,7 @@ ENTRY(system_call)
4979 CFI_ADJUST_CFA_OFFSET 4
4980 SAVE_ALL
4981 GET_THREAD_INFO(%ebp)
4982 - test_tif %ebp
4983 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4984 jnz syscall_trace_entry
4985 cmpl $(nr_syscalls), %eax
4986 jae syscall_badsys
4987 @@ -413,10 +460,6 @@ syscall_exit:
4988 # setting need_resched or sigpending
4989 # between sampling and the iret
4990 TRACE_IRQS_OFF
4991 - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
4992 - jz no_singlestep
4993 - orl $_TIF_SINGLESTEP,TI_flags(%ebp)
4994 -no_singlestep:
4995 movl TI_flags(%ebp), %ecx
4996 testw $_TIF_ALLWORK_MASK, %cx # current->work
4997 jne syscall_exit_work
4998 @@ -588,12 +631,8 @@ END(work_pending)
4999 syscall_trace_entry:
5000 movl $-ENOSYS,PT_EAX(%esp)
5001 movl %esp, %eax
5002 - xorl %edx,%edx
5003 - call do_syscall_trace
5004 - cmpl $0, %eax
5005 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5006 - # so must skip actual syscall
5007 - movl PT_ORIG_EAX(%esp), %eax
5008 + call syscall_trace_enter
5009 + /* What it returned is what we'll actually use. */
5010 cmpl $(nr_syscalls), %eax
5011 jnae syscall_call
5012 jmp syscall_exit
5013 @@ -602,14 +641,13 @@ END(syscall_trace_entry)
5014 # perform syscall exit tracing
5015 ALIGN
5016 syscall_exit_work:
5017 - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
5018 + testb $_TIF_WORK_SYSCALL_EXIT, %cl
5019 jz work_pending
5020 TRACE_IRQS_ON
5021 - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
5022 + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
5023 # schedule() instead
5024 movl %esp, %eax
5025 - movl $1, %edx
5026 - call do_syscall_trace
5027 + call syscall_trace_leave
5028 jmp resume_userspace
5029 END(syscall_exit_work)
5030 CFI_ENDPROC
5031 @@ -1113,10 +1151,10 @@ ENTRY(native_iret)
5032 .previous
5033 END(native_iret)
5034
5035 -ENTRY(native_irq_enable_syscall_ret)
5036 +ENTRY(native_irq_enable_sysexit)
5037 sti
5038 sysexit
5039 -END(native_irq_enable_syscall_ret)
5040 +END(native_irq_enable_sysexit)
5041 #endif
5042
5043 KPROBE_ENTRY(int3)
5044 @@ -1265,6 +1303,77 @@ ENTRY(kernel_thread_helper)
5045 CFI_ENDPROC
5046 ENDPROC(kernel_thread_helper)
5047
5048 +#ifdef CONFIG_FTRACE
5049 +#ifdef CONFIG_DYNAMIC_FTRACE
5050 +
5051 +ENTRY(mcount)
5052 + pushl %eax
5053 + pushl %ecx
5054 + pushl %edx
5055 + movl 0xc(%esp), %eax
5056 + subl $MCOUNT_INSN_SIZE, %eax
5057 +
5058 +.globl mcount_call
5059 +mcount_call:
5060 + call ftrace_stub
5061 +
5062 + popl %edx
5063 + popl %ecx
5064 + popl %eax
5065 +
5066 + ret
5067 +END(mcount)
5068 +
5069 +ENTRY(ftrace_caller)
5070 + pushl %eax
5071 + pushl %ecx
5072 + pushl %edx
5073 + movl 0xc(%esp), %eax
5074 + movl 0x4(%ebp), %edx
5075 + subl $MCOUNT_INSN_SIZE, %eax
5076 +
5077 +.globl ftrace_call
5078 +ftrace_call:
5079 + call ftrace_stub
5080 +
5081 + popl %edx
5082 + popl %ecx
5083 + popl %eax
5084 +
5085 +.globl ftrace_stub
5086 +ftrace_stub:
5087 + ret
5088 +END(ftrace_caller)
5089 +
5090 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5091 +
5092 +ENTRY(mcount)
5093 + cmpl $ftrace_stub, ftrace_trace_function
5094 + jnz trace
5095 +.globl ftrace_stub
5096 +ftrace_stub:
5097 + ret
5098 +
5099 + /* taken from glibc */
5100 +trace:
5101 + pushl %eax
5102 + pushl %ecx
5103 + pushl %edx
5104 + movl 0xc(%esp), %eax
5105 + movl 0x4(%ebp), %edx
5106 + subl $MCOUNT_INSN_SIZE, %eax
5107 +
5108 + call *ftrace_trace_function
5109 +
5110 + popl %edx
5111 + popl %ecx
5112 + popl %eax
5113 +
5114 + jmp ftrace_stub
5115 +END(mcount)
5116 +#endif /* CONFIG_DYNAMIC_FTRACE */
5117 +#endif /* CONFIG_FTRACE */
5118 +
5119 #include <asm/alternative-asm.h>
5120
5121 # pv syscall call handler stub
5122 @@ -1290,7 +1399,7 @@ ENTRY(ia32pv_cstar_target)
5123 .previous
5124 SAVE_ALL
5125 GET_THREAD_INFO(%ebp)
5126 - test_tif %ebp
5127 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
5128 jnz cstar_trace_entry
5129 cmpl $nr_syscalls,%eax
5130 jae cstar_badsys
5131 @@ -1324,29 +1433,21 @@ cstar_trace_entry:
5132 btl %eax,cstar_special
5133 jc .Lcstar_trace_special
5134 1: movl %esp,%eax
5135 - xorl %edx,%edx
5136 LOCK_PREFIX
5137 orl $_TIF_CSTAR,TI_flags(%ebp)
5138 - call do_syscall_trace
5139 + call syscall_trace_enter
5140 LOCK_PREFIX
5141 andl $~_TIF_CSTAR,TI_flags(%ebp)
5142 - testl %eax,%eax
5143 - jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
5144 - # so must skip actual syscall
5145 - movl PT_ORIG_EAX(%esp),%eax
5146 + /* What it returned is what we'll actually use. */
5147 cmpl $nr_syscalls,%eax
5148 jb .Lcstar_call
5149 jmp .Lcstar_exit
5150 .Lcstar_trace_special:
5151 movl PT_ECX(%esp),%ecx
5152 movl %esp,%eax
5153 - xorl %edx,%edx
5154 movl %ecx,PT_EBP(%esp) # put user EBP back in place
5155 - call do_syscall_trace
5156 - testl %eax,%eax
5157 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5158 - # so must skip actual syscall
5159 - movl PT_ORIG_EAX(%esp),%eax
5160 + call syscall_trace_enter
5161 + /* What it returned is what we'll actually use. */
5162 cmpl $nr_syscalls,%eax
5163 jb syscall_call
5164 jmp syscall_exit
5165 --- sle11-2009-06-04.orig/arch/x86/kernel/entry_64.S 2009-06-04 00:00:00.000000000 +0200
5166 +++ sle11-2009-06-04/arch/x86/kernel/entry_64.S 2009-06-04 10:21:39.000000000 +0200
5167 @@ -1409,7 +1409,7 @@ ENTRY(arch_unwind_init_running)
5168 ENDPROC(arch_unwind_init_running)
5169 #endif
5170
5171 -#ifdef CONFIG_XEN
5172 +#ifdef CONFIG_PARAVIRT_XEN
5173 ENTRY(xen_hypervisor_callback)
5174 zeroentry xen_do_hypervisor_callback
5175 END(xen_hypervisor_callback)
5176 @@ -1507,7 +1507,7 @@ ENTRY(xen_failsafe_callback)
5177 CFI_ENDPROC
5178 END(xen_failsafe_callback)
5179
5180 -#endif /* CONFIG_XEN */
5181 +#endif /* CONFIG_PARAVIRT_XEN */
5182
5183 #ifdef CONFIG_KDB
5184
5185 --- sle11-2009-06-04.orig/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:38:05.000000000 +0100
5186 +++ sle11-2009-06-04/arch/x86/kernel/entry_64-xen.S 2009-06-04 10:21:39.000000000 +0200
5187 @@ -53,19 +53,130 @@
5188 #include <asm/hw_irq.h>
5189 #include <asm/page.h>
5190 #include <asm/irqflags.h>
5191 +#include <asm/ftrace.h>
5192 #include <asm/errno.h>
5193 #include <xen/interface/xen.h>
5194 #include <xen/interface/features.h>
5195
5196 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
5197 +#include <linux/elf-em.h>
5198 +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
5199 +#define __AUDIT_ARCH_64BIT 0x80000000
5200 +#define __AUDIT_ARCH_LE 0x40000000
5201 +
5202 .code64
5203
5204 +#ifdef CONFIG_FTRACE
5205 +#ifdef CONFIG_DYNAMIC_FTRACE
5206 +ENTRY(mcount)
5207 +
5208 + subq $0x38, %rsp
5209 + movq %rax, (%rsp)
5210 + movq %rcx, 8(%rsp)
5211 + movq %rdx, 16(%rsp)
5212 + movq %rsi, 24(%rsp)
5213 + movq %rdi, 32(%rsp)
5214 + movq %r8, 40(%rsp)
5215 + movq %r9, 48(%rsp)
5216 +
5217 + movq 0x38(%rsp), %rdi
5218 + subq $MCOUNT_INSN_SIZE, %rdi
5219 +
5220 +.globl mcount_call
5221 +mcount_call:
5222 + call ftrace_stub
5223 +
5224 + movq 48(%rsp), %r9
5225 + movq 40(%rsp), %r8
5226 + movq 32(%rsp), %rdi
5227 + movq 24(%rsp), %rsi
5228 + movq 16(%rsp), %rdx
5229 + movq 8(%rsp), %rcx
5230 + movq (%rsp), %rax
5231 + addq $0x38, %rsp
5232 +
5233 + retq
5234 +END(mcount)
5235 +
5236 +ENTRY(ftrace_caller)
5237 +
5238 + /* taken from glibc */
5239 + subq $0x38, %rsp
5240 + movq %rax, (%rsp)
5241 + movq %rcx, 8(%rsp)
5242 + movq %rdx, 16(%rsp)
5243 + movq %rsi, 24(%rsp)
5244 + movq %rdi, 32(%rsp)
5245 + movq %r8, 40(%rsp)
5246 + movq %r9, 48(%rsp)
5247 +
5248 + movq 0x38(%rsp), %rdi
5249 + movq 8(%rbp), %rsi
5250 + subq $MCOUNT_INSN_SIZE, %rdi
5251 +
5252 +.globl ftrace_call
5253 +ftrace_call:
5254 + call ftrace_stub
5255 +
5256 + movq 48(%rsp), %r9
5257 + movq 40(%rsp), %r8
5258 + movq 32(%rsp), %rdi
5259 + movq 24(%rsp), %rsi
5260 + movq 16(%rsp), %rdx
5261 + movq 8(%rsp), %rcx
5262 + movq (%rsp), %rax
5263 + addq $0x38, %rsp
5264 +
5265 +.globl ftrace_stub
5266 +ftrace_stub:
5267 + retq
5268 +END(ftrace_caller)
5269 +
5270 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5271 +ENTRY(mcount)
5272 + cmpq $ftrace_stub, ftrace_trace_function
5273 + jnz trace
5274 +.globl ftrace_stub
5275 +ftrace_stub:
5276 + retq
5277 +
5278 +trace:
5279 + /* taken from glibc */
5280 + subq $0x38, %rsp
5281 + movq %rax, (%rsp)
5282 + movq %rcx, 8(%rsp)
5283 + movq %rdx, 16(%rsp)
5284 + movq %rsi, 24(%rsp)
5285 + movq %rdi, 32(%rsp)
5286 + movq %r8, 40(%rsp)
5287 + movq %r9, 48(%rsp)
5288 +
5289 + movq 0x38(%rsp), %rdi
5290 + movq 8(%rbp), %rsi
5291 + subq $MCOUNT_INSN_SIZE, %rdi
5292 +
5293 + call *ftrace_trace_function
5294 +
5295 + movq 48(%rsp), %r9
5296 + movq 40(%rsp), %r8
5297 + movq 32(%rsp), %rdi
5298 + movq 24(%rsp), %rsi
5299 + movq 16(%rsp), %rdx
5300 + movq 8(%rsp), %rcx
5301 + movq (%rsp), %rax
5302 + addq $0x38, %rsp
5303 +
5304 + jmp ftrace_stub
5305 +END(mcount)
5306 +#endif /* CONFIG_DYNAMIC_FTRACE */
5307 +#endif /* CONFIG_FTRACE */
5308 +
5309 #ifndef CONFIG_PREEMPT
5310 #define retint_kernel retint_restore_args
5311 #endif
5312
5313 #ifdef CONFIG_PARAVIRT
5314 -ENTRY(native_irq_enable_syscall_ret)
5315 - movq %gs:pda_oldrsp,%rsp
5316 +ENTRY(native_usergs_sysret64)
5317 swapgs
5318 sysretq
5319 #endif /* CONFIG_PARAVIRT */
5320 @@ -102,7 +213,7 @@ NMI_MASK = 0x80000000
5321 .macro FAKE_STACK_FRAME child_rip
5322 /* push in order ss, rsp, eflags, cs, rip */
5323 xorl %eax, %eax
5324 - pushq %rax /* ss */
5325 + pushq $__KERNEL_DS /* ss */
5326 CFI_ADJUST_CFA_OFFSET 8
5327 /*CFI_REL_OFFSET ss,0*/
5328 pushq %rax /* rsp */
5329 @@ -197,13 +308,13 @@ ENTRY(ret_from_fork)
5330 CFI_ADJUST_CFA_OFFSET -4
5331 call schedule_tail
5332 GET_THREAD_INFO(%rcx)
5333 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
5334 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5335 jnz rff_trace
5336 rff_action:
5337 RESTORE_REST
5338 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
5339 je int_ret_from_sys_call
5340 - testl $_TIF_IA32,threadinfo_flags(%rcx)
5341 + testl $_TIF_IA32,TI_flags(%rcx)
5342 jnz int_ret_from_sys_call
5343 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
5344 jmp ret_from_sys_call
5345 @@ -265,8 +376,9 @@ ENTRY(system_call)
5346 SAVE_ARGS -8,0
5347 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
5348 GET_THREAD_INFO(%rcx)
5349 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
5350 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
5351 jnz tracesys
5352 +system_call_fastpath:
5353 cmpq $__NR_syscall_max,%rax
5354 ja badsys
5355 movq %r10,%rcx
5356 @@ -284,7 +396,7 @@ sysret_check:
5357 GET_THREAD_INFO(%rcx)
5358 DISABLE_INTERRUPTS(CLBR_NONE)
5359 TRACE_IRQS_OFF
5360 - movl threadinfo_flags(%rcx),%edx
5361 + movl TI_flags(%rcx),%edx
5362 andl %edi,%edx
5363 jnz sysret_careful
5364 CFI_REMEMBER_STATE
5365 @@ -315,16 +427,16 @@ sysret_careful:
5366 sysret_signal:
5367 TRACE_IRQS_ON
5368 ENABLE_INTERRUPTS(CLBR_NONE)
5369 - testl $_TIF_DO_NOTIFY_MASK,%edx
5370 - jz 1f
5371 -
5372 - /* Really a signal */
5373 +#ifdef CONFIG_AUDITSYSCALL
5374 + bt $TIF_SYSCALL_AUDIT,%edx
5375 + jc sysret_audit
5376 +#endif
5377 /* edx: work flags (arg3) */
5378 leaq do_notify_resume(%rip),%rax
5379 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
5380 xorl %esi,%esi # oldset -> arg2
5381 call ptregscall_common
5382 -1: movl $_TIF_NEED_RESCHED,%edi
5383 + movl $_TIF_WORK_MASK,%edi
5384 /* Use IRET because user could have changed frame. This
5385 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
5386 DISABLE_INTERRUPTS(CLBR_NONE)
5387 @@ -335,14 +447,56 @@ badsys:
5388 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
5389 jmp ret_from_sys_call
5390
5391 +#ifdef CONFIG_AUDITSYSCALL
5392 + /*
5393 + * Fast path for syscall audit without full syscall trace.
5394 + * We just call audit_syscall_entry() directly, and then
5395 + * jump back to the normal fast path.
5396 + */
5397 +auditsys:
5398 + movq %r10,%r9 /* 6th arg: 4th syscall arg */
5399 + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
5400 + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
5401 + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
5402 + movq %rax,%rsi /* 2nd arg: syscall number */
5403 + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
5404 + call audit_syscall_entry
5405 + LOAD_ARGS 0 /* reload call-clobbered registers */
5406 + jmp system_call_fastpath
5407 +
5408 + /*
5409 + * Return fast path for syscall audit. Call audit_syscall_exit()
5410 + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
5411 + * masked off.
5412 + */
5413 +sysret_audit:
5414 + movq %rax,%rsi /* second arg, syscall return value */
5415 + cmpq $0,%rax /* is it < 0? */
5416 + setl %al /* 1 if so, 0 if not */
5417 + movzbl %al,%edi /* zero-extend that into %edi */
5418 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
5419 + call audit_syscall_exit
5420 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
5421 + jmp sysret_check
5422 +#endif /* CONFIG_AUDITSYSCALL */
5423 +
5424 /* Do syscall tracing */
5425 tracesys:
5426 +#ifdef CONFIG_AUDITSYSCALL
5427 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5428 + jz auditsys
5429 +#endif
5430 SAVE_REST
5431 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
5432 FIXUP_TOP_OF_STACK %rdi
5433 movq %rsp,%rdi
5434 call syscall_trace_enter
5435 - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
5436 + /*
5437 + * Reload arg registers from stack in case ptrace changed them.
5438 + * We don't reload %rax because syscall_trace_enter() returned
5439 + * the value it wants us to use in the table lookup.
5440 + */
5441 + LOAD_ARGS ARGOFFSET, 1
5442 RESTORE_REST
5443 cmpq $__NR_syscall_max,%rax
5444 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
5445 @@ -356,6 +510,7 @@ tracesys:
5446 * Has correct top of stack, but partial stack frame.
5447 */
5448 .globl int_ret_from_sys_call
5449 + .globl int_with_check
5450 int_ret_from_sys_call:
5451 DISABLE_INTERRUPTS(CLBR_NONE)
5452 TRACE_IRQS_OFF
5453 @@ -370,10 +525,10 @@ int_ret_from_sys_call:
5454 int_with_check:
5455 LOCKDEP_SYS_EXIT_IRQ
5456 GET_THREAD_INFO(%rcx)
5457 - movl threadinfo_flags(%rcx),%edx
5458 + movl TI_flags(%rcx),%edx
5459 andl %edi,%edx
5460 jnz int_careful
5461 - andl $~TS_COMPAT,threadinfo_status(%rcx)
5462 + andl $~TS_COMPAT,TI_status(%rcx)
5463 jmp retint_restore_args
5464
5465 /* Either reschedule or signal or syscall exit tracking needed. */
5466 @@ -399,7 +554,7 @@ int_very_careful:
5467 ENABLE_INTERRUPTS(CLBR_NONE)
5468 SAVE_REST
5469 /* Check for syscall exit trace */
5470 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
5471 + testl $_TIF_WORK_SYSCALL_EXIT,%edx
5472 jz int_signal
5473 pushq %rdi
5474 CFI_ADJUST_CFA_OFFSET 8
5475 @@ -407,7 +562,7 @@ int_very_careful:
5476 call syscall_trace_leave
5477 popq %rdi
5478 CFI_ADJUST_CFA_OFFSET -8
5479 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
5480 + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
5481 jmp int_restore_rest
5482
5483 int_signal:
5484 @@ -416,7 +571,7 @@ int_signal:
5485 movq %rsp,%rdi # &ptregs -> arg1
5486 xorl %esi,%esi # oldset -> arg2
5487 call do_notify_resume
5488 -1: movl $_TIF_NEED_RESCHED,%edi
5489 +1: movl $_TIF_WORK_MASK,%edi
5490 int_restore_rest:
5491 RESTORE_REST
5492 DISABLE_INTERRUPTS(CLBR_NONE)
5493 @@ -443,7 +598,6 @@ END(\label)
5494 PTREGSCALL stub_clone, sys_clone, %r8
5495 PTREGSCALL stub_fork, sys_fork, %rdi
5496 PTREGSCALL stub_vfork, sys_vfork, %rdi
5497 - PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
5498 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
5499 PTREGSCALL stub_iopl, sys_iopl, %rsi
5500
5501 @@ -517,10 +671,12 @@ END(stub_rt_sigreturn)
5502 *
5503 */
5504
5505 -retint_check:
5506 +retint_with_reschedule:
5507 CFI_DEFAULT_STACK adj=1
5508 + movl $_TIF_WORK_MASK,%edi
5509 +retint_check:
5510 LOCKDEP_SYS_EXIT_IRQ
5511 - movl threadinfo_flags(%rcx),%edx
5512 + movl TI_flags(%rcx),%edx
5513 andl %edi,%edx
5514 CFI_REMEMBER_STATE
5515 jnz retint_careful
5516 @@ -565,17 +721,16 @@ retint_signal:
5517 RESTORE_REST
5518 DISABLE_INTERRUPTS(CLBR_NONE)
5519 TRACE_IRQS_OFF
5520 - movl $_TIF_NEED_RESCHED,%edi
5521 GET_THREAD_INFO(%rcx)
5522 - jmp retint_check
5523 + jmp retint_with_reschedule
5524
5525 #ifdef CONFIG_PREEMPT
5526 /* Returning to kernel space. Check if we need preemption */
5527 /* rcx: threadinfo. interrupts off. */
5528 ENTRY(retint_kernel)
5529 - cmpl $0,threadinfo_preempt_count(%rcx)
5530 + cmpl $0,TI_preempt_count(%rcx)
5531 jnz retint_restore_args
5532 - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
5533 + bt $TIF_NEED_RESCHED,TI_flags(%rcx)
5534 jnc retint_restore_args
5535 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
5536 jnc retint_restore_args
5537 @@ -630,6 +785,9 @@ END(invalidate_interrupt\num)
5538 ENTRY(call_function_interrupt)
5539 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
5540 END(call_function_interrupt)
5541 +ENTRY(call_function_single_interrupt)
5542 + apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
5543 +END(call_function_single_interrupt)
5544 ENTRY(irq_move_cleanup_interrupt)
5545 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
5546 END(irq_move_cleanup_interrupt)
5547 @@ -639,6 +797,10 @@ ENTRY(apic_timer_interrupt)
5548 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
5549 END(apic_timer_interrupt)
5550
5551 +ENTRY(uv_bau_message_intr1)
5552 + apicinterrupt 220,uv_bau_message_interrupt
5553 +END(uv_bau_message_intr1)
5554 +
5555 ENTRY(error_interrupt)
5556 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
5557 END(error_interrupt)
5558 @@ -752,7 +914,7 @@ paranoid_restore\trace:
5559 jmp irq_return
5560 paranoid_userspace\trace:
5561 GET_THREAD_INFO(%rcx)
5562 - movl threadinfo_flags(%rcx),%ebx
5563 + movl TI_flags(%rcx),%ebx
5564 andl $_TIF_WORK_MASK,%ebx
5565 jz paranoid_swapgs\trace
5566 movq %rsp,%rdi /* &pt_regs */
5567 @@ -849,7 +1011,7 @@ error_exit:
5568 testb $3,CS-ARGOFFSET(%rsp)
5569 jz retint_kernel
5570 LOCKDEP_SYS_EXIT_IRQ
5571 - movl threadinfo_flags(%rcx),%edx
5572 + movl TI_flags(%rcx),%edx
5573 movl $_TIF_WORK_MASK,%edi
5574 andl %edi,%edx
5575 jnz retint_careful
5576 @@ -871,11 +1033,11 @@ error_kernelspace:
5577 iret run with kernel gs again, so don't set the user space flag.
5578 B stepping K8s sometimes report an truncated RIP for IRET
5579 exceptions returning to compat mode. Check for these here too. */
5580 - leaq irq_return(%rip),%rbp
5581 - cmpq %rbp,RIP(%rsp)
5582 + leaq irq_return(%rip),%rcx
5583 + cmpq %rcx,RIP(%rsp)
5584 je error_swapgs
5585 - movl %ebp,%ebp /* zero extend */
5586 - cmpq %rbp,RIP(%rsp)
5587 + movl %ecx,%ecx /* zero extend */
5588 + cmpq %rcx,RIP(%rsp)
5589 je error_swapgs
5590 cmpq $gs_change,RIP(%rsp)
5591 je error_swapgs
5592 @@ -1121,6 +1283,7 @@ END(device_not_available)
5593 /* runs on exception stack */
5594 KPROBE_ENTRY(debug)
5595 /* INTR_FRAME
5596 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5597 pushq $0
5598 CFI_ADJUST_CFA_OFFSET 8 */
5599 zeroentry do_debug
5600 @@ -1148,6 +1311,7 @@ END(do_nmi_callback)
5601
5602 KPROBE_ENTRY(int3)
5603 /* INTR_FRAME
5604 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5605 pushq $0
5606 CFI_ADJUST_CFA_OFFSET 8 */
5607 zeroentry do_int3
5608 @@ -1171,14 +1335,11 @@ ENTRY(coprocessor_segment_overrun)
5609 zeroentry do_coprocessor_segment_overrun
5610 END(coprocessor_segment_overrun)
5611
5612 -ENTRY(reserved)
5613 - zeroentry do_reserved
5614 -END(reserved)
5615 -
5616 #if 0
5617 /* runs on exception stack */
5618 ENTRY(double_fault)
5619 XCPT_FRAME
5620 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5621 paranoidentry do_double_fault
5622 jmp paranoid_exit1
5623 CFI_ENDPROC
5624 @@ -1196,6 +1357,7 @@ END(segment_not_present)
5625 /* runs on exception stack */
5626 ENTRY(stack_segment)
5627 /* XCPT_FRAME
5628 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5629 paranoidentry do_stack_segment */
5630 errorentry do_stack_segment
5631 /* jmp paranoid_exit1
5632 @@ -1222,6 +1384,7 @@ END(spurious_interrupt_bug)
5633 /* runs on exception stack */
5634 ENTRY(machine_check)
5635 INTR_FRAME
5636 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5637 pushq $0
5638 CFI_ADJUST_CFA_OFFSET 8
5639 paranoidentry do_machine_check
5640 --- sle11-2009-06-04.orig/arch/x86/kernel/fixup.c 2009-03-16 16:33:40.000000000 +0100
5641 +++ sle11-2009-06-04/arch/x86/kernel/fixup.c 2009-06-04 10:21:39.000000000 +0200
5642 @@ -33,6 +33,7 @@
5643 #include <linux/kernel.h>
5644 #include <linux/delay.h>
5645 #include <linux/version.h>
5646 +#include <asm/traps.h>
5647
5648 #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
5649
5650 --- sle11-2009-06-04.orig/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
5651 +++ sle11-2009-06-04/arch/x86/kernel/genapic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
5652 @@ -58,7 +58,7 @@ void __init setup_apic_routing(void)
5653 else
5654 #endif
5655
5656 - if (num_possible_cpus() <= 8)
5657 + if (max_physical_apicid < 8)
5658 genapic = &apic_flat;
5659 else
5660 genapic = &apic_physflat;
5661 @@ -121,4 +121,5 @@ int is_uv_system(void)
5662 {
5663 return uv_system_type != UV_NONE;
5664 }
5665 +EXPORT_SYMBOL_GPL(is_uv_system);
5666 #endif
5667 --- sle11-2009-06-04.orig/arch/x86/kernel/genapic_xen_64.c 2009-03-16 16:38:05.000000000 +0100
5668 +++ sle11-2009-06-04/arch/x86/kernel/genapic_xen_64.c 2009-06-04 10:21:39.000000000 +0200
5669 @@ -43,7 +43,7 @@ void xen_send_IPI_shortcut(unsigned int
5670 __send_IPI_one(smp_processor_id(), vector);
5671 break;
5672 case APIC_DEST_ALLBUT:
5673 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5674 + for_each_possible_cpu(cpu) {
5675 if (cpu == smp_processor_id())
5676 continue;
5677 if (cpu_isset(cpu, cpu_online_map)) {
5678 @@ -52,7 +52,7 @@ void xen_send_IPI_shortcut(unsigned int
5679 }
5680 break;
5681 case APIC_DEST_ALLINC:
5682 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5683 + for_each_possible_cpu(cpu) {
5684 if (cpu_isset(cpu, cpu_online_map)) {
5685 __send_IPI_one(cpu, vector);
5686 }
5687 @@ -81,8 +81,6 @@ static cpumask_t xen_vector_allocation_d
5688 */
5689 static void xen_init_apic_ldr(void)
5690 {
5691 - Dprintk("%s\n", __FUNCTION__);
5692 - return;
5693 }
5694
5695 static void xen_send_IPI_allbutself(int vector)
5696 @@ -92,14 +90,12 @@ static void xen_send_IPI_allbutself(int
5697 * we get an APIC send error if we try to broadcast.
5698 * thus we have to avoid sending IPIs in this case.
5699 */
5700 - Dprintk("%s\n", __FUNCTION__);
5701 if (num_online_cpus() > 1)
5702 xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
5703 }
5704
5705 static void xen_send_IPI_all(int vector)
5706 {
5707 - Dprintk("%s\n", __FUNCTION__);
5708 xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
5709 }
5710
5711 @@ -109,11 +105,10 @@ static void xen_send_IPI_mask(cpumask_t
5712 unsigned int cpu;
5713 unsigned long flags;
5714
5715 - Dprintk("%s\n", __FUNCTION__);
5716 local_irq_save(flags);
5717 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
5718
5719 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5720 + for_each_possible_cpu(cpu) {
5721 if (cpu_isset(cpu, cpumask)) {
5722 __send_IPI_one(cpu, vector);
5723 }
5724 @@ -125,14 +120,12 @@ static void xen_send_IPI_mask(cpumask_t
5725 static int xen_apic_id_registered(void)
5726 {
5727 /* better be set */
5728 - Dprintk("%s\n", __FUNCTION__);
5729 return physid_isset(smp_processor_id(), phys_cpu_present_map);
5730 }
5731 #endif
5732
5733 static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
5734 {
5735 - Dprintk("%s\n", __FUNCTION__);
5736 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
5737 }
5738
5739 @@ -140,7 +133,6 @@ static unsigned int phys_pkg_id(int inde
5740 {
5741 u32 ebx;
5742
5743 - Dprintk("%s\n", __FUNCTION__);
5744 ebx = cpuid_ebx(1);
5745 return ((ebx >> 24) & 0xFF) >> index_msb;
5746 }
5747 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5748 +++ sle11-2009-06-04/arch/x86/kernel/head-xen.c 2009-06-04 10:21:39.000000000 +0200
5749 @@ -0,0 +1,57 @@
5750 +#include <linux/kernel.h>
5751 +#include <linux/init.h>
5752 +
5753 +#include <asm/setup.h>
5754 +#include <asm/bios_ebda.h>
5755 +
5756 +#define BIOS_LOWMEM_KILOBYTES 0x413
5757 +
5758 +/*
5759 + * The BIOS places the EBDA/XBDA at the top of conventional
5760 + * memory, and usually decreases the reported amount of
5761 + * conventional memory (int 0x12) too. This also contains a
5762 + * workaround for Dell systems that neglect to reserve EBDA.
5763 + * The same workaround also avoids a problem with the AMD768MPX
5764 + * chipset: reserve a page before VGA to prevent PCI prefetch
5765 + * into it (errata #56). Usually the page is reserved anyways,
5766 + * unless you have no PS/2 mouse plugged in.
5767 + */
5768 +void __init reserve_ebda_region(void)
5769 +{
5770 +#ifndef CONFIG_XEN
5771 + unsigned int lowmem, ebda_addr;
5772 +
5773 + /* To determine the position of the EBDA and the */
5774 + /* end of conventional memory, we need to look at */
5775 + /* the BIOS data area. In a paravirtual environment */
5776 + /* that area is absent. We'll just have to assume */
5777 + /* that the paravirt case can handle memory setup */
5778 + /* correctly, without our help. */
5779 + if (paravirt_enabled())
5780 + return;
5781 +
5782 + /* end of low (conventional) memory */
5783 + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5784 + lowmem <<= 10;
5785 +
5786 + /* start of EBDA area */
5787 + ebda_addr = get_bios_ebda();
5788 +
5789 + /* Fixup: bios puts an EBDA in the top 64K segment */
5790 + /* of conventional memory, but does not adjust lowmem. */
5791 + if ((lowmem - ebda_addr) <= 0x10000)
5792 + lowmem = ebda_addr;
5793 +
5794 + /* Fixup: bios does not report an EBDA at all. */
5795 + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5796 + if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5797 + lowmem = 0x9f000;
5798 +
5799 + /* Paranoia: should never happen, but... */
5800 + if ((lowmem == 0) || (lowmem >= 0x100000))
5801 + lowmem = 0x9f000;
5802 +
5803 + /* reserve all memory between lowmem and the 1MB mark */
5804 + reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
5805 +#endif
5806 +}
5807 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5808 +++ sle11-2009-06-04/arch/x86/kernel/head32-xen.c 2009-06-04 10:21:39.000000000 +0200
5809 @@ -0,0 +1,57 @@
5810 +/*
5811 + * linux/arch/i386/kernel/head32.c -- prepare to run common code
5812 + *
5813 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5814 + * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
5815 + */
5816 +
5817 +#include <linux/init.h>
5818 +#include <linux/start_kernel.h>
5819 +
5820 +#include <asm/setup.h>
5821 +#include <asm/sections.h>
5822 +#include <asm/e820.h>
5823 +#include <asm/bios_ebda.h>
5824 +
5825 +void __init i386_start_kernel(void)
5826 +{
5827 + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5828 +
5829 +#ifndef CONFIG_XEN
5830 +#ifdef CONFIG_BLK_DEV_INITRD
5831 + /* Reserve INITRD */
5832 + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
5833 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
5834 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
5835 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
5836 + reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
5837 + }
5838 +#endif
5839 + reserve_early(init_pg_tables_start, init_pg_tables_end,
5840 + "INIT_PG_TABLE");
5841 +#else
5842 + reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
5843 + __pa(xen_start_info->pt_base)
5844 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5845 + "Xen provided");
5846 +
5847 + {
5848 + int max_cmdline;
5849 +
5850 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
5851 + max_cmdline = COMMAND_LINE_SIZE;
5852 + memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
5853 + boot_command_line[max_cmdline-1] = '\0';
5854 + }
5855 +#endif
5856 +
5857 + reserve_ebda_region();
5858 +
5859 + /*
5860 + * At this point everything still needed from the boot loader
5861 + * or BIOS or kernel text should be early reserved or marked not
5862 + * RAM in e820. All other memory is free game.
5863 + */
5864 +
5865 + start_kernel();
5866 +}
5867 --- sle11-2009-06-04.orig/arch/x86/kernel/head64-xen.c 2009-03-16 16:38:05.000000000 +0100
5868 +++ sle11-2009-06-04/arch/x86/kernel/head64-xen.c 2009-06-04 10:21:39.000000000 +0200
5869 @@ -32,7 +32,26 @@
5870 #include <asm/e820.h>
5871 #include <asm/bios_ebda.h>
5872
5873 -unsigned long start_pfn;
5874 +/* boot cpu pda */
5875 +static struct x8664_pda _boot_cpu_pda __read_mostly;
5876 +
5877 +#ifdef CONFIG_SMP
5878 +/*
5879 + * We install an empty cpu_pda pointer table to indicate to early users
5880 + * (numa_set_node) that the cpu_pda pointer table for cpus other than
5881 + * the boot cpu is not yet setup.
5882 + */
5883 +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
5884 +#else
5885 +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
5886 +#endif
5887 +
5888 +void __init x86_64_init_pda(void)
5889 +{
5890 + _cpu_pda = __cpu_pda;
5891 + cpu_pda(0) = &_boot_cpu_pda;
5892 + pda_init(0);
5893 +}
5894
5895 #ifndef CONFIG_XEN
5896 static void __init zap_identity_mappings(void)
5897 @@ -77,83 +96,10 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
5898 unsigned int machine_to_phys_order;
5899 EXPORT_SYMBOL(machine_to_phys_order);
5900
5901 -#define BIOS_LOWMEM_KILOBYTES 0x413
5902 -
5903 -/*
5904 - * The BIOS places the EBDA/XBDA at the top of conventional
5905 - * memory, and usually decreases the reported amount of
5906 - * conventional memory (int 0x12) too. This also contains a
5907 - * workaround for Dell systems that neglect to reserve EBDA.
5908 - * The same workaround also avoids a problem with the AMD768MPX
5909 - * chipset: reserve a page before VGA to prevent PCI prefetch
5910 - * into it (errata #56). Usually the page is reserved anyways,
5911 - * unless you have no PS/2 mouse plugged in.
5912 - */
5913 -static void __init reserve_ebda_region(void)
5914 -{
5915 -#ifndef CONFIG_XEN
5916 - unsigned int lowmem, ebda_addr;
5917 -
5918 - /* To determine the position of the EBDA and the */
5919 - /* end of conventional memory, we need to look at */
5920 - /* the BIOS data area. In a paravirtual environment */
5921 - /* that area is absent. We'll just have to assume */
5922 - /* that the paravirt case can handle memory setup */
5923 - /* correctly, without our help. */
5924 - if (paravirt_enabled())
5925 - return;
5926 -
5927 - /* end of low (conventional) memory */
5928 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5929 - lowmem <<= 10;
5930 -
5931 - /* start of EBDA area */
5932 - ebda_addr = get_bios_ebda();
5933 -
5934 - /* Fixup: bios puts an EBDA in the top 64K segment */
5935 - /* of conventional memory, but does not adjust lowmem. */
5936 - if ((lowmem - ebda_addr) <= 0x10000)
5937 - lowmem = ebda_addr;
5938 -
5939 - /* Fixup: bios does not report an EBDA at all. */
5940 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5941 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5942 - lowmem = 0x9f000;
5943 -
5944 - /* Paranoia: should never happen, but... */
5945 - if ((lowmem == 0) || (lowmem >= 0x100000))
5946 - lowmem = 0x9f000;
5947 -
5948 - /* reserve all memory between lowmem and the 1MB mark */
5949 - reserve_early(lowmem, 0x100000, "BIOS reserved");
5950 -#endif
5951 -}
5952 -
5953 -static void __init reserve_setup_data(void)
5954 -{
5955 -#ifndef CONFIG_XEN
5956 - struct setup_data *data;
5957 - unsigned long pa_data;
5958 - char buf[32];
5959 -
5960 - if (boot_params.hdr.version < 0x0209)
5961 - return;
5962 - pa_data = boot_params.hdr.setup_data;
5963 - while (pa_data) {
5964 - data = early_ioremap(pa_data, sizeof(*data));
5965 - sprintf(buf, "setup data %x", data->type);
5966 - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
5967 - pa_data = data->next;
5968 - early_iounmap(data, sizeof(*data));
5969 - }
5970 -#endif
5971 -}
5972 -
5973 void __init x86_64_start_kernel(char * real_mode_data)
5974 {
5975 struct xen_machphys_mapping mapping;
5976 unsigned long machine_to_phys_nr_ents;
5977 - int i;
5978
5979 /*
5980 * Build-time sanity checks on the kernel image and module
5981 @@ -167,6 +113,7 @@ void __init x86_64_start_kernel(char * r
5982 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
5983 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
5984 (__START_KERNEL & PGDIR_MASK)));
5985 + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
5986
5987 xen_setup_features();
5988
5989 @@ -174,8 +121,6 @@ void __init x86_64_start_kernel(char * r
5990 if (!xen_feature(XENFEAT_auto_translated_physmap))
5991 phys_to_machine_mapping =
5992 (unsigned long *)xen_start_info->mfn_list;
5993 - start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
5994 - xen_start_info->nr_pt_frames;
5995
5996 machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
5997 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
5998 @@ -208,19 +153,23 @@ void __init x86_64_start_kernel(char * r
5999
6000 early_printk("Kernel alive\n");
6001
6002 - for (i = 0; i < NR_CPUS; i++)
6003 - cpu_pda(i) = &boot_cpu_pda[i];
6004 + x86_64_init_pda();
6005
6006 - pda_init(0);
6007 + early_printk("Kernel really alive\n");
6008 +
6009 + x86_64_start_reservations(real_mode_data);
6010 +}
6011 +
6012 +void __init x86_64_start_reservations(char *real_mode_data)
6013 +{
6014 copy_bootdata(__va(real_mode_data));
6015
6016 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
6017
6018 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
6019 - start_pfn << PAGE_SHIFT, "Xen provided");
6020 -
6021 - reserve_ebda_region();
6022 - reserve_setup_data();
6023 + __pa(xen_start_info->pt_base)
6024 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
6025 + "Xen provided");
6026
6027 /*
6028 * At this point everything still needed from the boot loader
6029 --- sle11-2009-06-04.orig/arch/x86/kernel/head_64-xen.S 2009-02-16 16:17:21.000000000 +0100
6030 +++ sle11-2009-06-04/arch/x86/kernel/head_64-xen.S 2009-06-04 10:21:39.000000000 +0200
6031 @@ -95,53 +95,6 @@ NEXT_PAGE(hypercall_page)
6032
6033 #undef NEXT_PAGE
6034
6035 - .data
6036 -
6037 - .align 16
6038 - .globl cpu_gdt_descr
6039 -cpu_gdt_descr:
6040 - .word gdt_end-cpu_gdt_table-1
6041 -gdt:
6042 - .quad cpu_gdt_table
6043 -#ifdef CONFIG_SMP
6044 - .rept NR_CPUS-1
6045 - .word 0
6046 - .quad 0
6047 - .endr
6048 -#endif
6049 -
6050 -/* We need valid kernel segments for data and code in long mode too
6051 - * IRET will check the segment types kkeil 2000/10/28
6052 - * Also sysret mandates a special GDT layout
6053 - */
6054 -
6055 - .section .data.page_aligned, "aw"
6056 - .align PAGE_SIZE
6057 -
6058 -/* The TLS descriptors are currently at a different place compared to i386.
6059 - Hopefully nobody expects them at a fixed place (Wine?) */
6060 -
6061 -ENTRY(cpu_gdt_table)
6062 - .quad 0x0000000000000000 /* NULL descriptor */
6063 - .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
6064 - .quad 0x00af9b000000ffff /* __KERNEL_CS */
6065 - .quad 0x00cf93000000ffff /* __KERNEL_DS */
6066 - .quad 0x00cffb000000ffff /* __USER32_CS */
6067 - .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
6068 - .quad 0x00affb000000ffff /* __USER_CS */
6069 - .quad 0x0 /* unused */
6070 - .quad 0,0 /* TSS */
6071 - .quad 0,0 /* LDT */
6072 - .quad 0,0,0 /* three TLS descriptors */
6073 - .quad 0x0000f40000000000 /* node/CPU stored in limit */
6074 -gdt_end:
6075 - /* asm/segment.h:GDT_ENTRIES must match this */
6076 - /* This should be a multiple of the cache line size */
6077 - /* GDTs of other CPUs are now dynamically allocated */
6078 -
6079 - /* zero the remaining page */
6080 - .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
6081 -
6082 .section .bss.page_aligned, "aw", @nobits
6083 .align PAGE_SIZE
6084 ENTRY(empty_zero_page)
6085 --- sle11-2009-06-04.orig/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:38:05.000000000 +0100
6086 +++ sle11-2009-06-04/arch/x86/kernel/io_apic_32-xen.c 2009-06-04 10:21:39.000000000 +0200
6087 @@ -25,6 +25,7 @@
6088 #include <linux/init.h>
6089 #include <linux/delay.h>
6090 #include <linux/sched.h>
6091 +#include <linux/bootmem.h>
6092 #include <linux/mc146818rtc.h>
6093 #include <linux/compiler.h>
6094 #include <linux/acpi.h>
6095 @@ -75,7 +76,7 @@ static struct { int pin, apic; } ioapic_
6096 static DEFINE_SPINLOCK(ioapic_lock);
6097 static DEFINE_SPINLOCK(vector_lock);
6098
6099 -int timer_over_8254 __initdata = 1;
6100 +int timer_through_8259 __initdata;
6101
6102 /*
6103 * Is the SiS APIC rmw bug present ?
6104 @@ -89,15 +90,21 @@ int sis_apic_bug = -1;
6105 int nr_ioapic_registers[MAX_IO_APICS];
6106
6107 /* I/O APIC entries */
6108 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6109 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
6110 int nr_ioapics;
6111
6112 /* MP IRQ source entries */
6113 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6114 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6115
6116 /* # of MP IRQ source entries */
6117 int mp_irq_entries;
6118
6119 +#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
6120 +int mp_bus_id_to_type[MAX_MP_BUSSES];
6121 +#endif
6122 +
6123 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
6124 +
6125 static int disable_timer_pin_1 __initdata;
6126
6127 /*
6128 @@ -128,7 +135,7 @@ struct io_apic {
6129 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
6130 {
6131 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
6132 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
6133 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
6134 }
6135 #endif
6136
6137 @@ -142,7 +149,7 @@ static inline unsigned int io_apic_read(
6138 struct physdev_apic apic_op;
6139 int ret;
6140
6141 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6142 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6143 apic_op.reg = reg;
6144 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
6145 if (ret)
6146 @@ -160,7 +167,7 @@ static inline void io_apic_write(unsigne
6147 #else
6148 struct physdev_apic apic_op;
6149
6150 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6151 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6152 apic_op.reg = reg;
6153 apic_op.value = value;
6154 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
6155 @@ -288,7 +295,7 @@ static void __init replace_pin_at_irq(un
6156 }
6157 }
6158
6159 -static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
6160 +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
6161 {
6162 struct irq_pin_list *entry = irq_2_pin + irq;
6163 unsigned int pin, reg;
6164 @@ -308,30 +315,32 @@ static void __modify_IO_APIC_irq (unsign
6165 }
6166
6167 /* mask = 1 */
6168 -static void __mask_IO_APIC_irq (unsigned int irq)
6169 +static void __mask_IO_APIC_irq(unsigned int irq)
6170 {
6171 - __modify_IO_APIC_irq(irq, 0x00010000, 0);
6172 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
6173 }
6174
6175 /* mask = 0 */
6176 -static void __unmask_IO_APIC_irq (unsigned int irq)
6177 +static void __unmask_IO_APIC_irq(unsigned int irq)
6178 {
6179 - __modify_IO_APIC_irq(irq, 0, 0x00010000);
6180 + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
6181 }
6182
6183 /* mask = 1, trigger = 0 */
6184 -static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
6185 +static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
6186 {
6187 - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
6188 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
6189 + IO_APIC_REDIR_LEVEL_TRIGGER);
6190 }
6191
6192 /* mask = 0, trigger = 1 */
6193 -static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
6194 +static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
6195 {
6196 - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
6197 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
6198 + IO_APIC_REDIR_MASKED);
6199 }
6200
6201 -static void mask_IO_APIC_irq (unsigned int irq)
6202 +static void mask_IO_APIC_irq(unsigned int irq)
6203 {
6204 unsigned long flags;
6205
6206 @@ -340,7 +349,7 @@ static void mask_IO_APIC_irq (unsigned i
6207 spin_unlock_irqrestore(&ioapic_lock, flags);
6208 }
6209
6210 -static void unmask_IO_APIC_irq (unsigned int irq)
6211 +static void unmask_IO_APIC_irq(unsigned int irq)
6212 {
6213 unsigned long flags;
6214
6215 @@ -352,7 +361,7 @@ static void unmask_IO_APIC_irq (unsigned
6216 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
6217 {
6218 struct IO_APIC_route_entry entry;
6219 -
6220 +
6221 /* Check delivery_mode to be sure we're not clearing an SMI pin */
6222 entry = ioapic_read_entry(apic, pin);
6223 if (entry.delivery_mode == dest_SMI)
6224 @@ -364,7 +373,7 @@ static void clear_IO_APIC_pin(unsigned i
6225 ioapic_mask_entry(apic, pin);
6226 }
6227
6228 -static void clear_IO_APIC (void)
6229 +static void clear_IO_APIC(void)
6230 {
6231 int apic, pin;
6232
6233 @@ -381,7 +390,7 @@ static void set_ioapic_affinity_irq(unsi
6234 struct irq_pin_list *entry = irq_2_pin + irq;
6235 unsigned int apicid_value;
6236 cpumask_t tmp;
6237 -
6238 +
6239 cpus_and(tmp, cpumask, cpu_online_map);
6240 if (cpus_empty(tmp))
6241 tmp = TARGET_CPUS;
6242 @@ -410,7 +419,7 @@ static void set_ioapic_affinity_irq(unsi
6243 # include <linux/kernel_stat.h> /* kstat */
6244 # include <linux/slab.h> /* kmalloc() */
6245 # include <linux/timer.h>
6246 -
6247 +
6248 #define IRQBALANCE_CHECK_ARCH -999
6249 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
6250 #define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
6251 @@ -422,14 +431,14 @@ static int physical_balance __read_mostl
6252 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
6253
6254 static struct irq_cpu_info {
6255 - unsigned long * last_irq;
6256 - unsigned long * irq_delta;
6257 + unsigned long *last_irq;
6258 + unsigned long *irq_delta;
6259 unsigned long irq;
6260 } irq_cpu_data[NR_CPUS];
6261
6262 #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
6263 -#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
6264 -#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
6265 +#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
6266 +#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
6267
6268 #define IDLE_ENOUGH(cpu,now) \
6269 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
6270 @@ -468,8 +477,8 @@ inside:
6271 if (cpu == -1)
6272 cpu = NR_CPUS-1;
6273 }
6274 - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
6275 - (search_idle && !IDLE_ENOUGH(cpu,now)));
6276 + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
6277 + (search_idle && !IDLE_ENOUGH(cpu, now)));
6278
6279 return cpu;
6280 }
6281 @@ -479,15 +488,14 @@ static inline void balance_irq(int cpu,
6282 unsigned long now = jiffies;
6283 cpumask_t allowed_mask;
6284 unsigned int new_cpu;
6285 -
6286 +
6287 if (irqbalance_disabled)
6288 - return;
6289 + return;
6290
6291 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
6292 new_cpu = move(cpu, allowed_mask, now, 1);
6293 - if (cpu != new_cpu) {
6294 + if (cpu != new_cpu)
6295 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
6296 - }
6297 }
6298
6299 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
6300 @@ -499,14 +507,14 @@ static inline void rotate_irqs_among_cpu
6301 if (!irq_desc[j].action)
6302 continue;
6303 /* Is it a significant load ? */
6304 - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
6305 + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
6306 useful_load_threshold)
6307 continue;
6308 balance_irq(i, j);
6309 }
6310 }
6311 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6312 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6313 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6314 return;
6315 }
6316
6317 @@ -535,22 +543,22 @@ static void do_irq_balance(void)
6318 /* Is this an active IRQ or balancing disabled ? */
6319 if (!irq_desc[j].action || irq_balancing_disabled(j))
6320 continue;
6321 - if ( package_index == i )
6322 - IRQ_DELTA(package_index,j) = 0;
6323 + if (package_index == i)
6324 + IRQ_DELTA(package_index, j) = 0;
6325 /* Determine the total count per processor per IRQ */
6326 value_now = (unsigned long) kstat_cpu(i).irqs[j];
6327
6328 /* Determine the activity per processor per IRQ */
6329 - delta = value_now - LAST_CPU_IRQ(i,j);
6330 + delta = value_now - LAST_CPU_IRQ(i, j);
6331
6332 /* Update last_cpu_irq[][] for the next time */
6333 - LAST_CPU_IRQ(i,j) = value_now;
6334 + LAST_CPU_IRQ(i, j) = value_now;
6335
6336 /* Ignore IRQs whose rate is less than the clock */
6337 if (delta < useful_load_threshold)
6338 continue;
6339 /* update the load for the processor or package total */
6340 - IRQ_DELTA(package_index,j) += delta;
6341 + IRQ_DELTA(package_index, j) += delta;
6342
6343 /* Keep track of the higher numbered sibling as well */
6344 if (i != package_index)
6345 @@ -576,7 +584,8 @@ static void do_irq_balance(void)
6346 max_cpu_irq = ULONG_MAX;
6347
6348 tryanothercpu:
6349 - /* Look for heaviest loaded processor.
6350 + /*
6351 + * Look for heaviest loaded processor.
6352 * We may come back to get the next heaviest loaded processor.
6353 * Skip processors with trivial loads.
6354 */
6355 @@ -585,7 +594,7 @@ tryanothercpu:
6356 for_each_online_cpu(i) {
6357 if (i != CPU_TO_PACKAGEINDEX(i))
6358 continue;
6359 - if (max_cpu_irq <= CPU_IRQ(i))
6360 + if (max_cpu_irq <= CPU_IRQ(i))
6361 continue;
6362 if (tmp_cpu_irq < CPU_IRQ(i)) {
6363 tmp_cpu_irq = CPU_IRQ(i);
6364 @@ -594,8 +603,9 @@ tryanothercpu:
6365 }
6366
6367 if (tmp_loaded == -1) {
6368 - /* In the case of small number of heavy interrupt sources,
6369 - * loading some of the cpus too much. We use Ingo's original
6370 + /*
6371 + * In the case of small number of heavy interrupt sources,
6372 + * loading some of the cpus too much. We use Ingo's original
6373 * approach to rotate them around.
6374 */
6375 if (!first_attempt && imbalance >= useful_load_threshold) {
6376 @@ -604,13 +614,14 @@ tryanothercpu:
6377 }
6378 goto not_worth_the_effort;
6379 }
6380 -
6381 +
6382 first_attempt = 0; /* heaviest search */
6383 max_cpu_irq = tmp_cpu_irq; /* load */
6384 max_loaded = tmp_loaded; /* processor */
6385 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
6386 -
6387 - /* if imbalance is less than approx 10% of max load, then
6388 +
6389 + /*
6390 + * if imbalance is less than approx 10% of max load, then
6391 * observe diminishing returns action. - quit
6392 */
6393 if (imbalance < (max_cpu_irq >> 3))
6394 @@ -626,26 +637,25 @@ tryanotherirq:
6395 /* Is this an active IRQ? */
6396 if (!irq_desc[j].action)
6397 continue;
6398 - if (imbalance <= IRQ_DELTA(max_loaded,j))
6399 + if (imbalance <= IRQ_DELTA(max_loaded, j))
6400 continue;
6401 /* Try to find the IRQ that is closest to the imbalance
6402 * without going over.
6403 */
6404 - if (move_this_load < IRQ_DELTA(max_loaded,j)) {
6405 - move_this_load = IRQ_DELTA(max_loaded,j);
6406 + if (move_this_load < IRQ_DELTA(max_loaded, j)) {
6407 + move_this_load = IRQ_DELTA(max_loaded, j);
6408 selected_irq = j;
6409 }
6410 }
6411 - if (selected_irq == -1) {
6412 + if (selected_irq == -1)
6413 goto tryanothercpu;
6414 - }
6415
6416 imbalance = move_this_load;
6417 -
6418 +
6419 /* For physical_balance case, we accumulated both load
6420 * values in the one of the siblings cpu_irq[],
6421 * to use the same code for physical and logical processors
6422 - * as much as possible.
6423 + * as much as possible.
6424 *
6425 * NOTE: the cpu_irq[] array holds the sum of the load for
6426 * sibling A and sibling B in the slot for the lowest numbered
6427 @@ -674,11 +684,11 @@ tryanotherirq:
6428 /* mark for change destination */
6429 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
6430
6431 - /* Since we made a change, come back sooner to
6432 + /* Since we made a change, come back sooner to
6433 * check for more variation.
6434 */
6435 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6436 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6437 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6438 return;
6439 }
6440 goto tryanotherirq;
6441 @@ -689,7 +699,7 @@ not_worth_the_effort:
6442 * upward
6443 */
6444 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
6445 - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6446 + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6447 return;
6448 }
6449
6450 @@ -728,13 +738,13 @@ static int __init balanced_irq_init(void
6451 cpumask_t tmp;
6452
6453 cpus_shift_right(tmp, cpu_online_map, 2);
6454 - c = &boot_cpu_data;
6455 + c = &boot_cpu_data;
6456 /* When not overwritten by the command line ask subarchitecture. */
6457 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
6458 irqbalance_disabled = NO_BALANCE_IRQ;
6459 if (irqbalance_disabled)
6460 return 0;
6461 -
6462 +
6463 /* disable irqbalance completely if there is only one processor online */
6464 if (num_online_cpus() < 2) {
6465 irqbalance_disabled = 1;
6466 @@ -748,16 +758,14 @@ static int __init balanced_irq_init(void
6467 physical_balance = 1;
6468
6469 for_each_online_cpu(i) {
6470 - irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6471 - irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6472 + irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6473 + irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6474 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
6475 printk(KERN_ERR "balanced_irq_init: out of memory");
6476 goto failed;
6477 }
6478 - memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
6479 - memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
6480 }
6481 -
6482 +
6483 printk(KERN_INFO "Starting balanced_irq\n");
6484 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
6485 return 0;
6486 @@ -799,7 +807,7 @@ void send_IPI_self(int vector)
6487 /*
6488 * Send the IPI. The write to APIC_ICR fires this off.
6489 */
6490 - apic_write_around(APIC_ICR, cfg);
6491 + apic_write(APIC_ICR, cfg);
6492 #endif
6493 }
6494 #endif /* !CONFIG_SMP */
6495 @@ -853,10 +861,10 @@ static int find_irq_entry(int apic, int
6496 int i;
6497
6498 for (i = 0; i < mp_irq_entries; i++)
6499 - if (mp_irqs[i].mpc_irqtype == type &&
6500 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
6501 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
6502 - mp_irqs[i].mpc_dstirq == pin)
6503 + if (mp_irqs[i].mp_irqtype == type &&
6504 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
6505 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
6506 + mp_irqs[i].mp_dstirq == pin)
6507 return i;
6508
6509 return -1;
6510 @@ -871,13 +879,13 @@ static int __init find_isa_irq_pin(int i
6511 int i;
6512
6513 for (i = 0; i < mp_irq_entries; i++) {
6514 - int lbus = mp_irqs[i].mpc_srcbus;
6515 + int lbus = mp_irqs[i].mp_srcbus;
6516
6517 if (test_bit(lbus, mp_bus_not_pci) &&
6518 - (mp_irqs[i].mpc_irqtype == type) &&
6519 - (mp_irqs[i].mpc_srcbusirq == irq))
6520 + (mp_irqs[i].mp_irqtype == type) &&
6521 + (mp_irqs[i].mp_srcbusirq == irq))
6522
6523 - return mp_irqs[i].mpc_dstirq;
6524 + return mp_irqs[i].mp_dstirq;
6525 }
6526 return -1;
6527 }
6528 @@ -887,17 +895,17 @@ static int __init find_isa_irq_apic(int
6529 int i;
6530
6531 for (i = 0; i < mp_irq_entries; i++) {
6532 - int lbus = mp_irqs[i].mpc_srcbus;
6533 + int lbus = mp_irqs[i].mp_srcbus;
6534
6535 if (test_bit(lbus, mp_bus_not_pci) &&
6536 - (mp_irqs[i].mpc_irqtype == type) &&
6537 - (mp_irqs[i].mpc_srcbusirq == irq))
6538 + (mp_irqs[i].mp_irqtype == type) &&
6539 + (mp_irqs[i].mp_srcbusirq == irq))
6540 break;
6541 }
6542 if (i < mp_irq_entries) {
6543 int apic;
6544 - for(apic = 0; apic < nr_ioapics; apic++) {
6545 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
6546 + for (apic = 0; apic < nr_ioapics; apic++) {
6547 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
6548 return apic;
6549 }
6550 }
6551 @@ -918,28 +926,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6552
6553 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
6554 "slot:%d, pin:%d.\n", bus, slot, pin);
6555 - if (mp_bus_id_to_pci_bus[bus] == -1) {
6556 + if (test_bit(bus, mp_bus_not_pci)) {
6557 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
6558 return -1;
6559 }
6560 for (i = 0; i < mp_irq_entries; i++) {
6561 - int lbus = mp_irqs[i].mpc_srcbus;
6562 + int lbus = mp_irqs[i].mp_srcbus;
6563
6564 for (apic = 0; apic < nr_ioapics; apic++)
6565 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
6566 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
6567 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
6568 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
6569 break;
6570
6571 if (!test_bit(lbus, mp_bus_not_pci) &&
6572 - !mp_irqs[i].mpc_irqtype &&
6573 + !mp_irqs[i].mp_irqtype &&
6574 (bus == lbus) &&
6575 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
6576 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
6577 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
6578 + int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
6579
6580 if (!(apic || IO_APIC_IRQ(irq)))
6581 continue;
6582
6583 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
6584 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
6585 return irq;
6586 /*
6587 * Use the first all-but-pin matching entry as a
6588 @@ -954,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6589 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
6590
6591 /*
6592 - * This function currently is only a helper for the i386 smp boot process where
6593 + * This function currently is only a helper for the i386 smp boot process where
6594 * we need to reprogram the ioredtbls to cater for the cpus which have come online
6595 * so mask in all cases should simply be TARGET_CPUS
6596 */
6597 @@ -1008,7 +1016,7 @@ static int EISA_ELCR(unsigned int irq)
6598 * EISA conforming in the MP table, that means its trigger type must
6599 * be read in from the ELCR */
6600
6601 -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
6602 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
6603 #define default_EISA_polarity(idx) default_ISA_polarity(idx)
6604
6605 /* PCI interrupts are always polarity one level triggered,
6606 @@ -1025,118 +1033,115 @@ static int EISA_ELCR(unsigned int irq)
6607
6608 static int MPBIOS_polarity(int idx)
6609 {
6610 - int bus = mp_irqs[idx].mpc_srcbus;
6611 + int bus = mp_irqs[idx].mp_srcbus;
6612 int polarity;
6613
6614 /*
6615 * Determine IRQ line polarity (high active or low active):
6616 */
6617 - switch (mp_irqs[idx].mpc_irqflag & 3)
6618 + switch (mp_irqs[idx].mp_irqflag & 3) {
6619 + case 0: /* conforms, ie. bus-type dependent polarity */
6620 {
6621 - case 0: /* conforms, ie. bus-type dependent polarity */
6622 - {
6623 - polarity = test_bit(bus, mp_bus_not_pci)?
6624 - default_ISA_polarity(idx):
6625 - default_PCI_polarity(idx);
6626 - break;
6627 - }
6628 - case 1: /* high active */
6629 - {
6630 - polarity = 0;
6631 - break;
6632 - }
6633 - case 2: /* reserved */
6634 - {
6635 - printk(KERN_WARNING "broken BIOS!!\n");
6636 - polarity = 1;
6637 - break;
6638 - }
6639 - case 3: /* low active */
6640 - {
6641 - polarity = 1;
6642 - break;
6643 - }
6644 - default: /* invalid */
6645 - {
6646 - printk(KERN_WARNING "broken BIOS!!\n");
6647 - polarity = 1;
6648 - break;
6649 - }
6650 + polarity = test_bit(bus, mp_bus_not_pci)?
6651 + default_ISA_polarity(idx):
6652 + default_PCI_polarity(idx);
6653 + break;
6654 + }
6655 + case 1: /* high active */
6656 + {
6657 + polarity = 0;
6658 + break;
6659 + }
6660 + case 2: /* reserved */
6661 + {
6662 + printk(KERN_WARNING "broken BIOS!!\n");
6663 + polarity = 1;
6664 + break;
6665 + }
6666 + case 3: /* low active */
6667 + {
6668 + polarity = 1;
6669 + break;
6670 + }
6671 + default: /* invalid */
6672 + {
6673 + printk(KERN_WARNING "broken BIOS!!\n");
6674 + polarity = 1;
6675 + break;
6676 + }
6677 }
6678 return polarity;
6679 }
6680
6681 static int MPBIOS_trigger(int idx)
6682 {
6683 - int bus = mp_irqs[idx].mpc_srcbus;
6684 + int bus = mp_irqs[idx].mp_srcbus;
6685 int trigger;
6686
6687 /*
6688 * Determine IRQ trigger mode (edge or level sensitive):
6689 */
6690 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
6691 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
6692 + case 0: /* conforms, ie. bus-type dependent */
6693 {
6694 - case 0: /* conforms, ie. bus-type dependent */
6695 - {
6696 - trigger = test_bit(bus, mp_bus_not_pci)?
6697 - default_ISA_trigger(idx):
6698 - default_PCI_trigger(idx);
6699 + trigger = test_bit(bus, mp_bus_not_pci)?
6700 + default_ISA_trigger(idx):
6701 + default_PCI_trigger(idx);
6702 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
6703 - switch (mp_bus_id_to_type[bus])
6704 - {
6705 - case MP_BUS_ISA: /* ISA pin */
6706 - {
6707 - /* set before the switch */
6708 - break;
6709 - }
6710 - case MP_BUS_EISA: /* EISA pin */
6711 - {
6712 - trigger = default_EISA_trigger(idx);
6713 - break;
6714 - }
6715 - case MP_BUS_PCI: /* PCI pin */
6716 - {
6717 - /* set before the switch */
6718 - break;
6719 - }
6720 - case MP_BUS_MCA: /* MCA pin */
6721 - {
6722 - trigger = default_MCA_trigger(idx);
6723 - break;
6724 - }
6725 - default:
6726 - {
6727 - printk(KERN_WARNING "broken BIOS!!\n");
6728 - trigger = 1;
6729 - break;
6730 - }
6731 - }
6732 -#endif
6733 + switch (mp_bus_id_to_type[bus]) {
6734 + case MP_BUS_ISA: /* ISA pin */
6735 + {
6736 + /* set before the switch */
6737 break;
6738 }
6739 - case 1: /* edge */
6740 + case MP_BUS_EISA: /* EISA pin */
6741 {
6742 - trigger = 0;
6743 + trigger = default_EISA_trigger(idx);
6744 break;
6745 }
6746 - case 2: /* reserved */
6747 + case MP_BUS_PCI: /* PCI pin */
6748 {
6749 - printk(KERN_WARNING "broken BIOS!!\n");
6750 - trigger = 1;
6751 + /* set before the switch */
6752 break;
6753 }
6754 - case 3: /* level */
6755 + case MP_BUS_MCA: /* MCA pin */
6756 {
6757 - trigger = 1;
6758 + trigger = default_MCA_trigger(idx);
6759 break;
6760 }
6761 - default: /* invalid */
6762 + default:
6763 {
6764 printk(KERN_WARNING "broken BIOS!!\n");
6765 - trigger = 0;
6766 + trigger = 1;
6767 break;
6768 }
6769 }
6770 +#endif
6771 + break;
6772 + }
6773 + case 1: /* edge */
6774 + {
6775 + trigger = 0;
6776 + break;
6777 + }
6778 + case 2: /* reserved */
6779 + {
6780 + printk(KERN_WARNING "broken BIOS!!\n");
6781 + trigger = 1;
6782 + break;
6783 + }
6784 + case 3: /* level */
6785 + {
6786 + trigger = 1;
6787 + break;
6788 + }
6789 + default: /* invalid */
6790 + {
6791 + printk(KERN_WARNING "broken BIOS!!\n");
6792 + trigger = 0;
6793 + break;
6794 + }
6795 + }
6796 return trigger;
6797 }
6798
6799 @@ -1153,16 +1158,16 @@ static inline int irq_trigger(int idx)
6800 static int pin_2_irq(int idx, int apic, int pin)
6801 {
6802 int irq, i;
6803 - int bus = mp_irqs[idx].mpc_srcbus;
6804 + int bus = mp_irqs[idx].mp_srcbus;
6805
6806 /*
6807 * Debugging check, we are in big trouble if this message pops up!
6808 */
6809 - if (mp_irqs[idx].mpc_dstirq != pin)
6810 + if (mp_irqs[idx].mp_dstirq != pin)
6811 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
6812
6813 if (test_bit(bus, mp_bus_not_pci))
6814 - irq = mp_irqs[idx].mpc_srcbusirq;
6815 + irq = mp_irqs[idx].mp_srcbusirq;
6816 else {
6817 /*
6818 * PCI IRQs are mapped in order
6819 @@ -1204,8 +1209,8 @@ static inline int IO_APIC_irq_trigger(in
6820
6821 for (apic = 0; apic < nr_ioapics; apic++) {
6822 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6823 - idx = find_irq_entry(apic,pin,mp_INT);
6824 - if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
6825 + idx = find_irq_entry(apic, pin, mp_INT);
6826 + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
6827 return irq_trigger(idx);
6828 }
6829 }
6830 @@ -1291,25 +1296,25 @@ static void __init setup_IO_APIC_irqs(vo
6831 /*
6832 * add it to the IO-APIC irq-routing table:
6833 */
6834 - memset(&entry,0,sizeof(entry));
6835 + memset(&entry, 0, sizeof(entry));
6836
6837 entry.delivery_mode = INT_DELIVERY_MODE;
6838 entry.dest_mode = INT_DEST_MODE;
6839 entry.mask = 0; /* enable IRQ */
6840 - entry.dest.logical.logical_dest =
6841 + entry.dest.logical.logical_dest =
6842 cpu_mask_to_apicid(TARGET_CPUS);
6843
6844 - idx = find_irq_entry(apic,pin,mp_INT);
6845 + idx = find_irq_entry(apic, pin, mp_INT);
6846 if (idx == -1) {
6847 if (first_notcon) {
6848 apic_printk(APIC_VERBOSE, KERN_DEBUG
6849 " IO-APIC (apicid-pin) %d-%d",
6850 - mp_ioapics[apic].mpc_apicid,
6851 + mp_ioapics[apic].mp_apicid,
6852 pin);
6853 first_notcon = 0;
6854 } else
6855 apic_printk(APIC_VERBOSE, ", %d-%d",
6856 - mp_ioapics[apic].mpc_apicid, pin);
6857 + mp_ioapics[apic].mp_apicid, pin);
6858 continue;
6859 }
6860
6861 @@ -1343,7 +1348,7 @@ static void __init setup_IO_APIC_irqs(vo
6862 vector = assign_irq_vector(irq);
6863 entry.vector = vector;
6864 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
6865 -
6866 +
6867 if (!apic && (irq < 16))
6868 disable_8259A_irq(irq);
6869 }
6870 @@ -1355,27 +1360,23 @@ static void __init setup_IO_APIC_irqs(vo
6871 apic_printk(APIC_VERBOSE, " not connected.\n");
6872 }
6873
6874 +#ifndef CONFIG_XEN
6875 /*
6876 - * Set up the 8259A-master output pin:
6877 + * Set up the timer pin, possibly with the 8259A-master behind.
6878 */
6879 -#ifndef CONFIG_XEN
6880 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
6881 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
6882 + int vector)
6883 {
6884 struct IO_APIC_route_entry entry;
6885
6886 - memset(&entry,0,sizeof(entry));
6887 -
6888 - disable_8259A_irq(0);
6889 -
6890 - /* mask LVT0 */
6891 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6892 + memset(&entry, 0, sizeof(entry));
6893
6894 /*
6895 * We use logical delivery to get the timer IRQ
6896 * to the first CPU.
6897 */
6898 entry.dest_mode = INT_DEST_MODE;
6899 - entry.mask = 0; /* unmask IRQ now */
6900 + entry.mask = 1; /* mask IRQ now */
6901 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6902 entry.delivery_mode = INT_DELIVERY_MODE;
6903 entry.polarity = 0;
6904 @@ -1384,17 +1385,14 @@ static void __init setup_ExtINT_IRQ0_pin
6905
6906 /*
6907 * The timer IRQ doesn't have to know that behind the
6908 - * scene we have a 8259A-master in AEOI mode ...
6909 + * scene we may have a 8259A-master in AEOI mode ...
6910 */
6911 - irq_desc[0].chip = &ioapic_chip;
6912 - set_irq_handler(0, handle_edge_irq);
6913 + ioapic_register_intr(0, vector, IOAPIC_EDGE);
6914
6915 /*
6916 * Add it to the IO-APIC irq-routing table:
6917 */
6918 ioapic_write_entry(apic, pin, entry);
6919 -
6920 - enable_8259A_irq(0);
6921 }
6922
6923 void __init print_IO_APIC(void)
6924 @@ -1409,10 +1407,10 @@ void __init print_IO_APIC(void)
6925 if (apic_verbosity == APIC_QUIET)
6926 return;
6927
6928 - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6929 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6930 for (i = 0; i < nr_ioapics; i++)
6931 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
6932 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
6933 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
6934
6935 /*
6936 * We are a bit conservative about what we expect. We have to
6937 @@ -1431,7 +1429,7 @@ void __init print_IO_APIC(void)
6938 reg_03.raw = io_apic_read(apic, 3);
6939 spin_unlock_irqrestore(&ioapic_lock, flags);
6940
6941 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
6942 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
6943 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
6944 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
6945 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
6946 @@ -1512,7 +1510,7 @@ void __init print_IO_APIC(void)
6947 return;
6948 }
6949
6950 -static void print_APIC_bitfield (int base)
6951 +static void print_APIC_bitfield(int base)
6952 {
6953 unsigned int v;
6954 int i, j;
6955 @@ -1533,7 +1531,7 @@ static void print_APIC_bitfield (int bas
6956 }
6957 }
6958
6959 -void /*__init*/ print_local_APIC(void * dummy)
6960 +void /*__init*/ print_local_APIC(void *dummy)
6961 {
6962 unsigned int v, ver, maxlvt;
6963
6964 @@ -1542,6 +1540,7 @@ void /*__init*/ print_local_APIC(void *
6965
6966 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
6967 smp_processor_id(), hard_smp_processor_id());
6968 + v = apic_read(APIC_ID);
6969 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
6970 GET_APIC_ID(read_apic_id()));
6971 v = apic_read(APIC_LVR);
6972 @@ -1616,9 +1615,9 @@ void /*__init*/ print_local_APIC(void *
6973 printk("\n");
6974 }
6975
6976 -void print_all_local_APICs (void)
6977 +void print_all_local_APICs(void)
6978 {
6979 - on_each_cpu(print_local_APIC, NULL, 1, 1);
6980 + on_each_cpu(print_local_APIC, NULL, 1);
6981 }
6982
6983 void /*__init*/ print_PIC(void)
6984 @@ -1639,11 +1638,11 @@ void /*__init*/ print_PIC(void)
6985 v = inb(0xa0) << 8 | inb(0x20);
6986 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
6987
6988 - outb(0x0b,0xa0);
6989 - outb(0x0b,0x20);
6990 + outb(0x0b, 0xa0);
6991 + outb(0x0b, 0x20);
6992 v = inb(0xa0) << 8 | inb(0x20);
6993 - outb(0x0a,0xa0);
6994 - outb(0x0a,0x20);
6995 + outb(0x0a, 0xa0);
6996 + outb(0x0a, 0x20);
6997
6998 spin_unlock_irqrestore(&i8259A_lock, flags);
6999
7000 @@ -1652,6 +1651,8 @@ void /*__init*/ print_PIC(void)
7001 v = inb(0x4d1) << 8 | inb(0x4d0);
7002 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
7003 }
7004 +#else
7005 +void __init print_IO_APIC(void) {}
7006 #endif /* !CONFIG_XEN */
7007
7008 static void __init enable_IO_APIC(void)
7009 @@ -1681,7 +1682,7 @@ static void __init enable_IO_APIC(void)
7010 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
7011 }
7012 #ifndef CONFIG_XEN
7013 - for(apic = 0; apic < nr_ioapics; apic++) {
7014 + for (apic = 0; apic < nr_ioapics; apic++) {
7015 int pin;
7016 /* See if any of the pins is in ExtINT mode */
7017 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
7018 @@ -1774,7 +1775,7 @@ void disable_IO_APIC(void)
7019 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
7020 */
7021
7022 -#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
7023 +#ifndef CONFIG_XEN
7024 static void __init setup_ioapic_ids_from_mpc(void)
7025 {
7026 union IO_APIC_reg_00 reg_00;
7027 @@ -1784,6 +1785,11 @@ static void __init setup_ioapic_ids_from
7028 unsigned char old_id;
7029 unsigned long flags;
7030
7031 +#ifdef CONFIG_X86_NUMAQ
7032 + if (found_numaq)
7033 + return;
7034 +#endif
7035 +
7036 /*
7037 * Don't check I/O APIC IDs for xAPIC systems. They have
7038 * no meaning without the serial APIC bus.
7039 @@ -1806,15 +1812,15 @@ static void __init setup_ioapic_ids_from
7040 spin_lock_irqsave(&ioapic_lock, flags);
7041 reg_00.raw = io_apic_read(apic, 0);
7042 spin_unlock_irqrestore(&ioapic_lock, flags);
7043 -
7044 - old_id = mp_ioapics[apic].mpc_apicid;
7045
7046 - if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
7047 + old_id = mp_ioapics[apic].mp_apicid;
7048 +
7049 + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
7050 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
7051 - apic, mp_ioapics[apic].mpc_apicid);
7052 + apic, mp_ioapics[apic].mp_apicid);
7053 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7054 reg_00.bits.ID);
7055 - mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
7056 + mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
7057 }
7058
7059 /*
7060 @@ -1823,9 +1829,9 @@ static void __init setup_ioapic_ids_from
7061 * 'stuck on smp_invalidate_needed IPI wait' messages.
7062 */
7063 if (check_apicid_used(phys_id_present_map,
7064 - mp_ioapics[apic].mpc_apicid)) {
7065 + mp_ioapics[apic].mp_apicid)) {
7066 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
7067 - apic, mp_ioapics[apic].mpc_apicid);
7068 + apic, mp_ioapics[apic].mp_apicid);
7069 for (i = 0; i < get_physical_broadcast(); i++)
7070 if (!physid_isset(i, phys_id_present_map))
7071 break;
7072 @@ -1834,13 +1840,13 @@ static void __init setup_ioapic_ids_from
7073 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7074 i);
7075 physid_set(i, phys_id_present_map);
7076 - mp_ioapics[apic].mpc_apicid = i;
7077 + mp_ioapics[apic].mp_apicid = i;
7078 } else {
7079 physid_mask_t tmp;
7080 - tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
7081 + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
7082 apic_printk(APIC_VERBOSE, "Setting %d in the "
7083 "phys_id_present_map\n",
7084 - mp_ioapics[apic].mpc_apicid);
7085 + mp_ioapics[apic].mp_apicid);
7086 physids_or(phys_id_present_map, phys_id_present_map, tmp);
7087 }
7088
7089 @@ -1849,21 +1855,21 @@ static void __init setup_ioapic_ids_from
7090 * We need to adjust the IRQ routing table
7091 * if the ID changed.
7092 */
7093 - if (old_id != mp_ioapics[apic].mpc_apicid)
7094 + if (old_id != mp_ioapics[apic].mp_apicid)
7095 for (i = 0; i < mp_irq_entries; i++)
7096 - if (mp_irqs[i].mpc_dstapic == old_id)
7097 - mp_irqs[i].mpc_dstapic
7098 - = mp_ioapics[apic].mpc_apicid;
7099 + if (mp_irqs[i].mp_dstapic == old_id)
7100 + mp_irqs[i].mp_dstapic
7101 + = mp_ioapics[apic].mp_apicid;
7102
7103 /*
7104 * Read the right value from the MPC table and
7105 * write it into the ID register.
7106 - */
7107 + */
7108 apic_printk(APIC_VERBOSE, KERN_INFO
7109 "...changing IO-APIC physical APIC ID to %d ...",
7110 - mp_ioapics[apic].mpc_apicid);
7111 + mp_ioapics[apic].mp_apicid);
7112
7113 - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
7114 + reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
7115 spin_lock_irqsave(&ioapic_lock, flags);
7116 io_apic_write(apic, 0, reg_00.raw);
7117 spin_unlock_irqrestore(&ioapic_lock, flags);
7118 @@ -1874,17 +1880,13 @@ static void __init setup_ioapic_ids_from
7119 spin_lock_irqsave(&ioapic_lock, flags);
7120 reg_00.raw = io_apic_read(apic, 0);
7121 spin_unlock_irqrestore(&ioapic_lock, flags);
7122 - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
7123 + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
7124 printk("could not set ID!\n");
7125 else
7126 apic_printk(APIC_VERBOSE, " ok.\n");
7127 }
7128 }
7129 -#else
7130 -static void __init setup_ioapic_ids_from_mpc(void) { }
7131 -#endif
7132
7133 -#ifndef CONFIG_XEN
7134 int no_timer_check __initdata;
7135
7136 static int __init notimercheck(char *s)
7137 @@ -2077,45 +2079,53 @@ static inline void init_IO_APIC_traps(vo
7138 * The local APIC irq-chip implementation:
7139 */
7140
7141 -static void ack_apic(unsigned int irq)
7142 +static void ack_lapic_irq(unsigned int irq)
7143 {
7144 ack_APIC_irq();
7145 }
7146
7147 -static void mask_lapic_irq (unsigned int irq)
7148 +static void mask_lapic_irq(unsigned int irq)
7149 {
7150 unsigned long v;
7151
7152 v = apic_read(APIC_LVT0);
7153 - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
7154 + apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
7155 }
7156
7157 -static void unmask_lapic_irq (unsigned int irq)
7158 +static void unmask_lapic_irq(unsigned int irq)
7159 {
7160 unsigned long v;
7161
7162 v = apic_read(APIC_LVT0);
7163 - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
7164 + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
7165 }
7166
7167 static struct irq_chip lapic_chip __read_mostly = {
7168 - .name = "local-APIC-edge",
7169 + .name = "local-APIC",
7170 .mask = mask_lapic_irq,
7171 .unmask = unmask_lapic_irq,
7172 - .eoi = ack_apic,
7173 + .ack = ack_lapic_irq,
7174 };
7175
7176 +static void lapic_register_intr(int irq, int vector)
7177 +{
7178 + irq_desc[irq].status &= ~IRQ_LEVEL;
7179 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
7180 + "edge");
7181 + set_intr_gate(vector, interrupt[irq]);
7182 +}
7183 +
7184 static void __init setup_nmi(void)
7185 {
7186 /*
7187 - * Dirty trick to enable the NMI watchdog ...
7188 + * Dirty trick to enable the NMI watchdog ...
7189 * We put the 8259A master into AEOI mode and
7190 * unmask on all local APICs LVT0 as NMI.
7191 *
7192 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
7193 * is from Maciej W. Rozycki - so we do not have to EOI from
7194 * the NMI handler or the timer interrupt.
7195 - */
7196 + */
7197 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
7198
7199 enable_NMI_through_LVT0();
7200 @@ -2191,11 +2201,16 @@ static inline void __init unlock_ExtINT_
7201 static inline void __init check_timer(void)
7202 {
7203 int apic1, pin1, apic2, pin2;
7204 + int no_pin1 = 0;
7205 int vector;
7206 + unsigned int ver;
7207 unsigned long flags;
7208
7209 local_irq_save(flags);
7210
7211 + ver = apic_read(APIC_LVR);
7212 + ver = GET_APIC_VERSION(ver);
7213 +
7214 /*
7215 * get/set the timer IRQ vector:
7216 */
7217 @@ -2204,34 +2219,54 @@ static inline void __init check_timer(vo
7218 set_intr_gate(vector, interrupt[0]);
7219
7220 /*
7221 - * Subtle, code in do_timer_interrupt() expects an AEOI
7222 - * mode for the 8259A whenever interrupts are routed
7223 - * through I/O APICs. Also IRQ0 has to be enabled in
7224 - * the 8259A which implies the virtual wire has to be
7225 - * disabled in the local APIC.
7226 + * As IRQ0 is to be enabled in the 8259A, the virtual
7227 + * wire has to be disabled in the local APIC. Also
7228 + * timer interrupts need to be acknowledged manually in
7229 + * the 8259A for the i82489DX when using the NMI
7230 + * watchdog as that APIC treats NMIs as level-triggered.
7231 + * The AEOI mode will finish them in the 8259A
7232 + * automatically.
7233 */
7234 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7235 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7236 init_8259A(1);
7237 - timer_ack = 1;
7238 - if (timer_over_8254 > 0)
7239 - enable_8259A_irq(0);
7240 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
7241
7242 pin1 = find_isa_irq_pin(0, mp_INT);
7243 apic1 = find_isa_irq_apic(0, mp_INT);
7244 pin2 = ioapic_i8259.pin;
7245 apic2 = ioapic_i8259.apic;
7246
7247 - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
7248 - vector, apic1, pin1, apic2, pin2);
7249 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
7250 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
7251 + vector, apic1, pin1, apic2, pin2);
7252 +
7253 + /*
7254 + * Some BIOS writers are clueless and report the ExtINTA
7255 + * I/O APIC input from the cascaded 8259A as the timer
7256 + * interrupt input. So just in case, if only one pin
7257 + * was found above, try it both directly and through the
7258 + * 8259A.
7259 + */
7260 + if (pin1 == -1) {
7261 + pin1 = pin2;
7262 + apic1 = apic2;
7263 + no_pin1 = 1;
7264 + } else if (pin2 == -1) {
7265 + pin2 = pin1;
7266 + apic2 = apic1;
7267 + }
7268
7269 if (pin1 != -1) {
7270 /*
7271 * Ok, does IRQ0 through the IOAPIC work?
7272 */
7273 + if (no_pin1) {
7274 + add_pin_to_irq(0, apic1, pin1);
7275 + setup_timer_IRQ0_pin(apic1, pin1, vector);
7276 + }
7277 unmask_IO_APIC_irq(0);
7278 if (timer_irq_works()) {
7279 if (nmi_watchdog == NMI_IO_APIC) {
7280 - disable_8259A_irq(0);
7281 setup_nmi();
7282 enable_8259A_irq(0);
7283 }
7284 @@ -2240,71 +2275,77 @@ static inline void __init check_timer(vo
7285 goto out;
7286 }
7287 clear_IO_APIC_pin(apic1, pin1);
7288 - printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
7289 - "IO-APIC\n");
7290 - }
7291 -
7292 - printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
7293 - if (pin2 != -1) {
7294 - printk("\n..... (found pin %d) ...", pin2);
7295 + if (!no_pin1)
7296 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
7297 + "8254 timer not connected to IO-APIC\n");
7298 +
7299 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
7300 + "(IRQ0) through the 8259A ...\n");
7301 + apic_printk(APIC_QUIET, KERN_INFO
7302 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
7303 /*
7304 * legacy devices should be connected to IO APIC #0
7305 */
7306 - setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
7307 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7308 + setup_timer_IRQ0_pin(apic2, pin2, vector);
7309 + unmask_IO_APIC_irq(0);
7310 + enable_8259A_irq(0);
7311 if (timer_irq_works()) {
7312 - printk("works.\n");
7313 - if (pin1 != -1)
7314 - replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7315 - else
7316 - add_pin_to_irq(0, apic2, pin2);
7317 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
7318 + timer_through_8259 = 1;
7319 if (nmi_watchdog == NMI_IO_APIC) {
7320 + disable_8259A_irq(0);
7321 setup_nmi();
7322 + enable_8259A_irq(0);
7323 }
7324 goto out;
7325 }
7326 /*
7327 * Cleanup, just in case ...
7328 */
7329 + disable_8259A_irq(0);
7330 clear_IO_APIC_pin(apic2, pin2);
7331 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
7332 }
7333 - printk(" failed.\n");
7334
7335 if (nmi_watchdog == NMI_IO_APIC) {
7336 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
7337 - nmi_watchdog = 0;
7338 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
7339 + "through the IO-APIC - disabling NMI Watchdog!\n");
7340 + nmi_watchdog = NMI_NONE;
7341 }
7342 + timer_ack = 0;
7343
7344 - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
7345 + apic_printk(APIC_QUIET, KERN_INFO
7346 + "...trying to set up timer as Virtual Wire IRQ...\n");
7347
7348 - disable_8259A_irq(0);
7349 - set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
7350 - "fasteoi");
7351 - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7352 + lapic_register_intr(0, vector);
7353 + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7354 enable_8259A_irq(0);
7355
7356 if (timer_irq_works()) {
7357 - printk(" works.\n");
7358 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7359 goto out;
7360 }
7361 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7362 - printk(" failed.\n");
7363 + disable_8259A_irq(0);
7364 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7365 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
7366
7367 - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
7368 + apic_printk(APIC_QUIET, KERN_INFO
7369 + "...trying to set up timer as ExtINT IRQ...\n");
7370
7371 - timer_ack = 0;
7372 init_8259A(0);
7373 make_8259A_irq(0);
7374 - apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
7375 + apic_write(APIC_LVT0, APIC_DM_EXTINT);
7376
7377 unlock_ExtINT_logic();
7378
7379 if (timer_irq_works()) {
7380 - printk(" works.\n");
7381 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7382 goto out;
7383 }
7384 - printk(" failed :(.\n");
7385 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
7386 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
7387 - "report. Then try booting with the 'noapic' option");
7388 + "report. Then try booting with the 'noapic' option.\n");
7389 out:
7390 local_irq_restore(flags);
7391 }
7392 @@ -2314,11 +2355,21 @@ int timer_uses_ioapic_pin_0 = 0;
7393 #endif
7394
7395 /*
7396 - *
7397 - * IRQ's that are handled by the PIC in the MPS IOAPIC case.
7398 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
7399 - * Linux doesn't really care, as it's not actually used
7400 - * for any interrupt handling anyway.
7401 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
7402 + * to devices. However there may be an I/O APIC pin available for
7403 + * this interrupt regardless. The pin may be left unconnected, but
7404 + * typically it will be reused as an ExtINT cascade interrupt for
7405 + * the master 8259A. In the MPS case such a pin will normally be
7406 + * reported as an ExtINT interrupt in the MP table. With ACPI
7407 + * there is no provision for ExtINT interrupts, and in the absence
7408 + * of an override it would be treated as an ordinary ISA I/O APIC
7409 + * interrupt, that is edge-triggered and unmasked by default. We
7410 + * used to do this, but it caused problems on some systems because
7411 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
7412 + * the same ExtINT cascade interrupt to drive the local APIC of the
7413 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
7414 + * the I/O APIC in all cases now. No actual device should request
7415 + * it anyway. --macro
7416 */
7417 #define PIC_IRQS (1 << PIC_CASCADE_IR)
7418
7419 @@ -2328,25 +2379,22 @@ void __init setup_IO_APIC(void)
7420 int i;
7421
7422 /* Reserve all the system vectors. */
7423 - for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
7424 + for (i = first_system_vector; i < NR_VECTORS; i++)
7425 set_bit(i, used_vectors);
7426 #endif
7427
7428 enable_IO_APIC();
7429
7430 - if (acpi_ioapic)
7431 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
7432 - else
7433 - io_apic_irqs = ~PIC_IRQS;
7434 + io_apic_irqs = ~PIC_IRQS;
7435
7436 printk("ENABLING IO-APIC IRQs\n");
7437
7438 +#ifndef CONFIG_XEN
7439 /*
7440 * Set up IO-APIC IRQ routing.
7441 */
7442 if (!acpi_ioapic)
7443 setup_ioapic_ids_from_mpc();
7444 -#ifndef CONFIG_XEN
7445 sync_Arb_IDs();
7446 #endif
7447 setup_IO_APIC_irqs();
7448 @@ -2356,28 +2404,14 @@ void __init setup_IO_APIC(void)
7449 print_IO_APIC();
7450 }
7451
7452 -static int __init setup_disable_8254_timer(char *s)
7453 -{
7454 - timer_over_8254 = -1;
7455 - return 1;
7456 -}
7457 -static int __init setup_enable_8254_timer(char *s)
7458 -{
7459 - timer_over_8254 = 2;
7460 - return 1;
7461 -}
7462 -
7463 -__setup("disable_8254_timer", setup_disable_8254_timer);
7464 -__setup("enable_8254_timer", setup_enable_8254_timer);
7465 -
7466 /*
7467 * Called after all the initialization is done. If we didnt find any
7468 * APIC bugs then we can allow the modify fast path
7469 */
7470 -
7471 +
7472 static int __init io_apic_bug_finalize(void)
7473 {
7474 - if(sis_apic_bug == -1)
7475 + if (sis_apic_bug == -1)
7476 sis_apic_bug = 0;
7477 if (is_initial_xendomain()) {
7478 struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
7479 @@ -2396,17 +2430,17 @@ struct sysfs_ioapic_data {
7480 struct sys_device dev;
7481 struct IO_APIC_route_entry entry[0];
7482 };
7483 -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
7484 +static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
7485
7486 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
7487 {
7488 struct IO_APIC_route_entry *entry;
7489 struct sysfs_ioapic_data *data;
7490 int i;
7491 -
7492 +
7493 data = container_of(dev, struct sysfs_ioapic_data, dev);
7494 entry = data->entry;
7495 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7496 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7497 entry[i] = ioapic_read_entry(dev->id, i);
7498
7499 return 0;
7500 @@ -2419,18 +2453,18 @@ static int ioapic_resume(struct sys_devi
7501 unsigned long flags;
7502 union IO_APIC_reg_00 reg_00;
7503 int i;
7504 -
7505 +
7506 data = container_of(dev, struct sysfs_ioapic_data, dev);
7507 entry = data->entry;
7508
7509 spin_lock_irqsave(&ioapic_lock, flags);
7510 reg_00.raw = io_apic_read(dev->id, 0);
7511 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
7512 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
7513 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
7514 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
7515 io_apic_write(dev->id, 0, reg_00.raw);
7516 }
7517 spin_unlock_irqrestore(&ioapic_lock, flags);
7518 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7519 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7520 ioapic_write_entry(dev->id, i, entry[i]);
7521
7522 return 0;
7523 @@ -2444,24 +2478,23 @@ static struct sysdev_class ioapic_sysdev
7524
7525 static int __init ioapic_init_sysfs(void)
7526 {
7527 - struct sys_device * dev;
7528 + struct sys_device *dev;
7529 int i, size, error = 0;
7530
7531 error = sysdev_class_register(&ioapic_sysdev_class);
7532 if (error)
7533 return error;
7534
7535 - for (i = 0; i < nr_ioapics; i++ ) {
7536 - size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7537 + for (i = 0; i < nr_ioapics; i++) {
7538 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7539 * sizeof(struct IO_APIC_route_entry);
7540 - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
7541 + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
7542 if (!mp_ioapic_data[i]) {
7543 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
7544 continue;
7545 }
7546 - memset(mp_ioapic_data[i], 0, size);
7547 dev = &mp_ioapic_data[i]->dev;
7548 - dev->id = i;
7549 + dev->id = i;
7550 dev->cls = &ioapic_sysdev_class;
7551 error = sysdev_register(dev);
7552 if (error) {
7553 @@ -2538,7 +2571,7 @@ static int msi_compose_msg(struct pci_de
7554 msg->address_lo =
7555 MSI_ADDR_BASE_LO |
7556 ((INT_DEST_MODE == 0) ?
7557 - MSI_ADDR_DEST_MODE_PHYSICAL:
7558 +MSI_ADDR_DEST_MODE_PHYSICAL:
7559 MSI_ADDR_DEST_MODE_LOGICAL) |
7560 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7561 MSI_ADDR_REDIRECTION_CPU:
7562 @@ -2549,7 +2582,7 @@ static int msi_compose_msg(struct pci_de
7563 MSI_DATA_TRIGGER_EDGE |
7564 MSI_DATA_LEVEL_ASSERT |
7565 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7566 - MSI_DATA_DELIVERY_FIXED:
7567 +MSI_DATA_DELIVERY_FIXED:
7568 MSI_DATA_DELIVERY_LOWPRI) |
7569 MSI_DATA_VECTOR(vector);
7570 }
7571 @@ -2720,12 +2753,12 @@ int arch_setup_ht_irq(unsigned int irq,
7572 #endif /* CONFIG_HT_IRQ */
7573
7574 /* --------------------------------------------------------------------------
7575 - ACPI-based IOAPIC Configuration
7576 + ACPI-based IOAPIC Configuration
7577 -------------------------------------------------------------------------- */
7578
7579 #ifdef CONFIG_ACPI
7580
7581 -int __init io_apic_get_unique_id (int ioapic, int apic_id)
7582 +int __init io_apic_get_unique_id(int ioapic, int apic_id)
7583 {
7584 #ifndef CONFIG_XEN
7585 union IO_APIC_reg_00 reg_00;
7586 @@ -2735,10 +2768,10 @@ int __init io_apic_get_unique_id (int io
7587 int i = 0;
7588
7589 /*
7590 - * The P4 platform supports up to 256 APIC IDs on two separate APIC
7591 - * buses (one for LAPICs, one for IOAPICs), where predecessors only
7592 + * The P4 platform supports up to 256 APIC IDs on two separate APIC
7593 + * buses (one for LAPICs, one for IOAPICs), where predecessors only
7594 * supports up to 16 on one shared APIC bus.
7595 - *
7596 + *
7597 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
7598 * advantage of new APIC bus architecture.
7599 */
7600 @@ -2757,7 +2790,7 @@ int __init io_apic_get_unique_id (int io
7601 }
7602
7603 /*
7604 - * Every APIC in a system must have a unique ID or we get lots of nice
7605 + * Every APIC in a system must have a unique ID or we get lots of nice
7606 * 'stuck on smp_invalidate_needed IPI wait' messages.
7607 */
7608 if (check_apicid_used(apic_id_map, apic_id)) {
7609 @@ -2774,7 +2807,7 @@ int __init io_apic_get_unique_id (int io
7610 "trying %d\n", ioapic, apic_id, i);
7611
7612 apic_id = i;
7613 - }
7614 + }
7615
7616 tmp = apicid_to_cpu_present(apic_id);
7617 physids_or(apic_id_map, apic_id_map, tmp);
7618 @@ -2802,7 +2835,7 @@ int __init io_apic_get_unique_id (int io
7619 }
7620
7621
7622 -int __init io_apic_get_version (int ioapic)
7623 +int __init io_apic_get_version(int ioapic)
7624 {
7625 union IO_APIC_reg_01 reg_01;
7626 unsigned long flags;
7627 @@ -2815,7 +2848,7 @@ int __init io_apic_get_version (int ioap
7628 }
7629
7630
7631 -int __init io_apic_get_redir_entries (int ioapic)
7632 +int __init io_apic_get_redir_entries(int ioapic)
7633 {
7634 union IO_APIC_reg_01 reg_01;
7635 unsigned long flags;
7636 @@ -2828,7 +2861,7 @@ int __init io_apic_get_redir_entries (in
7637 }
7638
7639
7640 -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
7641 +int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
7642 {
7643 struct IO_APIC_route_entry entry;
7644
7645 @@ -2844,7 +2877,7 @@ int io_apic_set_pci_routing (int ioapic,
7646 * corresponding device driver registers for this IRQ.
7647 */
7648
7649 - memset(&entry,0,sizeof(entry));
7650 + memset(&entry, 0, sizeof(entry));
7651
7652 entry.delivery_mode = INT_DELIVERY_MODE;
7653 entry.dest_mode = INT_DEST_MODE;
7654 @@ -2863,7 +2896,7 @@ int io_apic_set_pci_routing (int ioapic,
7655
7656 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
7657 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
7658 - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
7659 + mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
7660 edge_level, active_high_low);
7661
7662 ioapic_register_intr(irq, entry.vector, edge_level);
7663 @@ -2884,8 +2917,8 @@ int acpi_get_override_irq(int bus_irq, i
7664 return -1;
7665
7666 for (i = 0; i < mp_irq_entries; i++)
7667 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
7668 - mp_irqs[i].mpc_srcbusirq == bus_irq)
7669 + if (mp_irqs[i].mp_irqtype == mp_INT &&
7670 + mp_irqs[i].mp_srcbusirq == bus_irq)
7671 break;
7672 if (i >= mp_irq_entries)
7673 return -1;
7674 @@ -2918,3 +2951,35 @@ static int __init parse_noapic(char *arg
7675 return 0;
7676 }
7677 early_param("noapic", parse_noapic);
7678 +
7679 +#ifndef CONFIG_XEN
7680 +void __init ioapic_init_mappings(void)
7681 +{
7682 + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
7683 + int i;
7684 +
7685 + for (i = 0; i < nr_ioapics; i++) {
7686 + if (smp_found_config) {
7687 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
7688 + if (!ioapic_phys) {
7689 + printk(KERN_ERR
7690 + "WARNING: bogus zero IO-APIC "
7691 + "address found in MPTABLE, "
7692 + "disabling IO/APIC support!\n");
7693 + smp_found_config = 0;
7694 + skip_ioapic_setup = 1;
7695 + goto fake_ioapic_page;
7696 + }
7697 + } else {
7698 +fake_ioapic_page:
7699 + ioapic_phys = (unsigned long)
7700 + alloc_bootmem_pages(PAGE_SIZE);
7701 + ioapic_phys = __pa(ioapic_phys);
7702 + }
7703 + set_fixmap_nocache(idx, ioapic_phys);
7704 + printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
7705 + __fix_to_virt(idx), ioapic_phys);
7706 + idx++;
7707 + }
7708 +}
7709 +#endif
7710 --- sle11-2009-06-04.orig/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
7711 +++ sle11-2009-06-04/arch/x86/kernel/io_apic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
7712 @@ -45,6 +45,7 @@
7713 #include <asm/proto.h>
7714 #include <asm/acpi.h>
7715 #include <asm/dma.h>
7716 +#include <asm/i8259.h>
7717 #include <asm/nmi.h>
7718 #include <asm/msidef.h>
7719 #include <asm/hypertransport.h>
7720 @@ -63,10 +64,16 @@ struct irq_cfg {
7721 };
7722
7723 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
7724 -struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7725 +static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7726
7727 static int assign_irq_vector(int irq, cpumask_t mask);
7728
7729 +#ifndef CONFIG_XEN
7730 +int first_system_vector = 0xfe;
7731 +
7732 +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
7733 +#endif
7734 +
7735 #define __apicdebuginit __init
7736
7737 int sis_apic_bug; /* not actually supported, dummy for compile */
7738 @@ -89,14 +96,14 @@ unsigned long io_apic_irqs;
7739
7740 #define clear_IO_APIC() ((void)0)
7741 #else
7742 -int timer_over_8254 __initdata = 1;
7743 +int timer_through_8259 __initdata;
7744
7745 /* Where if anywhere is the i8259 connect in external int mode */
7746 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
7747 #endif
7748
7749 static DEFINE_SPINLOCK(ioapic_lock);
7750 -DEFINE_SPINLOCK(vector_lock);
7751 +static DEFINE_SPINLOCK(vector_lock);
7752
7753 /*
7754 * # of IRQ routing registers
7755 @@ -104,15 +111,17 @@ DEFINE_SPINLOCK(vector_lock);
7756 int nr_ioapic_registers[MAX_IO_APICS];
7757
7758 /* I/O APIC entries */
7759 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7760 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
7761 int nr_ioapics;
7762
7763 /* MP IRQ source entries */
7764 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7765 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7766
7767 /* # of MP IRQ source entries */
7768 int mp_irq_entries;
7769
7770 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
7771 +
7772 /*
7773 * Rough estimation of how many shared IRQs there are, can
7774 * be changed anytime.
7775 @@ -141,7 +150,7 @@ struct io_apic {
7776 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
7777 {
7778 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
7779 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
7780 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
7781 }
7782 #endif
7783
7784 @@ -155,7 +164,7 @@ static inline unsigned int io_apic_read(
7785 struct physdev_apic apic_op;
7786 int ret;
7787
7788 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7789 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7790 apic_op.reg = reg;
7791 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
7792 if (ret)
7793 @@ -173,7 +182,7 @@ static inline void io_apic_write(unsigne
7794 #else
7795 struct physdev_apic apic_op;
7796
7797 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7798 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7799 apic_op.reg = reg;
7800 apic_op.value = value;
7801 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
7802 @@ -209,7 +218,7 @@ static bool io_apic_level_ack_pending(un
7803 break;
7804 reg = io_apic_read(entry->apic, 0x10 + pin*2);
7805 /* Is the remote IRR bit set? */
7806 - if ((reg >> 14) & 1) {
7807 + if (reg & IO_APIC_REDIR_REMOTE_IRR) {
7808 spin_unlock_irqrestore(&ioapic_lock, flags);
7809 return true;
7810 }
7811 @@ -308,7 +317,7 @@ static void __target_IO_APIC_irq(unsigne
7812 break;
7813 io_apic_write(apic, 0x11 + pin*2, dest);
7814 reg = io_apic_read(apic, 0x10 + pin*2);
7815 - reg &= ~0x000000ff;
7816 + reg &= ~IO_APIC_REDIR_VECTOR_MASK;
7817 reg |= vector;
7818 io_apic_modify(apic, reg);
7819 if (!entry->next)
7820 @@ -372,6 +381,26 @@ static void add_pin_to_irq(unsigned int
7821 }
7822
7823 #ifndef CONFIG_XEN
7824 +/*
7825 + * Reroute an IRQ to a different pin.
7826 + */
7827 +static void __init replace_pin_at_irq(unsigned int irq,
7828 + int oldapic, int oldpin,
7829 + int newapic, int newpin)
7830 +{
7831 + struct irq_pin_list *entry = irq_2_pin + irq;
7832 +
7833 + while (1) {
7834 + if (entry->apic == oldapic && entry->pin == oldpin) {
7835 + entry->apic = newapic;
7836 + entry->pin = newpin;
7837 + }
7838 + if (!entry->next)
7839 + break;
7840 + entry = irq_2_pin + entry->next;
7841 + }
7842 +}
7843 +
7844 #define __DO_ACTION(R, ACTION, FINAL) \
7845 \
7846 { \
7847 @@ -399,10 +428,11 @@ static void add_pin_to_irq(unsigned int
7848 static void name##_IO_APIC_irq (unsigned int irq) \
7849 __DO_ACTION(R, ACTION, FINAL)
7850
7851 -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
7852 - /* mask = 1 */
7853 -DO_ACTION( __unmask, 0, &= 0xfffeffff, )
7854 - /* mask = 0 */
7855 +/* mask = 1 */
7856 +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
7857 +
7858 +/* mask = 0 */
7859 +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
7860
7861 static void mask_IO_APIC_irq (unsigned int irq)
7862 {
7863 @@ -465,22 +495,6 @@ static int __init disable_timer_pin_setu
7864 }
7865 __setup("disable_timer_pin_1", disable_timer_pin_setup);
7866
7867 -#ifndef CONFIG_XEN
7868 -static int __init setup_disable_8254_timer(char *s)
7869 -{
7870 - timer_over_8254 = -1;
7871 - return 1;
7872 -}
7873 -static int __init setup_enable_8254_timer(char *s)
7874 -{
7875 - timer_over_8254 = 2;
7876 - return 1;
7877 -}
7878 -
7879 -__setup("disable_8254_timer", setup_disable_8254_timer);
7880 -__setup("enable_8254_timer", setup_enable_8254_timer);
7881 -#endif /* !CONFIG_XEN */
7882 -
7883
7884 /*
7885 * Find the IRQ entry number of a certain pin.
7886 @@ -490,10 +504,10 @@ static int find_irq_entry(int apic, int
7887 int i;
7888
7889 for (i = 0; i < mp_irq_entries; i++)
7890 - if (mp_irqs[i].mpc_irqtype == type &&
7891 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
7892 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
7893 - mp_irqs[i].mpc_dstirq == pin)
7894 + if (mp_irqs[i].mp_irqtype == type &&
7895 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
7896 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
7897 + mp_irqs[i].mp_dstirq == pin)
7898 return i;
7899
7900 return -1;
7901 @@ -508,13 +522,13 @@ static int __init find_isa_irq_pin(int i
7902 int i;
7903
7904 for (i = 0; i < mp_irq_entries; i++) {
7905 - int lbus = mp_irqs[i].mpc_srcbus;
7906 + int lbus = mp_irqs[i].mp_srcbus;
7907
7908 if (test_bit(lbus, mp_bus_not_pci) &&
7909 - (mp_irqs[i].mpc_irqtype == type) &&
7910 - (mp_irqs[i].mpc_srcbusirq == irq))
7911 + (mp_irqs[i].mp_irqtype == type) &&
7912 + (mp_irqs[i].mp_srcbusirq == irq))
7913
7914 - return mp_irqs[i].mpc_dstirq;
7915 + return mp_irqs[i].mp_dstirq;
7916 }
7917 return -1;
7918 }
7919 @@ -524,17 +538,17 @@ static int __init find_isa_irq_apic(int
7920 int i;
7921
7922 for (i = 0; i < mp_irq_entries; i++) {
7923 - int lbus = mp_irqs[i].mpc_srcbus;
7924 + int lbus = mp_irqs[i].mp_srcbus;
7925
7926 if (test_bit(lbus, mp_bus_not_pci) &&
7927 - (mp_irqs[i].mpc_irqtype == type) &&
7928 - (mp_irqs[i].mpc_srcbusirq == irq))
7929 + (mp_irqs[i].mp_irqtype == type) &&
7930 + (mp_irqs[i].mp_srcbusirq == irq))
7931 break;
7932 }
7933 if (i < mp_irq_entries) {
7934 int apic;
7935 for(apic = 0; apic < nr_ioapics; apic++) {
7936 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
7937 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
7938 return apic;
7939 }
7940 }
7941 @@ -555,28 +569,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7942
7943 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
7944 bus, slot, pin);
7945 - if (mp_bus_id_to_pci_bus[bus] == -1) {
7946 + if (test_bit(bus, mp_bus_not_pci)) {
7947 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
7948 return -1;
7949 }
7950 for (i = 0; i < mp_irq_entries; i++) {
7951 - int lbus = mp_irqs[i].mpc_srcbus;
7952 + int lbus = mp_irqs[i].mp_srcbus;
7953
7954 for (apic = 0; apic < nr_ioapics; apic++)
7955 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
7956 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
7957 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
7958 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
7959 break;
7960
7961 if (!test_bit(lbus, mp_bus_not_pci) &&
7962 - !mp_irqs[i].mpc_irqtype &&
7963 + !mp_irqs[i].mp_irqtype &&
7964 (bus == lbus) &&
7965 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
7966 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
7967 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
7968 + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
7969
7970 if (!(apic || IO_APIC_IRQ(irq)))
7971 continue;
7972
7973 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
7974 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
7975 return irq;
7976 /*
7977 * Use the first all-but-pin matching entry as a
7978 @@ -604,13 +618,13 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7979
7980 static int MPBIOS_polarity(int idx)
7981 {
7982 - int bus = mp_irqs[idx].mpc_srcbus;
7983 + int bus = mp_irqs[idx].mp_srcbus;
7984 int polarity;
7985
7986 /*
7987 * Determine IRQ line polarity (high active or low active):
7988 */
7989 - switch (mp_irqs[idx].mpc_irqflag & 3)
7990 + switch (mp_irqs[idx].mp_irqflag & 3)
7991 {
7992 case 0: /* conforms, ie. bus-type dependent polarity */
7993 if (test_bit(bus, mp_bus_not_pci))
7994 @@ -646,13 +660,13 @@ static int MPBIOS_polarity(int idx)
7995
7996 static int MPBIOS_trigger(int idx)
7997 {
7998 - int bus = mp_irqs[idx].mpc_srcbus;
7999 + int bus = mp_irqs[idx].mp_srcbus;
8000 int trigger;
8001
8002 /*
8003 * Determine IRQ trigger mode (edge or level sensitive):
8004 */
8005 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
8006 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
8007 {
8008 case 0: /* conforms, ie. bus-type dependent */
8009 if (test_bit(bus, mp_bus_not_pci))
8010 @@ -699,16 +713,16 @@ static inline int irq_trigger(int idx)
8011 static int pin_2_irq(int idx, int apic, int pin)
8012 {
8013 int irq, i;
8014 - int bus = mp_irqs[idx].mpc_srcbus;
8015 + int bus = mp_irqs[idx].mp_srcbus;
8016
8017 /*
8018 * Debugging check, we are in big trouble if this message pops up!
8019 */
8020 - if (mp_irqs[idx].mpc_dstirq != pin)
8021 + if (mp_irqs[idx].mp_dstirq != pin)
8022 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
8023
8024 if (test_bit(bus, mp_bus_not_pci)) {
8025 - irq = mp_irqs[idx].mpc_srcbusirq;
8026 + irq = mp_irqs[idx].mp_srcbusirq;
8027 } else {
8028 /*
8029 * PCI IRQs are mapped in order
8030 @@ -722,6 +736,19 @@ static int pin_2_irq(int idx, int apic,
8031 return irq;
8032 }
8033
8034 +void lock_vector_lock(void)
8035 +{
8036 + /* Used to the online set of cpus does not change
8037 + * during assign_irq_vector.
8038 + */
8039 + spin_lock(&vector_lock);
8040 +}
8041 +
8042 +void unlock_vector_lock(void)
8043 +{
8044 + spin_unlock(&vector_lock);
8045 +}
8046 +
8047 static int __assign_irq_vector(int irq, cpumask_t mask)
8048 {
8049 struct physdev_irq irq_op;
8050 @@ -773,7 +800,7 @@ static void __clear_irq_vector(int irq)
8051
8052 vector = cfg->vector;
8053 cpus_and(mask, cfg->domain, cpu_online_map);
8054 - for_each_cpu_mask(cpu, mask)
8055 + for_each_cpu_mask_nr(cpu, mask)
8056 per_cpu(vector_irq, cpu)[vector] = -1;
8057
8058 cfg->vector = 0;
8059 @@ -842,7 +869,7 @@ static void setup_IO_APIC_irq(int apic,
8060 apic_printk(APIC_VERBOSE,KERN_DEBUG
8061 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
8062 "IRQ %d Mode:%i Active:%i)\n",
8063 - apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
8064 + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
8065 irq, trigger, polarity);
8066
8067 /*
8068 @@ -883,10 +910,10 @@ static void __init setup_IO_APIC_irqs(vo
8069 idx = find_irq_entry(apic,pin,mp_INT);
8070 if (idx == -1) {
8071 if (first_notcon) {
8072 - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8073 + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
8074 first_notcon = 0;
8075 } else
8076 - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8077 + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
8078 continue;
8079 }
8080 if (!first_notcon) {
8081 @@ -908,26 +935,21 @@ static void __init setup_IO_APIC_irqs(vo
8082
8083 #ifndef CONFIG_XEN
8084 /*
8085 - * Set up the 8259A-master output pin as broadcast to all
8086 - * CPUs.
8087 + * Set up the timer pin, possibly with the 8259A-master behind.
8088 */
8089 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
8090 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
8091 + int vector)
8092 {
8093 struct IO_APIC_route_entry entry;
8094
8095 memset(&entry, 0, sizeof(entry));
8096
8097 - disable_8259A_irq(0);
8098 -
8099 - /* mask LVT0 */
8100 - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8101 -
8102 /*
8103 * We use logical delivery to get the timer IRQ
8104 * to the first CPU.
8105 */
8106 entry.dest_mode = INT_DEST_MODE;
8107 - entry.mask = 0; /* unmask IRQ now */
8108 + entry.mask = 1; /* mask IRQ now */
8109 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
8110 entry.delivery_mode = INT_DELIVERY_MODE;
8111 entry.polarity = 0;
8112 @@ -936,7 +958,7 @@ static void __init setup_ExtINT_IRQ0_pin
8113
8114 /*
8115 * The timer IRQ doesn't have to know that behind the
8116 - * scene we have a 8259A-master in AEOI mode ...
8117 + * scene we may have a 8259A-master in AEOI mode ...
8118 */
8119 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
8120
8121 @@ -944,8 +966,6 @@ static void __init setup_ExtINT_IRQ0_pin
8122 * Add it to the IO-APIC irq-routing table:
8123 */
8124 ioapic_write_entry(apic, pin, entry);
8125 -
8126 - enable_8259A_irq(0);
8127 }
8128
8129 void __apicdebuginit print_IO_APIC(void)
8130 @@ -962,7 +982,7 @@ void __apicdebuginit print_IO_APIC(void)
8131 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
8132 for (i = 0; i < nr_ioapics; i++)
8133 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
8134 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
8135 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
8136
8137 /*
8138 * We are a bit conservative about what we expect. We have to
8139 @@ -980,7 +1000,7 @@ void __apicdebuginit print_IO_APIC(void)
8140 spin_unlock_irqrestore(&ioapic_lock, flags);
8141
8142 printk("\n");
8143 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
8144 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
8145 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
8146 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
8147
8148 @@ -1072,6 +1092,7 @@ void __apicdebuginit print_local_APIC(vo
8149
8150 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
8151 smp_processor_id(), hard_smp_processor_id());
8152 + v = apic_read(APIC_ID);
8153 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
8154 v = apic_read(APIC_LVR);
8155 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
8156 @@ -1141,7 +1162,7 @@ void __apicdebuginit print_local_APIC(vo
8157
8158 void print_all_local_APICs (void)
8159 {
8160 - on_each_cpu(print_local_APIC, NULL, 1, 1);
8161 + on_each_cpu(print_local_APIC, NULL, 1);
8162 }
8163
8164 void __apicdebuginit print_PIC(void)
8165 @@ -1175,6 +1196,8 @@ void __apicdebuginit print_PIC(void)
8166 v = inb(0x4d1) << 8 | inb(0x4d0);
8167 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
8168 }
8169 +#else
8170 +void __apicdebuginit print_IO_APIC(void) {}
8171 #endif /* !CONFIG_XEN */
8172
8173 void __init enable_IO_APIC(void)
8174 @@ -1359,12 +1382,10 @@ static unsigned int startup_ioapic_irq(u
8175 static int ioapic_retrigger_irq(unsigned int irq)
8176 {
8177 struct irq_cfg *cfg = &irq_cfg[irq];
8178 - cpumask_t mask;
8179 unsigned long flags;
8180
8181 spin_lock_irqsave(&vector_lock, flags);
8182 - mask = cpumask_of_cpu(first_cpu(cfg->domain));
8183 - send_IPI_mask(mask, cfg->vector);
8184 + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
8185 spin_unlock_irqrestore(&vector_lock, flags);
8186
8187 return 1;
8188 @@ -1545,7 +1566,7 @@ static inline void init_IO_APIC_traps(vo
8189 }
8190
8191 #ifndef CONFIG_XEN
8192 -static void enable_lapic_irq (unsigned int irq)
8193 +static void unmask_lapic_irq(unsigned int irq)
8194 {
8195 unsigned long v;
8196
8197 @@ -1553,7 +1574,7 @@ static void enable_lapic_irq (unsigned i
8198 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
8199 }
8200
8201 -static void disable_lapic_irq (unsigned int irq)
8202 +static void mask_lapic_irq(unsigned int irq)
8203 {
8204 unsigned long v;
8205
8206 @@ -1566,19 +1587,20 @@ static void ack_lapic_irq (unsigned int
8207 ack_APIC_irq();
8208 }
8209
8210 -static void end_lapic_irq (unsigned int i) { /* nothing */ }
8211 -
8212 -static struct hw_interrupt_type lapic_irq_type __read_mostly = {
8213 - .name = "local-APIC",
8214 - .typename = "local-APIC-edge",
8215 - .startup = NULL, /* startup_irq() not used for IRQ0 */
8216 - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
8217 - .enable = enable_lapic_irq,
8218 - .disable = disable_lapic_irq,
8219 - .ack = ack_lapic_irq,
8220 - .end = end_lapic_irq,
8221 +static struct irq_chip lapic_chip __read_mostly = {
8222 + .name = "local-APIC",
8223 + .mask = mask_lapic_irq,
8224 + .unmask = unmask_lapic_irq,
8225 + .ack = ack_lapic_irq,
8226 };
8227
8228 +static void lapic_register_intr(int irq)
8229 +{
8230 + irq_desc[irq].status &= ~IRQ_LEVEL;
8231 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
8232 + "edge");
8233 +}
8234 +
8235 static void __init setup_nmi(void)
8236 {
8237 /*
8238 @@ -1664,6 +1686,7 @@ static inline void __init check_timer(vo
8239 struct irq_cfg *cfg = irq_cfg + 0;
8240 int apic1, pin1, apic2, pin2;
8241 unsigned long flags;
8242 + int no_pin1 = 0;
8243
8244 local_irq_save(flags);
8245
8246 @@ -1674,34 +1697,48 @@ static inline void __init check_timer(vo
8247 assign_irq_vector(0, TARGET_CPUS);
8248
8249 /*
8250 - * Subtle, code in do_timer_interrupt() expects an AEOI
8251 - * mode for the 8259A whenever interrupts are routed
8252 - * through I/O APICs. Also IRQ0 has to be enabled in
8253 - * the 8259A which implies the virtual wire has to be
8254 - * disabled in the local APIC.
8255 + * As IRQ0 is to be enabled in the 8259A, the virtual
8256 + * wire has to be disabled in the local APIC.
8257 */
8258 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8259 init_8259A(1);
8260 - if (timer_over_8254 > 0)
8261 - enable_8259A_irq(0);
8262
8263 pin1 = find_isa_irq_pin(0, mp_INT);
8264 apic1 = find_isa_irq_apic(0, mp_INT);
8265 pin2 = ioapic_i8259.pin;
8266 apic2 = ioapic_i8259.apic;
8267
8268 - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
8269 - cfg->vector, apic1, pin1, apic2, pin2);
8270 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
8271 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
8272 + cfg->vector, apic1, pin1, apic2, pin2);
8273 +
8274 + /*
8275 + * Some BIOS writers are clueless and report the ExtINTA
8276 + * I/O APIC input from the cascaded 8259A as the timer
8277 + * interrupt input. So just in case, if only one pin
8278 + * was found above, try it both directly and through the
8279 + * 8259A.
8280 + */
8281 + if (pin1 == -1) {
8282 + pin1 = pin2;
8283 + apic1 = apic2;
8284 + no_pin1 = 1;
8285 + } else if (pin2 == -1) {
8286 + pin2 = pin1;
8287 + apic2 = apic1;
8288 + }
8289
8290 if (pin1 != -1) {
8291 /*
8292 * Ok, does IRQ0 through the IOAPIC work?
8293 */
8294 + if (no_pin1) {
8295 + add_pin_to_irq(0, apic1, pin1);
8296 + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
8297 + }
8298 unmask_IO_APIC_irq(0);
8299 if (!no_timer_check && timer_irq_works()) {
8300 - nmi_watchdog_default();
8301 if (nmi_watchdog == NMI_IO_APIC) {
8302 - disable_8259A_irq(0);
8303 setup_nmi();
8304 enable_8259A_irq(0);
8305 }
8306 @@ -1710,54 +1747,62 @@ static inline void __init check_timer(vo
8307 goto out;
8308 }
8309 clear_IO_APIC_pin(apic1, pin1);
8310 - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
8311 - "connected to IO-APIC\n");
8312 - }
8313 -
8314 - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
8315 - "through the 8259A ... ");
8316 - if (pin2 != -1) {
8317 - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
8318 - apic2, pin2);
8319 + if (!no_pin1)
8320 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
8321 + "8254 timer not connected to IO-APIC\n");
8322 +
8323 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
8324 + "(IRQ0) through the 8259A ...\n");
8325 + apic_printk(APIC_QUIET, KERN_INFO
8326 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
8327 /*
8328 * legacy devices should be connected to IO APIC #0
8329 */
8330 - setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
8331 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
8332 + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
8333 + unmask_IO_APIC_irq(0);
8334 + enable_8259A_irq(0);
8335 if (timer_irq_works()) {
8336 - apic_printk(APIC_VERBOSE," works.\n");
8337 - nmi_watchdog_default();
8338 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
8339 + timer_through_8259 = 1;
8340 if (nmi_watchdog == NMI_IO_APIC) {
8341 + disable_8259A_irq(0);
8342 setup_nmi();
8343 + enable_8259A_irq(0);
8344 }
8345 goto out;
8346 }
8347 /*
8348 * Cleanup, just in case ...
8349 */
8350 + disable_8259A_irq(0);
8351 clear_IO_APIC_pin(apic2, pin2);
8352 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
8353 }
8354 - apic_printk(APIC_VERBOSE," failed.\n");
8355
8356 if (nmi_watchdog == NMI_IO_APIC) {
8357 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
8358 - nmi_watchdog = 0;
8359 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
8360 + "through the IO-APIC - disabling NMI Watchdog!\n");
8361 + nmi_watchdog = NMI_NONE;
8362 }
8363
8364 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
8365 + apic_printk(APIC_QUIET, KERN_INFO
8366 + "...trying to set up timer as Virtual Wire IRQ...\n");
8367
8368 - disable_8259A_irq(0);
8369 - irq_desc[0].chip = &lapic_irq_type;
8370 + lapic_register_intr(0);
8371 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
8372 enable_8259A_irq(0);
8373
8374 if (timer_irq_works()) {
8375 - apic_printk(APIC_VERBOSE," works.\n");
8376 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8377 goto out;
8378 }
8379 + disable_8259A_irq(0);
8380 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
8381 - apic_printk(APIC_VERBOSE," failed.\n");
8382 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
8383
8384 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
8385 + apic_printk(APIC_QUIET, KERN_INFO
8386 + "...trying to set up timer as ExtINT IRQ...\n");
8387
8388 init_8259A(0);
8389 make_8259A_irq(0);
8390 @@ -1766,11 +1811,12 @@ static inline void __init check_timer(vo
8391 unlock_ExtINT_logic();
8392
8393 if (timer_irq_works()) {
8394 - apic_printk(APIC_VERBOSE," works.\n");
8395 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8396 goto out;
8397 }
8398 - apic_printk(APIC_VERBOSE," failed :(.\n");
8399 - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
8400 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
8401 + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
8402 + "report. Then try booting with the 'noapic' option.\n");
8403 out:
8404 local_irq_restore(flags);
8405 }
8406 @@ -1788,10 +1834,21 @@ __setup("no_timer_check", notimercheck);
8407
8408 /*
8409 *
8410 - * IRQs that are handled by the PIC in the MPS IOAPIC case.
8411 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
8412 - * Linux doesn't really care, as it's not actually used
8413 - * for any interrupt handling anyway.
8414 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
8415 + * to devices. However there may be an I/O APIC pin available for
8416 + * this interrupt regardless. The pin may be left unconnected, but
8417 + * typically it will be reused as an ExtINT cascade interrupt for
8418 + * the master 8259A. In the MPS case such a pin will normally be
8419 + * reported as an ExtINT interrupt in the MP table. With ACPI
8420 + * there is no provision for ExtINT interrupts, and in the absence
8421 + * of an override it would be treated as an ordinary ISA I/O APIC
8422 + * interrupt, that is edge-triggered and unmasked by default. We
8423 + * used to do this, but it caused problems on some systems because
8424 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
8425 + * the same ExtINT cascade interrupt to drive the local APIC of the
8426 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
8427 + * the I/O APIC in all cases now. No actual device should request
8428 + * it anyway. --macro
8429 */
8430 #define PIC_IRQS (1<<2)
8431
8432 @@ -1799,10 +1856,7 @@ void __init setup_IO_APIC(void)
8433 {
8434 enable_IO_APIC();
8435
8436 - if (acpi_ioapic)
8437 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
8438 - else
8439 - io_apic_irqs = ~PIC_IRQS;
8440 + io_apic_irqs = ~PIC_IRQS;
8441
8442 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
8443
8444 @@ -1851,8 +1905,8 @@ static int ioapic_resume(struct sys_devi
8445
8446 spin_lock_irqsave(&ioapic_lock, flags);
8447 reg_00.raw = io_apic_read(dev->id, 0);
8448 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
8449 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
8450 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
8451 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
8452 io_apic_write(dev->id, 0, reg_00.raw);
8453 }
8454 spin_unlock_irqrestore(&ioapic_lock, flags);
8455 @@ -2254,8 +2308,8 @@ int acpi_get_override_irq(int bus_irq, i
8456 return -1;
8457
8458 for (i = 0; i < mp_irq_entries; i++)
8459 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
8460 - mp_irqs[i].mpc_srcbusirq == bus_irq)
8461 + if (mp_irqs[i].mp_irqtype == mp_INT &&
8462 + mp_irqs[i].mp_srcbusirq == bus_irq)
8463 break;
8464 if (i >= mp_irq_entries)
8465 return -1;
8466 @@ -2349,7 +2403,7 @@ void __init ioapic_init_mappings(void)
8467 ioapic_res = ioapic_setup_resources();
8468 for (i = 0; i < nr_ioapics; i++) {
8469 if (smp_found_config) {
8470 - ioapic_phys = mp_ioapics[i].mpc_apicaddr;
8471 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
8472 } else {
8473 ioapic_phys = (unsigned long)
8474 alloc_bootmem_pages(PAGE_SIZE);
8475 --- sle11-2009-06-04.orig/arch/x86/kernel/ipi-xen.c 2009-03-16 16:38:05.000000000 +0100
8476 +++ sle11-2009-06-04/arch/x86/kernel/ipi-xen.c 2009-06-04 10:21:39.000000000 +0200
8477 @@ -8,7 +8,6 @@
8478 #include <linux/kernel_stat.h>
8479 #include <linux/mc146818rtc.h>
8480 #include <linux/cache.h>
8481 -#include <linux/interrupt.h>
8482 #include <linux/cpu.h>
8483 #include <linux/module.h>
8484
8485 @@ -85,7 +84,7 @@ void __send_IPI_shortcut(unsigned int sh
8486 /*
8487 * Send the IPI. The write to APIC_ICR fires this off.
8488 */
8489 - apic_write_around(APIC_ICR, cfg);
8490 + apic_write(APIC_ICR, cfg);
8491 #else
8492 int cpu;
8493
8494 @@ -132,7 +131,7 @@ static inline void __send_IPI_dest_field
8495 * prepare target chip field
8496 */
8497 cfg = __prepare_ICR2(mask);
8498 - apic_write_around(APIC_ICR2, cfg);
8499 + apic_write(APIC_ICR2, cfg);
8500
8501 /*
8502 * program the ICR
8503 @@ -142,7 +141,7 @@ static inline void __send_IPI_dest_field
8504 /*
8505 * Send the IPI. The write to APIC_ICR fires this off.
8506 */
8507 - apic_write_around(APIC_ICR, cfg);
8508 + apic_write(APIC_ICR, cfg);
8509 }
8510 #endif
8511
8512 --- sle11-2009-06-04.orig/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:38:05.000000000 +0100
8513 +++ sle11-2009-06-04/arch/x86/kernel/irq_32-xen.c 2009-06-04 10:21:39.000000000 +0200
8514 @@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
8515 #endif
8516 }
8517
8518 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
8519 +/* Debugging check for stack overflow: is there less than 1KB free? */
8520 +static int check_stack_overflow(void)
8521 +{
8522 + long sp;
8523 +
8524 + __asm__ __volatile__("andl %%esp,%0" :
8525 + "=r" (sp) : "0" (THREAD_SIZE - 1));
8526 +
8527 + return sp < (sizeof(struct thread_info) + STACK_WARN);
8528 +}
8529 +
8530 +static void print_stack_overflow(void)
8531 +{
8532 + printk(KERN_WARNING "low stack detected by irq handler\n");
8533 + dump_stack();
8534 +}
8535 +
8536 +#else
8537 +static inline int check_stack_overflow(void) { return 0; }
8538 +static inline void print_stack_overflow(void) { }
8539 +#endif
8540 +
8541 #ifdef CONFIG_4KSTACKS
8542 /*
8543 * per-CPU IRQ handling contexts (thread information and stack)
8544 @@ -59,48 +82,26 @@ union irq_ctx {
8545
8546 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
8547 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
8548 -#endif
8549 -
8550 -/*
8551 - * do_IRQ handles all normal device IRQ's (the special
8552 - * SMP cross-CPU interrupts have their own specific
8553 - * handlers).
8554 - */
8555 -unsigned int do_IRQ(struct pt_regs *regs)
8556 -{
8557 - struct pt_regs *old_regs;
8558 - /* high bit used in ret_from_ code */
8559 - int irq = ~regs->orig_ax;
8560 - struct irq_desc *desc = irq_desc + irq;
8561 -#ifdef CONFIG_4KSTACKS
8562 - union irq_ctx *curctx, *irqctx;
8563 - u32 *isp;
8564 -#endif
8565
8566 - if (unlikely((unsigned)irq >= NR_IRQS)) {
8567 - printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8568 - __func__, irq);
8569 - BUG();
8570 - }
8571 +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8572 +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8573
8574 - old_regs = set_irq_regs(regs);
8575 - /*irq_enter();*/
8576 -#ifdef CONFIG_DEBUG_STACKOVERFLOW
8577 - /* Debugging check for stack overflow: is there less than 1KB free? */
8578 - {
8579 - long sp;
8580 -
8581 - __asm__ __volatile__("andl %%esp,%0" :
8582 - "=r" (sp) : "0" (THREAD_SIZE - 1));
8583 - if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
8584 - printk("do_IRQ: stack overflow: %ld\n",
8585 - sp - sizeof(struct thread_info));
8586 - dump_stack();
8587 - }
8588 - }
8589 -#endif
8590 +static void call_on_stack(void *func, void *stack)
8591 +{
8592 + asm volatile("xchgl %%ebx,%%esp \n"
8593 + "call *%%edi \n"
8594 + "movl %%ebx,%%esp \n"
8595 + : "=b" (stack)
8596 + : "0" (stack),
8597 + "D"(func)
8598 + : "memory", "cc", "edx", "ecx", "eax");
8599 +}
8600
8601 -#ifdef CONFIG_4KSTACKS
8602 +static inline int
8603 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
8604 +{
8605 + union irq_ctx *curctx, *irqctx;
8606 + u32 *isp, arg1, arg2;
8607
8608 curctx = (union irq_ctx *) current_thread_info();
8609 irqctx = hardirq_ctx[smp_processor_id()];
8610 @@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs
8611 * handler) we can't do that and just have to keep using the
8612 * current stack (which is the irq stack already after all)
8613 */
8614 - if (curctx != irqctx) {
8615 - int arg1, arg2, bx;
8616 -
8617 - /* build the stack frame on the IRQ stack */
8618 - isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8619 - irqctx->tinfo.task = curctx->tinfo.task;
8620 - irqctx->tinfo.previous_esp = current_stack_pointer;
8621 + if (unlikely(curctx == irqctx))
8622 + return 0;
8623
8624 - /*
8625 - * Copy the softirq bits in preempt_count so that the
8626 - * softirq checks work in the hardirq context.
8627 - */
8628 - irqctx->tinfo.preempt_count =
8629 - (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8630 - (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8631 -
8632 - asm volatile(
8633 - " xchgl %%ebx,%%esp \n"
8634 - " call *%%edi \n"
8635 - " movl %%ebx,%%esp \n"
8636 - : "=a" (arg1), "=d" (arg2), "=b" (bx)
8637 - : "0" (irq), "1" (desc), "2" (isp),
8638 - "D" (desc->handle_irq)
8639 - : "memory", "cc", "ecx"
8640 - );
8641 - } else
8642 -#endif
8643 - desc->handle_irq(irq, desc);
8644 + /* build the stack frame on the IRQ stack */
8645 + isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
8646 + irqctx->tinfo.task = curctx->tinfo.task;
8647 + irqctx->tinfo.previous_esp = current_stack_pointer;
8648
8649 - /*irq_exit();*/
8650 - set_irq_regs(old_regs);
8651 + /*
8652 + * Copy the softirq bits in preempt_count so that the
8653 + * softirq checks work in the hardirq context.
8654 + */
8655 + irqctx->tinfo.preempt_count =
8656 + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8657 + (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8658 +
8659 + if (unlikely(overflow))
8660 + call_on_stack(print_stack_overflow, isp);
8661 +
8662 + asm volatile("xchgl %%ebx,%%esp \n"
8663 + "call *%%edi \n"
8664 + "movl %%ebx,%%esp \n"
8665 + : "=a" (arg1), "=d" (arg2), "=b" (isp)
8666 + : "0" (irq), "1" (desc), "2" (isp),
8667 + "D" (desc->handle_irq)
8668 + : "memory", "cc", "ecx");
8669 return 1;
8670 }
8671
8672 -#ifdef CONFIG_4KSTACKS
8673 -
8674 -static char softirq_stack[NR_CPUS * THREAD_SIZE]
8675 - __attribute__((__section__(".bss.page_aligned")));
8676 -
8677 -static char hardirq_stack[NR_CPUS * THREAD_SIZE]
8678 - __attribute__((__section__(".bss.page_aligned")));
8679 -
8680 /*
8681 * allocate per-cpu stacks for hardirq and for softirq processing
8682 */
8683 -void irq_ctx_init(int cpu)
8684 +void __cpuinit irq_ctx_init(int cpu)
8685 {
8686 union irq_ctx *irqctx;
8687
8688 @@ -164,25 +152,25 @@ void irq_ctx_init(int cpu)
8689 return;
8690
8691 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
8692 - irqctx->tinfo.task = NULL;
8693 - irqctx->tinfo.exec_domain = NULL;
8694 - irqctx->tinfo.cpu = cpu;
8695 - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8696 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8697 + irqctx->tinfo.task = NULL;
8698 + irqctx->tinfo.exec_domain = NULL;
8699 + irqctx->tinfo.cpu = cpu;
8700 + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8701 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8702
8703 hardirq_ctx[cpu] = irqctx;
8704
8705 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
8706 - irqctx->tinfo.task = NULL;
8707 - irqctx->tinfo.exec_domain = NULL;
8708 - irqctx->tinfo.cpu = cpu;
8709 - irqctx->tinfo.preempt_count = 0;
8710 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8711 + irqctx->tinfo.task = NULL;
8712 + irqctx->tinfo.exec_domain = NULL;
8713 + irqctx->tinfo.cpu = cpu;
8714 + irqctx->tinfo.preempt_count = 0;
8715 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8716
8717 softirq_ctx[cpu] = irqctx;
8718
8719 - printk("CPU %u irqstacks, hard=%p soft=%p\n",
8720 - cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8721 + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
8722 + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8723 }
8724
8725 void irq_ctx_exit(int cpu)
8726 @@ -211,25 +199,56 @@ asmlinkage void do_softirq(void)
8727 /* build the stack frame on the softirq stack */
8728 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8729
8730 - asm volatile(
8731 - " xchgl %%ebx,%%esp \n"
8732 - " call __do_softirq \n"
8733 - " movl %%ebx,%%esp \n"
8734 - : "=b"(isp)
8735 - : "0"(isp)
8736 - : "memory", "cc", "edx", "ecx", "eax"
8737 - );
8738 + call_on_stack(__do_softirq, isp);
8739 /*
8740 * Shouldnt happen, we returned above if in_interrupt():
8741 - */
8742 + */
8743 WARN_ON_ONCE(softirq_count());
8744 }
8745
8746 local_irq_restore(flags);
8747 }
8748 +
8749 +#else
8750 +static inline int
8751 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
8752 #endif
8753
8754 /*
8755 + * do_IRQ handles all normal device IRQ's (the special
8756 + * SMP cross-CPU interrupts have their own specific
8757 + * handlers).
8758 + */
8759 +unsigned int do_IRQ(struct pt_regs *regs)
8760 +{
8761 + struct pt_regs *old_regs;
8762 + /* high bit used in ret_from_ code */
8763 + int overflow, irq = ~regs->orig_ax;
8764 + struct irq_desc *desc = irq_desc + irq;
8765 +
8766 + if (unlikely((unsigned)irq >= NR_IRQS)) {
8767 + printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8768 + __func__, irq);
8769 + BUG();
8770 + }
8771 +
8772 + old_regs = set_irq_regs(regs);
8773 + /*irq_enter();*/
8774 +
8775 + overflow = check_stack_overflow();
8776 +
8777 + if (!execute_on_irq_stack(overflow, desc, irq)) {
8778 + if (unlikely(overflow))
8779 + print_stack_overflow();
8780 + desc->handle_irq(irq, desc);
8781 + }
8782 +
8783 + /*irq_exit();*/
8784 + set_irq_regs(old_regs);
8785 + return 1;
8786 +}
8787 +
8788 +/*
8789 * Interrupt statistics:
8790 */
8791
8792 @@ -337,6 +356,42 @@ skip:
8793 return 0;
8794 }
8795
8796 +/*
8797 + * /proc/stat helpers
8798 + */
8799 +u64 arch_irq_stat_cpu(unsigned int cpu)
8800 +{
8801 + u64 sum = nmi_count(cpu);
8802 +
8803 +#ifdef CONFIG_X86_LOCAL_APIC
8804 + sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
8805 +#endif
8806 +#ifdef CONFIG_SMP
8807 + sum += per_cpu(irq_stat, cpu).irq_resched_count;
8808 + sum += per_cpu(irq_stat, cpu).irq_call_count;
8809 +#ifndef CONFIG_XEN
8810 + sum += per_cpu(irq_stat, cpu).irq_tlb_count;
8811 +#endif
8812 +#endif
8813 +#ifdef CONFIG_X86_MCE
8814 + sum += per_cpu(irq_stat, cpu).irq_thermal_count;
8815 +#endif
8816 +#ifdef CONFIG_X86_LOCAL_APIC
8817 + sum += per_cpu(irq_stat, cpu).irq_spurious_count;
8818 +#endif
8819 + return sum;
8820 +}
8821 +
8822 +u64 arch_irq_stat(void)
8823 +{
8824 + u64 sum = atomic_read(&irq_err_count);
8825 +
8826 +#ifdef CONFIG_X86_IO_APIC
8827 + sum += atomic_read(&irq_mis_count);
8828 +#endif
8829 + return sum;
8830 +}
8831 +
8832 #ifdef CONFIG_HOTPLUG_CPU
8833
8834 void fixup_irqs(cpumask_t map)
8835 --- sle11-2009-06-04.orig/arch/x86/kernel/irq_64-xen.c 2009-03-16 16:33:40.000000000 +0100
8836 +++ sle11-2009-06-04/arch/x86/kernel/irq_64-xen.c 2009-06-04 10:21:39.000000000 +0200
8837 @@ -163,6 +163,34 @@ skip:
8838 }
8839
8840 /*
8841 + * /proc/stat helpers
8842 + */
8843 +u64 arch_irq_stat_cpu(unsigned int cpu)
8844 +{
8845 + u64 sum = cpu_pda(cpu)->__nmi_count;
8846 +
8847 + sum += cpu_pda(cpu)->apic_timer_irqs;
8848 +#ifdef CONFIG_SMP
8849 + sum += cpu_pda(cpu)->irq_resched_count;
8850 + sum += cpu_pda(cpu)->irq_call_count;
8851 +#ifndef CONFIG_XEN
8852 + sum += cpu_pda(cpu)->irq_tlb_count;
8853 +#endif
8854 +#endif
8855 +#ifdef CONFIG_X86_MCE
8856 + sum += cpu_pda(cpu)->irq_thermal_count;
8857 + sum += cpu_pda(cpu)->irq_threshold_count;
8858 +#endif
8859 + sum += cpu_pda(cpu)->irq_spurious_count;
8860 + return sum;
8861 +}
8862 +
8863 +u64 arch_irq_stat(void)
8864 +{
8865 + return atomic_read(&irq_err_count);
8866 +}
8867 +
8868 +/*
8869 * do_IRQ handles all normal device IRQ's (the special
8870 * SMP cross-CPU interrupts have their own specific
8871 * handlers).
8872 --- sle11-2009-06-04.orig/arch/x86/kernel/ldt-xen.c 2009-03-16 16:33:40.000000000 +0100
8873 +++ sle11-2009-06-04/arch/x86/kernel/ldt-xen.c 2009-06-04 10:21:39.000000000 +0200
8874 @@ -20,9 +20,9 @@
8875 #include <asm/mmu_context.h>
8876
8877 #ifdef CONFIG_SMP
8878 -static void flush_ldt(void *null)
8879 +static void flush_ldt(void *current_mm)
8880 {
8881 - if (current->active_mm)
8882 + if (current->active_mm == current_mm)
8883 load_LDT(&current->active_mm->context);
8884 }
8885 #endif
8886 @@ -62,8 +62,6 @@ static int alloc_ldt(mm_context_t *pc, i
8887
8888 if (reload) {
8889 #ifdef CONFIG_SMP
8890 - cpumask_t mask;
8891 -
8892 preempt_disable();
8893 #endif
8894 make_pages_readonly(newldt,
8895 @@ -71,9 +69,9 @@ static int alloc_ldt(mm_context_t *pc, i
8896 XENFEAT_writable_descriptor_tables);
8897 load_LDT(pc);
8898 #ifdef CONFIG_SMP
8899 - mask = cpumask_of_cpu(smp_processor_id());
8900 - if (!cpus_equal(current->mm->cpu_vm_mask, mask))
8901 - smp_call_function(flush_ldt, NULL, 1, 1);
8902 + if (!cpus_equal(current->mm->cpu_vm_mask,
8903 + cpumask_of_cpu(smp_processor_id())))
8904 + smp_call_function(flush_ldt, current->mm, 1);
8905 preempt_enable();
8906 #endif
8907 }
8908 --- sle11-2009-06-04.orig/arch/x86/kernel/machine_kexec_32.c 2008-11-25 12:35:53.000000000 +0100
8909 +++ sle11-2009-06-04/arch/x86/kernel/machine_kexec_32.c 2009-06-04 10:21:39.000000000 +0200
8910 @@ -68,6 +68,8 @@ void machine_kexec_setup_load_arg(xen_ke
8911 xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
8912 xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
8913
8914 + if (image->type == KEXEC_TYPE_DEFAULT)
8915 + xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
8916 }
8917
8918 int __init machine_kexec_setup_resources(struct resource *hypervisor,
8919 --- sle11-2009-06-04.orig/arch/x86/kernel/microcode-xen.c 2009-03-16 16:38:05.000000000 +0100
8920 +++ sle11-2009-06-04/arch/x86/kernel/microcode-xen.c 2009-06-04 10:21:39.000000000 +0200
8921 @@ -5,13 +5,14 @@
8922 * 2006 Shaohua Li <shaohua.li@intel.com>
8923 *
8924 * This driver allows to upgrade microcode on Intel processors
8925 - * belonging to IA-32 family - PentiumPro, Pentium II,
8926 + * belonging to IA-32 family - PentiumPro, Pentium II,
8927 * Pentium III, Xeon, Pentium 4, etc.
8928 *
8929 - * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
8930 - * Order Number 245472 or free download from:
8931 - *
8932 - * http://developer.intel.com/design/pentium4/manuals/245472.htm
8933 + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
8934 + * Software Developer's Manual
8935 + * Order Number 253668 or free download from:
8936 + *
8937 + * http://developer.intel.com/design/pentium4/manuals/253668.htm
8938 *
8939 * For more information, go to http://www.urbanmyth.org/microcode
8940 *
8941 @@ -26,6 +27,7 @@
8942 #include <linux/kernel.h>
8943 #include <linux/init.h>
8944 #include <linux/sched.h>
8945 +#include <linux/smp_lock.h>
8946 #include <linux/cpumask.h>
8947 #include <linux/module.h>
8948 #include <linux/slab.h>
8949 @@ -86,6 +88,7 @@ static int do_microcode_update (const vo
8950
8951 static int microcode_open (struct inode *unused1, struct file *unused2)
8952 {
8953 + cycle_kernel_lock();
8954 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8955 }
8956
8957 @@ -162,7 +165,7 @@ static int request_microcode(void)
8958 c->x86, c->x86_model, c->x86_mask);
8959 error = request_firmware(&firmware, name, &microcode_pdev->dev);
8960 if (error) {
8961 - pr_debug("microcode: ucode data file %s load failed\n", name);
8962 + pr_debug("microcode: data file %s load failed\n", name);
8963 return error;
8964 }
8965
8966 @@ -183,6 +186,9 @@ static int __init microcode_init (void)
8967 {
8968 int error;
8969
8970 + printk(KERN_INFO
8971 + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8972 +
8973 error = microcode_dev_init();
8974 if (error)
8975 return error;
8976 @@ -195,8 +201,6 @@ static int __init microcode_init (void)
8977
8978 request_microcode();
8979
8980 - printk(KERN_INFO
8981 - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8982 return 0;
8983 }
8984
8985 --- sle11-2009-06-04.orig/arch/x86/kernel/mpparse-xen.c 2009-03-16 16:38:05.000000000 +0100
8986 +++ sle11-2009-06-04/arch/x86/kernel/mpparse-xen.c 2009-06-04 10:21:39.000000000 +0200
8987 @@ -25,6 +25,9 @@
8988 #include <asm/proto.h>
8989 #include <asm/acpi.h>
8990 #include <asm/bios_ebda.h>
8991 +#include <asm/e820.h>
8992 +#include <asm/trampoline.h>
8993 +#include <asm/setup.h>
8994
8995 #include <mach_apic.h>
8996 #ifdef CONFIG_X86_32
8997 @@ -32,27 +35,10 @@
8998 #include <mach_mpparse.h>
8999 #endif
9000
9001 -/* Have we found an MP table */
9002 -int smp_found_config;
9003 -
9004 -/*
9005 - * Various Linux-internal data structures created from the
9006 - * MP-table.
9007 - */
9008 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9009 -int mp_bus_id_to_type[MAX_MP_BUSSES];
9010 -#endif
9011 -
9012 -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
9013 -int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
9014 -
9015 -static int mp_current_pci_id;
9016 -
9017 -int pic_mode;
9018 -
9019 -/*
9020 - * Intel MP BIOS table parsing routines:
9021 - */
9022 +static void *_bus_to_virt(unsigned long ma)
9023 +{
9024 + return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma);
9025 +}
9026
9027 /*
9028 * Checksum an MP configuration block.
9029 @@ -68,19 +54,7 @@ static int __init mpf_checksum(unsigned
9030 return sum & 0xFF;
9031 }
9032
9033 -#ifdef CONFIG_X86_NUMAQ
9034 -/*
9035 - * Have to match translation table entries to main table entries by counter
9036 - * hence the mpc_record variable .... can't see a less disgusting way of
9037 - * doing this ....
9038 - */
9039 -
9040 -static int mpc_record;
9041 -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
9042 - __cpuinitdata;
9043 -#endif
9044 -
9045 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9046 +static void __init MP_processor_info(struct mpc_config_processor *m)
9047 {
9048 #ifndef CONFIG_XEN
9049 int apicid;
9050 @@ -90,11 +64,12 @@ static void __cpuinit MP_processor_info(
9051 disabled_cpus++;
9052 return;
9053 }
9054 -#ifdef CONFIG_X86_NUMAQ
9055 - apicid = mpc_apic_id(m, translation_table[mpc_record]);
9056 -#else
9057 - apicid = m->mpc_apicid;
9058 -#endif
9059 +
9060 + if (x86_quirks->mpc_apic_id)
9061 + apicid = x86_quirks->mpc_apic_id(m);
9062 + else
9063 + apicid = m->mpc_apicid;
9064 +
9065 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
9066 bootup_cpu = " (Bootup-CPU)";
9067 boot_cpu_physical_apicid = m->mpc_apicid;
9068 @@ -107,18 +82,17 @@ static void __cpuinit MP_processor_info(
9069 #endif
9070 }
9071
9072 +#ifdef CONFIG_X86_IO_APIC
9073 static void __init MP_bus_info(struct mpc_config_bus *m)
9074 {
9075 char str[7];
9076 -
9077 memcpy(str, m->mpc_bustype, 6);
9078 str[6] = 0;
9079
9080 -#ifdef CONFIG_X86_NUMAQ
9081 - mpc_oem_bus_info(m, str, translation_table[mpc_record]);
9082 -#else
9083 - Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
9084 -#endif
9085 + if (x86_quirks->mpc_oem_bus_info)
9086 + x86_quirks->mpc_oem_bus_info(m, str);
9087 + else
9088 + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
9089
9090 #if MAX_MP_BUSSES < 256
9091 if (m->mpc_busid >= MAX_MP_BUSSES) {
9092 @@ -135,12 +109,10 @@ static void __init MP_bus_info(struct mp
9093 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
9094 #endif
9095 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
9096 -#ifdef CONFIG_X86_NUMAQ
9097 - mpc_oem_pci_bus(m, translation_table[mpc_record]);
9098 -#endif
9099 + if (x86_quirks->mpc_oem_pci_bus)
9100 + x86_quirks->mpc_oem_pci_bus(m);
9101 +
9102 clear_bit(m->mpc_busid, mp_bus_not_pci);
9103 - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
9104 - mp_current_pci_id++;
9105 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
9106 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
9107 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
9108 @@ -151,6 +123,7 @@ static void __init MP_bus_info(struct mp
9109 } else
9110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
9111 }
9112 +#endif
9113
9114 #ifdef CONFIG_X86_IO_APIC
9115
9116 @@ -180,117 +153,111 @@ static void __init MP_ioapic_info(struct
9117 if (bad_ioapic(m->mpc_apicaddr))
9118 return;
9119
9120 - mp_ioapics[nr_ioapics] = *m;
9121 + mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
9122 + mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
9123 + mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
9124 + mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
9125 + mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
9126 nr_ioapics++;
9127 }
9128
9129 -static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9130 +static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
9131 {
9132 - mp_irqs[mp_irq_entries] = *m;
9133 - Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
9134 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9135 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9136 m->mpc_irqtype, m->mpc_irqflag & 3,
9137 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
9138 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
9139 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
9140 - panic("Max # of irq sources exceeded!!\n");
9141 }
9142
9143 -#endif
9144 -
9145 -static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9146 +static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
9147 {
9148 - Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
9149 - " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9150 - m->mpc_irqtype, m->mpc_irqflag & 3,
9151 - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9152 - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9153 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9154 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9155 + mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
9156 + (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
9157 + mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
9158 }
9159
9160 -#ifdef CONFIG_X86_NUMAQ
9161 -static void __init MP_translation_info(struct mpc_config_translation *m)
9162 +static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
9163 + struct mp_config_intsrc *mp_irq)
9164 {
9165 - printk(KERN_INFO
9166 - "Translation: record %d, type %d, quad %d, global %d, local %d\n",
9167 - mpc_record, m->trans_type, m->trans_quad, m->trans_global,
9168 - m->trans_local);
9169 + mp_irq->mp_dstapic = m->mpc_dstapic;
9170 + mp_irq->mp_type = m->mpc_type;
9171 + mp_irq->mp_irqtype = m->mpc_irqtype;
9172 + mp_irq->mp_irqflag = m->mpc_irqflag;
9173 + mp_irq->mp_srcbus = m->mpc_srcbus;
9174 + mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
9175 + mp_irq->mp_dstirq = m->mpc_dstirq;
9176 +}
9177
9178 - if (mpc_record >= MAX_MPC_ENTRY)
9179 - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
9180 - else
9181 - translation_table[mpc_record] = m; /* stash this for later */
9182 - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
9183 - node_set_online(m->trans_quad);
9184 +static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
9185 + struct mpc_config_intsrc *m)
9186 +{
9187 + m->mpc_dstapic = mp_irq->mp_dstapic;
9188 + m->mpc_type = mp_irq->mp_type;
9189 + m->mpc_irqtype = mp_irq->mp_irqtype;
9190 + m->mpc_irqflag = mp_irq->mp_irqflag;
9191 + m->mpc_srcbus = mp_irq->mp_srcbus;
9192 + m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
9193 + m->mpc_dstirq = mp_irq->mp_dstirq;
9194 }
9195
9196 -/*
9197 - * Read/parse the MPC oem tables
9198 - */
9199 +static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
9200 + struct mpc_config_intsrc *m)
9201 +{
9202 + if (mp_irq->mp_dstapic != m->mpc_dstapic)
9203 + return 1;
9204 + if (mp_irq->mp_type != m->mpc_type)
9205 + return 2;
9206 + if (mp_irq->mp_irqtype != m->mpc_irqtype)
9207 + return 3;
9208 + if (mp_irq->mp_irqflag != m->mpc_irqflag)
9209 + return 4;
9210 + if (mp_irq->mp_srcbus != m->mpc_srcbus)
9211 + return 5;
9212 + if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
9213 + return 6;
9214 + if (mp_irq->mp_dstirq != m->mpc_dstirq)
9215 + return 7;
9216
9217 -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
9218 - unsigned short oemsize)
9219 + return 0;
9220 +}
9221 +
9222 +static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9223 {
9224 - int count = sizeof(*oemtable); /* the header size */
9225 - unsigned char *oemptr = ((unsigned char *)oemtable) + count;
9226 + int i;
9227
9228 - mpc_record = 0;
9229 - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
9230 - oemtable);
9231 - if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
9232 - printk(KERN_WARNING
9233 - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
9234 - oemtable->oem_signature[0], oemtable->oem_signature[1],
9235 - oemtable->oem_signature[2], oemtable->oem_signature[3]);
9236 - return;
9237 - }
9238 - if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
9239 - printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
9240 - return;
9241 - }
9242 - while (count < oemtable->oem_length) {
9243 - switch (*oemptr) {
9244 - case MP_TRANSLATION:
9245 - {
9246 - struct mpc_config_translation *m =
9247 - (struct mpc_config_translation *)oemptr;
9248 - MP_translation_info(m);
9249 - oemptr += sizeof(*m);
9250 - count += sizeof(*m);
9251 - ++mpc_record;
9252 - break;
9253 - }
9254 - default:
9255 - {
9256 - printk(KERN_WARNING
9257 - "Unrecognised OEM table entry type! - %d\n",
9258 - (int)*oemptr);
9259 - return;
9260 - }
9261 - }
9262 + print_MP_intsrc_info(m);
9263 +
9264 + for (i = 0; i < mp_irq_entries; i++) {
9265 + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
9266 + return;
9267 }
9268 +
9269 + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
9270 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
9271 + panic("Max # of irq sources exceeded!!\n");
9272 }
9273
9274 -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
9275 - char *productid)
9276 +#endif
9277 +
9278 +static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9279 {
9280 - if (strncmp(oem, "IBM NUMA", 8))
9281 - printk("Warning! May not be a NUMA-Q system!\n");
9282 - if (mpc->mpc_oemptr)
9283 - smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
9284 - mpc->mpc_oemsize);
9285 + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
9286 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9287 + m->mpc_irqtype, m->mpc_irqflag & 3,
9288 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9289 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9290 }
9291 -#endif /* CONFIG_X86_NUMAQ */
9292
9293 /*
9294 * Read/parse the MPC
9295 */
9296
9297 -static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9298 +static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
9299 + char *str)
9300 {
9301 - char str[16];
9302 - char oem[10];
9303 - int count = sizeof(*mpc);
9304 - unsigned char *mpt = ((unsigned char *)mpc) + count;
9305
9306 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
9307 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
9308 @@ -313,19 +280,41 @@ static int __init smp_read_mpc(struct mp
9309 }
9310 memcpy(oem, mpc->mpc_oem, 8);
9311 oem[8] = 0;
9312 - printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
9313 + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
9314
9315 memcpy(str, mpc->mpc_productid, 12);
9316 str[12] = 0;
9317 - printk("Product ID: %s ", str);
9318
9319 -#ifdef CONFIG_X86_32
9320 - mps_oem_check(mpc, oem, str);
9321 -#endif
9322 - printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
9323 + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
9324
9325 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
9326
9327 + return 1;
9328 +}
9329 +
9330 +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9331 +{
9332 + char str[16];
9333 + char oem[10];
9334 +
9335 + int count = sizeof(*mpc);
9336 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9337 +
9338 + if (!smp_check_mpc(mpc, oem, str))
9339 + return 0;
9340 +
9341 +#ifdef CONFIG_X86_32
9342 + /*
9343 + * need to make sure summit and es7000's mps_oem_check is safe to be
9344 + * called early via genericarch 's mps_oem_check
9345 + */
9346 + if (early) {
9347 +#ifdef CONFIG_X86_NUMAQ
9348 + numaq_mps_oem_check(mpc, oem, str);
9349 +#endif
9350 + } else
9351 + mps_oem_check(mpc, oem, str);
9352 +#endif
9353 /* save the local APIC address, it might be non-default */
9354 if (!acpi_lapic)
9355 mp_lapic_addr = mpc->mpc_lapic;
9356 @@ -333,12 +322,17 @@ static int __init smp_read_mpc(struct mp
9357 if (early)
9358 return 1;
9359
9360 + if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
9361 + struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
9362 + x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
9363 + }
9364 +
9365 /*
9366 * Now process the configuration blocks.
9367 */
9368 -#ifdef CONFIG_X86_NUMAQ
9369 - mpc_record = 0;
9370 -#endif
9371 + if (x86_quirks->mpc_record)
9372 + *x86_quirks->mpc_record = 0;
9373 +
9374 while (count < mpc->mpc_length) {
9375 switch (*mpt) {
9376 case MP_PROCESSOR:
9377 @@ -356,7 +350,9 @@ static int __init smp_read_mpc(struct mp
9378 {
9379 struct mpc_config_bus *m =
9380 (struct mpc_config_bus *)mpt;
9381 +#ifdef CONFIG_X86_IO_APIC
9382 MP_bus_info(m);
9383 +#endif
9384 mpt += sizeof(*m);
9385 count += sizeof(*m);
9386 break;
9387 @@ -402,10 +398,14 @@ static int __init smp_read_mpc(struct mp
9388 count = mpc->mpc_length;
9389 break;
9390 }
9391 -#ifdef CONFIG_X86_NUMAQ
9392 - ++mpc_record;
9393 -#endif
9394 + if (x86_quirks->mpc_record)
9395 + (*x86_quirks->mpc_record)++;
9396 }
9397 +
9398 +#ifdef CONFIG_X86_GENERICARCH
9399 + generic_bigsmp_probe();
9400 +#endif
9401 +
9402 setup_apic_routing();
9403 if (!num_processors)
9404 printk(KERN_ERR "MPTABLE: no processors registered!\n");
9405 @@ -431,7 +431,7 @@ static void __init construct_default_ioi
9406 intsrc.mpc_type = MP_INTSRC;
9407 intsrc.mpc_irqflag = 0; /* conforming */
9408 intsrc.mpc_srcbus = 0;
9409 - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
9410 + intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
9411
9412 intsrc.mpc_irqtype = mp_INT;
9413
9414 @@ -492,40 +492,11 @@ static void __init construct_default_ioi
9415 MP_intsrc_info(&intsrc);
9416 }
9417
9418 -#endif
9419
9420 -static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9421 +static void __init construct_ioapic_table(int mpc_default_type)
9422 {
9423 - struct mpc_config_processor processor;
9424 - struct mpc_config_bus bus;
9425 -#ifdef CONFIG_X86_IO_APIC
9426 struct mpc_config_ioapic ioapic;
9427 -#endif
9428 - struct mpc_config_lintsrc lintsrc;
9429 - int linttypes[2] = { mp_ExtINT, mp_NMI };
9430 - int i;
9431 -
9432 - /*
9433 - * local APIC has default address
9434 - */
9435 - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9436 -
9437 - /*
9438 - * 2 CPUs, numbered 0 & 1.
9439 - */
9440 - processor.mpc_type = MP_PROCESSOR;
9441 - /* Either an integrated APIC or a discrete 82489DX. */
9442 - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9443 - processor.mpc_cpuflag = CPU_ENABLED;
9444 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9445 - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9446 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9447 - processor.mpc_reserved[0] = 0;
9448 - processor.mpc_reserved[1] = 0;
9449 - for (i = 0; i < 2; i++) {
9450 - processor.mpc_apicid = i;
9451 - MP_processor_info(&processor);
9452 - }
9453 + struct mpc_config_bus bus;
9454
9455 bus.mpc_type = MP_BUS;
9456 bus.mpc_busid = 0;
9457 @@ -554,7 +525,6 @@ static inline void __init construct_defa
9458 MP_bus_info(&bus);
9459 }
9460
9461 -#ifdef CONFIG_X86_IO_APIC
9462 ioapic.mpc_type = MP_IOAPIC;
9463 ioapic.mpc_apicid = 2;
9464 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9465 @@ -566,7 +536,42 @@ static inline void __init construct_defa
9466 * We set up most of the low 16 IO-APIC pins according to MPS rules.
9467 */
9468 construct_default_ioirq_mptable(mpc_default_type);
9469 +}
9470 +#else
9471 +static inline void __init construct_ioapic_table(int mpc_default_type) { }
9472 #endif
9473 +
9474 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9475 +{
9476 + struct mpc_config_processor processor;
9477 + struct mpc_config_lintsrc lintsrc;
9478 + int linttypes[2] = { mp_ExtINT, mp_NMI };
9479 + int i;
9480 +
9481 + /*
9482 + * local APIC has default address
9483 + */
9484 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9485 +
9486 + /*
9487 + * 2 CPUs, numbered 0 & 1.
9488 + */
9489 + processor.mpc_type = MP_PROCESSOR;
9490 + /* Either an integrated APIC or a discrete 82489DX. */
9491 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9492 + processor.mpc_cpuflag = CPU_ENABLED;
9493 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9494 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9495 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9496 + processor.mpc_reserved[0] = 0;
9497 + processor.mpc_reserved[1] = 0;
9498 + for (i = 0; i < 2; i++) {
9499 + processor.mpc_apicid = i;
9500 + MP_processor_info(&processor);
9501 + }
9502 +
9503 + construct_ioapic_table(mpc_default_type);
9504 +
9505 lintsrc.mpc_type = MP_LINTSRC;
9506 lintsrc.mpc_irqflag = 0; /* conforming */
9507 lintsrc.mpc_srcbusid = 0;
9508 @@ -584,10 +589,14 @@ static struct intel_mp_floating *mpf_fou
9509 /*
9510 * Scan the memory blocks for an SMP configuration block.
9511 */
9512 -static void __init __get_smp_config(unsigned early)
9513 +static void __init __get_smp_config(unsigned int early)
9514 {
9515 struct intel_mp_floating *mpf = mpf_found;
9516
9517 + if (x86_quirks->mach_get_smp_config) {
9518 + if (x86_quirks->mach_get_smp_config(early))
9519 + return;
9520 + }
9521 if (acpi_lapic && early)
9522 return;
9523 /*
9524 @@ -604,7 +613,7 @@ static void __init __get_smp_config(unsi
9525
9526 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
9527 mpf->mpf_specification);
9528 -#ifdef CONFIG_X86_32
9529 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9530 if (mpf->mpf_feature2 & (1 << 7)) {
9531 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
9532 pic_mode = 1;
9533 @@ -635,8 +644,10 @@ static void __init __get_smp_config(unsi
9534 * Read the physical hardware table. Anything here will
9535 * override the defaults.
9536 */
9537 - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
9538 + if (!smp_read_mpc(_bus_to_virt(mpf->mpf_physptr), early)) {
9539 +#ifdef CONFIG_X86_LOCAL_APIC
9540 smp_found_config = 0;
9541 +#endif
9542 printk(KERN_ERR
9543 "BIOS bug, MP table errors detected!...\n");
9544 printk(KERN_ERR "... disabling SMP support. "
9545 @@ -690,10 +701,11 @@ void __init get_smp_config(void)
9546 static int __init smp_scan_config(unsigned long base, unsigned long length,
9547 unsigned reserve)
9548 {
9549 - unsigned int *bp = isa_bus_to_virt(base);
9550 + unsigned int *bp = _bus_to_virt(base);
9551 struct intel_mp_floating *mpf;
9552
9553 - Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
9554 + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
9555 + bp, length);
9556 BUILD_BUG_ON(sizeof(*mpf) != 16);
9557
9558 while (length > 0) {
9559 @@ -703,16 +715,22 @@ static int __init smp_scan_config(unsign
9560 !mpf_checksum((unsigned char *)bp, 16) &&
9561 ((mpf->mpf_specification == 1)
9562 || (mpf->mpf_specification == 4))) {
9563 -
9564 +#ifdef CONFIG_X86_LOCAL_APIC
9565 smp_found_config = 1;
9566 +#endif
9567 mpf_found = mpf;
9568 -#ifdef CONFIG_X86_32
9569 +
9570 #ifndef CONFIG_XEN
9571 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9572 mpf, virt_to_phys(mpf));
9573 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
9574 +
9575 + if (!reserve)
9576 + return 1;
9577 + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
9578 BOOTMEM_DEFAULT);
9579 if (mpf->mpf_physptr) {
9580 + unsigned long size = PAGE_SIZE;
9581 +#ifdef CONFIG_X86_32
9582 /*
9583 * We cannot access to MPC table to compute
9584 * table size yet, as only few megabytes from
9585 @@ -722,27 +740,18 @@ static int __init smp_scan_config(unsign
9586 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
9587 * in reserve_bootmem.
9588 */
9589 - unsigned long size = PAGE_SIZE;
9590 unsigned long end = max_low_pfn * PAGE_SIZE;
9591 if (mpf->mpf_physptr + size > end)
9592 size = end - mpf->mpf_physptr;
9593 - reserve_bootmem(mpf->mpf_physptr, size,
9594 +#endif
9595 + reserve_bootmem_generic(mpf->mpf_physptr, size,
9596 BOOTMEM_DEFAULT);
9597 }
9598 #else
9599 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9600 - mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
9601 -#endif
9602 -#elif !defined(CONFIG_XEN)
9603 - if (!reserve)
9604 - return 1;
9605 -
9606 - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
9607 - if (mpf->mpf_physptr)
9608 - reserve_bootmem_generic(mpf->mpf_physptr,
9609 - PAGE_SIZE);
9610 + mpf, ((void *)bp - _bus_to_virt(base)) + base);
9611 #endif
9612 - return 1;
9613 + return 1;
9614 }
9615 bp += 4;
9616 length -= 16;
9617 @@ -750,12 +759,16 @@ static int __init smp_scan_config(unsign
9618 return 0;
9619 }
9620
9621 -static void __init __find_smp_config(unsigned reserve)
9622 +static void __init __find_smp_config(unsigned int reserve)
9623 {
9624 #ifndef CONFIG_XEN
9625 unsigned int address;
9626 #endif
9627
9628 + if (x86_quirks->mach_find_smp_config) {
9629 + if (x86_quirks->mach_find_smp_config(reserve))
9630 + return;
9631 + }
9632 /*
9633 * FIXME: Linux assumes you have 640K of base ram..
9634 * this continues the error...
9635 @@ -802,300 +815,297 @@ void __init find_smp_config(void)
9636 __find_smp_config(1);
9637 }
9638
9639 -/* --------------------------------------------------------------------------
9640 - ACPI-based MP Configuration
9641 - -------------------------------------------------------------------------- */
9642 -
9643 -/*
9644 - * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
9645 - */
9646 -int es7000_plat;
9647 -
9648 -#ifdef CONFIG_ACPI
9649 +#ifdef CONFIG_X86_IO_APIC
9650 +static u8 __initdata irq_used[MAX_IRQ_SOURCES];
9651
9652 -#ifdef CONFIG_X86_IO_APIC
9653 +static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
9654 +{
9655 + int i;
9656
9657 -#define MP_ISA_BUS 0
9658 + if (m->mpc_irqtype != mp_INT)
9659 + return 0;
9660
9661 -extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
9662 + if (m->mpc_irqflag != 0x0f)
9663 + return 0;
9664
9665 -static int mp_find_ioapic(int gsi)
9666 -{
9667 - int i = 0;
9668 + /* not legacy */
9669
9670 - /* Find the IOAPIC that manages this GSI. */
9671 - for (i = 0; i < nr_ioapics; i++) {
9672 - if ((gsi >= mp_ioapic_routing[i].gsi_base)
9673 - && (gsi <= mp_ioapic_routing[i].gsi_end))
9674 - return i;
9675 + for (i = 0; i < mp_irq_entries; i++) {
9676 + if (mp_irqs[i].mp_irqtype != mp_INT)
9677 + continue;
9678 +
9679 + if (mp_irqs[i].mp_irqflag != 0x0f)
9680 + continue;
9681 +
9682 + if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
9683 + continue;
9684 + if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
9685 + continue;
9686 + if (irq_used[i]) {
9687 + /* already claimed */
9688 + return -2;
9689 + }
9690 + irq_used[i] = 1;
9691 + return i;
9692 }
9693
9694 - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9695 + /* not found */
9696 return -1;
9697 }
9698
9699 -static u8 __init uniq_ioapic_id(u8 id)
9700 -{
9701 -#ifdef CONFIG_X86_32
9702 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
9703 - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
9704 - return io_apic_get_unique_id(nr_ioapics, id);
9705 - else
9706 - return id;
9707 -#else
9708 - int i;
9709 - DECLARE_BITMAP(used, 256);
9710 - bitmap_zero(used, 256);
9711 - for (i = 0; i < nr_ioapics; i++) {
9712 - struct mpc_config_ioapic *ia = &mp_ioapics[i];
9713 - __set_bit(ia->mpc_apicid, used);
9714 - }
9715 - if (!test_bit(id, used))
9716 - return id;
9717 - return find_first_zero_bit(used, 256);
9718 +#define SPARE_SLOT_NUM 20
9719 +
9720 +static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
9721 #endif
9722 -}
9723
9724 -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
9725 +static int __init replace_intsrc_all(struct mp_config_table *mpc,
9726 + unsigned long mpc_new_phys,
9727 + unsigned long mpc_new_length)
9728 {
9729 - int idx = 0;
9730 -
9731 - if (bad_ioapic(address))
9732 - return;
9733 +#ifdef CONFIG_X86_IO_APIC
9734 + int i;
9735 + int nr_m_spare = 0;
9736 +#endif
9737
9738 - idx = nr_ioapics;
9739 + int count = sizeof(*mpc);
9740 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9741
9742 - mp_ioapics[idx].mpc_type = MP_IOAPIC;
9743 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9744 - mp_ioapics[idx].mpc_apicaddr = address;
9745 + printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
9746 + while (count < mpc->mpc_length) {
9747 + switch (*mpt) {
9748 + case MP_PROCESSOR:
9749 + {
9750 + struct mpc_config_processor *m =
9751 + (struct mpc_config_processor *)mpt;
9752 + mpt += sizeof(*m);
9753 + count += sizeof(*m);
9754 + break;
9755 + }
9756 + case MP_BUS:
9757 + {
9758 + struct mpc_config_bus *m =
9759 + (struct mpc_config_bus *)mpt;
9760 + mpt += sizeof(*m);
9761 + count += sizeof(*m);
9762 + break;
9763 + }
9764 + case MP_IOAPIC:
9765 + {
9766 + mpt += sizeof(struct mpc_config_ioapic);
9767 + count += sizeof(struct mpc_config_ioapic);
9768 + break;
9769 + }
9770 + case MP_INTSRC:
9771 + {
9772 +#ifdef CONFIG_X86_IO_APIC
9773 + struct mpc_config_intsrc *m =
9774 + (struct mpc_config_intsrc *)mpt;
9775
9776 -#ifndef CONFIG_XEN
9777 - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
9778 + apic_printk(APIC_VERBOSE, "OLD ");
9779 + print_MP_intsrc_info(m);
9780 + i = get_MP_intsrc_index(m);
9781 + if (i > 0) {
9782 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9783 + apic_printk(APIC_VERBOSE, "NEW ");
9784 + print_mp_irq_info(&mp_irqs[i]);
9785 + } else if (!i) {
9786 + /* legacy, do nothing */
9787 + } else if (nr_m_spare < SPARE_SLOT_NUM) {
9788 + /*
9789 + * not found (-1), or duplicated (-2)
9790 + * are invalid entries,
9791 + * we need to use the slot later
9792 + */
9793 + m_spare[nr_m_spare] = m;
9794 + nr_m_spare++;
9795 + }
9796 #endif
9797 - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
9798 -#ifdef CONFIG_X86_32
9799 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9800 -#else
9801 - mp_ioapics[idx].mpc_apicver = 0;
9802 + mpt += sizeof(struct mpc_config_intsrc);
9803 + count += sizeof(struct mpc_config_intsrc);
9804 + break;
9805 + }
9806 + case MP_LINTSRC:
9807 + {
9808 + struct mpc_config_lintsrc *m =
9809 + (struct mpc_config_lintsrc *)mpt;
9810 + mpt += sizeof(*m);
9811 + count += sizeof(*m);
9812 + break;
9813 + }
9814 + default:
9815 + /* wrong mptable */
9816 + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
9817 + printk(KERN_ERR "type %x\n", *mpt);
9818 + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
9819 + 1, mpc, mpc->mpc_length, 1);
9820 + goto out;
9821 + }
9822 + }
9823 +
9824 +#ifdef CONFIG_X86_IO_APIC
9825 + for (i = 0; i < mp_irq_entries; i++) {
9826 + if (irq_used[i])
9827 + continue;
9828 +
9829 + if (mp_irqs[i].mp_irqtype != mp_INT)
9830 + continue;
9831 +
9832 + if (mp_irqs[i].mp_irqflag != 0x0f)
9833 + continue;
9834 +
9835 + if (nr_m_spare > 0) {
9836 + apic_printk(APIC_VERBOSE, "*NEW* found\n");
9837 + nr_m_spare--;
9838 + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
9839 + m_spare[nr_m_spare] = NULL;
9840 + } else {
9841 + struct mpc_config_intsrc *m =
9842 + (struct mpc_config_intsrc *)mpt;
9843 + count += sizeof(struct mpc_config_intsrc);
9844 + if (!mpc_new_phys) {
9845 + printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
9846 + } else {
9847 + if (count <= mpc_new_length)
9848 + printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
9849 + else {
9850 + printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
9851 + goto out;
9852 + }
9853 + }
9854 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9855 + mpc->mpc_length = count;
9856 + mpt += sizeof(struct mpc_config_intsrc);
9857 + }
9858 + print_mp_irq_info(&mp_irqs[i]);
9859 + }
9860 #endif
9861 - /*
9862 - * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9863 - * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9864 - */
9865 - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9866 - mp_ioapic_routing[idx].gsi_base = gsi_base;
9867 - mp_ioapic_routing[idx].gsi_end = gsi_base +
9868 - io_apic_get_redir_entries(idx);
9869 -
9870 - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
9871 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
9872 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9873 - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
9874 +out:
9875 + /* update checksum */
9876 + mpc->mpc_checksum = 0;
9877 + mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
9878 + mpc->mpc_length);
9879
9880 - nr_ioapics++;
9881 + return 0;
9882 }
9883
9884 -void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
9885 -{
9886 - struct mpc_config_intsrc intsrc;
9887 - int ioapic = -1;
9888 - int pin = -1;
9889 -
9890 - /*
9891 - * Convert 'gsi' to 'ioapic.pin'.
9892 - */
9893 - ioapic = mp_find_ioapic(gsi);
9894 - if (ioapic < 0)
9895 - return;
9896 - pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9897 +static int __initdata enable_update_mptable;
9898
9899 - /*
9900 - * TBD: This check is for faulty timer entries, where the override
9901 - * erroneously sets the trigger to level, resulting in a HUGE
9902 - * increase of timer interrupts!
9903 - */
9904 - if ((bus_irq == 0) && (trigger == 3))
9905 - trigger = 1;
9906 +static int __init update_mptable_setup(char *str)
9907 +{
9908 + enable_update_mptable = 1;
9909 + return 0;
9910 +}
9911 +early_param("update_mptable", update_mptable_setup);
9912
9913 - intsrc.mpc_type = MP_INTSRC;
9914 - intsrc.mpc_irqtype = mp_INT;
9915 - intsrc.mpc_irqflag = (trigger << 2) | polarity;
9916 - intsrc.mpc_srcbus = MP_ISA_BUS;
9917 - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
9918 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
9919 - intsrc.mpc_dstirq = pin; /* INTIN# */
9920 +static unsigned long __initdata mpc_new_phys;
9921 +static unsigned long mpc_new_length __initdata = 4096;
9922
9923 - MP_intsrc_info(&intsrc);
9924 +/* alloc_mptable or alloc_mptable=4k */
9925 +static int __initdata alloc_mptable;
9926 +static int __init parse_alloc_mptable_opt(char *p)
9927 +{
9928 + enable_update_mptable = 1;
9929 + alloc_mptable = 1;
9930 + if (!p)
9931 + return 0;
9932 + mpc_new_length = memparse(p, &p);
9933 + return 0;
9934 }
9935 +early_param("alloc_mptable", parse_alloc_mptable_opt);
9936
9937 -void __init mp_config_acpi_legacy_irqs(void)
9938 +void __init early_reserve_e820_mpc_new(void)
9939 {
9940 - struct mpc_config_intsrc intsrc;
9941 - int i = 0;
9942 - int ioapic = -1;
9943 -
9944 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9945 - /*
9946 - * Fabricate the legacy ISA bus (bus #31).
9947 - */
9948 - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9949 + if (enable_update_mptable && alloc_mptable) {
9950 + u64 startt = 0;
9951 +#ifdef CONFIG_X86_TRAMPOLINE
9952 + startt = TRAMPOLINE_BASE;
9953 #endif
9954 - set_bit(MP_ISA_BUS, mp_bus_not_pci);
9955 - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9956 -
9957 - /*
9958 - * Older generations of ES7000 have no legacy identity mappings
9959 - */
9960 - if (es7000_plat == 1)
9961 - return;
9962 -
9963 - /*
9964 - * Locate the IOAPIC that manages the ISA IRQs (0-15).
9965 - */
9966 - ioapic = mp_find_ioapic(0);
9967 - if (ioapic < 0)
9968 - return;
9969 -
9970 - intsrc.mpc_type = MP_INTSRC;
9971 - intsrc.mpc_irqflag = 0; /* Conforming */
9972 - intsrc.mpc_srcbus = MP_ISA_BUS;
9973 -#ifdef CONFIG_X86_IO_APIC
9974 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9975 -#endif
9976 - /*
9977 - * Use the default configuration for the IRQs 0-15. Unless
9978 - * overridden by (MADT) interrupt source override entries.
9979 - */
9980 - for (i = 0; i < 16; i++) {
9981 - int idx;
9982 -
9983 - for (idx = 0; idx < mp_irq_entries; idx++) {
9984 - struct mpc_config_intsrc *irq = mp_irqs + idx;
9985 -
9986 - /* Do we already have a mapping for this ISA IRQ? */
9987 - if (irq->mpc_srcbus == MP_ISA_BUS
9988 - && irq->mpc_srcbusirq == i)
9989 - break;
9990 -
9991 - /* Do we already have a mapping for this IOAPIC pin */
9992 - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9993 - (irq->mpc_dstirq == i))
9994 - break;
9995 - }
9996 -
9997 - if (idx != mp_irq_entries) {
9998 - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9999 - continue; /* IRQ already used */
10000 - }
10001 -
10002 - intsrc.mpc_irqtype = mp_INT;
10003 - intsrc.mpc_srcbusirq = i; /* Identity mapped */
10004 - intsrc.mpc_dstirq = i;
10005 -
10006 - MP_intsrc_info(&intsrc);
10007 + mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
10008 }
10009 }
10010
10011 -int mp_register_gsi(u32 gsi, int triggering, int polarity)
10012 +static int __init update_mp_table(void)
10013 {
10014 - int ioapic;
10015 - int ioapic_pin;
10016 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10017 -#define MAX_GSI_NUM 4096
10018 -#define IRQ_COMPRESSION_START 64
10019 + char str[16];
10020 + char oem[10];
10021 + struct intel_mp_floating *mpf;
10022 + struct mp_config_table *mpc;
10023 + struct mp_config_table *mpc_new;
10024 +
10025 + if (!enable_update_mptable)
10026 + return 0;
10027 +
10028 + mpf = mpf_found;
10029 + if (!mpf)
10030 + return 0;
10031
10032 - static int pci_irq = IRQ_COMPRESSION_START;
10033 /*
10034 - * Mapping between Global System Interrupts, which
10035 - * represent all possible interrupts, and IRQs
10036 - * assigned to actual devices.
10037 + * Now see if we need to go further.
10038 */
10039 - static int gsi_to_irq[MAX_GSI_NUM];
10040 -#else
10041 + if (mpf->mpf_feature1 != 0)
10042 + return 0;
10043
10044 - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
10045 - return gsi;
10046 -#endif
10047 + if (!mpf->mpf_physptr)
10048 + return 0;
10049
10050 - /* Don't set up the ACPI SCI because it's already set up */
10051 - if (acpi_gbl_FADT.sci_interrupt == gsi)
10052 - return gsi;
10053 + mpc = _bus_to_virt(mpf->mpf_physptr);
10054
10055 - ioapic = mp_find_ioapic(gsi);
10056 - if (ioapic < 0) {
10057 - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
10058 - return gsi;
10059 - }
10060 + if (!smp_check_mpc(mpc, oem, str))
10061 + return 0;
10062
10063 - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
10064 + printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
10065 + printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
10066
10067 -#ifndef CONFIG_X86_32
10068 - if (ioapic_renumber_irq)
10069 - gsi = ioapic_renumber_irq(ioapic, gsi);
10070 -#endif
10071 + if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
10072 + mpc_new_phys = 0;
10073 + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
10074 + mpc_new_length);
10075 + }
10076 +
10077 + if (!mpc_new_phys) {
10078 + unsigned char old, new;
10079 + /* check if we can change the postion */
10080 + mpc->mpc_checksum = 0;
10081 + old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10082 + mpc->mpc_checksum = 0xff;
10083 + new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10084 + if (old == new) {
10085 + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
10086 + return 0;
10087 + }
10088 + printk(KERN_INFO "use in-positon replacing\n");
10089 + } else {
10090 + maddr_t mpc_new_bus;
10091
10092 - /*
10093 - * Avoid pin reprogramming. PRTs typically include entries
10094 - * with redundant pin->gsi mappings (but unique PCI devices);
10095 - * we only program the IOAPIC on the first.
10096 - */
10097 - if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
10098 - printk(KERN_ERR "Invalid reference to IOAPIC pin "
10099 - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
10100 - ioapic_pin);
10101 - return gsi;
10102 - }
10103 - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
10104 - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
10105 - mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
10106 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10107 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
10108 -#else
10109 - return gsi;
10110 -#endif
10111 + mpc_new_bus = phys_to_machine(mpc_new_phys);
10112 + mpf->mpf_physptr = mpc_new_bus;
10113 + mpc_new = phys_to_virt(mpc_new_phys);
10114 + memcpy(mpc_new, mpc, mpc->mpc_length);
10115 + mpc = mpc_new;
10116 + /* check if we can modify that */
10117 + if (mpc_new_bus - mpf->mpf_physptr) {
10118 + struct intel_mp_floating *mpf_new;
10119 + /* steal 16 bytes from [0, 1k) */
10120 + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
10121 + mpf_new = isa_bus_to_virt(0x400 - 16);
10122 + memcpy(mpf_new, mpf, 16);
10123 + mpf = mpf_new;
10124 + mpf->mpf_physptr = mpc_new_bus;
10125 + }
10126 + mpf->mpf_checksum = 0;
10127 + mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
10128 + printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
10129 }
10130
10131 - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
10132 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10133 /*
10134 - * For GSI >= 64, use IRQ compression
10135 + * only replace the one with mp_INT and
10136 + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
10137 + * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
10138 + * may need pci=routeirq for all coverage
10139 */
10140 - if ((gsi >= IRQ_COMPRESSION_START)
10141 - && (triggering == ACPI_LEVEL_SENSITIVE)) {
10142 - /*
10143 - * For PCI devices assign IRQs in order, avoiding gaps
10144 - * due to unused I/O APIC pins.
10145 - */
10146 - int irq = gsi;
10147 - if (gsi < MAX_GSI_NUM) {
10148 - /*
10149 - * Retain the VIA chipset work-around (gsi > 15), but
10150 - * avoid a problem where the 8254 timer (IRQ0) is setup
10151 - * via an override (so it's not on pin 0 of the ioapic),
10152 - * and at the same time, the pin 0 interrupt is a PCI
10153 - * type. The gsi > 15 test could cause these two pins
10154 - * to be shared as IRQ0, and they are not shareable.
10155 - * So test for this condition, and if necessary, avoid
10156 - * the pin collision.
10157 - */
10158 - gsi = pci_irq++;
10159 - /*
10160 - * Don't assign IRQ used by ACPI SCI
10161 - */
10162 - if (gsi == acpi_gbl_FADT.sci_interrupt)
10163 - gsi = pci_irq++;
10164 - gsi_to_irq[irq] = gsi;
10165 - } else {
10166 - printk(KERN_ERR "GSI %u is too high\n", gsi);
10167 - return gsi;
10168 - }
10169 - }
10170 -#endif
10171 - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
10172 - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
10173 - polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
10174 - return gsi;
10175 + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
10176 +
10177 + return 0;
10178 }
10179
10180 -#endif /* CONFIG_X86_IO_APIC */
10181 -#endif /* CONFIG_ACPI */
10182 +late_initcall(update_mp_table);
10183 --- sle11-2009-06-04.orig/arch/x86/kernel/nmi.c 2009-06-04 11:08:07.000000000 +0200
10184 +++ sle11-2009-06-04/arch/x86/kernel/nmi.c 2009-06-04 10:21:39.000000000 +0200
10185 @@ -27,7 +27,9 @@
10186 #include <linux/kdebug.h>
10187 #include <linux/smp.h>
10188
10189 +#ifndef CONFIG_XEN
10190 #include <asm/i8259.h>
10191 +#endif
10192 #include <asm/io_apic.h>
10193 #include <asm/smp.h>
10194 #include <asm/nmi.h>
10195 @@ -179,8 +181,10 @@ int __init check_nmi_watchdog(void)
10196 kfree(prev_nmi_count);
10197 return 0;
10198 error:
10199 +#ifndef CONFIG_XEN
10200 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
10201 disable_8259A_irq(0);
10202 +#endif
10203 #ifdef CONFIG_X86_32
10204 timer_ack = 0;
10205 #endif
10206 --- sle11-2009-06-04.orig/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:38:05.000000000 +0100
10207 +++ sle11-2009-06-04/arch/x86/kernel/pci-dma-xen.c 2009-06-04 10:21:39.000000000 +0200
10208 @@ -5,13 +5,13 @@
10209
10210 #include <asm/proto.h>
10211 #include <asm/dma.h>
10212 -#include <asm/gart.h>
10213 +#include <asm/iommu.h>
10214 #include <asm/calgary.h>
10215 +#include <asm/amd_iommu.h>
10216
10217 -int forbid_dac __read_mostly;
10218 -EXPORT_SYMBOL(forbid_dac);
10219 +static int forbid_dac __read_mostly;
10220
10221 -const struct dma_mapping_ops *dma_ops;
10222 +struct dma_mapping_ops *dma_ops;
10223 EXPORT_SYMBOL(dma_ops);
10224
10225 static int iommu_sac_force __read_mostly;
10226 @@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_si
10227 void __init dma32_reserve_bootmem(void)
10228 {
10229 unsigned long size, align;
10230 - if (end_pfn <= MAX_DMA32_PFN)
10231 + if (max_pfn <= MAX_DMA32_PFN)
10232 return;
10233
10234 + /*
10235 + * check aperture_64.c allocate_aperture() for reason about
10236 + * using 512M as goal
10237 + */
10238 align = 64ULL<<20;
10239 size = round_up(dma32_bootmem_size, align);
10240 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
10241 - __pa(MAX_DMA_ADDRESS));
10242 + 512ULL<<20);
10243 if (dma32_bootmem_ptr)
10244 dma32_bootmem_size = size;
10245 else
10246 @@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
10247 }
10248 static void __init dma32_free_bootmem(void)
10249 {
10250 - int node;
10251
10252 - if (end_pfn <= MAX_DMA32_PFN)
10253 + if (max_pfn <= MAX_DMA32_PFN)
10254 return;
10255
10256 if (!dma32_bootmem_ptr)
10257 return;
10258
10259 - for_each_online_node(node)
10260 - free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
10261 - dma32_bootmem_size);
10262 + free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
10263
10264 dma32_bootmem_ptr = NULL;
10265 dma32_bootmem_size = 0;
10266 @@ -107,7 +108,7 @@ static void __init dma32_free_bootmem(vo
10267 #define dma32_free_bootmem() ((void)0)
10268 #endif
10269
10270 -static const struct dma_mapping_ops swiotlb_dma_ops = {
10271 +static struct dma_mapping_ops swiotlb_dma_ops = {
10272 .mapping_error = swiotlb_dma_mapping_error,
10273 .map_single = swiotlb_map_single_phys,
10274 .unmap_single = swiotlb_unmap_single,
10275 @@ -130,25 +131,31 @@ void __init pci_iommu_alloc(void)
10276 * The order of these functions is important for
10277 * fall-back/fail-over reasons
10278 */
10279 -#ifdef CONFIG_GART_IOMMU
10280 gart_iommu_hole_init();
10281 -#endif
10282
10283 -#ifdef CONFIG_CALGARY_IOMMU
10284 detect_calgary();
10285 -#endif
10286
10287 detect_intel_iommu();
10288
10289 -#ifdef CONFIG_SWIOTLB
10290 + amd_iommu_detect();
10291 +
10292 swiotlb_init();
10293 if (swiotlb) {
10294 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
10295 dma_ops = &swiotlb_dma_ops;
10296 }
10297 -#endif
10298 }
10299
10300 +#ifndef CONFIG_XEN
10301 +unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
10302 +{
10303 + unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
10304 +
10305 + return size >> PAGE_SHIFT;
10306 +}
10307 +EXPORT_SYMBOL(iommu_num_pages);
10308 +#endif
10309 +
10310 /*
10311 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
10312 * documentation.
10313 @@ -201,9 +208,7 @@ static __init int iommu_setup(char *p)
10314 swiotlb = 1;
10315 #endif
10316
10317 -#ifdef CONFIG_GART_IOMMU
10318 gart_parse_options(p);
10319 -#endif
10320
10321 #ifdef CONFIG_CALGARY_IOMMU
10322 if (!strncmp(p, "calgary", 7))
10323 @@ -245,136 +250,19 @@ int range_straddles_page_boundary(paddr_
10324 !check_pages_physically_contiguous(pfn, offset, size));
10325 }
10326
10327 -#ifdef CONFIG_X86_32
10328 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
10329 - dma_addr_t device_addr, size_t size, int flags)
10330 -{
10331 - void __iomem *mem_base = NULL;
10332 - int pages = size >> PAGE_SHIFT;
10333 - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
10334 -
10335 - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
10336 - goto out;
10337 - if (!size)
10338 - goto out;
10339 - if (dev->dma_mem)
10340 - goto out;
10341 -
10342 - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
10343 -
10344 - mem_base = ioremap(bus_addr, size);
10345 - if (!mem_base)
10346 - goto out;
10347 -
10348 - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
10349 - if (!dev->dma_mem)
10350 - goto out;
10351 - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
10352 - if (!dev->dma_mem->bitmap)
10353 - goto free1_out;
10354 -
10355 - dev->dma_mem->virt_base = mem_base;
10356 - dev->dma_mem->device_base = device_addr;
10357 - dev->dma_mem->size = pages;
10358 - dev->dma_mem->flags = flags;
10359 -
10360 - if (flags & DMA_MEMORY_MAP)
10361 - return DMA_MEMORY_MAP;
10362 -
10363 - return DMA_MEMORY_IO;
10364 -
10365 - free1_out:
10366 - kfree(dev->dma_mem);
10367 - out:
10368 - if (mem_base)
10369 - iounmap(mem_base);
10370 - return 0;
10371 -}
10372 -EXPORT_SYMBOL(dma_declare_coherent_memory);
10373 -
10374 -void dma_release_declared_memory(struct device *dev)
10375 -{
10376 - struct dma_coherent_mem *mem = dev->dma_mem;
10377 -
10378 - if (!mem)
10379 - return;
10380 - dev->dma_mem = NULL;
10381 - iounmap(mem->virt_base);
10382 - kfree(mem->bitmap);
10383 - kfree(mem);
10384 -}
10385 -EXPORT_SYMBOL(dma_release_declared_memory);
10386 -
10387 -void *dma_mark_declared_memory_occupied(struct device *dev,
10388 - dma_addr_t device_addr, size_t size)
10389 -{
10390 - struct dma_coherent_mem *mem = dev->dma_mem;
10391 - int pos, err;
10392 - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
10393 -
10394 - pages >>= PAGE_SHIFT;
10395 -
10396 - if (!mem)
10397 - return ERR_PTR(-EINVAL);
10398 -
10399 - pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
10400 - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
10401 - if (err != 0)
10402 - return ERR_PTR(err);
10403 - return mem->virt_base + (pos << PAGE_SHIFT);
10404 -}
10405 -EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
10406 -
10407 -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
10408 - dma_addr_t *dma_handle, void **ret)
10409 -{
10410 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10411 - int order = get_order(size);
10412 -
10413 - if (mem) {
10414 - int page = bitmap_find_free_region(mem->bitmap, mem->size,
10415 - order);
10416 - if (page >= 0) {
10417 - *dma_handle = mem->device_base + (page << PAGE_SHIFT);
10418 - *ret = mem->virt_base + (page << PAGE_SHIFT);
10419 - memset(*ret, 0, size);
10420 - }
10421 - if (mem->flags & DMA_MEMORY_EXCLUSIVE)
10422 - *ret = NULL;
10423 - }
10424 - return (mem != NULL);
10425 -}
10426 -
10427 -static int dma_release_coherent(struct device *dev, int order, void *vaddr)
10428 -{
10429 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10430 -
10431 - if (mem && vaddr >= mem->virt_base && vaddr <
10432 - (mem->virt_base + (mem->size << PAGE_SHIFT))) {
10433 - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
10434 -
10435 - bitmap_release_region(mem->bitmap, page, order);
10436 - return 1;
10437 - }
10438 - return 0;
10439 -}
10440 -#else
10441 -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
10442 -#define dma_release_coherent(dev, order, vaddr) (0)
10443 -#endif /* CONFIG_X86_32 */
10444 -
10445 int dma_supported(struct device *dev, u64 mask)
10446 {
10447 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10448 +
10449 #ifdef CONFIG_PCI
10450 if (mask > 0xffffffff && forbid_dac > 0) {
10451 - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
10452 - dev->bus_id);
10453 + dev_info(dev, "PCI: Disallowing DAC for device\n");
10454 return 0;
10455 }
10456 #endif
10457
10458 - if (dma_ops->dma_supported)
10459 - return dma_ops->dma_supported(dev, mask);
10460 + if (ops->dma_supported)
10461 + return ops->dma_supported(dev, mask);
10462
10463 /* Copied from i386. Doesn't make much sense, because it will
10464 only work for pci_alloc_coherent.
10465 @@ -395,8 +283,7 @@ int dma_supported(struct device *dev, u6
10466 type. Normally this doesn't make any difference, but gives
10467 more gentle handling of IOMMU overflow. */
10468 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
10469 - printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
10470 - dev->bus_id, mask);
10471 + dev_info(dev, "Force SAC with mask %Lx\n", mask);
10472 return 0;
10473 }
10474
10475 @@ -422,6 +309,9 @@ void *
10476 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
10477 gfp_t gfp)
10478 {
10479 +#ifndef CONFIG_XEN
10480 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10481 +#endif
10482 void *memory = NULL;
10483 struct page *page;
10484 unsigned long dma_mask = 0;
10485 @@ -431,7 +321,7 @@ dma_alloc_coherent(struct device *dev, s
10486 /* ignore region specifiers */
10487 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
10488
10489 - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
10490 + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
10491 return memory;
10492
10493 if (!dev) {
10494 @@ -491,8 +381,8 @@ dma_alloc_coherent(struct device *dev, s
10495 /* Let low level make its own zone decisions */
10496 gfp &= ~(GFP_DMA32|GFP_DMA);
10497
10498 - if (dma_ops->alloc_coherent)
10499 - return dma_ops->alloc_coherent(dev, size,
10500 + if (ops->alloc_coherent)
10501 + return ops->alloc_coherent(dev, size,
10502 dma_handle, gfp);
10503 return NULL;
10504 }
10505 @@ -504,14 +394,14 @@ dma_alloc_coherent(struct device *dev, s
10506 }
10507 }
10508
10509 - if (dma_ops->alloc_coherent) {
10510 + if (ops->alloc_coherent) {
10511 free_pages((unsigned long)memory, order);
10512 gfp &= ~(GFP_DMA|GFP_DMA32);
10513 - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
10514 + return ops->alloc_coherent(dev, size, dma_handle, gfp);
10515 }
10516
10517 - if (dma_ops->map_simple) {
10518 - *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
10519 + if (ops->map_simple) {
10520 + *dma_handle = ops->map_simple(dev, virt_to_bus(memory),
10521 size,
10522 PCI_DMA_BIDIRECTIONAL);
10523 if (*dma_handle != bad_dma_address)
10524 @@ -542,13 +432,17 @@ EXPORT_SYMBOL(dma_alloc_coherent);
10525 void dma_free_coherent(struct device *dev, size_t size,
10526 void *vaddr, dma_addr_t bus)
10527 {
10528 +#ifndef CONFIG_XEN
10529 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10530 +#endif
10531 +
10532 int order = get_order(size);
10533 WARN_ON(irqs_disabled()); /* for portability */
10534 - if (dma_release_coherent(dev, order, vaddr))
10535 + if (dma_release_from_coherent(dev, order, vaddr))
10536 return;
10537 #ifndef CONFIG_XEN
10538 - if (dma_ops->unmap_single)
10539 - dma_ops->unmap_single(dev, bus, size, 0);
10540 + if (ops->unmap_single)
10541 + ops->unmap_single(dev, bus, size, 0);
10542 #endif
10543 xen_destroy_contiguous_region((unsigned long)vaddr, order);
10544 free_pages((unsigned long)vaddr, order);
10545 @@ -557,15 +451,13 @@ EXPORT_SYMBOL(dma_free_coherent);
10546
10547 static int __init pci_iommu_init(void)
10548 {
10549 -#ifdef CONFIG_CALGARY_IOMMU
10550 calgary_iommu_init();
10551 -#endif
10552
10553 intel_iommu_init();
10554
10555 -#ifdef CONFIG_GART_IOMMU
10556 + amd_iommu_init();
10557 +
10558 gart_iommu_init();
10559 -#endif
10560
10561 no_iommu_init();
10562 return 0;
10563 --- sle11-2009-06-04.orig/arch/x86/kernel/pci-nommu-xen.c 2009-03-16 16:38:05.000000000 +0100
10564 +++ sle11-2009-06-04/arch/x86/kernel/pci-nommu-xen.c 2009-06-04 10:21:39.000000000 +0200
10565 @@ -79,18 +79,12 @@ gnttab_unmap_single(struct device *dev,
10566 gnttab_dma_unmap_page(dma_addr);
10567 }
10568
10569 -static int nommu_mapping_error(dma_addr_t dma_addr)
10570 -{
10571 - return (dma_addr == bad_dma_address);
10572 -}
10573 -
10574 -static const struct dma_mapping_ops nommu_dma_ops = {
10575 +static struct dma_mapping_ops nommu_dma_ops = {
10576 .map_single = gnttab_map_single,
10577 .unmap_single = gnttab_unmap_single,
10578 .map_sg = gnttab_map_sg,
10579 .unmap_sg = gnttab_unmap_sg,
10580 .dma_supported = swiotlb_dma_supported,
10581 - .mapping_error = nommu_mapping_error
10582 };
10583
10584 void __init no_iommu_init(void)
10585 --- sle11-2009-06-04.orig/arch/x86/kernel/probe_roms_32.c 2009-06-04 11:08:07.000000000 +0200
10586 +++ sle11-2009-06-04/arch/x86/kernel/probe_roms_32.c 2009-06-04 10:21:39.000000000 +0200
10587 @@ -99,6 +99,11 @@ void __init probe_roms(void)
10588 unsigned char c;
10589 int i;
10590
10591 +#ifdef CONFIG_XEN
10592 + if (!is_initial_xendomain())
10593 + return;
10594 +#endif
10595 +
10596 /* video rom */
10597 upper = adapter_rom_resources[0].start;
10598 for (start = video_rom_resource.start; start < upper; start += 2048) {
10599 @@ -131,7 +136,7 @@ void __init probe_roms(void)
10600 upper = system_rom_resource.start;
10601
10602 /* check for extension rom (ignore length byte!) */
10603 - rom = isa_bus_to_virt(extension_rom_resource.start);
10604 + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
10605 if (romsignature(rom)) {
10606 length = extension_rom_resource.end - extension_rom_resource.start + 1;
10607 if (romchecksum(rom, length)) {
10608 --- sle11-2009-06-04.orig/arch/x86/kernel/process-xen.c 2009-03-16 16:38:05.000000000 +0100
10609 +++ sle11-2009-06-04/arch/x86/kernel/process-xen.c 2009-06-04 10:21:39.000000000 +0200
10610 @@ -6,6 +6,13 @@
10611 #include <linux/sched.h>
10612 #include <linux/module.h>
10613 #include <linux/pm.h>
10614 +#include <linux/clockchips.h>
10615 +#include <asm/system.h>
10616 +
10617 +unsigned long idle_halt;
10618 +EXPORT_SYMBOL(idle_halt);
10619 +unsigned long idle_nomwait;
10620 +EXPORT_SYMBOL(idle_nomwait);
10621
10622 struct kmem_cache *task_xstate_cachep;
10623
10624 @@ -45,6 +52,70 @@ void arch_task_cache_init(void)
10625 SLAB_PANIC, NULL);
10626 }
10627
10628 +/*
10629 + * Idle related variables and functions
10630 + */
10631 +unsigned long boot_option_idle_override = 0;
10632 +EXPORT_SYMBOL(boot_option_idle_override);
10633 +
10634 +/*
10635 + * Powermanagement idle function, if any..
10636 + */
10637 +void (*pm_idle)(void);
10638 +EXPORT_SYMBOL(pm_idle);
10639 +
10640 +#ifdef CONFIG_X86_32
10641 +/*
10642 + * This halt magic was a workaround for ancient floppy DMA
10643 + * wreckage. It should be safe to remove.
10644 + */
10645 +static int hlt_counter;
10646 +void disable_hlt(void)
10647 +{
10648 + hlt_counter++;
10649 +}
10650 +EXPORT_SYMBOL(disable_hlt);
10651 +
10652 +void enable_hlt(void)
10653 +{
10654 + hlt_counter--;
10655 +}
10656 +EXPORT_SYMBOL(enable_hlt);
10657 +
10658 +static inline int hlt_use_halt(void)
10659 +{
10660 + return (!hlt_counter && boot_cpu_data.hlt_works_ok);
10661 +}
10662 +#else
10663 +static inline int hlt_use_halt(void)
10664 +{
10665 + return 1;
10666 +}
10667 +#endif
10668 +
10669 +/*
10670 + * We use this if we don't have any better
10671 + * idle routine..
10672 + */
10673 +void xen_idle(void)
10674 +{
10675 + current_thread_info()->status &= ~TS_POLLING;
10676 + /*
10677 + * TS_POLLING-cleared state must be visible before we
10678 + * test NEED_RESCHED:
10679 + */
10680 + smp_mb();
10681 +
10682 + if (!need_resched())
10683 + safe_halt(); /* enables interrupts racelessly */
10684 + else
10685 + local_irq_enable();
10686 + current_thread_info()->status |= TS_POLLING;
10687 +}
10688 +#ifdef CONFIG_APM_MODULE
10689 +EXPORT_SYMBOL(default_idle);
10690 +#endif
10691 +
10692 static void do_nothing(void *unused)
10693 {
10694 }
10695 @@ -61,7 +132,7 @@ void cpu_idle_wait(void)
10696 {
10697 smp_mb();
10698 /* kick all the CPUs so that they exit out of pm_idle */
10699 - smp_call_function(do_nothing, NULL, 0, 1);
10700 + smp_call_function(do_nothing, NULL, 1);
10701 }
10702 EXPORT_SYMBOL_GPL(cpu_idle_wait);
10703
10704 @@ -125,60 +196,175 @@ static void poll_idle(void)
10705 *
10706 * idle=mwait overrides this decision and forces the usage of mwait.
10707 */
10708 +static int __cpuinitdata force_mwait;
10709 +
10710 +#define MWAIT_INFO 0x05
10711 +#define MWAIT_ECX_EXTENDED_INFO 0x01
10712 +#define MWAIT_EDX_C1 0xf0
10713 +
10714 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
10715 {
10716 + u32 eax, ebx, ecx, edx;
10717 +
10718 if (force_mwait)
10719 return 1;
10720
10721 - if (c->x86_vendor == X86_VENDOR_AMD) {
10722 - switch(c->x86) {
10723 - case 0x10:
10724 - case 0x11:
10725 - return 0;
10726 + if (c->cpuid_level < MWAIT_INFO)
10727 + return 0;
10728 +
10729 + cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
10730 + /* Check, whether EDX has extended info about MWAIT */
10731 + if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
10732 + return 1;
10733 +
10734 + /*
10735 + * edx enumeratios MONITOR/MWAIT extensions. Check, whether
10736 + * C1 supports MWAIT
10737 + */
10738 + return (edx & MWAIT_EDX_C1);
10739 +}
10740 +
10741 +/*
10742 + * Check for AMD CPUs, which have potentially C1E support
10743 + */
10744 +static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
10745 +{
10746 + if (c->x86_vendor != X86_VENDOR_AMD)
10747 + return 0;
10748 +
10749 + if (c->x86 < 0x0F)
10750 + return 0;
10751 +
10752 + /* Family 0x0f models < rev F do not have C1E */
10753 + if (c->x86 == 0x0f && c->x86_model < 0x40)
10754 + return 0;
10755 +
10756 + return 1;
10757 +}
10758 +
10759 +static cpumask_t c1e_mask = CPU_MASK_NONE;
10760 +static int c1e_detected;
10761 +
10762 +void c1e_remove_cpu(int cpu)
10763 +{
10764 + cpu_clear(cpu, c1e_mask);
10765 +}
10766 +
10767 +/*
10768 + * C1E aware idle routine. We check for C1E active in the interrupt
10769 + * pending message MSR. If we detect C1E, then we handle it the same
10770 + * way as C3 power states (local apic timer and TSC stop)
10771 + */
10772 +static void c1e_idle(void)
10773 +{
10774 + if (need_resched())
10775 + return;
10776 +
10777 + if (!c1e_detected) {
10778 + u32 lo, hi;
10779 +
10780 + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
10781 + if (lo & K8_INTP_C1E_ACTIVE_MASK) {
10782 + c1e_detected = 1;
10783 + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
10784 + mark_tsc_unstable("TSC halt in AMD C1E");
10785 + printk(KERN_INFO "System has AMD C1E enabled\n");
10786 + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
10787 }
10788 }
10789 - return 1;
10790 +
10791 + if (c1e_detected) {
10792 + int cpu = smp_processor_id();
10793 +
10794 + if (!cpu_isset(cpu, c1e_mask)) {
10795 + cpu_set(cpu, c1e_mask);
10796 + /*
10797 + * Force broadcast so ACPI can not interfere. Needs
10798 + * to run with interrupts enabled as it uses
10799 + * smp_function_call.
10800 + */
10801 + local_irq_enable();
10802 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
10803 + &cpu);
10804 + printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
10805 + cpu);
10806 + local_irq_disable();
10807 + }
10808 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
10809 +
10810 + default_idle();
10811 +
10812 + /*
10813 + * The switch back from broadcast mode needs to be
10814 + * called with interrupts disabled.
10815 + */
10816 + local_irq_disable();
10817 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
10818 + local_irq_enable();
10819 + } else
10820 + default_idle();
10821 }
10822 #endif
10823
10824 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
10825 {
10826 #ifndef CONFIG_XEN
10827 - static int selected;
10828 -
10829 - if (selected)
10830 - return;
10831 #ifdef CONFIG_X86_SMP
10832 if (pm_idle == poll_idle && smp_num_siblings > 1) {
10833 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
10834 " performance may degrade.\n");
10835 }
10836 #endif
10837 + if (pm_idle)
10838 + return;
10839 +
10840 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
10841 /*
10842 - * Skip, if setup has overridden idle.
10843 * One CPU supports mwait => All CPUs supports mwait
10844 */
10845 - if (!pm_idle) {
10846 - printk(KERN_INFO "using mwait in idle threads.\n");
10847 - pm_idle = mwait_idle;
10848 - }
10849 - }
10850 - selected = 1;
10851 + printk(KERN_INFO "using mwait in idle threads.\n");
10852 + pm_idle = mwait_idle;
10853 + } else if (check_c1e_idle(c)) {
10854 + printk(KERN_INFO "using C1E aware idle routine\n");
10855 + pm_idle = c1e_idle;
10856 + } else
10857 + pm_idle = default_idle;
10858 #endif
10859 }
10860
10861 static int __init idle_setup(char *str)
10862 {
10863 + if (!str)
10864 + return -EINVAL;
10865 +
10866 if (!strcmp(str, "poll")) {
10867 printk("using polling idle threads.\n");
10868 pm_idle = poll_idle;
10869 - }
10870 #ifndef CONFIG_XEN
10871 - else if (!strcmp(str, "mwait"))
10872 + } else if (!strcmp(str, "mwait"))
10873 force_mwait = 1;
10874 + else if (!strcmp(str, "halt")) {
10875 + /*
10876 + * When the boot option of idle=halt is added, halt is
10877 + * forced to be used for CPU idle. In such case CPU C2/C3
10878 + * won't be used again.
10879 + * To continue to load the CPU idle driver, don't touch
10880 + * the boot_option_idle_override.
10881 + */
10882 + pm_idle = default_idle;
10883 + idle_halt = 1;
10884 + return 0;
10885 + } else if (!strcmp(str, "nomwait")) {
10886 + /*
10887 + * If the boot option of "idle=nomwait" is added,
10888 + * it means that mwait will be disabled for CPU C2/C3
10889 + * states. In such case it won't touch the variable
10890 + * of boot_option_idle_override.
10891 + */
10892 + idle_nomwait = 1;
10893 + return 0;
10894 #endif
10895 - else
10896 + } else
10897 return -1;
10898
10899 boot_option_idle_override = 1;
10900 --- sle11-2009-06-04.orig/arch/x86/kernel/process_32-xen.c 2009-03-16 16:38:05.000000000 +0100
10901 +++ sle11-2009-06-04/arch/x86/kernel/process_32-xen.c 2009-06-04 10:21:39.000000000 +0200
10902 @@ -59,15 +59,11 @@
10903 #include <asm/tlbflush.h>
10904 #include <asm/cpu.h>
10905 #include <asm/kdebug.h>
10906 +#include <asm/idle.h>
10907
10908 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
10909 asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
10910
10911 -static int hlt_counter;
10912 -
10913 -unsigned long boot_option_idle_override = 0;
10914 -EXPORT_SYMBOL(boot_option_idle_override);
10915 -
10916 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
10917 EXPORT_PER_CPU_SYMBOL(current_task);
10918
10919 @@ -82,46 +78,27 @@ unsigned long thread_saved_pc(struct tas
10920 return ((unsigned long *)tsk->thread.sp)[3];
10921 }
10922
10923 -/*
10924 - * Powermanagement idle function, if any..
10925 - */
10926 -void (*pm_idle)(void);
10927 -EXPORT_SYMBOL(pm_idle);
10928 +#ifdef CONFIG_HOTPLUG_CPU
10929 +#ifndef CONFIG_XEN
10930 +#include <asm/nmi.h>
10931
10932 -void disable_hlt(void)
10933 +static void cpu_exit_clear(void)
10934 {
10935 - hlt_counter++;
10936 -}
10937 + int cpu = raw_smp_processor_id();
10938
10939 -EXPORT_SYMBOL(disable_hlt);
10940 -
10941 -void enable_hlt(void)
10942 -{
10943 - hlt_counter--;
10944 -}
10945 + idle_task_exit();
10946
10947 -EXPORT_SYMBOL(enable_hlt);
10948 + cpu_uninit();
10949 + irq_ctx_exit(cpu);
10950
10951 -static void xen_idle(void)
10952 -{
10953 - current_thread_info()->status &= ~TS_POLLING;
10954 - /*
10955 - * TS_POLLING-cleared state must be visible before we
10956 - * test NEED_RESCHED:
10957 - */
10958 - smp_mb();
10959 + cpu_clear(cpu, cpu_callout_map);
10960 + cpu_clear(cpu, cpu_callin_map);
10961
10962 - if (!need_resched())
10963 - safe_halt(); /* enables interrupts racelessly */
10964 - else
10965 - local_irq_enable();
10966 - current_thread_info()->status |= TS_POLLING;
10967 + numa_remove_cpu(cpu);
10968 + c1e_remove_cpu(cpu);
10969 }
10970 -#ifdef CONFIG_APM_MODULE
10971 -EXPORT_SYMBOL(default_idle);
10972 #endif
10973
10974 -#ifdef CONFIG_HOTPLUG_CPU
10975 static inline void play_dead(void)
10976 {
10977 idle_task_exit();
10978 @@ -152,13 +129,11 @@ void cpu_idle(void)
10979
10980 /* endless idle loop with no priority at all */
10981 while (1) {
10982 - tick_nohz_stop_sched_tick();
10983 + tick_nohz_stop_sched_tick(1);
10984 while (!need_resched()) {
10985 - void (*idle)(void);
10986
10987 check_pgt_cache();
10988 rmb();
10989 - idle = xen_idle; /* no alternatives */
10990
10991 if (rcu_pending(cpu))
10992 rcu_check_callbacks(cpu, 0);
10993 @@ -168,7 +143,10 @@ void cpu_idle(void)
10994
10995 local_irq_disable();
10996 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
10997 - idle();
10998 + /* Don't trace irqs off for idle */
10999 + stop_critical_timings();
11000 + xen_idle();
11001 + start_critical_timings();
11002 }
11003 tick_nohz_restart_sched_tick();
11004 preempt_enable_no_resched();
11005 --- sle11-2009-06-04.orig/arch/x86/kernel/process_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11006 +++ sle11-2009-06-04/arch/x86/kernel/process_64-xen.c 2009-06-04 10:21:39.000000000 +0200
11007 @@ -65,15 +65,6 @@ asmlinkage extern void ret_from_fork(voi
11008
11009 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
11010
11011 -unsigned long boot_option_idle_override = 0;
11012 -EXPORT_SYMBOL(boot_option_idle_override);
11013 -
11014 -/*
11015 - * Powermanagement idle function, if any..
11016 - */
11017 -void (*pm_idle)(void);
11018 -EXPORT_SYMBOL(pm_idle);
11019 -
11020 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
11021
11022 void idle_notifier_register(struct notifier_block *n)
11023 @@ -103,25 +94,13 @@ void exit_idle(void)
11024 __exit_idle();
11025 }
11026
11027 -static void xen_idle(void)
11028 -{
11029 - current_thread_info()->status &= ~TS_POLLING;
11030 - /*
11031 - * TS_POLLING-cleared state must be visible before we
11032 - * test NEED_RESCHED:
11033 - */
11034 - smp_mb();
11035 - if (!need_resched())
11036 - safe_halt(); /* enables interrupts racelessly */
11037 - else
11038 - local_irq_enable();
11039 - current_thread_info()->status |= TS_POLLING;
11040 -}
11041 -
11042 #ifdef CONFIG_HOTPLUG_CPU
11043 static inline void play_dead(void)
11044 {
11045 idle_task_exit();
11046 +#ifndef CONFIG_XEN
11047 + c1e_remove_cpu(raw_smp_processor_id());
11048 +#endif
11049 local_irq_disable();
11050 cpu_clear(smp_processor_id(), cpu_initialized);
11051 preempt_enable_no_resched();
11052 @@ -146,12 +125,11 @@ void cpu_idle(void)
11053 current_thread_info()->status |= TS_POLLING;
11054 /* endless idle loop with no priority at all */
11055 while (1) {
11056 - tick_nohz_stop_sched_tick();
11057 + tick_nohz_stop_sched_tick(1);
11058 while (!need_resched()) {
11059 - void (*idle)(void);
11060
11061 rmb();
11062 - idle = xen_idle; /* no alternatives */
11063 +
11064 if (cpu_is_offline(smp_processor_id()))
11065 play_dead();
11066 /*
11067 @@ -161,7 +139,10 @@ void cpu_idle(void)
11068 */
11069 local_irq_disable();
11070 enter_idle();
11071 - idle();
11072 + /* Don't trace irqs off for idle */
11073 + stop_critical_timings();
11074 + xen_idle();
11075 + start_critical_timings();
11076 /* In many cases the interrupt that ended idle
11077 has already called exit_idle. But some idle
11078 loops can be woken up without interrupt. */
11079 @@ -271,7 +252,7 @@ void exit_thread(void)
11080 }
11081 }
11082
11083 -void load_gs_index(unsigned gs)
11084 +void xen_load_gs_index(unsigned gs)
11085 {
11086 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
11087 }
11088 @@ -373,10 +354,10 @@ int copy_thread(int nr, unsigned long cl
11089 p->thread.fs = me->thread.fs;
11090 p->thread.gs = me->thread.gs;
11091
11092 - asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
11093 - asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
11094 - asm("mov %%es,%0" : "=m" (p->thread.es));
11095 - asm("mov %%ds,%0" : "=m" (p->thread.ds));
11096 + savesegment(gs, p->thread.gsindex);
11097 + savesegment(fs, p->thread.fsindex);
11098 + savesegment(es, p->thread.es);
11099 + savesegment(ds, p->thread.ds);
11100
11101 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
11102 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
11103 @@ -417,7 +398,9 @@ out:
11104 void
11105 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
11106 {
11107 - asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
11108 + loadsegment(fs, 0);
11109 + loadsegment(es, 0);
11110 + loadsegment(ds, 0);
11111 load_gs_index(0);
11112 regs->ip = new_ip;
11113 regs->sp = new_sp;
11114 @@ -557,8 +540,8 @@ static inline void __switch_to_xtra(stru
11115 struct task_struct *
11116 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
11117 {
11118 - struct thread_struct *prev = &prev_p->thread,
11119 - *next = &next_p->thread;
11120 + struct thread_struct *prev = &prev_p->thread;
11121 + struct thread_struct *next = &next_p->thread;
11122 int cpu = smp_processor_id();
11123 #ifndef CONFIG_X86_NO_TSS
11124 struct tss_struct *tss = &per_cpu(init_tss, cpu);
11125 @@ -661,12 +644,25 @@ __switch_to(struct task_struct *prev_p,
11126 */
11127 if (unlikely(next->es))
11128 loadsegment(es, next->es);
11129 -
11130 +
11131 if (unlikely(next->ds))
11132 loadsegment(ds, next->ds);
11133
11134 + /*
11135 + * Leave lazy mode, flushing any hypercalls made here.
11136 + * This must be done before restoring TLS segments so
11137 + * the GDT and LDT are properly updated, and must be
11138 + * done before math_state_restore, so the TS bit is up
11139 + * to date.
11140 + */
11141 + arch_leave_lazy_cpu_mode();
11142 +
11143 /*
11144 * Switch FS and GS.
11145 + *
11146 + * Segment register != 0 always requires a reload. Also
11147 + * reload when it has changed. When prev process used 64bit
11148 + * base always reload to avoid an information leak.
11149 */
11150 if (unlikely(next->fsindex))
11151 loadsegment(fs, next->fsindex);
11152 @@ -687,7 +683,8 @@ __switch_to(struct task_struct *prev_p,
11153 write_pda(oldrsp, next->usersp);
11154 write_pda(pcurrent, next_p);
11155 write_pda(kernelstack,
11156 - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
11157 + (unsigned long)task_stack_page(next_p) +
11158 + THREAD_SIZE - PDA_STACKOFFSET);
11159 #ifdef CONFIG_CC_STACKPROTECTOR
11160 write_pda(stack_canary, next_p->stack_canary);
11161
11162 @@ -848,7 +845,7 @@ long do_arch_prctl(struct task_struct *t
11163 set_32bit_tls(task, FS_TLS, addr);
11164 if (doit) {
11165 load_TLS(&task->thread, cpu);
11166 - asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
11167 + loadsegment(fs, FS_TLS_SEL);
11168 }
11169 task->thread.fsindex = FS_TLS_SEL;
11170 task->thread.fs = 0;
11171 @@ -858,7 +855,7 @@ long do_arch_prctl(struct task_struct *t
11172 if (doit) {
11173 /* set the selector to 0 to not confuse
11174 __switch_to */
11175 - asm volatile("movl %0,%%fs" :: "r" (0));
11176 + loadsegment(fs, 0);
11177 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
11178 addr);
11179 }
11180 @@ -882,7 +879,7 @@ long do_arch_prctl(struct task_struct *t
11181 if (task->thread.gsindex == GS_TLS_SEL)
11182 base = read_32bit_tls(task, GS_TLS);
11183 else if (doit) {
11184 - asm("movl %%gs,%0" : "=r" (gsindex));
11185 + savesegment(gs, gsindex);
11186 if (gsindex)
11187 rdmsrl(MSR_KERNEL_GS_BASE, base);
11188 else
11189 --- sle11-2009-06-04.orig/arch/x86/kernel/quirks-xen.c 2009-03-16 16:33:40.000000000 +0100
11190 +++ sle11-2009-06-04/arch/x86/kernel/quirks-xen.c 2009-06-04 10:21:39.000000000 +0200
11191 @@ -63,6 +63,7 @@ static enum {
11192 ICH_FORCE_HPET_RESUME,
11193 VT8237_FORCE_HPET_RESUME,
11194 NVIDIA_FORCE_HPET_RESUME,
11195 + ATI_FORCE_HPET_RESUME,
11196 } force_hpet_resume_type;
11197
11198 static void __iomem *rcba_base;
11199 @@ -156,6 +157,8 @@ static void ich_force_enable_hpet(struct
11200
11201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
11202 ich_force_enable_hpet);
11203 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
11204 + ich_force_enable_hpet);
11205 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
11206 ich_force_enable_hpet);
11207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
11208 @@ -172,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
11209
11210 static struct pci_dev *cached_dev;
11211
11212 +static void hpet_print_force_info(void)
11213 +{
11214 + printk(KERN_INFO "HPET not enabled in BIOS. "
11215 + "You might try hpet=force boot option\n");
11216 +}
11217 +
11218 static void old_ich_force_hpet_resume(void)
11219 {
11220 u32 val;
11221 @@ -251,8 +260,12 @@ static void old_ich_force_enable_hpet_us
11222 {
11223 if (hpet_force_user)
11224 old_ich_force_enable_hpet(dev);
11225 + else
11226 + hpet_print_force_info();
11227 }
11228
11229 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
11230 + old_ich_force_enable_hpet_user);
11231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
11232 old_ich_force_enable_hpet_user);
11233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
11234 @@ -288,9 +301,14 @@ static void vt8237_force_enable_hpet(str
11235 {
11236 u32 uninitialized_var(val);
11237
11238 - if (!hpet_force_user || hpet_address || force_hpet_address)
11239 + if (hpet_address || force_hpet_address)
11240 return;
11241
11242 + if (!hpet_force_user) {
11243 + hpet_print_force_info();
11244 + return;
11245 + }
11246 +
11247 pci_read_config_dword(dev, 0x68, &val);
11248 /*
11249 * Bit 7 is HPET enable bit.
11250 @@ -328,6 +346,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_V
11251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
11252 vt8237_force_enable_hpet);
11253
11254 +static void ati_force_hpet_resume(void)
11255 +{
11256 + pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
11257 + printk(KERN_DEBUG "Force enabled HPET at resume\n");
11258 +}
11259 +
11260 +static void ati_force_enable_hpet(struct pci_dev *dev)
11261 +{
11262 + u32 uninitialized_var(val);
11263 +
11264 + if (hpet_address || force_hpet_address)
11265 + return;
11266 +
11267 + if (!hpet_force_user) {
11268 + hpet_print_force_info();
11269 + return;
11270 + }
11271 +
11272 + pci_write_config_dword(dev, 0x14, 0xfed00000);
11273 + pci_read_config_dword(dev, 0x14, &val);
11274 + force_hpet_address = val;
11275 + force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
11276 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
11277 + force_hpet_address);
11278 + cached_dev = dev;
11279 + return;
11280 +}
11281 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
11282 + ati_force_enable_hpet);
11283 +
11284 /*
11285 * Undocumented chipset feature taken from LinuxBIOS.
11286 */
11287 @@ -341,8 +389,13 @@ static void nvidia_force_enable_hpet(str
11288 {
11289 u32 uninitialized_var(val);
11290
11291 - if (!hpet_force_user || hpet_address || force_hpet_address)
11292 + if (hpet_address || force_hpet_address)
11293 + return;
11294 +
11295 + if (!hpet_force_user) {
11296 + hpet_print_force_info();
11297 return;
11298 + }
11299
11300 pci_write_config_dword(dev, 0x44, 0xfed00001);
11301 pci_read_config_dword(dev, 0x44, &val);
11302 @@ -395,6 +448,9 @@ void force_hpet_resume(void)
11303 case NVIDIA_FORCE_HPET_RESUME:
11304 nvidia_force_hpet_resume();
11305 return;
11306 + case ATI_FORCE_HPET_RESUME:
11307 + ati_force_hpet_resume();
11308 + return;
11309 default:
11310 break;
11311 }
11312 --- sle11-2009-06-04.orig/arch/x86/kernel/setup-xen.c 2009-03-16 16:38:05.000000000 +0100
11313 +++ sle11-2009-06-04/arch/x86/kernel/setup-xen.c 2009-06-04 10:21:39.000000000 +0200
11314 @@ -1,141 +1,1131 @@
11315 -#include <linux/kernel.h>
11316 +/*
11317 + * Copyright (C) 1995 Linus Torvalds
11318 + *
11319 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11320 + *
11321 + * Memory region support
11322 + * David Parsons <orc@pell.chi.il.us>, July-August 1999
11323 + *
11324 + * Added E820 sanitization routine (removes overlapping memory regions);
11325 + * Brian Moyle <bmoyle@mvista.com>, February 2001
11326 + *
11327 + * Moved CPU detection code to cpu/${cpu}.c
11328 + * Patrick Mochel <mochel@osdl.org>, March 2002
11329 + *
11330 + * Provisions for empty E820 memory regions (reported by certain BIOSes).
11331 + * Alex Achenbach <xela@slit.de>, December 2002.
11332 + *
11333 + */
11334 +
11335 +/*
11336 + * This file handles the architecture-dependent parts of initialization
11337 + */
11338 +
11339 +#include <linux/sched.h>
11340 +#include <linux/mm.h>
11341 +#include <linux/mmzone.h>
11342 +#include <linux/screen_info.h>
11343 +#include <linux/ioport.h>
11344 +#include <linux/acpi.h>
11345 +#include <linux/apm_bios.h>
11346 +#include <linux/initrd.h>
11347 +#include <linux/bootmem.h>
11348 +#include <linux/seq_file.h>
11349 +#include <linux/console.h>
11350 +#include <linux/mca.h>
11351 +#include <linux/root_dev.h>
11352 +#include <linux/highmem.h>
11353 #include <linux/module.h>
11354 +#include <linux/efi.h>
11355 #include <linux/init.h>
11356 -#include <linux/bootmem.h>
11357 +#include <linux/edd.h>
11358 +#include <linux/iscsi_ibft.h>
11359 +#include <linux/nodemask.h>
11360 +#include <linux/kexec.h>
11361 +#include <linux/dmi.h>
11362 +#include <linux/pfn.h>
11363 +#include <linux/pci.h>
11364 +#include <asm/pci-direct.h>
11365 +#include <linux/init_ohci1394_dma.h>
11366 +#include <linux/kvm_para.h>
11367 +
11368 +#include <linux/errno.h>
11369 +#include <linux/kernel.h>
11370 +#include <linux/stddef.h>
11371 +#include <linux/unistd.h>
11372 +#include <linux/ptrace.h>
11373 +#include <linux/slab.h>
11374 +#include <linux/user.h>
11375 +#include <linux/delay.h>
11376 +
11377 +#include <linux/kallsyms.h>
11378 +#include <linux/cpufreq.h>
11379 +#include <linux/dma-mapping.h>
11380 +#include <linux/ctype.h>
11381 +#include <linux/uaccess.h>
11382 +
11383 #include <linux/percpu.h>
11384 -#include <asm/smp.h>
11385 -#include <asm/percpu.h>
11386 +#include <linux/crash_dump.h>
11387 +
11388 +#include <video/edid.h>
11389 +
11390 +#include <asm/mtrr.h>
11391 +#include <asm/apic.h>
11392 +#include <asm/e820.h>
11393 +#include <asm/mpspec.h>
11394 +#include <asm/setup.h>
11395 +#include <asm/arch_hooks.h>
11396 +#include <asm/efi.h>
11397 #include <asm/sections.h>
11398 +#include <asm/dmi.h>
11399 +#include <asm/io_apic.h>
11400 +#include <asm/ist.h>
11401 +#include <asm/vmi.h>
11402 +#include <setup_arch.h>
11403 +#include <asm/bios_ebda.h>
11404 +#include <asm/cacheflush.h>
11405 #include <asm/processor.h>
11406 -#include <asm/setup.h>
11407 +#include <asm/bugs.h>
11408 +
11409 +#include <asm/system.h>
11410 +#include <asm/vsyscall.h>
11411 +#include <asm/smp.h>
11412 +#include <asm/desc.h>
11413 +#include <asm/dma.h>
11414 +#include <asm/iommu.h>
11415 +#include <asm/mmu_context.h>
11416 +#include <asm/proto.h>
11417 +
11418 +#include <mach_apic.h>
11419 +#include <asm/paravirt.h>
11420 +
11421 +#include <asm/percpu.h>
11422 #include <asm/topology.h>
11423 -#include <asm/mpspec.h>
11424 #include <asm/apicdef.h>
11425 +#ifdef CONFIG_X86_64
11426 +#include <asm/numa_64.h>
11427 +#endif
11428 +
11429 +#ifdef CONFIG_XEN
11430 +#include <asm/hypervisor.h>
11431 +#include <xen/interface/kexec.h>
11432 +#include <xen/interface/memory.h>
11433 +#include <xen/interface/nmi.h>
11434 +#include <xen/interface/physdev.h>
11435 +#include <xen/features.h>
11436 +#include <xen/firmware.h>
11437 +#include <xen/xencons.h>
11438 +
11439 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11440 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
11441
11442 -#ifdef CONFIG_X86_LOCAL_APIC
11443 -unsigned int num_processors;
11444 -unsigned disabled_cpus __cpuinitdata;
11445 -/* Processor that is doing the boot up */
11446 -unsigned int boot_cpu_physical_apicid = -1U;
11447 -EXPORT_SYMBOL(boot_cpu_physical_apicid);
11448 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
11449 +static struct notifier_block xen_panic_block = {
11450 + xen_panic_event, NULL, 0 /* try to go last */
11451 +};
11452
11453 -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
11454 -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
11455 +unsigned long *phys_to_machine_mapping;
11456 +EXPORT_SYMBOL(phys_to_machine_mapping);
11457
11458 -/* Bitmask of physically existing CPUs */
11459 -physid_mask_t phys_cpu_present_map;
11460 +unsigned long *pfn_to_mfn_frame_list_list,
11461 +#ifdef CONFIG_X86_64
11462 + *pfn_to_mfn_frame_list[512];
11463 +#else
11464 + *pfn_to_mfn_frame_list[128];
11465 +#endif
11466 +
11467 +/* Raw start-of-day parameters from the hypervisor. */
11468 +start_info_t *xen_start_info;
11469 +EXPORT_SYMBOL(xen_start_info);
11470 +#endif
11471 +
11472 +#ifndef ARCH_SETUP
11473 +#define ARCH_SETUP
11474 +#endif
11475 +
11476 +#ifndef CONFIG_XEN
11477 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
11478 +struct boot_params __initdata boot_params;
11479 +#else
11480 +struct boot_params boot_params;
11481 +#endif
11482 #endif
11483
11484 -#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
11485 /*
11486 - * Copy data used in early init routines from the initial arrays to the
11487 - * per cpu data areas. These arrays then become expendable and the
11488 - * *_early_ptr's are zeroed indicating that the static arrays are gone.
11489 + * Machine setup..
11490 */
11491 -static void __init setup_per_cpu_maps(void)
11492 +static struct resource data_resource = {
11493 + .name = "Kernel data",
11494 + .start = 0,
11495 + .end = 0,
11496 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11497 +};
11498 +
11499 +static struct resource code_resource = {
11500 + .name = "Kernel code",
11501 + .start = 0,
11502 + .end = 0,
11503 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11504 +};
11505 +
11506 +static struct resource bss_resource = {
11507 + .name = "Kernel bss",
11508 + .start = 0,
11509 + .end = 0,
11510 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11511 +};
11512 +
11513 +
11514 +#ifdef CONFIG_X86_32
11515 +#ifndef CONFIG_XEN
11516 +/* This value is set up by the early boot code to point to the value
11517 + immediately after the boot time page tables. It contains a *physical*
11518 + address, and must not be in the .bss segment! */
11519 +unsigned long init_pg_tables_start __initdata = ~0UL;
11520 +unsigned long init_pg_tables_end __initdata = ~0UL;
11521 +#endif
11522 +
11523 +static struct resource video_ram_resource = {
11524 + .name = "Video RAM area",
11525 + .start = 0xa0000,
11526 + .end = 0xbffff,
11527 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11528 +};
11529 +
11530 +/* cpu data as detected by the assembly code in head.S */
11531 +struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11532 +/* common cpu data for all cpus */
11533 +struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11534 +EXPORT_SYMBOL(boot_cpu_data);
11535 +#ifndef CONFIG_XEN
11536 +static void set_mca_bus(int x)
11537 +{
11538 +#ifdef CONFIG_MCA
11539 + MCA_bus = x;
11540 +#endif
11541 +}
11542 +
11543 +unsigned int def_to_bigsmp;
11544 +
11545 +/* for MCA, but anyone else can use it if they want */
11546 +unsigned int machine_id;
11547 +unsigned int machine_submodel_id;
11548 +unsigned int BIOS_revision;
11549 +
11550 +struct apm_info apm_info;
11551 +EXPORT_SYMBOL(apm_info);
11552 +#endif
11553 +
11554 +#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
11555 +struct ist_info ist_info;
11556 +EXPORT_SYMBOL(ist_info);
11557 +#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
11558 +struct ist_info ist_info;
11559 +#endif
11560 +
11561 +#else
11562 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
11563 +EXPORT_SYMBOL(boot_cpu_data);
11564 +#endif
11565 +
11566 +
11567 +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
11568 +unsigned long mmu_cr4_features;
11569 +#else
11570 +unsigned long mmu_cr4_features = X86_CR4_PAE;
11571 +#endif
11572 +
11573 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
11574 +int bootloader_type;
11575 +
11576 +/*
11577 + * Early DMI memory
11578 + */
11579 +int dmi_alloc_index;
11580 +char dmi_alloc_data[DMI_MAX_DATA];
11581 +
11582 +/*
11583 + * Setup options
11584 + */
11585 +struct screen_info screen_info;
11586 +EXPORT_SYMBOL(screen_info);
11587 +struct edid_info edid_info;
11588 +EXPORT_SYMBOL_GPL(edid_info);
11589 +
11590 +extern int root_mountflags;
11591 +
11592 +unsigned long saved_video_mode;
11593 +
11594 +#define RAMDISK_IMAGE_START_MASK 0x07FF
11595 +#define RAMDISK_PROMPT_FLAG 0x8000
11596 +#define RAMDISK_LOAD_FLAG 0x4000
11597 +
11598 +static char __initdata command_line[COMMAND_LINE_SIZE];
11599 +
11600 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11601 +struct edd edd;
11602 +#ifdef CONFIG_EDD_MODULE
11603 +EXPORT_SYMBOL(edd);
11604 +#endif
11605 +#ifndef CONFIG_XEN
11606 +/**
11607 + * copy_edd() - Copy the BIOS EDD information
11608 + * from boot_params into a safe place.
11609 + *
11610 + */
11611 +static inline void copy_edd(void)
11612 +{
11613 + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
11614 + sizeof(edd.mbr_signature));
11615 + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
11616 + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
11617 + edd.edd_info_nr = boot_params.eddbuf_entries;
11618 +}
11619 +#endif
11620 +#else
11621 +static inline void copy_edd(void)
11622 +{
11623 +}
11624 +#endif
11625 +
11626 +#ifdef CONFIG_BLK_DEV_INITRD
11627 +
11628 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11629 +
11630 +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
11631 +static void __init relocate_initrd(void)
11632 +{
11633 +
11634 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11635 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11636 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11637 + u64 ramdisk_here;
11638 + unsigned long slop, clen, mapaddr;
11639 + char *p, *q;
11640 +
11641 + /* We need to move the initrd down into lowmem */
11642 + ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
11643 + PAGE_SIZE);
11644 +
11645 + if (ramdisk_here == -1ULL)
11646 + panic("Cannot find place for new RAMDISK of size %lld\n",
11647 + ramdisk_size);
11648 +
11649 + /* Note: this includes all the lowmem currently occupied by
11650 + the initrd, we rely on that fact to keep the data intact. */
11651 + reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
11652 + "NEW RAMDISK");
11653 + initrd_start = ramdisk_here + PAGE_OFFSET;
11654 + initrd_end = initrd_start + ramdisk_size;
11655 + printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
11656 + ramdisk_here, ramdisk_here + ramdisk_size);
11657 +
11658 + q = (char *)initrd_start;
11659 +
11660 + /* Copy any lowmem portion of the initrd */
11661 + if (ramdisk_image < end_of_lowmem) {
11662 + clen = end_of_lowmem - ramdisk_image;
11663 + p = (char *)__va(ramdisk_image);
11664 + memcpy(q, p, clen);
11665 + q += clen;
11666 + ramdisk_image += clen;
11667 + ramdisk_size -= clen;
11668 + }
11669 +
11670 + /* Copy the highmem portion of the initrd */
11671 + while (ramdisk_size) {
11672 + slop = ramdisk_image & ~PAGE_MASK;
11673 + clen = ramdisk_size;
11674 + if (clen > MAX_MAP_CHUNK-slop)
11675 + clen = MAX_MAP_CHUNK-slop;
11676 + mapaddr = ramdisk_image & PAGE_MASK;
11677 + p = early_ioremap(mapaddr, clen+slop);
11678 + memcpy(q, p+slop, clen);
11679 + early_iounmap(p, clen+slop);
11680 + q += clen;
11681 + ramdisk_image += clen;
11682 + ramdisk_size -= clen;
11683 + }
11684 + /* high pages is not converted by early_res_to_bootmem */
11685 + ramdisk_image = boot_params.hdr.ramdisk_image;
11686 + ramdisk_size = boot_params.hdr.ramdisk_size;
11687 + printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
11688 + " %08llx - %08llx\n",
11689 + ramdisk_image, ramdisk_image + ramdisk_size - 1,
11690 + ramdisk_here, ramdisk_here + ramdisk_size - 1);
11691 +}
11692 +#endif
11693 +
11694 +static void __init reserve_initrd(void)
11695 {
11696 #ifndef CONFIG_XEN
11697 - int cpu;
11698 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11699 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11700 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
11701 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11702 +
11703 + if (!boot_params.hdr.type_of_loader ||
11704 + !ramdisk_image || !ramdisk_size)
11705 + return; /* No initrd provided by bootloader */
11706 +#else
11707 + unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
11708 + unsigned long ramdisk_size = xen_start_info->mod_len;
11709 + unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
11710 + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11711
11712 - for_each_possible_cpu(cpu) {
11713 - per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
11714 - per_cpu(x86_bios_cpu_apicid, cpu) =
11715 - x86_bios_cpu_apicid_init[cpu];
11716 -#ifdef CONFIG_NUMA
11717 - per_cpu(x86_cpu_to_node_map, cpu) =
11718 - x86_cpu_to_node_map_init[cpu];
11719 + if (!xen_start_info->mod_start || !ramdisk_size)
11720 + return; /* No initrd provided by bootloader */
11721 #endif
11722 +
11723 + initrd_start = 0;
11724 +
11725 + if (ramdisk_size >= (end_of_lowmem>>1)) {
11726 + free_early(ramdisk_image, ramdisk_end);
11727 + printk(KERN_ERR "initrd too large to handle, "
11728 + "disabling initrd\n");
11729 + return;
11730 }
11731
11732 - /* indicate the early static arrays will soon be gone */
11733 - x86_cpu_to_apicid_early_ptr = NULL;
11734 - x86_bios_cpu_apicid_early_ptr = NULL;
11735 -#ifdef CONFIG_NUMA
11736 - x86_cpu_to_node_map_early_ptr = NULL;
11737 + printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
11738 + ramdisk_end);
11739 +
11740 +
11741 + if (ramdisk_end <= end_of_lowmem) {
11742 + /* All in lowmem, easy case */
11743 + /*
11744 + * don't need to reserve again, already reserved early
11745 + * in i386_start_kernel
11746 + */
11747 + initrd_start = ramdisk_image + PAGE_OFFSET;
11748 + initrd_end = initrd_start + ramdisk_size;
11749 +#ifdef CONFIG_X86_64_XEN
11750 + initrd_below_start_ok = 1;
11751 #endif
11752 + return;
11753 + }
11754 +
11755 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11756 + relocate_initrd();
11757 +#else
11758 + printk(KERN_ERR "initrd extends beyond end of memory "
11759 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11760 + ramdisk_end, end_of_lowmem);
11761 + initrd_start = 0;
11762 #endif
11763 + free_early(ramdisk_image, ramdisk_end);
11764 }
11765 +#else
11766 +static void __init reserve_initrd(void)
11767 +{
11768 +}
11769 +#endif /* CONFIG_BLK_DEV_INITRD */
11770 +
11771 +static void __init parse_setup_data(void)
11772 +{
11773 +#ifndef CONFIG_XEN
11774 + struct setup_data *data;
11775 + u64 pa_data;
11776
11777 -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
11778 -cpumask_t *cpumask_of_cpu_map __read_mostly;
11779 -EXPORT_SYMBOL(cpumask_of_cpu_map);
11780 + if (boot_params.hdr.version < 0x0209)
11781 + return;
11782 + pa_data = boot_params.hdr.setup_data;
11783 + while (pa_data) {
11784 + data = early_ioremap(pa_data, PAGE_SIZE);
11785 + switch (data->type) {
11786 + case SETUP_E820_EXT:
11787 + parse_e820_ext(data, pa_data);
11788 + break;
11789 + default:
11790 + break;
11791 + }
11792 + pa_data = data->next;
11793 + early_iounmap(data, PAGE_SIZE);
11794 + }
11795 +#endif
11796 +}
11797
11798 -/* requires nr_cpu_ids to be initialized */
11799 -static void __init setup_cpumask_of_cpu(void)
11800 +static void __init e820_reserve_setup_data(void)
11801 {
11802 - int i;
11803 +#ifndef CONFIG_XEN
11804 + struct setup_data *data;
11805 + u64 pa_data;
11806 + int found = 0;
11807 +
11808 + if (boot_params.hdr.version < 0x0209)
11809 + return;
11810 + pa_data = boot_params.hdr.setup_data;
11811 + while (pa_data) {
11812 + data = early_ioremap(pa_data, sizeof(*data));
11813 + e820_update_range(pa_data, sizeof(*data)+data->len,
11814 + E820_RAM, E820_RESERVED_KERN);
11815 + found = 1;
11816 + pa_data = data->next;
11817 + early_iounmap(data, sizeof(*data));
11818 + }
11819 + if (!found)
11820 + return;
11821
11822 - /* alloc_bootmem zeroes memory */
11823 - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
11824 - for (i = 0; i < nr_cpu_ids; i++)
11825 - cpu_set(i, cpumask_of_cpu_map[i]);
11826 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
11827 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
11828 + printk(KERN_INFO "extended physical RAM map:\n");
11829 + e820_print_map("reserve setup_data");
11830 +#endif
11831 }
11832 -#else
11833 -static inline void setup_cpumask_of_cpu(void) { }
11834 +
11835 +static void __init reserve_early_setup_data(void)
11836 +{
11837 +#ifndef CONFIG_XEN
11838 + struct setup_data *data;
11839 + u64 pa_data;
11840 + char buf[32];
11841 +
11842 + if (boot_params.hdr.version < 0x0209)
11843 + return;
11844 + pa_data = boot_params.hdr.setup_data;
11845 + while (pa_data) {
11846 + data = early_ioremap(pa_data, sizeof(*data));
11847 + sprintf(buf, "setup data %x", data->type);
11848 + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
11849 + pa_data = data->next;
11850 + early_iounmap(data, sizeof(*data));
11851 + }
11852 #endif
11853 +}
11854
11855 -#ifdef CONFIG_X86_32
11856 /*
11857 - * Great future not-so-futuristic plan: make i386 and x86_64 do it
11858 - * the same way
11859 + * --------- Crashkernel reservation ------------------------------
11860 + */
11861 +
11862 +#ifdef CONFIG_KEXEC
11863 +
11864 +#ifndef CONFIG_XEN
11865 +/**
11866 + * Reserve @size bytes of crashkernel memory at any suitable offset.
11867 + *
11868 + * @size: Size of the crashkernel memory to reserve.
11869 + * Returns the base address on success, and -1ULL on failure.
11870 + */
11871 +unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
11872 +{
11873 + const unsigned long long alignment = 16<<20; /* 16M */
11874 + unsigned long long start = 0LL;
11875 +
11876 + while (1) {
11877 + int ret;
11878 +
11879 + start = find_e820_area(start, ULONG_MAX, size, alignment);
11880 + if (start == -1ULL)
11881 + return start;
11882 +
11883 + /* try to reserve it */
11884 + ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
11885 + if (ret >= 0)
11886 + return start;
11887 +
11888 + start += alignment;
11889 + }
11890 +}
11891 +
11892 +static inline unsigned long long get_total_mem(void)
11893 +{
11894 + unsigned long long total;
11895 +
11896 + total = max_low_pfn - min_low_pfn;
11897 +#ifdef CONFIG_HIGHMEM
11898 + total += highend_pfn - highstart_pfn;
11899 +#endif
11900 +
11901 + return total << PAGE_SHIFT;
11902 +}
11903 +
11904 +static void __init reserve_crashkernel(void)
11905 +{
11906 + unsigned long long total_mem;
11907 + unsigned long long crash_size, crash_base;
11908 + int ret;
11909 +
11910 + total_mem = get_total_mem();
11911 +
11912 + ret = parse_crashkernel(boot_command_line, total_mem,
11913 + &crash_size, &crash_base);
11914 + if (ret != 0 || crash_size <= 0)
11915 + return;
11916 +
11917 + /* 0 means: find the address automatically */
11918 + if (crash_base <= 0) {
11919 + crash_base = find_and_reserve_crashkernel(crash_size);
11920 + if (crash_base == -1ULL) {
11921 + pr_info("crashkernel reservation failed. "
11922 + "No suitable area found.\n");
11923 + return;
11924 + }
11925 + } else {
11926 + ret = reserve_bootmem_generic(crash_base, crash_size,
11927 + BOOTMEM_EXCLUSIVE);
11928 + if (ret < 0) {
11929 + pr_info("crashkernel reservation failed - "
11930 + "memory is in use\n");
11931 + return;
11932 + }
11933 + }
11934 +
11935 + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
11936 + "for crashkernel (System RAM: %ldMB)\n",
11937 + (unsigned long)(crash_size >> 20),
11938 + (unsigned long)(crash_base >> 20),
11939 + (unsigned long)(total_mem >> 20));
11940 +
11941 + crashk_res.start = crash_base;
11942 + crashk_res.end = crash_base + crash_size - 1;
11943 + insert_resource(&iomem_resource, &crashk_res);
11944 +}
11945 +#else
11946 +#define reserve_crashkernel xen_machine_kexec_setup_resources
11947 +#endif
11948 +#else
11949 +static void __init reserve_crashkernel(void)
11950 +{
11951 +}
11952 +#endif
11953 +
11954 +static struct resource standard_io_resources[] = {
11955 + { .name = "dma1", .start = 0x00, .end = 0x1f,
11956 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11957 + { .name = "pic1", .start = 0x20, .end = 0x21,
11958 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11959 + { .name = "timer0", .start = 0x40, .end = 0x43,
11960 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11961 + { .name = "timer1", .start = 0x50, .end = 0x53,
11962 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11963 + { .name = "keyboard", .start = 0x60, .end = 0x60,
11964 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11965 + { .name = "keyboard", .start = 0x64, .end = 0x64,
11966 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11967 + { .name = "dma page reg", .start = 0x80, .end = 0x8f,
11968 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11969 + { .name = "pic2", .start = 0xa0, .end = 0xa1,
11970 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11971 + { .name = "dma2", .start = 0xc0, .end = 0xdf,
11972 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11973 + { .name = "fpu", .start = 0xf0, .end = 0xff,
11974 + .flags = IORESOURCE_BUSY | IORESOURCE_IO }
11975 +};
11976 +
11977 +static void __init reserve_standard_io_resources(void)
11978 +{
11979 + int i;
11980 +
11981 + /* Nothing to do if not running in dom0. */
11982 + if (!is_initial_xendomain())
11983 + return;
11984 +
11985 + /* request I/O space for devices used on all i[345]86 PCs */
11986 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
11987 + request_resource(&ioport_resource, &standard_io_resources[i]);
11988 +
11989 +}
11990 +
11991 +#ifdef CONFIG_PROC_VMCORE
11992 +/* elfcorehdr= specifies the location of elf core header
11993 + * stored by the crashed kernel. This option will be passed
11994 + * by kexec loader to the capture kernel.
11995 */
11996 -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
11997 -EXPORT_SYMBOL(__per_cpu_offset);
11998 +static int __init setup_elfcorehdr(char *arg)
11999 +{
12000 + char *end;
12001 + if (!arg)
12002 + return -EINVAL;
12003 + elfcorehdr_addr = memparse(arg, &end);
12004 + return end > arg ? 0 : -EINVAL;
12005 +}
12006 +early_param("elfcorehdr", setup_elfcorehdr);
12007 #endif
12008
12009 +static struct x86_quirks default_x86_quirks __initdata;
12010 +
12011 +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
12012 +
12013 +/*
12014 + * Determine if we were loaded by an EFI loader. If so, then we have also been
12015 + * passed the efi memmap, systab, etc., so we should use these data structures
12016 + * for initialization. Note, the efi init code path is determined by the
12017 + * global efi_enabled. This allows the same kernel image to be used on existing
12018 + * systems (with a traditional BIOS) as well as on EFI systems.
12019 + */
12020 /*
12021 - * Great future plan:
12022 - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
12023 - * Always point %gs to its beginning
12024 + * setup_arch - architecture-specific boot-time initializations
12025 + *
12026 + * Note: On x86_64, fixmaps are ready for use even before this is called.
12027 */
12028 -void __init setup_per_cpu_areas(void)
12029 +
12030 +void __init setup_arch(char **cmdline_p)
12031 {
12032 - int i, highest_cpu = 0;
12033 - unsigned long size;
12034 +#ifdef CONFIG_XEN
12035 + unsigned int i;
12036 + unsigned long p2m_pages;
12037 + struct physdev_set_iopl set_iopl;
12038
12039 -#ifdef CONFIG_HOTPLUG_CPU
12040 - prefill_possible_map();
12041 +#ifdef CONFIG_X86_32
12042 + /* Force a quick death if the kernel panics (not domain 0). */
12043 + extern int panic_timeout;
12044 + if (!panic_timeout && !is_initial_xendomain())
12045 + panic_timeout = 1;
12046 #endif
12047
12048 - /* Copy section for each CPU (we discard the original) */
12049 - size = PERCPU_ENOUGH_ROOM;
12050 - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
12051 - size);
12052 -
12053 - for_each_possible_cpu(i) {
12054 - char *ptr;
12055 -#ifndef CONFIG_NEED_MULTIPLE_NODES
12056 - ptr = alloc_bootmem_pages(size);
12057 -#else
12058 - int node = early_cpu_to_node(i);
12059 - if (!node_online(node) || !NODE_DATA(node)) {
12060 - ptr = alloc_bootmem_pages(size);
12061 - printk(KERN_INFO
12062 - "cpu %d has no node or node-local memory\n", i);
12063 - }
12064 - else
12065 - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
12066 + /* Register a call for panic conditions. */
12067 + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12068 +
12069 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12070 + VMASST_TYPE_writable_pagetables));
12071 +#ifdef CONFIG_X86_32
12072 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12073 + VMASST_TYPE_4gb_segments));
12074 +#endif
12075 +#endif /* CONFIG_XEN */
12076 +
12077 +#ifdef CONFIG_X86_32
12078 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12079 + visws_early_detect();
12080 + pre_setup_arch_hook();
12081 +#else
12082 + printk(KERN_INFO "Command line: %s\n", boot_command_line);
12083 +#endif
12084 +
12085 + early_cpu_init();
12086 + early_ioremap_init();
12087 +
12088 +#ifndef CONFIG_XEN
12089 + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
12090 + screen_info = boot_params.screen_info;
12091 + edid_info = boot_params.edid_info;
12092 +#ifdef CONFIG_X86_32
12093 + apm_info.bios = boot_params.apm_bios_info;
12094 + ist_info = boot_params.ist_info;
12095 + if (boot_params.sys_desc_table.length != 0) {
12096 + set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
12097 + machine_id = boot_params.sys_desc_table.table[0];
12098 + machine_submodel_id = boot_params.sys_desc_table.table[1];
12099 + BIOS_revision = boot_params.sys_desc_table.table[2];
12100 + }
12101 +#endif
12102 + saved_video_mode = boot_params.hdr.vid_mode;
12103 + bootloader_type = boot_params.hdr.type_of_loader;
12104 +
12105 +#ifdef CONFIG_BLK_DEV_RAM
12106 + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
12107 + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
12108 + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
12109 +#endif
12110 +#ifdef CONFIG_EFI
12111 + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
12112 +#ifdef CONFIG_X86_32
12113 + "EL32",
12114 +#else
12115 + "EL64",
12116 #endif
12117 - if (!ptr)
12118 - panic("Cannot allocate cpu data for CPU %d\n", i);
12119 + 4)) {
12120 + efi_enabled = 1;
12121 + efi_reserve_early();
12122 + }
12123 +#endif
12124 +#else /* CONFIG_XEN */
12125 +#ifdef CONFIG_X86_32
12126 + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12127 + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12128 + */
12129 + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12130 +#else
12131 + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
12132 +#endif
12133 + if (is_initial_xendomain()) {
12134 + const struct dom0_vga_console_info *info =
12135 + (void *)((char *)xen_start_info +
12136 + xen_start_info->console.dom0.info_off);
12137 +
12138 + dom0_init_screen_info(info,
12139 + xen_start_info->console.dom0.info_size);
12140 + xen_start_info->console.domU.mfn = 0;
12141 + xen_start_info->console.domU.evtchn = 0;
12142 + } else
12143 + screen_info.orig_video_isVGA = 0;
12144 + copy_edid();
12145 +#endif /* CONFIG_XEN */
12146 +
12147 + ARCH_SETUP
12148 +
12149 + setup_memory_map();
12150 + parse_setup_data();
12151 + /* update the e820_saved too */
12152 + e820_reserve_setup_data();
12153 +
12154 + copy_edd();
12155 +
12156 +#ifndef CONFIG_XEN
12157 + if (!boot_params.hdr.root_flags)
12158 + root_mountflags &= ~MS_RDONLY;
12159 +#endif
12160 + init_mm.start_code = (unsigned long) _text;
12161 + init_mm.end_code = (unsigned long) _etext;
12162 + init_mm.end_data = (unsigned long) _edata;
12163 +#ifdef CONFIG_X86_32
12164 +#ifndef CONFIG_XEN
12165 + init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
12166 +#else
12167 + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12168 + xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12169 +#endif
12170 +#else
12171 + init_mm.brk = (unsigned long) &_end;
12172 +#endif
12173 +
12174 + code_resource.start = virt_to_phys(_text);
12175 + code_resource.end = virt_to_phys(_etext)-1;
12176 + data_resource.start = virt_to_phys(_etext);
12177 + data_resource.end = virt_to_phys(_edata)-1;
12178 + bss_resource.start = virt_to_phys(&__bss_start);
12179 + bss_resource.end = virt_to_phys(&__bss_stop)-1;
12180 +
12181 + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
12182 + *cmdline_p = command_line;
12183 +
12184 + parse_early_param();
12185 +
12186 #ifdef CONFIG_X86_64
12187 - cpu_pda(i)->data_offset = ptr - __per_cpu_start;
12188 + check_efer();
12189 +#endif
12190 +
12191 +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
12192 + /*
12193 + * Must be before kernel pagetables are setup
12194 + * or fixmap area is touched.
12195 + */
12196 + vmi_init();
12197 +#endif
12198 +
12199 + /* after early param, so could get panic from serial */
12200 + reserve_early_setup_data();
12201 +
12202 + if (acpi_mps_check()) {
12203 +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
12204 + disable_apic = 1;
12205 +#endif
12206 + setup_clear_cpu_cap(X86_FEATURE_APIC);
12207 + }
12208 +
12209 +#ifdef CONFIG_PCI
12210 + if (pci_early_dump_regs)
12211 + early_dump_pci_devices();
12212 +#endif
12213 +
12214 + finish_e820_parsing();
12215 +
12216 +#ifdef CONFIG_X86_32
12217 + probe_roms();
12218 +#endif
12219 +
12220 +#ifndef CONFIG_XEN
12221 + /* after parse_early_param, so could debug it */
12222 + insert_resource(&iomem_resource, &code_resource);
12223 + insert_resource(&iomem_resource, &data_resource);
12224 + insert_resource(&iomem_resource, &bss_resource);
12225 +
12226 + if (efi_enabled)
12227 + efi_init();
12228 +
12229 +#ifdef CONFIG_X86_32
12230 + if (ppro_with_ram_bug()) {
12231 + e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
12232 + E820_RESERVED);
12233 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
12234 + printk(KERN_INFO "fixed physical RAM map:\n");
12235 + e820_print_map("bad_ppro");
12236 + }
12237 #else
12238 - __per_cpu_offset[i] = ptr - __per_cpu_start;
12239 + early_gart_iommu_check();
12240 #endif
12241 - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
12242 +#endif /* CONFIG_XEN */
12243
12244 - highest_cpu = i;
12245 + /*
12246 + * partially used pages are not usable - thus
12247 + * we are rounding upwards:
12248 + */
12249 + max_pfn = e820_end_of_ram_pfn();
12250 +
12251 + /* preallocate 4k for mptable mpc */
12252 + early_reserve_e820_mpc_new();
12253 + /* update e820 for memory not covered by WB MTRRs */
12254 + mtrr_bp_init();
12255 +#ifndef CONFIG_XEN
12256 + if (mtrr_trim_uncached_memory(max_pfn))
12257 + max_pfn = e820_end_of_ram_pfn();
12258 +#endif
12259 +
12260 +#ifdef CONFIG_X86_32
12261 + /* max_low_pfn get updated here */
12262 + find_low_pfn_range();
12263 +#else
12264 + num_physpages = max_pfn;
12265 + max_mapnr = max_pfn;
12266 +
12267 +
12268 + /* How many end-of-memory variables you have, grandma! */
12269 + /* need this before calling reserve_initrd */
12270 + if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
12271 + max_low_pfn = e820_end_of_low_ram_pfn();
12272 + else
12273 + max_low_pfn = max_pfn;
12274 +
12275 + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
12276 +#endif
12277 +
12278 + /* max_pfn_mapped is updated here */
12279 + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12280 + max_pfn_mapped = max_low_pfn_mapped;
12281 +
12282 +#ifdef CONFIG_X86_64
12283 + if (max_pfn > max_low_pfn) {
12284 + max_pfn_mapped = init_memory_mapping(1UL<<32,
12285 + max_pfn<<PAGE_SHIFT);
12286 + /* can we preseve max_low_pfn ?*/
12287 + max_low_pfn = max_pfn;
12288 }
12289 +#endif
12290
12291 - nr_cpu_ids = highest_cpu + 1;
12292 - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
12293 + /*
12294 + * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
12295 + */
12296
12297 - /* Setup percpu data maps */
12298 - setup_per_cpu_maps();
12299 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
12300 + if (init_ohci1394_dma_early)
12301 + init_ohci1394_dma_on_all_controllers();
12302 +#endif
12303
12304 - /* Setup cpumask_of_cpu map */
12305 - setup_cpumask_of_cpu();
12306 -}
12307 + reserve_initrd();
12308 +
12309 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12310 + vsmp_init();
12311 +#endif
12312 +
12313 + if (is_initial_xendomain())
12314 + dmi_scan_machine();
12315 +
12316 + io_delay_init();
12317 +
12318 +#ifdef CONFIG_ACPI
12319 + if (!is_initial_xendomain()) {
12320 + printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12321 + disable_acpi();
12322 + }
12323 +#endif
12324 +
12325 + /*
12326 + * Parse the ACPI tables for possible boot-time SMP configuration.
12327 + */
12328 + acpi_boot_table_init();
12329 +
12330 +#ifdef CONFIG_ACPI_NUMA
12331 + /*
12332 + * Parse SRAT to discover nodes.
12333 + */
12334 + acpi_numa_init();
12335 +#endif
12336 +
12337 + initmem_init(0, max_pfn);
12338
12339 +#ifdef CONFIG_ACPI_SLEEP
12340 + /*
12341 + * Reserve low memory region for sleep support.
12342 + */
12343 + acpi_reserve_bootmem();
12344 #endif
12345 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
12346 + /*
12347 + * Find and reserve possible boot-time SMP configuration:
12348 + */
12349 + find_smp_config();
12350 +#endif
12351 + reserve_crashkernel();
12352 +
12353 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12354 + /*
12355 + * dma32_reserve_bootmem() allocates bootmem which may conflict
12356 + * with the crashkernel command line, so do that after
12357 + * reserve_crashkernel()
12358 + */
12359 + dma32_reserve_bootmem();
12360 +#endif
12361 +
12362 + reserve_ibft_region();
12363 +
12364 +#ifdef CONFIG_KVM_CLOCK
12365 + kvmclock_init();
12366 +#endif
12367 +
12368 + xen_pagetable_setup_start(swapper_pg_dir);
12369 + paging_init();
12370 + xen_pagetable_setup_done(swapper_pg_dir);
12371 + paravirt_post_allocator_init();
12372 +
12373 +#ifdef CONFIG_X86_64
12374 + map_vsyscall();
12375 +#endif
12376 +
12377 +#ifdef CONFIG_XEN
12378 + p2m_pages = max_pfn;
12379 + if (xen_start_info->nr_pages > max_pfn) {
12380 + /*
12381 + * the max_pfn was shrunk (probably by mem= or highmem=
12382 + * kernel parameter); shrink reservation with the HV
12383 + */
12384 + struct xen_memory_reservation reservation = {
12385 + .address_bits = 0,
12386 + .extent_order = 0,
12387 + .domid = DOMID_SELF
12388 + };
12389 + unsigned int difference;
12390 + int ret;
12391 +
12392 + difference = xen_start_info->nr_pages - max_pfn;
12393 +
12394 + set_xen_guest_handle(reservation.extent_start,
12395 + ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
12396 + reservation.nr_extents = difference;
12397 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
12398 + &reservation);
12399 + BUG_ON(ret != difference);
12400 + }
12401 + else if (max_pfn > xen_start_info->nr_pages)
12402 + p2m_pages = xen_start_info->nr_pages;
12403 +
12404 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12405 + unsigned long i, j;
12406 + unsigned int k, fpp;
12407 +
12408 + /* Make sure we have a large enough P->M table. */
12409 + phys_to_machine_mapping = alloc_bootmem_pages(
12410 + max_pfn * sizeof(unsigned long));
12411 + memset(phys_to_machine_mapping, ~0,
12412 + max_pfn * sizeof(unsigned long));
12413 + memcpy(phys_to_machine_mapping,
12414 + (unsigned long *)xen_start_info->mfn_list,
12415 + p2m_pages * sizeof(unsigned long));
12416 + free_bootmem(
12417 + __pa(xen_start_info->mfn_list),
12418 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12419 + sizeof(unsigned long))));
12420 +
12421 + /*
12422 + * Initialise the list of the frames that specify the list of
12423 + * frames that make up the p2m table. Used by save/restore.
12424 + */
12425 + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
12426 +
12427 + fpp = PAGE_SIZE/sizeof(unsigned long);
12428 + for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
12429 + if (j == fpp)
12430 + j = 0;
12431 + if (j == 0) {
12432 + k++;
12433 + BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
12434 + pfn_to_mfn_frame_list[k] =
12435 + alloc_bootmem_pages(PAGE_SIZE);
12436 + pfn_to_mfn_frame_list_list[k] =
12437 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
12438 + }
12439 + pfn_to_mfn_frame_list[k][j] =
12440 + virt_to_mfn(&phys_to_machine_mapping[i]);
12441 + }
12442 + HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12443 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12444 + virt_to_mfn(pfn_to_mfn_frame_list_list);
12445 + }
12446 +
12447 + /* Mark all ISA DMA channels in-use - using them wouldn't work. */
12448 + for (i = 0; i < MAX_DMA_CHANNELS; ++i)
12449 + if (i != 4 && request_dma(i, "xen") != 0)
12450 + BUG();
12451 +#endif /* CONFIG_XEN */
12452 +
12453 +#ifdef CONFIG_X86_GENERICARCH
12454 + generic_apic_probe();
12455 +#endif
12456 +
12457 +#ifndef CONFIG_XEN
12458 + early_quirks();
12459 +#endif
12460 +
12461 + /*
12462 + * Read APIC and some other early information from ACPI tables.
12463 + */
12464 + acpi_boot_init();
12465 +
12466 +#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
12467 + /*
12468 + * get boot-time SMP configuration:
12469 + */
12470 + if (smp_found_config)
12471 + get_smp_config();
12472 +#endif
12473 +
12474 + prefill_possible_map();
12475 +#ifdef CONFIG_X86_64
12476 + init_cpu_to_node();
12477 +#endif
12478 +
12479 +#ifndef CONFIG_XEN
12480 + init_apic_mappings();
12481 + ioapic_init_mappings();
12482 +
12483 + kvm_guest_init();
12484 +
12485 + e820_reserve_resources();
12486 + e820_mark_nosave_regions(max_low_pfn);
12487 +#else
12488 + if (is_initial_xendomain())
12489 + e820_reserve_resources();
12490 +#endif
12491 +
12492 +#ifdef CONFIG_X86_32
12493 + request_resource(&iomem_resource, &video_ram_resource);
12494 +#endif
12495 + reserve_standard_io_resources();
12496 +
12497 +#ifndef CONFIG_XEN
12498 + e820_setup_gap();
12499 +
12500 +#ifdef CONFIG_VT
12501 +#if defined(CONFIG_VGA_CONSOLE)
12502 + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12503 + conswitchp = &vga_con;
12504 +#elif defined(CONFIG_DUMMY_CONSOLE)
12505 + conswitchp = &dummy_con;
12506 +#endif
12507 +#endif
12508 +#else /* CONFIG_XEN */
12509 + if (is_initial_xendomain())
12510 + e820_setup_gap();
12511 +
12512 + set_iopl.iopl = 1;
12513 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
12514 +
12515 +#ifdef CONFIG_VT
12516 +#ifdef CONFIG_DUMMY_CONSOLE
12517 + conswitchp = &dummy_con;
12518 +#endif
12519 +#ifdef CONFIG_VGA_CONSOLE
12520 + if (is_initial_xendomain())
12521 + conswitchp = &vga_con;
12522 +#endif
12523 +#endif
12524 +#endif /* CONFIG_XEN */
12525 +}
12526 +
12527 +#ifdef CONFIG_XEN
12528 +static int
12529 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12530 +{
12531 + HYPERVISOR_shutdown(SHUTDOWN_crash);
12532 + /* we're never actually going to get here... */
12533 + return NOTIFY_DONE;
12534 +}
12535 +#endif /* !CONFIG_XEN */
12536 --- sle11-2009-06-04.orig/arch/x86/kernel/setup64-xen.c 2009-03-16 16:38:05.000000000 +0100
12537 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12538 @@ -1,370 +0,0 @@
12539 -/*
12540 - * X86-64 specific CPU setup.
12541 - * Copyright (C) 1995 Linus Torvalds
12542 - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
12543 - * See setup.c for older changelog.
12544 - *
12545 - * Jun Nakajima <jun.nakajima@intel.com>
12546 - * Modified for Xen
12547 - *
12548 - */
12549 -#include <linux/init.h>
12550 -#include <linux/kernel.h>
12551 -#include <linux/sched.h>
12552 -#include <linux/string.h>
12553 -#include <linux/bootmem.h>
12554 -#include <linux/bitops.h>
12555 -#include <linux/module.h>
12556 -#include <linux/kgdb.h>
12557 -#include <asm/pda.h>
12558 -#include <asm/pgtable.h>
12559 -#include <asm/processor.h>
12560 -#include <asm/desc.h>
12561 -#include <asm/atomic.h>
12562 -#include <asm/mmu_context.h>
12563 -#include <asm/smp.h>
12564 -#include <asm/i387.h>
12565 -#include <asm/percpu.h>
12566 -#include <asm/proto.h>
12567 -#include <asm/sections.h>
12568 -#include <asm/setup.h>
12569 -#include <asm/genapic.h>
12570 -#ifdef CONFIG_XEN
12571 -#include <asm/hypervisor.h>
12572 -#endif
12573 -
12574 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
12575 -struct boot_params __initdata boot_params;
12576 -#else
12577 -struct boot_params boot_params;
12578 -#endif
12579 -
12580 -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
12581 -
12582 -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
12583 -EXPORT_SYMBOL(_cpu_pda);
12584 -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
12585 -
12586 -#ifndef CONFIG_X86_NO_IDT
12587 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
12588 -#endif
12589 -
12590 -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
12591 -
12592 -unsigned long __supported_pte_mask __read_mostly = ~0UL;
12593 -EXPORT_SYMBOL(__supported_pte_mask);
12594 -
12595 -static int do_not_nx __cpuinitdata = 0;
12596 -
12597 -/* noexec=on|off
12598 -Control non executable mappings for 64bit processes.
12599 -
12600 -on Enable(default)
12601 -off Disable
12602 -*/
12603 -static int __init nonx_setup(char *str)
12604 -{
12605 - if (!str)
12606 - return -EINVAL;
12607 - if (!strncmp(str, "on", 2)) {
12608 - __supported_pte_mask |= _PAGE_NX;
12609 - do_not_nx = 0;
12610 - } else if (!strncmp(str, "off", 3)) {
12611 - do_not_nx = 1;
12612 - __supported_pte_mask &= ~_PAGE_NX;
12613 - }
12614 - return 0;
12615 -}
12616 -early_param("noexec", nonx_setup);
12617 -
12618 -int force_personality32 = 0;
12619 -
12620 -/* noexec32=on|off
12621 -Control non executable heap for 32bit processes.
12622 -To control the stack too use noexec=off
12623 -
12624 -on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
12625 -off PROT_READ implies PROT_EXEC
12626 -*/
12627 -static int __init nonx32_setup(char *str)
12628 -{
12629 - if (!strcmp(str, "on"))
12630 - force_personality32 &= ~READ_IMPLIES_EXEC;
12631 - else if (!strcmp(str, "off"))
12632 - force_personality32 |= READ_IMPLIES_EXEC;
12633 - return 1;
12634 -}
12635 -__setup("noexec32=", nonx32_setup);
12636 -
12637 -#ifdef CONFIG_XEN
12638 -static void __init_refok switch_pt(int cpu)
12639 -{
12640 - if (cpu == 0)
12641 - xen_init_pt();
12642 - xen_pt_switch(__pa_symbol(init_level4_pgt));
12643 - xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
12644 -}
12645 -#define switch_pt() switch_pt(cpu)
12646 -
12647 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12648 -{
12649 - unsigned long frames[16];
12650 - unsigned long va;
12651 - int f;
12652 -
12653 - for (va = gdt_descr->address, f = 0;
12654 - va < gdt_descr->address + gdt_descr->size;
12655 - va += PAGE_SIZE, f++) {
12656 - frames[f] = virt_to_mfn(va);
12657 - make_page_readonly(
12658 - (void *)va, XENFEAT_writable_descriptor_tables);
12659 - }
12660 - if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
12661 - sizeof (struct desc_struct)))
12662 - BUG();
12663 -}
12664 -#else
12665 -static void switch_pt(void)
12666 -{
12667 - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
12668 -}
12669 -
12670 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12671 -{
12672 - load_gdt(gdt_descr);
12673 - load_idt(idt_descr);
12674 -}
12675 -#endif
12676 -
12677 -void pda_init(int cpu)
12678 -{
12679 - struct x8664_pda *pda = cpu_pda(cpu);
12680 -
12681 - /* Setup up data that may be needed in __get_free_pages early */
12682 - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
12683 -#ifndef CONFIG_XEN
12684 - /* Memory clobbers used to order PDA accessed */
12685 - mb();
12686 - wrmsrl(MSR_GS_BASE, pda);
12687 - mb();
12688 -#else
12689 - if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
12690 - (unsigned long)pda))
12691 - BUG();
12692 -#endif
12693 - pda->cpunumber = cpu;
12694 - pda->irqcount = -1;
12695 - pda->kernelstack =
12696 - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
12697 - pda->active_mm = &init_mm;
12698 - pda->mmu_state = 0;
12699 -
12700 - if (cpu == 0) {
12701 - /* others are initialized in smpboot.c */
12702 - pda->pcurrent = &init_task;
12703 - pda->irqstackptr = boot_cpu_stack;
12704 - } else {
12705 - pda->irqstackptr = (char *)
12706 - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
12707 - if (!pda->irqstackptr)
12708 - panic("cannot allocate irqstack for cpu %d", cpu);
12709 - }
12710 -
12711 - switch_pt();
12712 -
12713 - pda->irqstackptr += IRQSTACKSIZE-64;
12714 -}
12715 -
12716 -#ifndef CONFIG_X86_NO_TSS
12717 -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
12718 -__attribute__((section(".bss.page_aligned")));
12719 -#endif
12720 -
12721 -extern asmlinkage void ignore_sysret(void);
12722 -
12723 -/* May not be marked __init: used by software suspend */
12724 -void syscall_init(void)
12725 -{
12726 -#ifndef CONFIG_XEN
12727 - /*
12728 - * LSTAR and STAR live in a bit strange symbiosis.
12729 - * They both write to the same internal register. STAR allows to set CS/DS
12730 - * but only a 32bit target. LSTAR sets the 64bit rip.
12731 - */
12732 - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
12733 - wrmsrl(MSR_LSTAR, system_call);
12734 - wrmsrl(MSR_CSTAR, ignore_sysret);
12735 -
12736 - /* Flags to clear on syscall */
12737 - wrmsrl(MSR_SYSCALL_MASK,
12738 - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
12739 -#endif
12740 -#ifdef CONFIG_IA32_EMULATION
12741 - syscall32_cpu_init ();
12742 -#else
12743 - {
12744 - static const struct callback_register cstar = {
12745 - .type = CALLBACKTYPE_syscall32,
12746 - .address = (unsigned long)ignore_sysret
12747 - };
12748 - if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
12749 - printk(KERN_WARN "Unable to register CSTAR callback\n");
12750 - }
12751 -#endif
12752 -}
12753 -
12754 -void __cpuinit check_efer(void)
12755 -{
12756 - unsigned long efer;
12757 -
12758 - rdmsrl(MSR_EFER, efer);
12759 - if (!(efer & EFER_NX) || do_not_nx) {
12760 - __supported_pte_mask &= ~_PAGE_NX;
12761 - }
12762 -}
12763 -
12764 -unsigned long kernel_eflags;
12765 -
12766 -#ifndef CONFIG_X86_NO_TSS
12767 -/*
12768 - * Copies of the original ist values from the tss are only accessed during
12769 - * debugging, no special alignment required.
12770 - */
12771 -DEFINE_PER_CPU(struct orig_ist, orig_ist);
12772 -#endif
12773 -
12774 -/*
12775 - * cpu_init() initializes state that is per-CPU. Some data is already
12776 - * initialized (naturally) in the bootstrap process, such as the GDT
12777 - * and IDT. We reload them nevertheless, this function acts as a
12778 - * 'CPU state barrier', nothing should get across.
12779 - * A lot of state is already set up in PDA init.
12780 - */
12781 -void __cpuinit cpu_init (void)
12782 -{
12783 - int cpu = stack_smp_processor_id();
12784 -#ifndef CONFIG_X86_NO_TSS
12785 - struct tss_struct *t = &per_cpu(init_tss, cpu);
12786 - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
12787 - unsigned long v;
12788 - char *estacks = NULL;
12789 - unsigned i;
12790 -#endif
12791 - struct task_struct *me;
12792 -
12793 - /* CPU 0 is initialised in head64.c */
12794 - if (cpu != 0) {
12795 - pda_init(cpu);
12796 - }
12797 -#ifndef CONFIG_X86_NO_TSS
12798 - else
12799 - estacks = boot_exception_stacks;
12800 -#endif
12801 -
12802 - me = current;
12803 -
12804 - if (cpu_test_and_set(cpu, cpu_initialized))
12805 - panic("CPU#%d already initialized!\n", cpu);
12806 -
12807 - printk("Initializing CPU#%d\n", cpu);
12808 -
12809 - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
12810 -
12811 - /*
12812 - * Initialize the per-CPU GDT with the boot GDT,
12813 - * and set up the GDT descriptor:
12814 - */
12815 -#ifndef CONFIG_XEN
12816 - if (cpu)
12817 - memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
12818 -#endif
12819 -
12820 - cpu_gdt_descr[cpu].size = GDT_SIZE;
12821 - cpu_gdt_init(&cpu_gdt_descr[cpu]);
12822 -
12823 - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
12824 - syscall_init();
12825 -
12826 - wrmsrl(MSR_FS_BASE, 0);
12827 - wrmsrl(MSR_KERNEL_GS_BASE, 0);
12828 - barrier();
12829 -
12830 - check_efer();
12831 -
12832 -#ifndef CONFIG_X86_NO_TSS
12833 - /*
12834 - * set up and load the per-CPU TSS
12835 - */
12836 - for (v = 0; v < N_EXCEPTION_STACKS; v++) {
12837 - static const unsigned int order[N_EXCEPTION_STACKS] = {
12838 - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
12839 - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
12840 - };
12841 - if (cpu) {
12842 - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
12843 - if (!estacks)
12844 - panic("Cannot allocate exception stack %ld %d\n",
12845 - v, cpu);
12846 - }
12847 - estacks += PAGE_SIZE << order[v];
12848 - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
12849 - }
12850 -
12851 - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
12852 - /*
12853 - * <= is required because the CPU will access up to
12854 - * 8 bits beyond the end of the IO permission bitmap.
12855 - */
12856 - for (i = 0; i <= IO_BITMAP_LONGS; i++)
12857 - t->io_bitmap[i] = ~0UL;
12858 -#endif
12859 -
12860 - atomic_inc(&init_mm.mm_count);
12861 - me->active_mm = &init_mm;
12862 - if (me->mm)
12863 - BUG();
12864 - enter_lazy_tlb(&init_mm, me);
12865 -
12866 -#ifndef CONFIG_X86_NO_TSS
12867 - set_tss_desc(cpu, t);
12868 -#endif
12869 -#ifndef CONFIG_XEN
12870 - load_TR_desc();
12871 -#endif
12872 - load_LDT(&init_mm.context);
12873 -
12874 -#ifdef CONFIG_KGDB
12875 - /*
12876 - * If the kgdb is connected no debug regs should be altered. This
12877 - * is only applicable when KGDB and a KGDB I/O module are built
12878 - * into the kernel and you are using early debugging with
12879 - * kgdbwait. KGDB will control the kernel HW breakpoint registers.
12880 - */
12881 - if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
12882 - arch_kgdb_ops.correct_hw_break();
12883 - else {
12884 -#endif
12885 - /*
12886 - * Clear all 6 debug registers:
12887 - */
12888 -
12889 - set_debugreg(0UL, 0);
12890 - set_debugreg(0UL, 1);
12891 - set_debugreg(0UL, 2);
12892 - set_debugreg(0UL, 3);
12893 - set_debugreg(0UL, 6);
12894 - set_debugreg(0UL, 7);
12895 -#ifdef CONFIG_KGDB
12896 - /* If the kgdb is connected no debug regs should be altered. */
12897 - }
12898 -#endif
12899 -
12900 - fpu_init();
12901 -
12902 - asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
12903 - if (raw_irqs_disabled())
12904 - kernel_eflags &= ~X86_EFLAGS_IF;
12905 -
12906 - if (is_uv_system())
12907 - uv_cpu_init();
12908 -}
12909 --- sle11-2009-06-04.orig/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:38:05.000000000 +0100
12910 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12911 @@ -1,1151 +0,0 @@
12912 -/*
12913 - * Copyright (C) 1995 Linus Torvalds
12914 - *
12915 - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
12916 - *
12917 - * Memory region support
12918 - * David Parsons <orc@pell.chi.il.us>, July-August 1999
12919 - *
12920 - * Added E820 sanitization routine (removes overlapping memory regions);
12921 - * Brian Moyle <bmoyle@mvista.com>, February 2001
12922 - *
12923 - * Moved CPU detection code to cpu/${cpu}.c
12924 - * Patrick Mochel <mochel@osdl.org>, March 2002
12925 - *
12926 - * Provisions for empty E820 memory regions (reported by certain BIOSes).
12927 - * Alex Achenbach <xela@slit.de>, December 2002.
12928 - *
12929 - */
12930 -
12931 -/*
12932 - * This file handles the architecture-dependent parts of initialization
12933 - */
12934 -
12935 -#include <linux/sched.h>
12936 -#include <linux/mm.h>
12937 -#include <linux/mmzone.h>
12938 -#include <linux/screen_info.h>
12939 -#include <linux/ioport.h>
12940 -#include <linux/acpi.h>
12941 -#include <linux/apm_bios.h>
12942 -#include <linux/initrd.h>
12943 -#include <linux/bootmem.h>
12944 -#include <linux/seq_file.h>
12945 -#include <linux/console.h>
12946 -#include <linux/mca.h>
12947 -#include <linux/root_dev.h>
12948 -#include <linux/highmem.h>
12949 -#include <linux/module.h>
12950 -#include <linux/efi.h>
12951 -#include <linux/init.h>
12952 -#include <linux/edd.h>
12953 -#include <linux/iscsi_ibft.h>
12954 -#include <linux/nodemask.h>
12955 -#include <linux/kernel.h>
12956 -#include <linux/percpu.h>
12957 -#include <linux/notifier.h>
12958 -#include <linux/kexec.h>
12959 -#include <linux/crash_dump.h>
12960 -#include <linux/dmi.h>
12961 -#include <linux/pfn.h>
12962 -#include <linux/pci.h>
12963 -#include <linux/init_ohci1394_dma.h>
12964 -#include <linux/kvm_para.h>
12965 -
12966 -#include <video/edid.h>
12967 -
12968 -#include <asm/mtrr.h>
12969 -#include <asm/apic.h>
12970 -#include <asm/e820.h>
12971 -#include <asm/mpspec.h>
12972 -#include <asm/mmzone.h>
12973 -#include <asm/setup.h>
12974 -#include <asm/arch_hooks.h>
12975 -#include <asm/sections.h>
12976 -#include <asm/io_apic.h>
12977 -#include <asm/ist.h>
12978 -#include <asm/io.h>
12979 -#include <asm/hypervisor.h>
12980 -#include <xen/interface/physdev.h>
12981 -#include <xen/interface/memory.h>
12982 -#include <xen/features.h>
12983 -#include <xen/firmware.h>
12984 -#include <xen/xencons.h>
12985 -#include <setup_arch.h>
12986 -#include <asm/bios_ebda.h>
12987 -#include <asm/cacheflush.h>
12988 -#include <asm/processor.h>
12989 -
12990 -#ifdef CONFIG_XEN
12991 -#include <xen/interface/kexec.h>
12992 -#endif
12993 -
12994 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
12995 -static struct notifier_block xen_panic_block = {
12996 - xen_panic_event, NULL, 0 /* try to go last */
12997 -};
12998 -
12999 -/*
13000 - * Machine setup..
13001 - */
13002 -static struct resource data_resource = {
13003 - .name = "Kernel data",
13004 - .start = 0,
13005 - .end = 0,
13006 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13007 -};
13008 -
13009 -static struct resource code_resource = {
13010 - .name = "Kernel code",
13011 - .start = 0,
13012 - .end = 0,
13013 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13014 -};
13015 -
13016 -static struct resource bss_resource = {
13017 - .name = "Kernel bss",
13018 - .start = 0,
13019 - .end = 0,
13020 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13021 -};
13022 -
13023 -static struct resource video_ram_resource = {
13024 - .name = "Video RAM area",
13025 - .start = 0xa0000,
13026 - .end = 0xbffff,
13027 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13028 -};
13029 -
13030 -static struct resource standard_io_resources[] = { {
13031 - .name = "dma1",
13032 - .start = 0x0000,
13033 - .end = 0x001f,
13034 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13035 -}, {
13036 - .name = "pic1",
13037 - .start = 0x0020,
13038 - .end = 0x0021,
13039 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13040 -}, {
13041 - .name = "timer0",
13042 - .start = 0x0040,
13043 - .end = 0x0043,
13044 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13045 -}, {
13046 - .name = "timer1",
13047 - .start = 0x0050,
13048 - .end = 0x0053,
13049 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13050 -}, {
13051 - .name = "keyboard",
13052 - .start = 0x0060,
13053 - .end = 0x0060,
13054 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13055 -}, {
13056 - .name = "keyboard",
13057 - .start = 0x0064,
13058 - .end = 0x0064,
13059 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13060 -}, {
13061 - .name = "dma page reg",
13062 - .start = 0x0080,
13063 - .end = 0x008f,
13064 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13065 -}, {
13066 - .name = "pic2",
13067 - .start = 0x00a0,
13068 - .end = 0x00a1,
13069 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13070 -}, {
13071 - .name = "dma2",
13072 - .start = 0x00c0,
13073 - .end = 0x00df,
13074 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13075 -}, {
13076 - .name = "fpu",
13077 - .start = 0x00f0,
13078 - .end = 0x00ff,
13079 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13080 -} };
13081 -
13082 -/* cpu data as detected by the assembly code in head.S */
13083 -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13084 -/* common cpu data for all cpus */
13085 -struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13086 -EXPORT_SYMBOL(boot_cpu_data);
13087 -
13088 -unsigned int def_to_bigsmp;
13089 -
13090 -#ifndef CONFIG_X86_PAE
13091 -unsigned long mmu_cr4_features;
13092 -#else
13093 -unsigned long mmu_cr4_features = X86_CR4_PAE;
13094 -#endif
13095 -
13096 -/* for MCA, but anyone else can use it if they want */
13097 -unsigned int machine_id;
13098 -unsigned int machine_submodel_id;
13099 -unsigned int BIOS_revision;
13100 -
13101 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
13102 -int bootloader_type;
13103 -
13104 -/* user-defined highmem size */
13105 -static unsigned int highmem_pages = -1;
13106 -
13107 -/*
13108 - * Setup options
13109 - */
13110 -struct screen_info screen_info;
13111 -EXPORT_SYMBOL(screen_info);
13112 -struct apm_info apm_info;
13113 -EXPORT_SYMBOL(apm_info);
13114 -struct edid_info edid_info;
13115 -EXPORT_SYMBOL_GPL(edid_info);
13116 -#ifndef CONFIG_XEN
13117 -#define copy_edid() (edid_info = boot_params.edid_info)
13118 -#endif
13119 -struct ist_info ist_info;
13120 -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
13121 - defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
13122 -EXPORT_SYMBOL(ist_info);
13123 -#endif
13124 -
13125 -extern void early_cpu_init(void);
13126 -extern int root_mountflags;
13127 -
13128 -unsigned long saved_video_mode;
13129 -
13130 -#define RAMDISK_IMAGE_START_MASK 0x07FF
13131 -#define RAMDISK_PROMPT_FLAG 0x8000
13132 -#define RAMDISK_LOAD_FLAG 0x4000
13133 -
13134 -static char __initdata command_line[COMMAND_LINE_SIZE];
13135 -
13136 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
13137 -struct boot_params __initdata boot_params;
13138 -#else
13139 -struct boot_params boot_params;
13140 -#endif
13141 -
13142 -/*
13143 - * Point at the empty zero page to start with. We map the real shared_info
13144 - * page as soon as fixmap is up and running.
13145 - */
13146 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
13147 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
13148 -
13149 -unsigned long *phys_to_machine_mapping;
13150 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
13151 -EXPORT_SYMBOL(phys_to_machine_mapping);
13152 -
13153 -/* Raw start-of-day parameters from the hypervisor. */
13154 -start_info_t *xen_start_info;
13155 -EXPORT_SYMBOL(xen_start_info);
13156 -
13157 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
13158 -struct edd edd;
13159 -#ifdef CONFIG_EDD_MODULE
13160 -EXPORT_SYMBOL(edd);
13161 -#endif
13162 -#ifndef CONFIG_XEN
13163 -/**
13164 - * copy_edd() - Copy the BIOS EDD information
13165 - * from boot_params into a safe place.
13166 - *
13167 - */
13168 -static inline void copy_edd(void)
13169 -{
13170 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
13171 - sizeof(edd.mbr_signature));
13172 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
13173 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
13174 - edd.edd_info_nr = boot_params.eddbuf_entries;
13175 -}
13176 -#endif
13177 -#else
13178 -static inline void copy_edd(void)
13179 -{
13180 -}
13181 -#endif
13182 -
13183 -int __initdata user_defined_memmap;
13184 -
13185 -/*
13186 - * "mem=nopentium" disables the 4MB page tables.
13187 - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
13188 - * to <mem>, overriding the bios size.
13189 - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
13190 - * <start> to <start>+<mem>, overriding the bios size.
13191 - *
13192 - * HPA tells me bootloaders need to parse mem=, so no new
13193 - * option should be mem= [also see Documentation/i386/boot.txt]
13194 - */
13195 -static int __init parse_mem(char *arg)
13196 -{
13197 - if (!arg)
13198 - return -EINVAL;
13199 -
13200 - if (strcmp(arg, "nopentium") == 0) {
13201 - setup_clear_cpu_cap(X86_FEATURE_PSE);
13202 - } else {
13203 - /* If the user specifies memory size, we
13204 - * limit the BIOS-provided memory map to
13205 - * that size. exactmap can be used to specify
13206 - * the exact map. mem=number can be used to
13207 - * trim the existing memory map.
13208 - */
13209 - unsigned long long mem_size;
13210 -
13211 - mem_size = memparse(arg, &arg);
13212 - limit_regions(mem_size);
13213 - user_defined_memmap = 1;
13214 - }
13215 - return 0;
13216 -}
13217 -early_param("mem", parse_mem);
13218 -
13219 -#ifdef CONFIG_PROC_VMCORE
13220 -/* elfcorehdr= specifies the location of elf core header
13221 - * stored by the crashed kernel.
13222 - */
13223 -static int __init parse_elfcorehdr(char *arg)
13224 -{
13225 - if (!arg)
13226 - return -EINVAL;
13227 -
13228 - elfcorehdr_addr = memparse(arg, &arg);
13229 - return 0;
13230 -}
13231 -early_param("elfcorehdr", parse_elfcorehdr);
13232 -#endif /* CONFIG_PROC_VMCORE */
13233 -
13234 -/*
13235 - * highmem=size forces highmem to be exactly 'size' bytes.
13236 - * This works even on boxes that have no highmem otherwise.
13237 - * This also works to reduce highmem size on bigger boxes.
13238 - */
13239 -static int __init parse_highmem(char *arg)
13240 -{
13241 - if (!arg)
13242 - return -EINVAL;
13243 -
13244 - highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
13245 - return 0;
13246 -}
13247 -early_param("highmem", parse_highmem);
13248 -
13249 -/*
13250 - * vmalloc=size forces the vmalloc area to be exactly 'size'
13251 - * bytes. This can be used to increase (or decrease) the
13252 - * vmalloc area - the default is 128m.
13253 - */
13254 -static int __init parse_vmalloc(char *arg)
13255 -{
13256 - if (!arg)
13257 - return -EINVAL;
13258 -
13259 - __VMALLOC_RESERVE = memparse(arg, &arg);
13260 - return 0;
13261 -}
13262 -early_param("vmalloc", parse_vmalloc);
13263 -
13264 -#ifndef CONFIG_XEN
13265 -/*
13266 - * reservetop=size reserves a hole at the top of the kernel address space which
13267 - * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
13268 - * so relocating the fixmap can be done before paging initialization.
13269 - */
13270 -static int __init parse_reservetop(char *arg)
13271 -{
13272 - unsigned long address;
13273 -
13274 - if (!arg)
13275 - return -EINVAL;
13276 -
13277 - address = memparse(arg, &arg);
13278 - reserve_top_address(address);
13279 - return 0;
13280 -}
13281 -early_param("reservetop", parse_reservetop);
13282 -#endif
13283 -
13284 -/*
13285 - * Determine low and high memory ranges:
13286 - */
13287 -unsigned long __init find_max_low_pfn(void)
13288 -{
13289 - unsigned long max_low_pfn;
13290 -
13291 - max_low_pfn = max_pfn;
13292 - if (max_low_pfn > MAXMEM_PFN) {
13293 - if (highmem_pages == -1)
13294 - highmem_pages = max_pfn - MAXMEM_PFN;
13295 - if (highmem_pages + MAXMEM_PFN < max_pfn)
13296 - max_pfn = MAXMEM_PFN + highmem_pages;
13297 - if (highmem_pages + MAXMEM_PFN > max_pfn) {
13298 - printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
13299 - highmem_pages = 0;
13300 - }
13301 - max_low_pfn = MAXMEM_PFN;
13302 -#ifndef CONFIG_HIGHMEM
13303 - /* Maximum memory usable is what is directly addressable */
13304 - printk(KERN_WARNING "Warning only %ldMB will be used.\n",
13305 - MAXMEM>>20);
13306 - if (max_pfn > MAX_NONPAE_PFN)
13307 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13308 - else
13309 - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
13310 - max_pfn = MAXMEM_PFN;
13311 -#else /* !CONFIG_HIGHMEM */
13312 -#ifndef CONFIG_HIGHMEM64G
13313 - if (max_pfn > MAX_NONPAE_PFN) {
13314 - max_pfn = MAX_NONPAE_PFN;
13315 - printk(KERN_WARNING "Warning only 4GB will be used.\n");
13316 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13317 - }
13318 -#endif /* !CONFIG_HIGHMEM64G */
13319 -#endif /* !CONFIG_HIGHMEM */
13320 - } else {
13321 - if (highmem_pages == -1)
13322 - highmem_pages = 0;
13323 -#ifdef CONFIG_HIGHMEM
13324 - if (highmem_pages >= max_pfn) {
13325 - printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
13326 - highmem_pages = 0;
13327 - }
13328 - if (highmem_pages) {
13329 - if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
13330 - printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
13331 - highmem_pages = 0;
13332 - }
13333 - max_low_pfn -= highmem_pages;
13334 - }
13335 -#else
13336 - if (highmem_pages)
13337 - printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
13338 -#endif
13339 - }
13340 - return max_low_pfn;
13341 -}
13342 -
13343 -#ifndef CONFIG_XEN
13344 -#define BIOS_LOWMEM_KILOBYTES 0x413
13345 -
13346 -/*
13347 - * The BIOS places the EBDA/XBDA at the top of conventional
13348 - * memory, and usually decreases the reported amount of
13349 - * conventional memory (int 0x12) too. This also contains a
13350 - * workaround for Dell systems that neglect to reserve EBDA.
13351 - * The same workaround also avoids a problem with the AMD768MPX
13352 - * chipset: reserve a page before VGA to prevent PCI prefetch
13353 - * into it (errata #56). Usually the page is reserved anyways,
13354 - * unless you have no PS/2 mouse plugged in.
13355 - */
13356 -static void __init reserve_ebda_region(void)
13357 -{
13358 - unsigned int lowmem, ebda_addr;
13359 -
13360 - /* To determine the position of the EBDA and the */
13361 - /* end of conventional memory, we need to look at */
13362 - /* the BIOS data area. In a paravirtual environment */
13363 - /* that area is absent. We'll just have to assume */
13364 - /* that the paravirt case can handle memory setup */
13365 - /* correctly, without our help. */
13366 - if (paravirt_enabled())
13367 - return;
13368 -
13369 - /* end of low (conventional) memory */
13370 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
13371 - lowmem <<= 10;
13372 -
13373 - /* start of EBDA area */
13374 - ebda_addr = get_bios_ebda();
13375 -
13376 - /* Fixup: bios puts an EBDA in the top 64K segment */
13377 - /* of conventional memory, but does not adjust lowmem. */
13378 - if ((lowmem - ebda_addr) <= 0x10000)
13379 - lowmem = ebda_addr;
13380 -
13381 - /* Fixup: bios does not report an EBDA at all. */
13382 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
13383 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
13384 - lowmem = 0x9f000;
13385 -
13386 - /* Paranoia: should never happen, but... */
13387 - if ((lowmem == 0) || (lowmem >= 0x100000))
13388 - lowmem = 0x9f000;
13389 -
13390 - /* reserve all memory between lowmem and the 1MB mark */
13391 - reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
13392 -}
13393 -#endif
13394 -
13395 -#ifndef CONFIG_NEED_MULTIPLE_NODES
13396 -static void __init setup_bootmem_allocator(void);
13397 -static unsigned long __init setup_memory(void)
13398 -{
13399 - /*
13400 - * partially used pages are not usable - thus
13401 - * we are rounding upwards:
13402 - */
13403 - min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
13404 - xen_start_info->nr_pt_frames;
13405 -
13406 - max_low_pfn = find_max_low_pfn();
13407 -
13408 -#ifdef CONFIG_HIGHMEM
13409 - highstart_pfn = highend_pfn = max_pfn;
13410 - if (max_pfn > max_low_pfn) {
13411 - highstart_pfn = max_low_pfn;
13412 - }
13413 - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
13414 - pages_to_mb(highend_pfn - highstart_pfn));
13415 - num_physpages = highend_pfn;
13416 - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
13417 -#else
13418 - num_physpages = max_low_pfn;
13419 - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
13420 -#endif
13421 -#ifdef CONFIG_FLATMEM
13422 - max_mapnr = num_physpages;
13423 -#endif
13424 - printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
13425 - pages_to_mb(max_low_pfn));
13426 -
13427 - setup_bootmem_allocator();
13428 -
13429 - return max_low_pfn;
13430 -}
13431 -
13432 -static void __init zone_sizes_init(void)
13433 -{
13434 - unsigned long max_zone_pfns[MAX_NR_ZONES];
13435 - memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
13436 - max_zone_pfns[ZONE_DMA] =
13437 - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
13438 - max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
13439 -#ifdef CONFIG_HIGHMEM
13440 - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
13441 - add_active_range(0, 0, highend_pfn);
13442 -#else
13443 - add_active_range(0, 0, max_low_pfn);
13444 -#endif
13445 -
13446 - free_area_init_nodes(max_zone_pfns);
13447 -}
13448 -#else
13449 -extern unsigned long __init setup_memory(void);
13450 -extern void zone_sizes_init(void);
13451 -#endif /* !CONFIG_NEED_MULTIPLE_NODES */
13452 -
13453 -static inline unsigned long long get_total_mem(void)
13454 -{
13455 - unsigned long long total;
13456 -
13457 - total = max_low_pfn - min_low_pfn;
13458 -#ifdef CONFIG_HIGHMEM
13459 - total += highend_pfn - highstart_pfn;
13460 -#endif
13461 -
13462 - return total << PAGE_SHIFT;
13463 -}
13464 -
13465 -#ifdef CONFIG_KEXEC
13466 -#ifndef CONFIG_XEN
13467 -static void __init reserve_crashkernel(void)
13468 -{
13469 - unsigned long long total_mem;
13470 - unsigned long long crash_size, crash_base;
13471 - int ret;
13472 -
13473 - total_mem = get_total_mem();
13474 -
13475 - ret = parse_crashkernel(boot_command_line, total_mem,
13476 - &crash_size, &crash_base);
13477 - if (ret == 0 && crash_size > 0) {
13478 - if (crash_base > 0) {
13479 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
13480 - "for crashkernel (System RAM: %ldMB)\n",
13481 - (unsigned long)(crash_size >> 20),
13482 - (unsigned long)(crash_base >> 20),
13483 - (unsigned long)(total_mem >> 20));
13484 -
13485 - if (reserve_bootmem(crash_base, crash_size,
13486 - BOOTMEM_EXCLUSIVE) < 0) {
13487 - printk(KERN_INFO "crashkernel reservation "
13488 - "failed - memory is in use\n");
13489 - return;
13490 - }
13491 -
13492 - crashk_res.start = crash_base;
13493 - crashk_res.end = crash_base + crash_size - 1;
13494 - } else
13495 - printk(KERN_INFO "crashkernel reservation failed - "
13496 - "you have to specify a base address\n");
13497 - }
13498 -}
13499 -#else
13500 -#define reserve_crashkernel xen_machine_kexec_setup_resources
13501 -#endif
13502 -#else
13503 -static inline void __init reserve_crashkernel(void)
13504 -{}
13505 -#endif
13506 -
13507 -#ifdef CONFIG_BLK_DEV_INITRD
13508 -
13509 -static bool do_relocate_initrd = false;
13510 -
13511 -static void __init reserve_initrd(void)
13512 -{
13513 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
13514 - unsigned long ramdisk_size = xen_start_info->mod_len;
13515 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
13516 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13517 - unsigned long ramdisk_here;
13518 -
13519 - initrd_start = 0;
13520 -
13521 - if (!xen_start_info->mod_start || !ramdisk_size)
13522 - return; /* No initrd provided by bootloader */
13523 -
13524 - if (ramdisk_end < ramdisk_image) {
13525 - printk(KERN_ERR "initrd wraps around end of memory, "
13526 - "disabling initrd\n");
13527 - return;
13528 - }
13529 - if (ramdisk_size >= end_of_lowmem/2) {
13530 - printk(KERN_ERR "initrd too large to handle, "
13531 - "disabling initrd\n");
13532 - return;
13533 - }
13534 - if (ramdisk_end <= end_of_lowmem) {
13535 - /* All in lowmem, easy case */
13536 - reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
13537 - initrd_start = ramdisk_image + PAGE_OFFSET;
13538 - initrd_end = initrd_start+ramdisk_size;
13539 - return;
13540 - }
13541 -
13542 - /* We need to move the initrd down into lowmem */
13543 - ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
13544 -
13545 - /* Note: this includes all the lowmem currently occupied by
13546 - the initrd, we rely on that fact to keep the data intact. */
13547 - reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
13548 - initrd_start = ramdisk_here + PAGE_OFFSET;
13549 - initrd_end = initrd_start + ramdisk_size;
13550 -
13551 - do_relocate_initrd = true;
13552 -}
13553 -
13554 -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
13555 -
13556 -static void __init relocate_initrd(void)
13557 -{
13558 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
13559 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
13560 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13561 - unsigned long ramdisk_here;
13562 - unsigned long slop, clen, mapaddr;
13563 - char *p, *q;
13564 -
13565 - if (!do_relocate_initrd)
13566 - return;
13567 -
13568 - ramdisk_here = initrd_start - PAGE_OFFSET;
13569 -
13570 - q = (char *)initrd_start;
13571 -
13572 - /* Copy any lowmem portion of the initrd */
13573 - if (ramdisk_image < end_of_lowmem) {
13574 - clen = end_of_lowmem - ramdisk_image;
13575 - p = (char *)__va(ramdisk_image);
13576 - memcpy(q, p, clen);
13577 - q += clen;
13578 - ramdisk_image += clen;
13579 - ramdisk_size -= clen;
13580 - }
13581 -
13582 - /* Copy the highmem portion of the initrd */
13583 - while (ramdisk_size) {
13584 - slop = ramdisk_image & ~PAGE_MASK;
13585 - clen = ramdisk_size;
13586 - if (clen > MAX_MAP_CHUNK-slop)
13587 - clen = MAX_MAP_CHUNK-slop;
13588 - mapaddr = ramdisk_image & PAGE_MASK;
13589 - p = early_ioremap(mapaddr, clen+slop);
13590 - memcpy(q, p+slop, clen);
13591 - early_iounmap(p, clen+slop);
13592 - q += clen;
13593 - ramdisk_image += clen;
13594 - ramdisk_size -= clen;
13595 - }
13596 -}
13597 -
13598 -#endif /* CONFIG_BLK_DEV_INITRD */
13599 -
13600 -void __init setup_bootmem_allocator(void)
13601 -{
13602 - unsigned long bootmap_size;
13603 - /*
13604 - * Initialize the boot-time allocator (with low memory only):
13605 - */
13606 - bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
13607 -
13608 - register_bootmem_low_pages(max_low_pfn);
13609 -
13610 - /*
13611 - * Reserve the bootmem bitmap itself as well. We do this in two
13612 - * steps (first step was init_bootmem()) because this catches
13613 - * the (very unlikely) case of us accidentally initializing the
13614 - * bootmem allocator with an invalid RAM area.
13615 - */
13616 - reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
13617 - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
13618 - BOOTMEM_DEFAULT);
13619 -
13620 -#ifndef CONFIG_XEN
13621 - /*
13622 - * reserve physical page 0 - it's a special BIOS page on many boxes,
13623 - * enabling clean reboots, SMP operation, laptop functions.
13624 - */
13625 - reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
13626 -
13627 - /* reserve EBDA region */
13628 - reserve_ebda_region();
13629 -
13630 -#ifdef CONFIG_SMP
13631 - /*
13632 - * But first pinch a few for the stack/trampoline stuff
13633 - * FIXME: Don't need the extra page at 4K, but need to fix
13634 - * trampoline before removing it. (see the GDT stuff)
13635 - */
13636 - reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
13637 -#endif
13638 -#ifdef CONFIG_ACPI_SLEEP
13639 - /*
13640 - * Reserve low memory region for sleep support.
13641 - */
13642 - acpi_reserve_bootmem();
13643 -#endif
13644 -#endif /* !CONFIG_XEN */
13645 -
13646 -#ifdef CONFIG_BLK_DEV_INITRD
13647 - reserve_initrd();
13648 -#endif
13649 - numa_kva_reserve();
13650 - reserve_crashkernel();
13651 -
13652 - reserve_ibft_region();
13653 -}
13654 -
13655 -/*
13656 - * The node 0 pgdat is initialized before all of these because
13657 - * it's needed for bootmem. node>0 pgdats have their virtual
13658 - * space allocated before the pagetables are in place to access
13659 - * them, so they can't be cleared then.
13660 - *
13661 - * This should all compile down to nothing when NUMA is off.
13662 - */
13663 -static void __init remapped_pgdat_init(void)
13664 -{
13665 - int nid;
13666 -
13667 - for_each_online_node(nid) {
13668 - if (nid != 0)
13669 - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
13670 - }
13671 -}
13672 -
13673 -#ifdef CONFIG_MCA
13674 -static void set_mca_bus(int x)
13675 -{
13676 - MCA_bus = x;
13677 -}
13678 -#else
13679 -static void set_mca_bus(int x) { }
13680 -#endif
13681 -
13682 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
13683 -char * __init __attribute__((weak)) memory_setup(void)
13684 -{
13685 - return machine_specific_memory_setup();
13686 -}
13687 -
13688 -#ifdef CONFIG_NUMA
13689 -/*
13690 - * In the golden day, when everything among i386 and x86_64 will be
13691 - * integrated, this will not live here
13692 - */
13693 -void *x86_cpu_to_node_map_early_ptr;
13694 -int x86_cpu_to_node_map_init[NR_CPUS] = {
13695 - [0 ... NR_CPUS-1] = NUMA_NO_NODE
13696 -};
13697 -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
13698 -#endif
13699 -
13700 -/*
13701 - * Determine if we were loaded by an EFI loader. If so, then we have also been
13702 - * passed the efi memmap, systab, etc., so we should use these data structures
13703 - * for initialization. Note, the efi init code path is determined by the
13704 - * global efi_enabled. This allows the same kernel image to be used on existing
13705 - * systems (with a traditional BIOS) as well as on EFI systems.
13706 - */
13707 -void __init setup_arch(char **cmdline_p)
13708 -{
13709 - int i, j, k, fpp;
13710 - struct physdev_set_iopl set_iopl;
13711 - unsigned long max_low_pfn;
13712 - unsigned long p2m_pages;
13713 -
13714 - /* Force a quick death if the kernel panics (not domain 0). */
13715 - extern int panic_timeout;
13716 - if (!panic_timeout && !is_initial_xendomain())
13717 - panic_timeout = 1;
13718 -
13719 - /* Register a call for panic conditions. */
13720 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
13721 -
13722 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13723 - VMASST_TYPE_4gb_segments));
13724 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13725 - VMASST_TYPE_writable_pagetables));
13726 -
13727 - memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
13728 - pre_setup_arch_hook();
13729 - early_cpu_init();
13730 - early_ioremap_init();
13731 -#ifdef CONFIG_SMP
13732 - prefill_possible_map();
13733 -#endif
13734 -
13735 -#ifdef CONFIG_EFI
13736 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
13737 - "EL32", 4))
13738 - efi_enabled = 1;
13739 -#endif
13740 -
13741 - /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
13742 - properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
13743 - */
13744 - ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
13745 - screen_info = boot_params.screen_info;
13746 - copy_edid();
13747 - apm_info.bios = boot_params.apm_bios_info;
13748 - ist_info = boot_params.ist_info;
13749 - saved_video_mode = boot_params.hdr.vid_mode;
13750 - if( boot_params.sys_desc_table.length != 0 ) {
13751 - set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
13752 - machine_id = boot_params.sys_desc_table.table[0];
13753 - machine_submodel_id = boot_params.sys_desc_table.table[1];
13754 - BIOS_revision = boot_params.sys_desc_table.table[2];
13755 - }
13756 - bootloader_type = boot_params.hdr.type_of_loader;
13757 -
13758 - if (is_initial_xendomain()) {
13759 - const struct dom0_vga_console_info *info =
13760 - (void *)((char *)xen_start_info +
13761 - xen_start_info->console.dom0.info_off);
13762 -
13763 - dom0_init_screen_info(info,
13764 - xen_start_info->console.dom0.info_size);
13765 - xen_start_info->console.domU.mfn = 0;
13766 - xen_start_info->console.domU.evtchn = 0;
13767 - } else
13768 - screen_info.orig_video_isVGA = 0;
13769 -
13770 -#ifdef CONFIG_BLK_DEV_RAM
13771 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
13772 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
13773 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
13774 -#endif
13775 -
13776 - ARCH_SETUP
13777 -
13778 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
13779 - print_memory_map(memory_setup());
13780 -
13781 - copy_edd();
13782 -
13783 - if (!boot_params.hdr.root_flags)
13784 - root_mountflags &= ~MS_RDONLY;
13785 - init_mm.start_code = (unsigned long) _text;
13786 - init_mm.end_code = (unsigned long) _etext;
13787 - init_mm.end_data = (unsigned long) _edata;
13788 - init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
13789 - xen_start_info->nr_pt_frames) << PAGE_SHIFT;
13790 -
13791 - code_resource.start = virt_to_phys(_text);
13792 - code_resource.end = virt_to_phys(_etext)-1;
13793 - data_resource.start = virt_to_phys(_etext);
13794 - data_resource.end = virt_to_phys(_edata)-1;
13795 - bss_resource.start = virt_to_phys(&__bss_start);
13796 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
13797 -
13798 - if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
13799 - i = COMMAND_LINE_SIZE;
13800 - memcpy(boot_command_line, xen_start_info->cmd_line, i);
13801 - boot_command_line[i - 1] = '\0';
13802 - parse_early_param();
13803 -
13804 - if (user_defined_memmap) {
13805 - printk(KERN_INFO "user-defined physical RAM map:\n");
13806 - print_memory_map("user");
13807 - }
13808 -
13809 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
13810 - *cmdline_p = command_line;
13811 -
13812 - if (efi_enabled)
13813 - efi_init();
13814 -
13815 - /* update e820 for memory not covered by WB MTRRs */
13816 - propagate_e820_map();
13817 - mtrr_bp_init();
13818 -#ifndef CONFIG_XEN
13819 - if (mtrr_trim_uncached_memory(max_pfn))
13820 - propagate_e820_map();
13821 -#endif
13822 -
13823 - max_low_pfn = setup_memory();
13824 -
13825 -#ifdef CONFIG_KVM_CLOCK
13826 - kvmclock_init();
13827 -#endif
13828 -
13829 -#ifdef CONFIG_VMI
13830 - /*
13831 - * Must be after max_low_pfn is determined, and before kernel
13832 - * pagetables are setup.
13833 - */
13834 - vmi_init();
13835 -#endif
13836 - kvm_guest_init();
13837 -
13838 - /*
13839 - * NOTE: before this point _nobody_ is allowed to allocate
13840 - * any memory using the bootmem allocator. Although the
13841 - * allocator is now initialised only the first 8Mb of the kernel
13842 - * virtual address space has been mapped. All allocations before
13843 - * paging_init() has completed must use the alloc_bootmem_low_pages()
13844 - * variant (which allocates DMA'able memory) and care must be taken
13845 - * not to exceed the 8Mb limit.
13846 - */
13847 -
13848 -#ifdef CONFIG_SMP
13849 - smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
13850 -#endif
13851 - paging_init();
13852 -
13853 - /*
13854 - * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
13855 - */
13856 -
13857 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
13858 - if (init_ohci1394_dma_early)
13859 - init_ohci1394_dma_on_all_controllers();
13860 -#endif
13861 -
13862 - remapped_pgdat_init();
13863 - sparse_init();
13864 - zone_sizes_init();
13865 -
13866 -#ifdef CONFIG_X86_FIND_SMP_CONFIG
13867 - /*
13868 - * Find and reserve possible boot-time SMP configuration:
13869 - */
13870 - find_smp_config();
13871 -#endif
13872 -
13873 - p2m_pages = max_pfn;
13874 - if (xen_start_info->nr_pages > max_pfn) {
13875 - /*
13876 - * the max_pfn was shrunk (probably by mem= or highmem=
13877 - * kernel parameter); shrink reservation with the HV
13878 - */
13879 - struct xen_memory_reservation reservation = {
13880 - .address_bits = 0,
13881 - .extent_order = 0,
13882 - .domid = DOMID_SELF
13883 - };
13884 - unsigned int difference;
13885 - int ret;
13886 -
13887 - difference = xen_start_info->nr_pages - max_pfn;
13888 -
13889 - set_xen_guest_handle(reservation.extent_start,
13890 - ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
13891 - reservation.nr_extents = difference;
13892 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
13893 - &reservation);
13894 - BUG_ON (ret != difference);
13895 - }
13896 - else if (max_pfn > xen_start_info->nr_pages)
13897 - p2m_pages = xen_start_info->nr_pages;
13898 -
13899 - /* Make sure we have a correctly sized P->M table. */
13900 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
13901 - phys_to_machine_mapping = alloc_bootmem_low_pages(
13902 - max_pfn * sizeof(unsigned long));
13903 - memset(phys_to_machine_mapping, ~0,
13904 - max_pfn * sizeof(unsigned long));
13905 - memcpy(phys_to_machine_mapping,
13906 - (unsigned long *)xen_start_info->mfn_list,
13907 - p2m_pages * sizeof(unsigned long));
13908 - free_bootmem(
13909 - __pa(xen_start_info->mfn_list),
13910 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
13911 - sizeof(unsigned long))));
13912 -
13913 - /*
13914 - * Initialise the list of the frames that specify the list of
13915 - * frames that make up the p2m table. Used by save/restore
13916 - */
13917 - pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
13918 -
13919 - fpp = PAGE_SIZE/sizeof(unsigned long);
13920 - for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
13921 - if ((j % fpp) == 0) {
13922 - k++;
13923 - BUG_ON(k>=16);
13924 - pfn_to_mfn_frame_list[k] =
13925 - alloc_bootmem_low_pages(PAGE_SIZE);
13926 - pfn_to_mfn_frame_list_list[k] =
13927 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
13928 - j=0;
13929 - }
13930 - pfn_to_mfn_frame_list[k][j] =
13931 - virt_to_mfn(&phys_to_machine_mapping[i]);
13932 - }
13933 - HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
13934 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
13935 - virt_to_mfn(pfn_to_mfn_frame_list_list);
13936 - }
13937 -
13938 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
13939 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
13940 - if (i != 4 && request_dma(i, "xen") != 0)
13941 - BUG();
13942 -
13943 - /*
13944 - * NOTE: at this point the bootmem allocator is fully available.
13945 - */
13946 -
13947 -#ifdef CONFIG_BLK_DEV_INITRD
13948 - relocate_initrd();
13949 -#endif
13950 -
13951 - paravirt_post_allocator_init();
13952 -
13953 - if (is_initial_xendomain())
13954 - dmi_scan_machine();
13955 -
13956 - io_delay_init();
13957 -
13958 -#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
13959 - /*
13960 - * setup to use the early static init tables during kernel startup
13961 - * X86_SMP will exclude sub-arches that don't deal well with it.
13962 - */
13963 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
13964 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
13965 -#ifdef CONFIG_NUMA
13966 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
13967 -#endif
13968 -#endif
13969 -
13970 -#ifdef CONFIG_X86_GENERICARCH
13971 - generic_apic_probe();
13972 -#endif
13973 -
13974 - set_iopl.iopl = 1;
13975 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
13976 -
13977 -#ifdef CONFIG_ACPI
13978 - if (!is_initial_xendomain()) {
13979 - printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
13980 - acpi_disabled = 1;
13981 - acpi_ht = 0;
13982 - }
13983 -
13984 - /*
13985 - * Parse the ACPI tables for possible boot-time SMP configuration.
13986 - */
13987 - acpi_boot_table_init();
13988 -#endif
13989 -
13990 -#ifndef CONFIG_XEN
13991 - early_quirks();
13992 -#endif
13993 -
13994 -#ifdef CONFIG_ACPI
13995 - acpi_boot_init();
13996 -
13997 -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
13998 - if (def_to_bigsmp)
13999 - printk(KERN_WARNING "More than 8 CPUs detected and "
14000 - "CONFIG_X86_PC cannot handle it.\nUse "
14001 - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
14002 -#endif
14003 -#endif
14004 -#ifdef CONFIG_X86_LOCAL_APIC
14005 - if (smp_found_config)
14006 - get_smp_config();
14007 -#endif
14008 -
14009 - e820_register_memory();
14010 - e820_mark_nosave_regions();
14011 -
14012 - if (is_initial_xendomain()) {
14013 -#ifdef CONFIG_VT
14014 -#if defined(CONFIG_VGA_CONSOLE)
14015 - if (!efi_enabled ||
14016 - (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14017 - conswitchp = &vga_con;
14018 -#elif defined(CONFIG_DUMMY_CONSOLE)
14019 - conswitchp = &dummy_con;
14020 -#endif
14021 -#endif
14022 - } else {
14023 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14024 - conswitchp = &dummy_con;
14025 -#endif
14026 - }
14027 -}
14028 -
14029 -static int
14030 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14031 -{
14032 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14033 - /* we're never actually going to get here... */
14034 - return NOTIFY_DONE;
14035 -}
14036 -
14037 -/*
14038 - * Request address space for all standard resources
14039 - *
14040 - * This is called just before pcibios_init(), which is also a
14041 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
14042 - */
14043 -static int __init request_standard_resources(void)
14044 -{
14045 - int i;
14046 -
14047 - /* Nothing to do if not running in dom0. */
14048 - if (!is_initial_xendomain())
14049 - return 0;
14050 -
14051 - printk(KERN_INFO "Setting up standard PCI resources\n");
14052 - init_iomem_resources(&code_resource, &data_resource, &bss_resource);
14053 -
14054 - request_resource(&iomem_resource, &video_ram_resource);
14055 -
14056 - /* request I/O space for devices used on all i[345]86 PCs */
14057 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14058 - request_resource(&ioport_resource, &standard_io_resources[i]);
14059 - return 0;
14060 -}
14061 -
14062 -subsys_initcall(request_standard_resources);
14063 --- sle11-2009-06-04.orig/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:38:05.000000000 +0100
14064 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
14065 @@ -1,1433 +0,0 @@
14066 -/*
14067 - * Copyright (C) 1995 Linus Torvalds
14068 - */
14069 -
14070 -/*
14071 - * This file handles the architecture-dependent parts of initialization
14072 - */
14073 -
14074 -#include <linux/errno.h>
14075 -#include <linux/sched.h>
14076 -#include <linux/kernel.h>
14077 -#include <linux/mm.h>
14078 -#include <linux/stddef.h>
14079 -#include <linux/unistd.h>
14080 -#include <linux/ptrace.h>
14081 -#include <linux/slab.h>
14082 -#include <linux/user.h>
14083 -#include <linux/screen_info.h>
14084 -#include <linux/ioport.h>
14085 -#include <linux/delay.h>
14086 -#include <linux/init.h>
14087 -#include <linux/initrd.h>
14088 -#include <linux/highmem.h>
14089 -#include <linux/bootmem.h>
14090 -#include <linux/module.h>
14091 -#include <asm/processor.h>
14092 -#include <linux/console.h>
14093 -#include <linux/seq_file.h>
14094 -#include <linux/crash_dump.h>
14095 -#include <linux/root_dev.h>
14096 -#include <linux/pci.h>
14097 -#include <asm/pci-direct.h>
14098 -#include <linux/efi.h>
14099 -#include <linux/acpi.h>
14100 -#include <linux/kallsyms.h>
14101 -#include <linux/edd.h>
14102 -#include <linux/iscsi_ibft.h>
14103 -#include <linux/mmzone.h>
14104 -#include <linux/kexec.h>
14105 -#include <linux/cpufreq.h>
14106 -#include <linux/dmi.h>
14107 -#include <linux/dma-mapping.h>
14108 -#include <linux/ctype.h>
14109 -#include <linux/sort.h>
14110 -#include <linux/uaccess.h>
14111 -#include <linux/init_ohci1394_dma.h>
14112 -#include <linux/kvm_para.h>
14113 -
14114 -#include <asm/mtrr.h>
14115 -#include <asm/uaccess.h>
14116 -#include <asm/system.h>
14117 -#include <asm/vsyscall.h>
14118 -#include <asm/io.h>
14119 -#include <asm/smp.h>
14120 -#include <asm/msr.h>
14121 -#include <asm/desc.h>
14122 -#include <video/edid.h>
14123 -#include <asm/e820.h>
14124 -#include <asm/dma.h>
14125 -#include <asm/gart.h>
14126 -#include <asm/mpspec.h>
14127 -#include <asm/mmu_context.h>
14128 -#include <asm/proto.h>
14129 -#include <asm/setup.h>
14130 -#include <asm/numa.h>
14131 -#include <asm/sections.h>
14132 -#include <asm/dmi.h>
14133 -#include <asm/cacheflush.h>
14134 -#include <asm/mce.h>
14135 -#include <asm/ds.h>
14136 -#include <asm/topology.h>
14137 -#include <asm/pat.h>
14138 -
14139 -#include <mach_apic.h>
14140 -#ifdef CONFIG_XEN
14141 -#include <linux/percpu.h>
14142 -#include <xen/interface/physdev.h>
14143 -#include "setup_arch_pre.h"
14144 -#include <asm/hypervisor.h>
14145 -#include <xen/interface/nmi.h>
14146 -#include <xen/features.h>
14147 -#include <xen/firmware.h>
14148 -#include <xen/xencons.h>
14149 -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
14150 -#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
14151 -#include <asm/mach-xen/setup_arch_post.h>
14152 -#include <xen/interface/memory.h>
14153 -
14154 -#ifdef CONFIG_XEN
14155 -#include <xen/interface/kexec.h>
14156 -#endif
14157 -
14158 -extern unsigned long start_pfn;
14159 -extern struct edid_info edid_info;
14160 -
14161 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
14162 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
14163 -
14164 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
14165 -static struct notifier_block xen_panic_block = {
14166 - xen_panic_event, NULL, 0 /* try to go last */
14167 -};
14168 -
14169 -unsigned long *phys_to_machine_mapping;
14170 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
14171 -
14172 -EXPORT_SYMBOL(phys_to_machine_mapping);
14173 -
14174 -DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
14175 -DEFINE_PER_CPU(int, nr_multicall_ents);
14176 -
14177 -/* Raw start-of-day parameters from the hypervisor. */
14178 -start_info_t *xen_start_info;
14179 -EXPORT_SYMBOL(xen_start_info);
14180 -#endif
14181 -
14182 -/*
14183 - * Machine setup..
14184 - */
14185 -
14186 -struct cpuinfo_x86 boot_cpu_data __read_mostly;
14187 -EXPORT_SYMBOL(boot_cpu_data);
14188 -
14189 -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
14190 -
14191 -unsigned long mmu_cr4_features;
14192 -
14193 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
14194 -int bootloader_type;
14195 -
14196 -unsigned long saved_video_mode;
14197 -
14198 -int force_mwait __cpuinitdata;
14199 -
14200 -/*
14201 - * Early DMI memory
14202 - */
14203 -int dmi_alloc_index;
14204 -char dmi_alloc_data[DMI_MAX_DATA];
14205 -
14206 -/*
14207 - * Setup options
14208 - */
14209 -struct screen_info screen_info;
14210 -EXPORT_SYMBOL(screen_info);
14211 -struct sys_desc_table_struct {
14212 - unsigned short length;
14213 - unsigned char table[0];
14214 -};
14215 -
14216 -struct edid_info edid_info;
14217 -EXPORT_SYMBOL_GPL(edid_info);
14218 -
14219 -extern int root_mountflags;
14220 -
14221 -char __initdata command_line[COMMAND_LINE_SIZE];
14222 -
14223 -static struct resource standard_io_resources[] = {
14224 - { .name = "dma1", .start = 0x00, .end = 0x1f,
14225 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14226 - { .name = "pic1", .start = 0x20, .end = 0x21,
14227 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14228 - { .name = "timer0", .start = 0x40, .end = 0x43,
14229 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14230 - { .name = "timer1", .start = 0x50, .end = 0x53,
14231 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14232 - { .name = "keyboard", .start = 0x60, .end = 0x60,
14233 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14234 - { .name = "keyboard", .start = 0x64, .end = 0x64,
14235 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14236 - { .name = "dma page reg", .start = 0x80, .end = 0x8f,
14237 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14238 - { .name = "pic2", .start = 0xa0, .end = 0xa1,
14239 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14240 - { .name = "dma2", .start = 0xc0, .end = 0xdf,
14241 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14242 - { .name = "fpu", .start = 0xf0, .end = 0xff,
14243 - .flags = IORESOURCE_BUSY | IORESOURCE_IO }
14244 -};
14245 -
14246 -#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
14247 -
14248 -static struct resource data_resource = {
14249 - .name = "Kernel data",
14250 - .start = 0,
14251 - .end = 0,
14252 - .flags = IORESOURCE_RAM,
14253 -};
14254 -static struct resource code_resource = {
14255 - .name = "Kernel code",
14256 - .start = 0,
14257 - .end = 0,
14258 - .flags = IORESOURCE_RAM,
14259 -};
14260 -static struct resource bss_resource = {
14261 - .name = "Kernel bss",
14262 - .start = 0,
14263 - .end = 0,
14264 - .flags = IORESOURCE_RAM,
14265 -};
14266 -
14267 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
14268 -
14269 -#ifdef CONFIG_PROC_VMCORE
14270 -/* elfcorehdr= specifies the location of elf core header
14271 - * stored by the crashed kernel. This option will be passed
14272 - * by kexec loader to the capture kernel.
14273 - */
14274 -static int __init setup_elfcorehdr(char *arg)
14275 -{
14276 - char *end;
14277 - if (!arg)
14278 - return -EINVAL;
14279 - elfcorehdr_addr = memparse(arg, &end);
14280 - return end > arg ? 0 : -EINVAL;
14281 -}
14282 -early_param("elfcorehdr", setup_elfcorehdr);
14283 -#endif
14284 -
14285 -#ifndef CONFIG_NUMA
14286 -static void __init
14287 -contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
14288 -{
14289 - unsigned long bootmap_size, bootmap;
14290 -
14291 - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
14292 - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
14293 - PAGE_SIZE);
14294 - if (bootmap == -1L)
14295 - panic("Cannot find bootmem map of size %ld\n", bootmap_size);
14296 - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
14297 - e820_register_active_regions(0, start_pfn, end_pfn);
14298 -#ifdef CONFIG_XEN
14299 - free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
14300 - early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
14301 -#else
14302 - free_bootmem_with_active_regions(0, end_pfn);
14303 - early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
14304 -#endif
14305 - reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
14306 -}
14307 -#endif
14308 -
14309 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
14310 -struct edd edd;
14311 -#ifdef CONFIG_EDD_MODULE
14312 -EXPORT_SYMBOL(edd);
14313 -#endif
14314 -#ifndef CONFIG_XEN
14315 -/**
14316 - * copy_edd() - Copy the BIOS EDD information
14317 - * from boot_params into a safe place.
14318 - *
14319 - */
14320 -static inline void copy_edd(void)
14321 -{
14322 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
14323 - sizeof(edd.mbr_signature));
14324 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
14325 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
14326 - edd.edd_info_nr = boot_params.eddbuf_entries;
14327 -}
14328 -#endif
14329 -#else
14330 -static inline void copy_edd(void)
14331 -{
14332 -}
14333 -#endif
14334 -
14335 -#ifdef CONFIG_KEXEC
14336 -#ifndef CONFIG_XEN
14337 -static void __init reserve_crashkernel(void)
14338 -{
14339 - unsigned long long total_mem;
14340 - unsigned long long crash_size, crash_base;
14341 - int ret;
14342 -
14343 - total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
14344 -
14345 - ret = parse_crashkernel(boot_command_line, total_mem,
14346 - &crash_size, &crash_base);
14347 - if (ret == 0 && crash_size) {
14348 - if (crash_base <= 0) {
14349 - printk(KERN_INFO "crashkernel reservation failed - "
14350 - "you have to specify a base address\n");
14351 - return;
14352 - }
14353 -
14354 - if (reserve_bootmem(crash_base, crash_size,
14355 - BOOTMEM_EXCLUSIVE) < 0) {
14356 - printk(KERN_INFO "crashkernel reservation failed - "
14357 - "memory is in use\n");
14358 - return;
14359 - }
14360 -
14361 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
14362 - "for crashkernel (System RAM: %ldMB)\n",
14363 - (unsigned long)(crash_size >> 20),
14364 - (unsigned long)(crash_base >> 20),
14365 - (unsigned long)(total_mem >> 20));
14366 - crashk_res.start = crash_base;
14367 - crashk_res.end = crash_base + crash_size - 1;
14368 - insert_resource(&iomem_resource, &crashk_res);
14369 - }
14370 -}
14371 -#else
14372 -#define reserve_crashkernel xen_machine_kexec_setup_resources
14373 -#endif
14374 -#else
14375 -static inline void __init reserve_crashkernel(void)
14376 -{}
14377 -#endif
14378 -
14379 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
14380 -void __attribute__((weak)) __init memory_setup(void)
14381 -{
14382 - machine_specific_memory_setup();
14383 -}
14384 -
14385 -static void __init parse_setup_data(void)
14386 -{
14387 - struct setup_data *data;
14388 - unsigned long pa_data;
14389 -
14390 - if (boot_params.hdr.version < 0x0209)
14391 - return;
14392 - pa_data = boot_params.hdr.setup_data;
14393 - while (pa_data) {
14394 - data = early_ioremap(pa_data, PAGE_SIZE);
14395 - switch (data->type) {
14396 - default:
14397 - break;
14398 - }
14399 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
14400 - free_early(pa_data, pa_data+sizeof(*data)+data->len);
14401 -#endif
14402 - pa_data = data->next;
14403 - early_iounmap(data, PAGE_SIZE);
14404 - }
14405 -}
14406 -
14407 -#ifdef CONFIG_PCI_MMCONFIG
14408 -extern void __cpuinit fam10h_check_enable_mmcfg(void);
14409 -extern void __init check_enable_amd_mmconf_dmi(void);
14410 -#else
14411 -void __cpuinit fam10h_check_enable_mmcfg(void)
14412 -{
14413 -}
14414 -void __init check_enable_amd_mmconf_dmi(void)
14415 -{
14416 -}
14417 -#endif
14418 -
14419 -/*
14420 - * setup_arch - architecture-specific boot-time initializations
14421 - *
14422 - * Note: On x86_64, fixmaps are ready for use even before this is called.
14423 - */
14424 -void __init setup_arch(char **cmdline_p)
14425 -{
14426 - unsigned i;
14427 -
14428 -#ifdef CONFIG_XEN
14429 - extern struct e820map machine_e820;
14430 -
14431 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14432 -
14433 - /* Register a call for panic conditions. */
14434 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
14435 -
14436 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
14437 - VMASST_TYPE_writable_pagetables));
14438 -
14439 - early_ioremap_init();
14440 -
14441 - ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
14442 - screen_info = boot_params.screen_info;
14443 -
14444 - if (is_initial_xendomain()) {
14445 - const struct dom0_vga_console_info *info =
14446 - (void *)((char *)xen_start_info +
14447 - xen_start_info->console.dom0.info_off);
14448 -
14449 - dom0_init_screen_info(info,
14450 - xen_start_info->console.dom0.info_size);
14451 - xen_start_info->console.domU.mfn = 0;
14452 - xen_start_info->console.domU.evtchn = 0;
14453 - } else
14454 - screen_info.orig_video_isVGA = 0;
14455 -
14456 - copy_edid();
14457 -#else
14458 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14459 -
14460 - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
14461 - screen_info = boot_params.screen_info;
14462 - edid_info = boot_params.edid_info;
14463 -#endif /* !CONFIG_XEN */
14464 - saved_video_mode = boot_params.hdr.vid_mode;
14465 - bootloader_type = boot_params.hdr.type_of_loader;
14466 -
14467 -#ifdef CONFIG_BLK_DEV_RAM
14468 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
14469 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
14470 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
14471 -#endif
14472 -#ifdef CONFIG_EFI
14473 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
14474 - "EL64", 4))
14475 - efi_enabled = 1;
14476 -#endif
14477 -
14478 - ARCH_SETUP
14479 -
14480 - memory_setup();
14481 - copy_edd();
14482 -
14483 - if (!boot_params.hdr.root_flags)
14484 - root_mountflags &= ~MS_RDONLY;
14485 - init_mm.start_code = (unsigned long) &_text;
14486 - init_mm.end_code = (unsigned long) &_etext;
14487 - init_mm.end_data = (unsigned long) &_edata;
14488 - init_mm.brk = (unsigned long) &_end;
14489 -
14490 - code_resource.start = virt_to_phys(&_text);
14491 - code_resource.end = virt_to_phys(&_etext)-1;
14492 - data_resource.start = virt_to_phys(&_etext);
14493 - data_resource.end = virt_to_phys(&_edata)-1;
14494 - bss_resource.start = virt_to_phys(&__bss_start);
14495 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
14496 -
14497 - early_identify_cpu(&boot_cpu_data);
14498 -
14499 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
14500 - *cmdline_p = command_line;
14501 -
14502 - parse_setup_data();
14503 -
14504 - parse_early_param();
14505 -
14506 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
14507 - if (init_ohci1394_dma_early)
14508 - init_ohci1394_dma_on_all_controllers();
14509 -#endif
14510 -
14511 - finish_e820_parsing();
14512 -
14513 -#ifndef CONFIG_XEN
14514 - /* after parse_early_param, so could debug it */
14515 - insert_resource(&iomem_resource, &code_resource);
14516 - insert_resource(&iomem_resource, &data_resource);
14517 - insert_resource(&iomem_resource, &bss_resource);
14518 -#endif
14519 -
14520 - early_gart_iommu_check();
14521 -
14522 - e820_register_active_regions(0, 0, -1UL);
14523 - /*
14524 - * partially used pages are not usable - thus
14525 - * we are rounding upwards:
14526 - */
14527 - end_pfn = e820_end_of_ram();
14528 - /* update e820 for memory not covered by WB MTRRs */
14529 - mtrr_bp_init();
14530 -#ifndef CONFIG_XEN
14531 - if (mtrr_trim_uncached_memory(end_pfn)) {
14532 - e820_register_active_regions(0, 0, -1UL);
14533 - end_pfn = e820_end_of_ram();
14534 - }
14535 -#endif
14536 -
14537 - num_physpages = end_pfn;
14538 - max_mapnr = end_pfn;
14539 -
14540 - check_efer();
14541 -
14542 - max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
14543 - if (efi_enabled)
14544 - efi_init();
14545 -
14546 -#ifndef CONFIG_XEN
14547 - vsmp_init();
14548 -#endif
14549 -
14550 - if (is_initial_xendomain())
14551 - dmi_scan_machine();
14552 -
14553 - io_delay_init();
14554 -
14555 -#ifdef CONFIG_KVM_CLOCK
14556 - kvmclock_init();
14557 -#endif
14558 -
14559 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
14560 - /* setup to use the early static init tables during kernel startup */
14561 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
14562 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
14563 -#ifdef CONFIG_NUMA
14564 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
14565 -#endif
14566 -#endif
14567 -
14568 - /* How many end-of-memory variables you have, grandma! */
14569 - max_low_pfn = end_pfn;
14570 - max_pfn = end_pfn;
14571 - high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
14572 -
14573 - /* Remove active ranges so rediscovery with NUMA-awareness happens */
14574 - remove_all_active_ranges();
14575 -
14576 -#ifdef CONFIG_ACPI_NUMA
14577 - /*
14578 - * Parse SRAT to discover nodes.
14579 - */
14580 - acpi_numa_init();
14581 -#endif
14582 -
14583 -#ifdef CONFIG_NUMA
14584 - numa_initmem_init(0, end_pfn);
14585 -#else
14586 - contig_initmem_init(0, end_pfn);
14587 -#endif
14588 -
14589 -#ifndef CONFIG_XEN
14590 - dma32_reserve_bootmem();
14591 -
14592 -#ifdef CONFIG_ACPI_SLEEP
14593 - /*
14594 - * Reserve low memory region for sleep support.
14595 - */
14596 - acpi_reserve_bootmem();
14597 -#endif
14598 -
14599 - if (efi_enabled)
14600 - efi_reserve_bootmem();
14601 -#endif
14602 -
14603 -#ifdef CONFIG_BLK_DEV_INITRD
14604 -#ifdef CONFIG_XEN
14605 - if (xen_start_info->mod_start) {
14606 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
14607 - unsigned long ramdisk_size = xen_start_info->mod_len;
14608 -#else
14609 - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
14610 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
14611 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
14612 -#endif
14613 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
14614 - unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
14615 -
14616 - if (ramdisk_end <= end_of_mem) {
14617 - /*
14618 - * don't need to reserve again, already reserved early
14619 - * in x86_64_start_kernel, and early_res_to_bootmem
14620 - * convert that to reserved in bootmem
14621 - */
14622 - initrd_start = ramdisk_image + PAGE_OFFSET;
14623 - initrd_end = initrd_start+ramdisk_size;
14624 -#ifdef CONFIG_XEN
14625 - initrd_below_start_ok = 1;
14626 -#endif
14627 - } else {
14628 - free_bootmem(ramdisk_image, ramdisk_size);
14629 - printk(KERN_ERR "initrd extends beyond end of memory "
14630 - "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
14631 - ramdisk_end, end_of_mem);
14632 - initrd_start = 0;
14633 - }
14634 - }
14635 -#endif
14636 - reserve_crashkernel();
14637 -
14638 - reserve_ibft_region();
14639 -
14640 - paging_init();
14641 - map_vsyscall();
14642 -#ifdef CONFIG_X86_LOCAL_APIC
14643 - /*
14644 - * Find and reserve possible boot-time SMP configuration:
14645 - */
14646 - find_smp_config();
14647 -#endif
14648 -#ifdef CONFIG_XEN
14649 - {
14650 - int i, j, k, fpp;
14651 - unsigned long p2m_pages;
14652 -
14653 - p2m_pages = end_pfn;
14654 - if (xen_start_info->nr_pages > end_pfn) {
14655 - /*
14656 - * the end_pfn was shrunk (probably by mem= or highmem=
14657 - * kernel parameter); shrink reservation with the HV
14658 - */
14659 - struct xen_memory_reservation reservation = {
14660 - .address_bits = 0,
14661 - .extent_order = 0,
14662 - .domid = DOMID_SELF
14663 - };
14664 - unsigned int difference;
14665 - int ret;
14666 -
14667 - difference = xen_start_info->nr_pages - end_pfn;
14668 -
14669 - set_xen_guest_handle(reservation.extent_start,
14670 - ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
14671 - reservation.nr_extents = difference;
14672 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
14673 - &reservation);
14674 - BUG_ON (ret != difference);
14675 - }
14676 - else if (end_pfn > xen_start_info->nr_pages)
14677 - p2m_pages = xen_start_info->nr_pages;
14678 -
14679 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
14680 - /* Make sure we have a large enough P->M table. */
14681 - phys_to_machine_mapping = alloc_bootmem_pages(
14682 - end_pfn * sizeof(unsigned long));
14683 - memset(phys_to_machine_mapping, ~0,
14684 - end_pfn * sizeof(unsigned long));
14685 - memcpy(phys_to_machine_mapping,
14686 - (unsigned long *)xen_start_info->mfn_list,
14687 - p2m_pages * sizeof(unsigned long));
14688 - free_bootmem(
14689 - __pa(xen_start_info->mfn_list),
14690 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
14691 - sizeof(unsigned long))));
14692 -
14693 - /*
14694 - * Initialise the list of the frames that specify the
14695 - * list of frames that make up the p2m table. Used by
14696 - * save/restore.
14697 - */
14698 - pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
14699 -
14700 - fpp = PAGE_SIZE/sizeof(unsigned long);
14701 - for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
14702 - if ((j % fpp) == 0) {
14703 - k++;
14704 - BUG_ON(k>=fpp);
14705 - pfn_to_mfn_frame_list[k] =
14706 - alloc_bootmem_pages(PAGE_SIZE);
14707 - pfn_to_mfn_frame_list_list[k] =
14708 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
14709 - j=0;
14710 - }
14711 - pfn_to_mfn_frame_list[k][j] =
14712 - virt_to_mfn(&phys_to_machine_mapping[i]);
14713 - }
14714 - HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
14715 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
14716 - virt_to_mfn(pfn_to_mfn_frame_list_list);
14717 - }
14718 -
14719 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
14720 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
14721 - if (i != 4 && request_dma(i, "xen") != 0)
14722 - BUG();
14723 - }
14724 -
14725 -#ifdef CONFIG_ACPI
14726 - if (!is_initial_xendomain()) {
14727 - acpi_disabled = 1;
14728 - acpi_ht = 0;
14729 - }
14730 -#endif
14731 -#endif
14732 -
14733 -#ifndef CONFIG_XEN
14734 - early_quirks();
14735 -#endif
14736 -
14737 -#ifdef CONFIG_ACPI
14738 - /*
14739 - * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
14740 - * Call this early for SRAT node setup.
14741 - */
14742 - acpi_boot_table_init();
14743 -
14744 - /*
14745 - * Read APIC and some other early information from ACPI tables.
14746 - */
14747 - acpi_boot_init();
14748 -#endif
14749 -
14750 - init_cpu_to_node();
14751 -
14752 -#ifdef CONFIG_X86_LOCAL_APIC
14753 - /*
14754 - * get boot-time SMP configuration:
14755 - */
14756 - if (smp_found_config)
14757 - get_smp_config();
14758 -#ifndef CONFIG_XEN
14759 - init_apic_mappings();
14760 - ioapic_init_mappings();
14761 -#endif
14762 -#endif
14763 -#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
14764 - prefill_possible_map();
14765 -#endif
14766 -
14767 - kvm_guest_init();
14768 -
14769 - /*
14770 - * We trust e820 completely. No explicit ROM probing in memory.
14771 - */
14772 -#ifdef CONFIG_XEN
14773 - if (is_initial_xendomain())
14774 - e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
14775 -#else
14776 - e820_reserve_resources(e820.map, e820.nr_map);
14777 - e820_mark_nosave_regions();
14778 -#endif
14779 -
14780 - /* request I/O space for devices used on all i[345]86 PCs */
14781 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14782 - request_resource(&ioport_resource, &standard_io_resources[i]);
14783 -
14784 -#ifdef CONFIG_XEN
14785 - if (is_initial_xendomain())
14786 - e820_setup_gap(machine_e820.map, machine_e820.nr_map);
14787 -#else
14788 - e820_setup_gap(e820.map, e820.nr_map);
14789 -#endif
14790 -
14791 -#ifdef CONFIG_XEN
14792 - {
14793 - struct physdev_set_iopl set_iopl;
14794 -
14795 - set_iopl.iopl = 1;
14796 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
14797 -
14798 - if (is_initial_xendomain()) {
14799 -#ifdef CONFIG_VT
14800 -#if defined(CONFIG_VGA_CONSOLE)
14801 - conswitchp = &vga_con;
14802 -#elif defined(CONFIG_DUMMY_CONSOLE)
14803 - conswitchp = &dummy_con;
14804 -#endif
14805 -#endif
14806 - } else {
14807 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14808 - conswitchp = &dummy_con;
14809 -#endif
14810 - }
14811 - }
14812 -#else /* CONFIG_XEN */
14813 -
14814 -#ifdef CONFIG_VT
14815 -#if defined(CONFIG_VGA_CONSOLE)
14816 - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14817 - conswitchp = &vga_con;
14818 -#elif defined(CONFIG_DUMMY_CONSOLE)
14819 - conswitchp = &dummy_con;
14820 -#endif
14821 -#endif
14822 -
14823 -#endif /* !CONFIG_XEN */
14824 -
14825 - /* do this before identify_cpu for boot cpu */
14826 - check_enable_amd_mmconf_dmi();
14827 -}
14828 -
14829 -#ifdef CONFIG_XEN
14830 -static int
14831 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14832 -{
14833 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14834 - /* we're never actually going to get here... */
14835 - return NOTIFY_DONE;
14836 -}
14837 -#endif /* !CONFIG_XEN */
14838 -
14839 -
14840 -static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
14841 -{
14842 - unsigned int *v;
14843 -
14844 - if (c->extended_cpuid_level < 0x80000004)
14845 - return 0;
14846 -
14847 - v = (unsigned int *) c->x86_model_id;
14848 - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
14849 - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
14850 - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
14851 - c->x86_model_id[48] = 0;
14852 - return 1;
14853 -}
14854 -
14855 -
14856 -static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
14857 -{
14858 - unsigned int n, dummy, eax, ebx, ecx, edx;
14859 -
14860 - n = c->extended_cpuid_level;
14861 -
14862 - if (n >= 0x80000005) {
14863 - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
14864 - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
14865 - "D cache %dK (%d bytes/line)\n",
14866 - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
14867 - c->x86_cache_size = (ecx>>24) + (edx>>24);
14868 - /* On K8 L1 TLB is inclusive, so don't count it */
14869 - c->x86_tlbsize = 0;
14870 - }
14871 -
14872 - if (n >= 0x80000006) {
14873 - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
14874 - ecx = cpuid_ecx(0x80000006);
14875 - c->x86_cache_size = ecx >> 16;
14876 - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
14877 -
14878 - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
14879 - c->x86_cache_size, ecx & 0xFF);
14880 - }
14881 - if (n >= 0x80000008) {
14882 - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
14883 - c->x86_virt_bits = (eax >> 8) & 0xff;
14884 - c->x86_phys_bits = eax & 0xff;
14885 - }
14886 -}
14887 -
14888 -#ifdef CONFIG_NUMA
14889 -static int __cpuinit nearby_node(int apicid)
14890 -{
14891 - int i, node;
14892 -
14893 - for (i = apicid - 1; i >= 0; i--) {
14894 - node = apicid_to_node[i];
14895 - if (node != NUMA_NO_NODE && node_online(node))
14896 - return node;
14897 - }
14898 - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
14899 - node = apicid_to_node[i];
14900 - if (node != NUMA_NO_NODE && node_online(node))
14901 - return node;
14902 - }
14903 - return first_node(node_online_map); /* Shouldn't happen */
14904 -}
14905 -#endif
14906 -
14907 -/*
14908 - * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
14909 - * Assumes number of cores is a power of two.
14910 - */
14911 -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
14912 -{
14913 -#ifdef CONFIG_SMP
14914 - unsigned bits;
14915 -#ifdef CONFIG_NUMA
14916 - int cpu = smp_processor_id();
14917 - int node = 0;
14918 - unsigned apicid = hard_smp_processor_id();
14919 -#endif
14920 - bits = c->x86_coreid_bits;
14921 -
14922 - /* Low order bits define the core id (index of core in socket) */
14923 - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
14924 - /* Convert the initial APIC ID into the socket ID */
14925 - c->phys_proc_id = c->initial_apicid >> bits;
14926 -
14927 -#ifdef CONFIG_NUMA
14928 - node = c->phys_proc_id;
14929 - if (apicid_to_node[apicid] != NUMA_NO_NODE)
14930 - node = apicid_to_node[apicid];
14931 - if (!node_online(node)) {
14932 - /* Two possibilities here:
14933 - - The CPU is missing memory and no node was created.
14934 - In that case try picking one from a nearby CPU
14935 - - The APIC IDs differ from the HyperTransport node IDs
14936 - which the K8 northbridge parsing fills in.
14937 - Assume they are all increased by a constant offset,
14938 - but in the same order as the HT nodeids.
14939 - If that doesn't result in a usable node fall back to the
14940 - path for the previous case. */
14941 -
14942 - int ht_nodeid = c->initial_apicid;
14943 -
14944 - if (ht_nodeid >= 0 &&
14945 - apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
14946 - node = apicid_to_node[ht_nodeid];
14947 - /* Pick a nearby node */
14948 - if (!node_online(node))
14949 - node = nearby_node(apicid);
14950 - }
14951 - numa_set_node(cpu, node);
14952 -
14953 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
14954 -#endif
14955 -#endif
14956 -}
14957 -
14958 -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
14959 -{
14960 -#ifdef CONFIG_SMP
14961 - unsigned bits, ecx;
14962 -
14963 - /* Multi core CPU? */
14964 - if (c->extended_cpuid_level < 0x80000008)
14965 - return;
14966 -
14967 - ecx = cpuid_ecx(0x80000008);
14968 -
14969 - c->x86_max_cores = (ecx & 0xff) + 1;
14970 -
14971 - /* CPU telling us the core id bits shift? */
14972 - bits = (ecx >> 12) & 0xF;
14973 -
14974 - /* Otherwise recompute */
14975 - if (bits == 0) {
14976 - while ((1 << bits) < c->x86_max_cores)
14977 - bits++;
14978 - }
14979 -
14980 - c->x86_coreid_bits = bits;
14981 -
14982 -#endif
14983 -}
14984 -
14985 -#define ENABLE_C1E_MASK 0x18000000
14986 -#define CPUID_PROCESSOR_SIGNATURE 1
14987 -#define CPUID_XFAM 0x0ff00000
14988 -#define CPUID_XFAM_K8 0x00000000
14989 -#define CPUID_XFAM_10H 0x00100000
14990 -#define CPUID_XFAM_11H 0x00200000
14991 -#define CPUID_XMOD 0x000f0000
14992 -#define CPUID_XMOD_REV_F 0x00040000
14993 -
14994 -#ifndef CONFIG_XEN
14995 -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
14996 -static __cpuinit int amd_apic_timer_broken(void)
14997 -{
14998 - u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
14999 -
15000 - switch (eax & CPUID_XFAM) {
15001 - case CPUID_XFAM_K8:
15002 - if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
15003 - break;
15004 - case CPUID_XFAM_10H:
15005 - case CPUID_XFAM_11H:
15006 - rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
15007 - if (lo & ENABLE_C1E_MASK)
15008 - return 1;
15009 - break;
15010 - default:
15011 - /* err on the side of caution */
15012 - return 1;
15013 - }
15014 - return 0;
15015 -}
15016 -#endif
15017 -
15018 -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
15019 -{
15020 - early_init_amd_mc(c);
15021 -
15022 - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
15023 - if (c->x86_power & (1<<8))
15024 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15025 -}
15026 -
15027 -static void __cpuinit init_amd(struct cpuinfo_x86 *c)
15028 -{
15029 - unsigned level;
15030 -
15031 -#ifdef CONFIG_SMP
15032 - unsigned long value;
15033 -
15034 - /*
15035 - * Disable TLB flush filter by setting HWCR.FFDIS on K8
15036 - * bit 6 of msr C001_0015
15037 - *
15038 - * Errata 63 for SH-B3 steppings
15039 - * Errata 122 for all steppings (F+ have it disabled by default)
15040 - */
15041 - if (c->x86 == 15) {
15042 - rdmsrl(MSR_K8_HWCR, value);
15043 - value |= 1 << 6;
15044 - wrmsrl(MSR_K8_HWCR, value);
15045 - }
15046 -#endif
15047 -
15048 - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
15049 - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
15050 - clear_cpu_cap(c, 0*32+31);
15051 -
15052 - /* On C+ stepping K8 rep microcode works well for copy/memset */
15053 - level = cpuid_eax(1);
15054 - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
15055 - level >= 0x0f58))
15056 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15057 - if (c->x86 == 0x10 || c->x86 == 0x11)
15058 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15059 -
15060 - /* Enable workaround for FXSAVE leak */
15061 - if (c->x86 >= 6)
15062 - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
15063 -
15064 - level = get_model_name(c);
15065 - if (!level) {
15066 - switch (c->x86) {
15067 - case 15:
15068 - /* Should distinguish Models here, but this is only
15069 - a fallback anyways. */
15070 - strcpy(c->x86_model_id, "Hammer");
15071 - break;
15072 - }
15073 - }
15074 - display_cacheinfo(c);
15075 -
15076 - /* Multi core CPU? */
15077 - if (c->extended_cpuid_level >= 0x80000008)
15078 - amd_detect_cmp(c);
15079 -
15080 - if (c->extended_cpuid_level >= 0x80000006 &&
15081 - (cpuid_edx(0x80000006) & 0xf000))
15082 - num_cache_leaves = 4;
15083 - else
15084 - num_cache_leaves = 3;
15085 -
15086 - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
15087 - set_cpu_cap(c, X86_FEATURE_K8);
15088 -
15089 - /* MFENCE stops RDTSC speculation */
15090 - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
15091 -
15092 - if (c->x86 == 0x10)
15093 - fam10h_check_enable_mmcfg();
15094 -
15095 -#ifndef CONFIG_XEN
15096 - if (amd_apic_timer_broken())
15097 - disable_apic_timer = 1;
15098 -
15099 - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
15100 - unsigned long long tseg;
15101 -
15102 - /*
15103 - * Split up direct mapping around the TSEG SMM area.
15104 - * Don't do it for gbpages because there seems very little
15105 - * benefit in doing so.
15106 - */
15107 - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
15108 - (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
15109 - set_memory_4k((unsigned long)__va(tseg), 1);
15110 - }
15111 -#endif
15112 -}
15113 -
15114 -void __cpuinit detect_ht(struct cpuinfo_x86 *c)
15115 -{
15116 -#ifdef CONFIG_SMP
15117 - u32 eax, ebx, ecx, edx;
15118 - int index_msb, core_bits;
15119 -
15120 - cpuid(1, &eax, &ebx, &ecx, &edx);
15121 -
15122 -
15123 - if (!cpu_has(c, X86_FEATURE_HT))
15124 - return;
15125 - if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
15126 - goto out;
15127 -
15128 - smp_num_siblings = (ebx & 0xff0000) >> 16;
15129 -
15130 - if (smp_num_siblings == 1) {
15131 - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
15132 - } else if (smp_num_siblings > 1) {
15133 -
15134 - if (smp_num_siblings > NR_CPUS) {
15135 - printk(KERN_WARNING "CPU: Unsupported number of "
15136 - "siblings %d", smp_num_siblings);
15137 - smp_num_siblings = 1;
15138 - return;
15139 - }
15140 -
15141 - index_msb = get_count_order(smp_num_siblings);
15142 - c->phys_proc_id = phys_pkg_id(index_msb);
15143 -
15144 - smp_num_siblings = smp_num_siblings / c->x86_max_cores;
15145 -
15146 - index_msb = get_count_order(smp_num_siblings);
15147 -
15148 - core_bits = get_count_order(c->x86_max_cores);
15149 -
15150 - c->cpu_core_id = phys_pkg_id(index_msb) &
15151 - ((1 << core_bits) - 1);
15152 - }
15153 -out:
15154 - if ((c->x86_max_cores * smp_num_siblings) > 1) {
15155 - printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
15156 - c->phys_proc_id);
15157 - printk(KERN_INFO "CPU: Processor Core ID: %d\n",
15158 - c->cpu_core_id);
15159 - }
15160 -
15161 -#endif
15162 -}
15163 -
15164 -/*
15165 - * find out the number of processor cores on the die
15166 - */
15167 -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
15168 -{
15169 - unsigned int eax, t;
15170 -
15171 - if (c->cpuid_level < 4)
15172 - return 1;
15173 -
15174 - cpuid_count(4, 0, &eax, &t, &t, &t);
15175 -
15176 - if (eax & 0x1f)
15177 - return ((eax >> 26) + 1);
15178 - else
15179 - return 1;
15180 -}
15181 -
15182 -static void __cpuinit srat_detect_node(void)
15183 -{
15184 -#ifdef CONFIG_NUMA
15185 - unsigned node;
15186 - int cpu = smp_processor_id();
15187 - int apicid = hard_smp_processor_id();
15188 -
15189 - /* Don't do the funky fallback heuristics the AMD version employs
15190 - for now. */
15191 - node = apicid_to_node[apicid];
15192 - if (node == NUMA_NO_NODE || !node_online(node))
15193 - node = first_node(node_online_map);
15194 - numa_set_node(cpu, node);
15195 -
15196 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
15197 -#endif
15198 -}
15199 -
15200 -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
15201 -{
15202 - if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
15203 - (c->x86 == 0x6 && c->x86_model >= 0x0e))
15204 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15205 -}
15206 -
15207 -static void __cpuinit init_intel(struct cpuinfo_x86 *c)
15208 -{
15209 - /* Cache sizes */
15210 - unsigned n;
15211 -
15212 - init_intel_cacheinfo(c);
15213 - if (c->cpuid_level > 9) {
15214 - unsigned eax = cpuid_eax(10);
15215 - /* Check for version and the number of counters */
15216 - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
15217 - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
15218 - }
15219 -
15220 - if (cpu_has_ds) {
15221 - unsigned int l1, l2;
15222 - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
15223 - if (!(l1 & (1<<11)))
15224 - set_cpu_cap(c, X86_FEATURE_BTS);
15225 - if (!(l1 & (1<<12)))
15226 - set_cpu_cap(c, X86_FEATURE_PEBS);
15227 - }
15228 -
15229 -
15230 - if (cpu_has_bts)
15231 - ds_init_intel(c);
15232 -
15233 - n = c->extended_cpuid_level;
15234 - if (n >= 0x80000008) {
15235 - unsigned eax = cpuid_eax(0x80000008);
15236 - c->x86_virt_bits = (eax >> 8) & 0xff;
15237 - c->x86_phys_bits = eax & 0xff;
15238 - /* CPUID workaround for Intel 0F34 CPU */
15239 - if (c->x86_vendor == X86_VENDOR_INTEL &&
15240 - c->x86 == 0xF && c->x86_model == 0x3 &&
15241 - c->x86_mask == 0x4)
15242 - c->x86_phys_bits = 36;
15243 - }
15244 -
15245 - if (c->x86 == 15)
15246 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15247 - if (c->x86 == 6)
15248 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15249 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15250 - c->x86_max_cores = intel_num_cpu_cores(c);
15251 -
15252 - srat_detect_node();
15253 -}
15254 -
15255 -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
15256 -{
15257 - if (c->x86 == 0x6 && c->x86_model >= 0xf)
15258 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15259 -}
15260 -
15261 -static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
15262 -{
15263 - /* Cache sizes */
15264 - unsigned n;
15265 -
15266 - n = c->extended_cpuid_level;
15267 - if (n >= 0x80000008) {
15268 - unsigned eax = cpuid_eax(0x80000008);
15269 - c->x86_virt_bits = (eax >> 8) & 0xff;
15270 - c->x86_phys_bits = eax & 0xff;
15271 - }
15272 -
15273 - if (c->x86 == 0x6 && c->x86_model >= 0xf) {
15274 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15275 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15276 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15277 - }
15278 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15279 -}
15280 -
15281 -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
15282 -{
15283 - char *v = c->x86_vendor_id;
15284 -
15285 - if (!strcmp(v, "AuthenticAMD"))
15286 - c->x86_vendor = X86_VENDOR_AMD;
15287 - else if (!strcmp(v, "GenuineIntel"))
15288 - c->x86_vendor = X86_VENDOR_INTEL;
15289 - else if (!strcmp(v, "CentaurHauls"))
15290 - c->x86_vendor = X86_VENDOR_CENTAUR;
15291 - else
15292 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15293 -}
15294 -
15295 -/* Do some early cpuid on the boot CPU to get some parameter that are
15296 - needed before check_bugs. Everything advanced is in identify_cpu
15297 - below. */
15298 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
15299 -{
15300 - u32 tfms, xlvl;
15301 -
15302 - c->loops_per_jiffy = loops_per_jiffy;
15303 - c->x86_cache_size = -1;
15304 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15305 - c->x86_model = c->x86_mask = 0; /* So far unknown... */
15306 - c->x86_vendor_id[0] = '\0'; /* Unset */
15307 - c->x86_model_id[0] = '\0'; /* Unset */
15308 - c->x86_clflush_size = 64;
15309 - c->x86_cache_alignment = c->x86_clflush_size;
15310 - c->x86_max_cores = 1;
15311 - c->x86_coreid_bits = 0;
15312 - c->extended_cpuid_level = 0;
15313 - memset(&c->x86_capability, 0, sizeof c->x86_capability);
15314 -
15315 - /* Get vendor name */
15316 - cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
15317 - (unsigned int *)&c->x86_vendor_id[0],
15318 - (unsigned int *)&c->x86_vendor_id[8],
15319 - (unsigned int *)&c->x86_vendor_id[4]);
15320 -
15321 - get_cpu_vendor(c);
15322 -
15323 - /* Initialize the standard set of capabilities */
15324 - /* Note that the vendor-specific code below might override */
15325 -
15326 - /* Intel-defined flags: level 0x00000001 */
15327 - if (c->cpuid_level >= 0x00000001) {
15328 - __u32 misc;
15329 - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
15330 - &c->x86_capability[0]);
15331 - c->x86 = (tfms >> 8) & 0xf;
15332 - c->x86_model = (tfms >> 4) & 0xf;
15333 - c->x86_mask = tfms & 0xf;
15334 - if (c->x86 == 0xf)
15335 - c->x86 += (tfms >> 20) & 0xff;
15336 - if (c->x86 >= 0x6)
15337 - c->x86_model += ((tfms >> 16) & 0xF) << 4;
15338 - if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
15339 - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
15340 - } else {
15341 - /* Have CPUID level 0 only - unheard of */
15342 - c->x86 = 4;
15343 - }
15344 -
15345 - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
15346 -#ifdef CONFIG_SMP
15347 - c->phys_proc_id = c->initial_apicid;
15348 -#endif
15349 - /* AMD-defined flags: level 0x80000001 */
15350 - xlvl = cpuid_eax(0x80000000);
15351 - c->extended_cpuid_level = xlvl;
15352 - if ((xlvl & 0xffff0000) == 0x80000000) {
15353 - if (xlvl >= 0x80000001) {
15354 - c->x86_capability[1] = cpuid_edx(0x80000001);
15355 - c->x86_capability[6] = cpuid_ecx(0x80000001);
15356 - }
15357 - if (xlvl >= 0x80000004)
15358 - get_model_name(c); /* Default name */
15359 - }
15360 -
15361 - /* Transmeta-defined flags: level 0x80860001 */
15362 - xlvl = cpuid_eax(0x80860000);
15363 - if ((xlvl & 0xffff0000) == 0x80860000) {
15364 - /* Don't set x86_cpuid_level here for now to not confuse. */
15365 - if (xlvl >= 0x80860001)
15366 - c->x86_capability[2] = cpuid_edx(0x80860001);
15367 - }
15368 -
15369 - c->extended_cpuid_level = cpuid_eax(0x80000000);
15370 - if (c->extended_cpuid_level >= 0x80000007)
15371 - c->x86_power = cpuid_edx(0x80000007);
15372 -
15373 - switch (c->x86_vendor) {
15374 - case X86_VENDOR_AMD:
15375 - early_init_amd(c);
15376 - break;
15377 - case X86_VENDOR_INTEL:
15378 - early_init_intel(c);
15379 - break;
15380 - case X86_VENDOR_CENTAUR:
15381 - early_init_centaur(c);
15382 - break;
15383 - }
15384 -
15385 - validate_pat_support(c);
15386 -}
15387 -
15388 -/*
15389 - * This does the hard work of actually picking apart the CPU stuff...
15390 - */
15391 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
15392 -{
15393 - int i;
15394 -
15395 - early_identify_cpu(c);
15396 -
15397 - init_scattered_cpuid_features(c);
15398 -
15399 - c->apicid = phys_pkg_id(0);
15400 -
15401 - /*
15402 - * Vendor-specific initialization. In this section we
15403 - * canonicalize the feature flags, meaning if there are
15404 - * features a certain CPU supports which CPUID doesn't
15405 - * tell us, CPUID claiming incorrect flags, or other bugs,
15406 - * we handle them here.
15407 - *
15408 - * At the end of this section, c->x86_capability better
15409 - * indicate the features this CPU genuinely supports!
15410 - */
15411 - switch (c->x86_vendor) {
15412 - case X86_VENDOR_AMD:
15413 - init_amd(c);
15414 - break;
15415 -
15416 - case X86_VENDOR_INTEL:
15417 - init_intel(c);
15418 - break;
15419 -
15420 - case X86_VENDOR_CENTAUR:
15421 - init_centaur(c);
15422 - break;
15423 -
15424 - case X86_VENDOR_UNKNOWN:
15425 - default:
15426 - display_cacheinfo(c);
15427 - break;
15428 - }
15429 -
15430 - detect_ht(c);
15431 -
15432 - /*
15433 - * On SMP, boot_cpu_data holds the common feature set between
15434 - * all CPUs; so make sure that we indicate which features are
15435 - * common between the CPUs. The first time this routine gets
15436 - * executed, c == &boot_cpu_data.
15437 - */
15438 - if (c != &boot_cpu_data) {
15439 - /* AND the already accumulated flags with these */
15440 - for (i = 0; i < NCAPINTS; i++)
15441 - boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
15442 - }
15443 -
15444 - /* Clear all flags overriden by options */
15445 - for (i = 0; i < NCAPINTS; i++)
15446 - c->x86_capability[i] &= ~cleared_cpu_caps[i];
15447 -
15448 -#ifdef CONFIG_X86_MCE
15449 - mcheck_init(c);
15450 -#endif
15451 - select_idle_routine(c);
15452 -
15453 -#ifdef CONFIG_NUMA
15454 - numa_add_cpu(smp_processor_id());
15455 -#endif
15456 -
15457 -}
15458 -
15459 -void __cpuinit identify_boot_cpu(void)
15460 -{
15461 - identify_cpu(&boot_cpu_data);
15462 -}
15463 -
15464 -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
15465 -{
15466 - BUG_ON(c == &boot_cpu_data);
15467 - identify_cpu(c);
15468 - mtrr_ap_init();
15469 -}
15470 -
15471 -static __init int setup_noclflush(char *arg)
15472 -{
15473 - setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
15474 - return 1;
15475 -}
15476 -__setup("noclflush", setup_noclflush);
15477 -
15478 -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
15479 -{
15480 - if (c->x86_model_id[0])
15481 - printk(KERN_CONT "%s", c->x86_model_id);
15482 -
15483 - if (c->x86_mask || c->cpuid_level >= 0)
15484 - printk(KERN_CONT " stepping %02x\n", c->x86_mask);
15485 - else
15486 - printk(KERN_CONT "\n");
15487 -}
15488 -
15489 -static __init int setup_disablecpuid(char *arg)
15490 -{
15491 - int bit;
15492 - if (get_option(&arg, &bit) && bit < NCAPINTS*32)
15493 - setup_clear_cpu_cap(bit);
15494 - else
15495 - return 0;
15496 - return 1;
15497 -}
15498 -__setup("clearcpuid=", setup_disablecpuid);
15499 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
15500 +++ sle11-2009-06-04/arch/x86/kernel/setup_percpu-xen.c 2009-06-04 10:21:39.000000000 +0200
15501 @@ -0,0 +1,385 @@
15502 +#include <linux/kernel.h>
15503 +#include <linux/module.h>
15504 +#include <linux/init.h>
15505 +#include <linux/bootmem.h>
15506 +#include <linux/percpu.h>
15507 +#include <linux/kexec.h>
15508 +#include <linux/crash_dump.h>
15509 +#include <asm/smp.h>
15510 +#include <asm/percpu.h>
15511 +#include <asm/sections.h>
15512 +#include <asm/processor.h>
15513 +#include <asm/setup.h>
15514 +#include <asm/topology.h>
15515 +#include <asm/mpspec.h>
15516 +#include <asm/apicdef.h>
15517 +#include <asm/highmem.h>
15518 +
15519 +#ifdef CONFIG_X86_LOCAL_APIC
15520 +unsigned int num_processors;
15521 +unsigned disabled_cpus __cpuinitdata;
15522 +/* Processor that is doing the boot up */
15523 +unsigned int boot_cpu_physical_apicid = -1U;
15524 +unsigned int max_physical_apicid;
15525 +EXPORT_SYMBOL(boot_cpu_physical_apicid);
15526 +
15527 +/* Bitmask of physically existing CPUs */
15528 +physid_mask_t phys_cpu_present_map;
15529 +#endif
15530 +
15531 +/* map cpu index to physical APIC ID */
15532 +#ifndef CONFIG_XEN
15533 +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
15534 +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
15535 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15536 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
15537 +#else
15538 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
15539 +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15540 +#endif
15541 +
15542 +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
15543 +#define X86_64_NUMA 1
15544 +
15545 +/* map cpu index to node index */
15546 +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
15547 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
15548 +
15549 +/* which logical CPUs are on which nodes */
15550 +cpumask_t *node_to_cpumask_map;
15551 +EXPORT_SYMBOL(node_to_cpumask_map);
15552 +
15553 +/* setup node_to_cpumask_map */
15554 +static void __init setup_node_to_cpumask_map(void);
15555 +
15556 +#else
15557 +static inline void setup_node_to_cpumask_map(void) { }
15558 +#endif
15559 +
15560 +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
15561 +/*
15562 + * Copy data used in early init routines from the initial arrays to the
15563 + * per cpu data areas. These arrays then become expendable and the
15564 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
15565 + */
15566 +static void __init setup_per_cpu_maps(void)
15567 +{
15568 +#ifndef CONFIG_XEN
15569 + int cpu;
15570 +
15571 + for_each_possible_cpu(cpu) {
15572 + per_cpu(x86_cpu_to_apicid, cpu) =
15573 + early_per_cpu_map(x86_cpu_to_apicid, cpu);
15574 + per_cpu(x86_bios_cpu_apicid, cpu) =
15575 + early_per_cpu_map(x86_bios_cpu_apicid, cpu);
15576 +#ifdef X86_64_NUMA
15577 + per_cpu(x86_cpu_to_node_map, cpu) =
15578 + early_per_cpu_map(x86_cpu_to_node_map, cpu);
15579 +#endif
15580 + }
15581 +
15582 + /* indicate the early static arrays will soon be gone */
15583 + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
15584 + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
15585 +#ifdef X86_64_NUMA
15586 + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
15587 +#endif
15588 +#endif
15589 +}
15590 +
15591 +#ifdef CONFIG_X86_32
15592 +/*
15593 + * Great future not-so-futuristic plan: make i386 and x86_64 do it
15594 + * the same way
15595 + */
15596 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
15597 +EXPORT_SYMBOL(__per_cpu_offset);
15598 +static inline void setup_cpu_pda_map(void) { }
15599 +
15600 +#elif !defined(CONFIG_SMP)
15601 +static inline void setup_cpu_pda_map(void) { }
15602 +
15603 +#else /* CONFIG_SMP && CONFIG_X86_64 */
15604 +
15605 +/*
15606 + * Allocate cpu_pda pointer table and array via alloc_bootmem.
15607 + */
15608 +static void __init setup_cpu_pda_map(void)
15609 +{
15610 + char *pda;
15611 + struct x8664_pda **new_cpu_pda;
15612 + unsigned long size;
15613 + int cpu;
15614 +
15615 + size = roundup(sizeof(struct x8664_pda), cache_line_size());
15616 +
15617 + /* allocate cpu_pda array and pointer table */
15618 + {
15619 + unsigned long tsize = nr_cpu_ids * sizeof(void *);
15620 + unsigned long asize = size * (nr_cpu_ids - 1);
15621 +
15622 + tsize = roundup(tsize, cache_line_size());
15623 + new_cpu_pda = alloc_bootmem(tsize + asize);
15624 + pda = (char *)new_cpu_pda + tsize;
15625 + }
15626 +
15627 + /* initialize pointer table to static pda's */
15628 + for_each_possible_cpu(cpu) {
15629 + if (cpu == 0) {
15630 + /* leave boot cpu pda in place */
15631 + new_cpu_pda[0] = cpu_pda(0);
15632 + continue;
15633 + }
15634 + new_cpu_pda[cpu] = (struct x8664_pda *)pda;
15635 + new_cpu_pda[cpu]->in_bootmem = 1;
15636 + pda += size;
15637 + }
15638 +
15639 + /* point to new pointer table */
15640 + _cpu_pda = new_cpu_pda;
15641 +}
15642 +#endif
15643 +
15644 +/*
15645 + * Great future plan:
15646 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
15647 + * Always point %gs to its beginning
15648 + */
15649 +void __init setup_per_cpu_areas(void)
15650 +{
15651 + ssize_t size = PERCPU_ENOUGH_ROOM;
15652 + char *ptr;
15653 + int cpu;
15654 +
15655 + /* Setup cpu_pda map */
15656 + setup_cpu_pda_map();
15657 +
15658 + /* Copy section for each CPU (we discard the original) */
15659 + size = PERCPU_ENOUGH_ROOM;
15660 + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
15661 + size);
15662 +
15663 + for_each_possible_cpu(cpu) {
15664 +#ifndef CONFIG_NEED_MULTIPLE_NODES
15665 + ptr = alloc_bootmem_pages(size);
15666 +#else
15667 + int node = early_cpu_to_node(cpu);
15668 + if (!node_online(node) || !NODE_DATA(node)) {
15669 + ptr = alloc_bootmem_pages(size);
15670 + printk(KERN_INFO
15671 + "cpu %d has no node %d or node-local memory\n",
15672 + cpu, node);
15673 + }
15674 + else
15675 + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
15676 +#endif
15677 + per_cpu_offset(cpu) = ptr - __per_cpu_start;
15678 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
15679 +
15680 + }
15681 +
15682 + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
15683 + NR_CPUS, nr_cpu_ids, nr_node_ids);
15684 +
15685 + /* Setup percpu data maps */
15686 + setup_per_cpu_maps();
15687 +
15688 + /* Setup node to cpumask map */
15689 + setup_node_to_cpumask_map();
15690 +}
15691 +
15692 +#endif
15693 +
15694 +#ifdef X86_64_NUMA
15695 +
15696 +/*
15697 + * Allocate node_to_cpumask_map based on number of available nodes
15698 + * Requires node_possible_map to be valid.
15699 + *
15700 + * Note: node_to_cpumask() is not valid until after this is done.
15701 + */
15702 +static void __init setup_node_to_cpumask_map(void)
15703 +{
15704 + unsigned int node, num = 0;
15705 + cpumask_t *map;
15706 +
15707 + /* setup nr_node_ids if not done yet */
15708 + if (nr_node_ids == MAX_NUMNODES) {
15709 + for_each_node_mask(node, node_possible_map)
15710 + num = node;
15711 + nr_node_ids = num + 1;
15712 + }
15713 +
15714 + /* allocate the map */
15715 + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
15716 +
15717 + pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
15718 + map, nr_node_ids);
15719 +
15720 + /* node_to_cpumask() will now work */
15721 + node_to_cpumask_map = map;
15722 +}
15723 +
15724 +void __cpuinit numa_set_node(int cpu, int node)
15725 +{
15726 + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
15727 +
15728 + if (cpu_pda(cpu) && node != NUMA_NO_NODE)
15729 + cpu_pda(cpu)->nodenumber = node;
15730 +
15731 + if (cpu_to_node_map)
15732 + cpu_to_node_map[cpu] = node;
15733 +
15734 + else if (per_cpu_offset(cpu))
15735 + per_cpu(x86_cpu_to_node_map, cpu) = node;
15736 +
15737 + else
15738 + pr_debug("Setting node for non-present cpu %d\n", cpu);
15739 +}
15740 +
15741 +void __cpuinit numa_clear_node(int cpu)
15742 +{
15743 + numa_set_node(cpu, NUMA_NO_NODE);
15744 +}
15745 +
15746 +#ifndef CONFIG_DEBUG_PER_CPU_MAPS
15747 +
15748 +void __cpuinit numa_add_cpu(int cpu)
15749 +{
15750 + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
15751 +}
15752 +
15753 +void __cpuinit numa_remove_cpu(int cpu)
15754 +{
15755 + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
15756 +}
15757 +
15758 +#else /* CONFIG_DEBUG_PER_CPU_MAPS */
15759 +
15760 +/*
15761 + * --------- debug versions of the numa functions ---------
15762 + */
15763 +static void __cpuinit numa_set_cpumask(int cpu, int enable)
15764 +{
15765 + int node = cpu_to_node(cpu);
15766 + cpumask_t *mask;
15767 + char buf[64];
15768 +
15769 + if (node_to_cpumask_map == NULL) {
15770 + printk(KERN_ERR "node_to_cpumask_map NULL\n");
15771 + dump_stack();
15772 + return;
15773 + }
15774 +
15775 + mask = &node_to_cpumask_map[node];
15776 + if (enable)
15777 + cpu_set(cpu, *mask);
15778 + else
15779 + cpu_clear(cpu, *mask);
15780 +
15781 + cpulist_scnprintf(buf, sizeof(buf), *mask);
15782 + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
15783 + enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
15784 + }
15785 +
15786 +void __cpuinit numa_add_cpu(int cpu)
15787 +{
15788 + numa_set_cpumask(cpu, 1);
15789 +}
15790 +
15791 +void __cpuinit numa_remove_cpu(int cpu)
15792 +{
15793 + numa_set_cpumask(cpu, 0);
15794 +}
15795 +
15796 +int cpu_to_node(int cpu)
15797 +{
15798 + if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
15799 + printk(KERN_WARNING
15800 + "cpu_to_node(%d): usage too early!\n", cpu);
15801 + dump_stack();
15802 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15803 + }
15804 + return per_cpu(x86_cpu_to_node_map, cpu);
15805 +}
15806 +EXPORT_SYMBOL(cpu_to_node);
15807 +
15808 +/*
15809 + * Same function as cpu_to_node() but used if called before the
15810 + * per_cpu areas are setup.
15811 + */
15812 +int early_cpu_to_node(int cpu)
15813 +{
15814 + if (early_per_cpu_ptr(x86_cpu_to_node_map))
15815 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15816 +
15817 + if (!per_cpu_offset(cpu)) {
15818 + printk(KERN_WARNING
15819 + "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
15820 + dump_stack();
15821 + return NUMA_NO_NODE;
15822 + }
15823 + return per_cpu(x86_cpu_to_node_map, cpu);
15824 +}
15825 +
15826 +
15827 +/* empty cpumask */
15828 +static const cpumask_t cpu_mask_none;
15829 +
15830 +/*
15831 + * Returns a pointer to the bitmask of CPUs on Node 'node'.
15832 + */
15833 +const cpumask_t *_node_to_cpumask_ptr(int node)
15834 +{
15835 + if (node_to_cpumask_map == NULL) {
15836 + printk(KERN_WARNING
15837 + "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
15838 + node);
15839 + dump_stack();
15840 + return (const cpumask_t *)&cpu_online_map;
15841 + }
15842 + if (node >= nr_node_ids) {
15843 + printk(KERN_WARNING
15844 + "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
15845 + node, nr_node_ids);
15846 + dump_stack();
15847 + return &cpu_mask_none;
15848 + }
15849 + return &node_to_cpumask_map[node];
15850 +}
15851 +EXPORT_SYMBOL(_node_to_cpumask_ptr);
15852 +
15853 +/*
15854 + * Returns a bitmask of CPUs on Node 'node'.
15855 + *
15856 + * Side note: this function creates the returned cpumask on the stack
15857 + * so with a high NR_CPUS count, excessive stack space is used. The
15858 + * node_to_cpumask_ptr function should be used whenever possible.
15859 + */
15860 +cpumask_t node_to_cpumask(int node)
15861 +{
15862 + if (node_to_cpumask_map == NULL) {
15863 + printk(KERN_WARNING
15864 + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
15865 + dump_stack();
15866 + return cpu_online_map;
15867 + }
15868 + if (node >= nr_node_ids) {
15869 + printk(KERN_WARNING
15870 + "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
15871 + node, nr_node_ids);
15872 + dump_stack();
15873 + return cpu_mask_none;
15874 + }
15875 + return node_to_cpumask_map[node];
15876 +}
15877 +EXPORT_SYMBOL(node_to_cpumask);
15878 +
15879 +/*
15880 + * --------- end of debug versions of the numa functions ---------
15881 + */
15882 +
15883 +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
15884 +
15885 +#endif /* X86_64_NUMA */
15886 +
15887 --- sle11-2009-06-04.orig/arch/x86/kernel/smp-xen.c 2009-03-16 16:38:05.000000000 +0100
15888 +++ sle11-2009-06-04/arch/x86/kernel/smp-xen.c 2009-06-04 10:21:39.000000000 +0200
15889 @@ -121,132 +121,14 @@ void xen_smp_send_reschedule(int cpu)
15890 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
15891 }
15892
15893 -/*
15894 - * Structure and data for smp_call_function(). This is designed to minimise
15895 - * static memory requirements. It also looks cleaner.
15896 - */
15897 -static DEFINE_SPINLOCK(call_lock);
15898 -
15899 -struct call_data_struct {
15900 - void (*func) (void *info);
15901 - void *info;
15902 - atomic_t started;
15903 - atomic_t finished;
15904 - int wait;
15905 -};
15906 -
15907 -void lock_ipi_call_lock(void)
15908 +void xen_send_call_func_single_ipi(int cpu)
15909 {
15910 - spin_lock_irq(&call_lock);
15911 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
15912 }
15913
15914 -void unlock_ipi_call_lock(void)
15915 +void xen_send_call_func_ipi(cpumask_t mask)
15916 {
15917 - spin_unlock_irq(&call_lock);
15918 -}
15919 -
15920 -static struct call_data_struct *call_data;
15921 -
15922 -static void __smp_call_function(void (*func) (void *info), void *info,
15923 - int nonatomic, int wait)
15924 -{
15925 - struct call_data_struct data;
15926 - int cpus = num_online_cpus() - 1;
15927 -
15928 - if (!cpus)
15929 - return;
15930 -
15931 - data.func = func;
15932 - data.info = info;
15933 - atomic_set(&data.started, 0);
15934 - data.wait = wait;
15935 - if (wait)
15936 - atomic_set(&data.finished, 0);
15937 -
15938 - call_data = &data;
15939 - mb();
15940 -
15941 - /* Send a message to all other CPUs and wait for them to respond */
15942 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
15943 -
15944 - /* Wait for response */
15945 - while (atomic_read(&data.started) != cpus)
15946 - cpu_relax();
15947 -
15948 - if (wait)
15949 - while (atomic_read(&data.finished) != cpus)
15950 - cpu_relax();
15951 -}
15952 -
15953 -
15954 -/**
15955 - * smp_call_function_mask(): Run a function on a set of other CPUs.
15956 - * @mask: The set of cpus to run on. Must not include the current cpu.
15957 - * @func: The function to run. This must be fast and non-blocking.
15958 - * @info: An arbitrary pointer to pass to the function.
15959 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
15960 - *
15961 - * Returns 0 on success, else a negative status code.
15962 - *
15963 - * If @wait is true, then returns once @func has returned; otherwise
15964 - * it returns just before the target cpu calls @func.
15965 - *
15966 - * You must not call this function with disabled interrupts or from a
15967 - * hardware interrupt handler or from a bottom half handler.
15968 - */
15969 -int
15970 -xen_smp_call_function_mask(cpumask_t mask,
15971 - void (*func)(void *), void *info,
15972 - int wait)
15973 -{
15974 - struct call_data_struct data;
15975 - cpumask_t allbutself;
15976 - int cpus;
15977 -
15978 - /* Can deadlock when called with interrupts disabled */
15979 - WARN_ON(irqs_disabled());
15980 -
15981 - /* Holding any lock stops cpus from going down. */
15982 - spin_lock(&call_lock);
15983 -
15984 - allbutself = cpu_online_map;
15985 - cpu_clear(smp_processor_id(), allbutself);
15986 -
15987 - cpus_and(mask, mask, allbutself);
15988 - cpus = cpus_weight(mask);
15989 -
15990 - if (!cpus) {
15991 - spin_unlock(&call_lock);
15992 - return 0;
15993 - }
15994 -
15995 - data.func = func;
15996 - data.info = info;
15997 - atomic_set(&data.started, 0);
15998 - data.wait = wait;
15999 - if (wait)
16000 - atomic_set(&data.finished, 0);
16001 -
16002 - call_data = &data;
16003 - wmb();
16004 -
16005 - /* Send a message to other CPUs */
16006 - if (cpus_equal(mask, allbutself) &&
16007 - cpus_equal(cpu_online_map, cpu_callout_map))
16008 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
16009 - else
16010 - send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16011 -
16012 - /* Wait for response */
16013 - while (atomic_read(&data.started) != cpus)
16014 - cpu_relax();
16015 -
16016 - if (wait)
16017 - while (atomic_read(&data.finished) != cpus)
16018 - cpu_relax();
16019 - spin_unlock(&call_lock);
16020 -
16021 - return 0;
16022 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16023 }
16024
16025 static void stop_this_cpu(void *dummy)
16026 @@ -268,15 +150,10 @@ static void stop_this_cpu(void *dummy)
16027
16028 void xen_smp_send_stop(void)
16029 {
16030 - int nolock;
16031 unsigned long flags;
16032
16033 - /* Don't deadlock on the call lock in panic */
16034 - nolock = !spin_trylock(&call_lock);
16035 + smp_call_function(stop_this_cpu, NULL, 0);
16036 local_irq_save(flags);
16037 - __smp_call_function(stop_this_cpu, NULL, 0, 0);
16038 - if (!nolock)
16039 - spin_unlock(&call_lock);
16040 disable_all_local_evtchn();
16041 local_irq_restore(flags);
16042 }
16043 @@ -298,21 +175,8 @@ irqreturn_t smp_reschedule_interrupt(int
16044
16045 irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
16046 {
16047 - void (*func) (void *info) = call_data->func;
16048 - void *info = call_data->info;
16049 - int wait = call_data->wait;
16050 -
16051 - /*
16052 - * Notify initiating CPU that I've grabbed the data and am
16053 - * about to execute the function
16054 - */
16055 - mb();
16056 - atomic_inc(&call_data->started);
16057 - /*
16058 - * At this point the info structure may be out of scope unless wait==1
16059 - */
16060 irq_enter();
16061 - (*func)(info);
16062 + generic_smp_call_function_interrupt();
16063 #ifdef CONFIG_X86_32
16064 __get_cpu_var(irq_stat).irq_call_count++;
16065 #else
16066 @@ -320,10 +184,19 @@ irqreturn_t smp_call_function_interrupt(
16067 #endif
16068 irq_exit();
16069
16070 - if (wait) {
16071 - mb();
16072 - atomic_inc(&call_data->finished);
16073 - }
16074 + return IRQ_HANDLED;
16075 +}
16076 +
16077 +irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
16078 +{
16079 + irq_enter();
16080 + generic_smp_call_function_single_interrupt();
16081 +#ifdef CONFIG_X86_32
16082 + __get_cpu_var(irq_stat).irq_call_count++;
16083 +#else
16084 + add_pda(irq_call_count, 1);
16085 +#endif
16086 + irq_exit();
16087
16088 return IRQ_HANDLED;
16089 }
16090 --- sle11-2009-06-04.orig/arch/x86/kernel/time_32-xen.c 2009-03-24 10:13:09.000000000 +0100
16091 +++ sle11-2009-06-04/arch/x86/kernel/time_32-xen.c 2009-06-04 10:21:39.000000000 +0200
16092 @@ -468,7 +468,7 @@ irqreturn_t timer_interrupt(int irq, voi
16093
16094 /* Keep nmi watchdog up to date */
16095 #ifdef __i386__
16096 - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
16097 + x86_add_percpu(irq_stat.irq0_irqs, 1);
16098 #else
16099 add_pda(irq0_irqs, 1);
16100 #endif
16101 @@ -746,9 +746,7 @@ void __init time_init(void)
16102
16103 update_wallclock();
16104
16105 -#ifndef CONFIG_X86_64
16106 use_tsc_delay();
16107 -#endif
16108
16109 /* Cannot request_irq() until kmem is initialised. */
16110 late_time_init = setup_cpu0_timer_irq;
16111 @@ -805,7 +803,8 @@ static void stop_hz_timer(void)
16112
16113 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
16114 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
16115 - (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
16116 + (j = get_next_timer_interrupt(jiffies),
16117 + time_before_eq(j, jiffies))) {
16118 cpu_clear(cpu, nohz_cpu_mask);
16119 j = jiffies + 1;
16120 }
16121 --- sle11-2009-06-04.orig/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:38:05.000000000 +0100
16122 +++ sle11-2009-06-04/arch/x86/kernel/traps_32-xen.c 2009-06-04 10:21:39.000000000 +0200
16123 @@ -1,5 +1,6 @@
16124 /*
16125 * Copyright (C) 1991, 1992 Linus Torvalds
16126 + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
16127 *
16128 * Pentium III FXSR, SSE support
16129 * Gareth Hughes <gareth@valinux.com>, May 2000
16130 @@ -57,11 +58,10 @@
16131 #include <asm/nmi.h>
16132 #include <asm/smp.h>
16133 #include <asm/io.h>
16134 +#include <asm/traps.h>
16135
16136 #include "mach_traps.h"
16137
16138 -int panic_on_unrecovered_nmi;
16139 -
16140 #ifndef CONFIG_XEN
16141 DECLARE_BITMAP(used_vectors, NR_VECTORS);
16142 EXPORT_SYMBOL_GPL(used_vectors);
16143 @@ -82,43 +82,22 @@ gate_desc idt_table[256]
16144 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
16145 #endif
16146
16147 -asmlinkage void divide_error(void);
16148 -asmlinkage void debug(void);
16149 -asmlinkage void nmi(void);
16150 -asmlinkage void int3(void);
16151 -asmlinkage void overflow(void);
16152 -asmlinkage void bounds(void);
16153 -asmlinkage void invalid_op(void);
16154 -asmlinkage void device_not_available(void);
16155 -asmlinkage void coprocessor_segment_overrun(void);
16156 -asmlinkage void invalid_TSS(void);
16157 -asmlinkage void segment_not_present(void);
16158 -asmlinkage void stack_segment(void);
16159 -asmlinkage void general_protection(void);
16160 -asmlinkage void page_fault(void);
16161 -asmlinkage void coprocessor_error(void);
16162 -asmlinkage void simd_coprocessor_error(void);
16163 -asmlinkage void alignment_check(void);
16164 -#ifndef CONFIG_XEN
16165 -asmlinkage void spurious_interrupt_bug(void);
16166 -#else
16167 -asmlinkage void fixup_4gb_segment(void);
16168 -#endif
16169 -asmlinkage void machine_check(void);
16170 -
16171 +int panic_on_unrecovered_nmi;
16172 int kstack_depth_to_print = 24;
16173 static unsigned int code_bytes = 64;
16174 +static int ignore_nmis;
16175 +static int die_counter;
16176
16177 void printk_address(unsigned long address, int reliable)
16178 {
16179 #ifdef CONFIG_KALLSYMS
16180 - char namebuf[KSYM_NAME_LEN];
16181 unsigned long offset = 0;
16182 unsigned long symsize;
16183 const char *symname;
16184 - char reliab[4] = "";
16185 - char *delim = ":";
16186 char *modname;
16187 + char *delim = ":";
16188 + char namebuf[KSYM_NAME_LEN];
16189 + char reliab[4] = "";
16190
16191 symname = kallsyms_lookup(address, &symsize, &offset,
16192 &modname, namebuf);
16193 @@ -138,22 +117,23 @@ void printk_address(unsigned long addres
16194 #endif
16195 }
16196
16197 -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
16198 +static inline int valid_stack_ptr(struct thread_info *tinfo,
16199 + void *p, unsigned int size)
16200 {
16201 - return p > (void *)tinfo &&
16202 - p <= (void *)tinfo + THREAD_SIZE - size;
16203 + void *t = tinfo;
16204 + return p > t && p <= t + THREAD_SIZE - size;
16205 }
16206
16207 /* The form of the top of the frame on the stack */
16208 struct stack_frame {
16209 - struct stack_frame *next_frame;
16210 - unsigned long return_address;
16211 + struct stack_frame *next_frame;
16212 + unsigned long return_address;
16213 };
16214
16215 static inline unsigned long
16216 print_context_stack(struct thread_info *tinfo,
16217 - unsigned long *stack, unsigned long bp,
16218 - const struct stacktrace_ops *ops, void *data)
16219 + unsigned long *stack, unsigned long bp,
16220 + const struct stacktrace_ops *ops, void *data)
16221 {
16222 struct stack_frame *frame = (struct stack_frame *)bp;
16223
16224 @@ -175,8 +155,6 @@ print_context_stack(struct thread_info *
16225 return bp;
16226 }
16227
16228 -#define MSG(msg) ops->warning(data, msg)
16229 -
16230 void dump_trace(struct task_struct *task, struct pt_regs *regs,
16231 unsigned long *stack, unsigned long bp,
16232 const struct stacktrace_ops *ops, void *data)
16233 @@ -186,7 +164,6 @@ void dump_trace(struct task_struct *task
16234
16235 if (!stack) {
16236 unsigned long dummy;
16237 -
16238 stack = &dummy;
16239 if (task != current)
16240 stack = (unsigned long *)task->thread.sp;
16241 @@ -204,7 +181,7 @@ void dump_trace(struct task_struct *task
16242 }
16243 #endif
16244
16245 - while (1) {
16246 + for (;;) {
16247 struct thread_info *context;
16248
16249 context = (struct thread_info *)
16250 @@ -256,15 +233,15 @@ static void print_trace_address(void *da
16251 }
16252
16253 static const struct stacktrace_ops print_trace_ops = {
16254 - .warning = print_trace_warning,
16255 - .warning_symbol = print_trace_warning_symbol,
16256 - .stack = print_trace_stack,
16257 - .address = print_trace_address,
16258 + .warning = print_trace_warning,
16259 + .warning_symbol = print_trace_warning_symbol,
16260 + .stack = print_trace_stack,
16261 + .address = print_trace_address,
16262 };
16263
16264 static void
16265 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16266 - unsigned long *stack, unsigned long bp, char *log_lvl)
16267 + unsigned long *stack, unsigned long bp, char *log_lvl)
16268 {
16269 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16270 printk("%s =======================\n", log_lvl);
16271 @@ -359,15 +336,14 @@ void show_registers(struct pt_regs *regs
16272 printk(KERN_EMERG "Code: ");
16273
16274 ip = (u8 *)regs->ip - code_prologue;
16275 - if (ip < (u8 *)PAGE_OFFSET ||
16276 - probe_kernel_address(ip, c)) {
16277 + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
16278 /* try starting at EIP */
16279 ip = (u8 *)regs->ip;
16280 code_len = code_len - code_prologue + 1;
16281 }
16282 for (i = 0; i < code_len; i++, ip++) {
16283 if (ip < (u8 *)PAGE_OFFSET ||
16284 - probe_kernel_address(ip, c)) {
16285 + probe_kernel_address(ip, c)) {
16286 printk(" Bad EIP value.");
16287 break;
16288 }
16289 @@ -392,7 +368,53 @@ int is_valid_bugaddr(unsigned long ip)
16290 return ud2 == 0x0b0f;
16291 }
16292
16293 -static int die_counter;
16294 +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
16295 +static int die_owner = -1;
16296 +static unsigned int die_nest_count;
16297 +
16298 +unsigned __kprobes long oops_begin(void)
16299 +{
16300 + unsigned long flags;
16301 +
16302 + oops_enter();
16303 +
16304 + if (die_owner != raw_smp_processor_id()) {
16305 + console_verbose();
16306 + raw_local_irq_save(flags);
16307 + __raw_spin_lock(&die_lock);
16308 + die_owner = smp_processor_id();
16309 + die_nest_count = 0;
16310 + bust_spinlocks(1);
16311 + } else {
16312 + raw_local_irq_save(flags);
16313 + }
16314 + die_nest_count++;
16315 + return flags;
16316 +}
16317 +
16318 +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
16319 +{
16320 + bust_spinlocks(0);
16321 + die_owner = -1;
16322 + add_taint(TAINT_DIE);
16323 + __raw_spin_unlock(&die_lock);
16324 + raw_local_irq_restore(flags);
16325 +
16326 + if (!regs)
16327 + return;
16328 +
16329 + if (kexec_should_crash(current))
16330 + crash_kexec(regs);
16331 +
16332 + if (in_interrupt())
16333 + panic("Fatal exception in interrupt");
16334 +
16335 + if (panic_on_oops)
16336 + panic("Fatal exception");
16337 +
16338 + oops_exit();
16339 + do_exit(signr);
16340 +}
16341
16342 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
16343 {
16344 @@ -410,26 +432,22 @@ int __kprobes __die(const char *str, str
16345 printk("DEBUG_PAGEALLOC");
16346 #endif
16347 printk("\n");
16348 -
16349 if (notify_die(DIE_OOPS, str, regs, err,
16350 - current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
16351 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
16352 + return 1;
16353
16354 - show_registers(regs);
16355 - /* Executive summary in case the oops scrolled away */
16356 - sp = (unsigned long) (&regs->sp);
16357 - savesegment(ss, ss);
16358 - if (user_mode(regs)) {
16359 - sp = regs->sp;
16360 - ss = regs->ss & 0xffff;
16361 - }
16362 - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16363 - print_symbol("%s", regs->ip);
16364 - printk(" SS:ESP %04x:%08lx\n", ss, sp);
16365 -
16366 - return 0;
16367 - }
16368 -
16369 - return 1;
16370 + show_registers(regs);
16371 + /* Executive summary in case the oops scrolled away */
16372 + sp = (unsigned long) (&regs->sp);
16373 + savesegment(ss, ss);
16374 + if (user_mode(regs)) {
16375 + sp = regs->sp;
16376 + ss = regs->ss & 0xffff;
16377 + }
16378 + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16379 + print_symbol("%s", regs->ip);
16380 + printk(" SS:ESP %04x:%08lx\n", ss, sp);
16381 + return 0;
16382 }
16383
16384 /*
16385 @@ -438,31 +456,9 @@ int __kprobes __die(const char *str, str
16386 */
16387 void die(const char *str, struct pt_regs *regs, long err)
16388 {
16389 - static struct {
16390 - raw_spinlock_t lock;
16391 - u32 lock_owner;
16392 - int lock_owner_depth;
16393 - } die = {
16394 - .lock = __RAW_SPIN_LOCK_UNLOCKED,
16395 - .lock_owner = -1,
16396 - .lock_owner_depth = 0
16397 - };
16398 - unsigned long flags;
16399 -
16400 - oops_enter();
16401 + unsigned long flags = oops_begin();
16402
16403 - if (die.lock_owner != raw_smp_processor_id()) {
16404 - console_verbose();
16405 - raw_local_irq_save(flags);
16406 - __raw_spin_lock(&die.lock);
16407 - die.lock_owner = smp_processor_id();
16408 - die.lock_owner_depth = 0;
16409 - bust_spinlocks(1);
16410 - } else {
16411 - raw_local_irq_save(flags);
16412 - }
16413 -
16414 - if (++die.lock_owner_depth < 3) {
16415 + if (die_nest_count < 3) {
16416 report_bug(regs->ip, regs);
16417
16418 if (__die(str, regs, err))
16419 @@ -471,26 +467,7 @@ void die(const char *str, struct pt_regs
16420 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
16421 }
16422
16423 - bust_spinlocks(0);
16424 - die.lock_owner = -1;
16425 - add_taint(TAINT_DIE);
16426 - __raw_spin_unlock(&die.lock);
16427 - raw_local_irq_restore(flags);
16428 -
16429 - if (!regs)
16430 - return;
16431 -
16432 - if (kexec_should_crash(current))
16433 - crash_kexec(regs);
16434 -
16435 - if (in_interrupt())
16436 - panic("Fatal exception in interrupt");
16437 -
16438 - if (panic_on_oops)
16439 - panic("Fatal exception");
16440 -
16441 - oops_exit();
16442 - do_exit(SIGSEGV);
16443 + oops_end(flags, regs, SIGSEGV);
16444 }
16445
16446 static inline void
16447 @@ -554,7 +531,7 @@ void do_##name(struct pt_regs *regs, lon
16448 { \
16449 trace_hardirqs_fixup(); \
16450 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16451 - == NOTIFY_STOP) \
16452 + == NOTIFY_STOP) \
16453 return; \
16454 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
16455 }
16456 @@ -570,7 +547,7 @@ void do_##name(struct pt_regs *regs, lon
16457 info.si_code = sicode; \
16458 info.si_addr = (void __user *)siaddr; \
16459 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16460 - == NOTIFY_STOP) \
16461 + == NOTIFY_STOP) \
16462 return; \
16463 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
16464 }
16465 @@ -579,7 +556,7 @@ void do_##name(struct pt_regs *regs, lon
16466 void do_##name(struct pt_regs *regs, long error_code) \
16467 { \
16468 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16469 - == NOTIFY_STOP) \
16470 + == NOTIFY_STOP) \
16471 return; \
16472 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
16473 }
16474 @@ -594,28 +571,29 @@ void do_##name(struct pt_regs *regs, lon
16475 info.si_addr = (void __user *)siaddr; \
16476 trace_hardirqs_fixup(); \
16477 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16478 - == NOTIFY_STOP) \
16479 + == NOTIFY_STOP) \
16480 return; \
16481 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
16482 }
16483
16484 -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16485 +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16486 #ifndef CONFIG_KPROBES
16487 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
16488 #endif
16489 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
16490 DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
16491 -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16492 -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16493 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16494 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16495 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
16496 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16497 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16498 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16499 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16500 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
16501 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
16502
16503 -void __kprobes do_general_protection(struct pt_regs * regs,
16504 - long error_code)
16505 +void __kprobes
16506 +do_general_protection(struct pt_regs *regs, long error_code)
16507 {
16508 + struct task_struct *tsk;
16509 struct thread_struct *thread;
16510
16511 thread = &current->thread;
16512 @@ -623,23 +601,24 @@ void __kprobes do_general_protection(str
16513 if (regs->flags & X86_VM_MASK)
16514 goto gp_in_vm86;
16515
16516 + tsk = current;
16517 if (!user_mode(regs))
16518 goto gp_in_kernel;
16519
16520 - current->thread.error_code = error_code;
16521 - current->thread.trap_no = 13;
16522 + tsk->thread.error_code = error_code;
16523 + tsk->thread.trap_no = 13;
16524
16525 - if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
16526 - printk_ratelimit()) {
16527 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
16528 + printk_ratelimit()) {
16529 printk(KERN_INFO
16530 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16531 - current->comm, task_pid_nr(current),
16532 - regs->ip, regs->sp, error_code);
16533 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16534 + tsk->comm, task_pid_nr(tsk),
16535 + regs->ip, regs->sp, error_code);
16536 print_vma_addr(" in ", regs->ip);
16537 printk("\n");
16538 }
16539
16540 - force_sig(SIGSEGV, current);
16541 + force_sig(SIGSEGV, tsk);
16542 return;
16543
16544 gp_in_vm86:
16545 @@ -648,14 +627,15 @@ gp_in_vm86:
16546 return;
16547
16548 gp_in_kernel:
16549 - if (!fixup_exception(regs)) {
16550 - current->thread.error_code = error_code;
16551 - current->thread.trap_no = 13;
16552 - if (notify_die(DIE_GPF, "general protection fault", regs,
16553 + if (fixup_exception(regs))
16554 + return;
16555 +
16556 + tsk->thread.error_code = error_code;
16557 + tsk->thread.trap_no = 13;
16558 + if (notify_die(DIE_GPF, "general protection fault", regs,
16559 error_code, 13, SIGSEGV) == NOTIFY_STOP)
16560 - return;
16561 - die("general protection fault", regs, error_code);
16562 - }
16563 + return;
16564 + die("general protection fault", regs, error_code);
16565 }
16566
16567 static notrace __kprobes void
16568 @@ -722,9 +702,9 @@ unknown_nmi_error(unsigned char reason,
16569
16570 static DEFINE_SPINLOCK(nmi_print_lock);
16571
16572 -void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
16573 +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
16574 {
16575 - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16576 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16577 return;
16578
16579 spin_lock(&nmi_print_lock);
16580 @@ -733,10 +713,12 @@ void notrace __kprobes die_nmi(struct pt
16581 * to get a message out:
16582 */
16583 bust_spinlocks(1);
16584 - printk(KERN_EMERG "%s", msg);
16585 + printk(KERN_EMERG "%s", str);
16586 printk(" on CPU%d, ip %08lx, registers:\n",
16587 smp_processor_id(), regs->ip);
16588 show_registers(regs);
16589 + if (do_panic)
16590 + panic("Non maskable interrupt");
16591 console_silent();
16592 spin_unlock(&nmi_print_lock);
16593 bust_spinlocks(0);
16594 @@ -756,14 +738,17 @@ void notrace __kprobes die_nmi(struct pt
16595 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
16596 {
16597 unsigned char reason = 0;
16598 + int cpu;
16599
16600 - /* Only the BSP gets external NMIs from the system: */
16601 - if (!smp_processor_id())
16602 + cpu = smp_processor_id();
16603 +
16604 + /* Only the BSP gets external NMIs from the system. */
16605 + if (!cpu)
16606 reason = get_nmi_reason();
16607
16608 if (!(reason & 0xc0)) {
16609 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
16610 - == NOTIFY_STOP)
16611 + == NOTIFY_STOP)
16612 return;
16613 #ifdef CONFIG_X86_LOCAL_APIC
16614 /*
16615 @@ -772,7 +757,7 @@ static notrace __kprobes void default_do
16616 */
16617 if (nmi_watchdog_tick(regs, reason))
16618 return;
16619 - if (!do_nmi_callback(regs, smp_processor_id()))
16620 + if (!do_nmi_callback(regs, cpu))
16621 unknown_nmi_error(reason, regs);
16622 #else
16623 unknown_nmi_error(reason, regs);
16624 @@ -782,6 +767,8 @@ static notrace __kprobes void default_do
16625 }
16626 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
16627 return;
16628 +
16629 + /* AK: following checks seem to be broken on modern chipsets. FIXME */
16630 if (reason & 0x80)
16631 mem_parity_error(reason, regs);
16632 if (reason & 0x40)
16633 @@ -793,8 +780,6 @@ static notrace __kprobes void default_do
16634 reassert_nmi();
16635 }
16636
16637 -static int ignore_nmis;
16638 -
16639 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
16640 {
16641 int cpu;
16642 @@ -879,7 +864,7 @@ void __kprobes do_debug(struct pt_regs *
16643 tsk->thread.debugctlmsr = 0;
16644
16645 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16646 - SIGTRAP) == NOTIFY_STOP)
16647 + SIGTRAP) == NOTIFY_STOP)
16648 return;
16649 /* It's safe to allow irq's after DR6 has been saved */
16650 if (regs->flags & X86_EFLAGS_IF)
16651 @@ -940,9 +925,8 @@ clear_TF_reenable:
16652 void math_error(void __user *ip)
16653 {
16654 struct task_struct *task;
16655 - unsigned short cwd;
16656 - unsigned short swd;
16657 siginfo_t info;
16658 + unsigned short cwd, swd;
16659
16660 /*
16661 * Save the info for the exception handler and clear the error.
16662 @@ -961,7 +945,7 @@ void math_error(void __user *ip)
16663 * C1 reg you need in case of a stack fault, 0x040 is the stack
16664 * fault bit. We should only be taking one exception at a time,
16665 * so if this combination doesn't produce any single exception,
16666 - * then we have a bad program that isn't syncronizing its FPU usage
16667 + * then we have a bad program that isn't synchronizing its FPU usage
16668 * and it will suffer the consequences since we won't be able to
16669 * fully reproduce the context of the exception
16670 */
16671 @@ -970,7 +954,7 @@ void math_error(void __user *ip)
16672 switch (swd & ~cwd & 0x3f) {
16673 case 0x000: /* No unmasked exception */
16674 return;
16675 - default: /* Multiple exceptions */
16676 + default: /* Multiple exceptions */
16677 break;
16678 case 0x001: /* Invalid Op */
16679 /*
16680 @@ -1006,8 +990,8 @@ void do_coprocessor_error(struct pt_regs
16681 static void simd_math_error(void __user *ip)
16682 {
16683 struct task_struct *task;
16684 - unsigned short mxcsr;
16685 siginfo_t info;
16686 + unsigned short mxcsr;
16687
16688 /*
16689 * Save the info for the exception handler and clear the error.
16690 @@ -1084,7 +1068,7 @@ void do_spurious_interrupt_bug(struct pt
16691
16692 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
16693 {
16694 - struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
16695 + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
16696 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
16697 unsigned long new_kesp = kesp - base;
16698 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
16699 --- sle11-2009-06-04.orig/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:38:05.000000000 +0100
16700 +++ sle11-2009-06-04/arch/x86/kernel/traps_64-xen.c 2009-06-04 10:21:39.000000000 +0200
16701 @@ -10,73 +10,56 @@
16702 * 'Traps.c' handles hardware traps and faults after we have saved some
16703 * state in 'entry.S'.
16704 */
16705 -#include <linux/sched.h>
16706 +#include <linux/moduleparam.h>
16707 +#include <linux/interrupt.h>
16708 +#include <linux/kallsyms.h>
16709 +#include <linux/spinlock.h>
16710 +#include <linux/kprobes.h>
16711 +#include <linux/uaccess.h>
16712 +#include <linux/utsname.h>
16713 +#include <linux/kdebug.h>
16714 #include <linux/kernel.h>
16715 +#include <linux/module.h>
16716 +#include <linux/ptrace.h>
16717 #include <linux/string.h>
16718 +#include <linux/unwind.h>
16719 +#include <linux/delay.h>
16720 #include <linux/errno.h>
16721 -#include <linux/ptrace.h>
16722 +#include <linux/kexec.h>
16723 +#include <linux/sched.h>
16724 #include <linux/timer.h>
16725 -#include <linux/mm.h>
16726 #include <linux/init.h>
16727 -#include <linux/delay.h>
16728 -#include <linux/spinlock.h>
16729 -#include <linux/interrupt.h>
16730 -#include <linux/kallsyms.h>
16731 -#include <linux/module.h>
16732 -#include <linux/moduleparam.h>
16733 -#include <linux/nmi.h>
16734 -#include <linux/kprobes.h>
16735 -#include <linux/kexec.h>
16736 -#include <linux/unwind.h>
16737 -#include <linux/uaccess.h>
16738 #include <linux/bug.h>
16739 -#include <linux/kdebug.h>
16740 -#include <linux/utsname.h>
16741 -
16742 -#include <mach_traps.h>
16743 +#include <linux/nmi.h>
16744 +#include <linux/mm.h>
16745
16746 #if defined(CONFIG_EDAC)
16747 #include <linux/edac.h>
16748 #endif
16749
16750 -#include <asm/system.h>
16751 -#include <asm/io.h>
16752 -#include <asm/atomic.h>
16753 +#include <asm/stacktrace.h>
16754 +#include <asm/processor.h>
16755 #include <asm/debugreg.h>
16756 +#include <asm/atomic.h>
16757 +#include <asm/system.h>
16758 +#include <asm/unwind.h>
16759 #include <asm/desc.h>
16760 #include <asm/i387.h>
16761 -#include <asm/processor.h>
16762 -#include <asm/unwind.h>
16763 +#include <asm/nmi.h>
16764 #include <asm/smp.h>
16765 +#include <asm/io.h>
16766 #include <asm/pgalloc.h>
16767 -#include <asm/pda.h>
16768 #include <asm/proto.h>
16769 -#include <asm/nmi.h>
16770 -#include <asm/stacktrace.h>
16771 +#include <asm/pda.h>
16772 +#include <asm/traps.h>
16773
16774 -asmlinkage void divide_error(void);
16775 -asmlinkage void debug(void);
16776 -asmlinkage void nmi(void);
16777 -asmlinkage void int3(void);
16778 -asmlinkage void overflow(void);
16779 -asmlinkage void bounds(void);
16780 -asmlinkage void invalid_op(void);
16781 -asmlinkage void device_not_available(void);
16782 -asmlinkage void double_fault(void);
16783 -asmlinkage void coprocessor_segment_overrun(void);
16784 -asmlinkage void invalid_TSS(void);
16785 -asmlinkage void segment_not_present(void);
16786 -asmlinkage void stack_segment(void);
16787 -asmlinkage void general_protection(void);
16788 -asmlinkage void page_fault(void);
16789 -asmlinkage void coprocessor_error(void);
16790 -asmlinkage void simd_coprocessor_error(void);
16791 -asmlinkage void reserved(void);
16792 -asmlinkage void alignment_check(void);
16793 -asmlinkage void machine_check(void);
16794 -asmlinkage void spurious_interrupt_bug(void);
16795 +#include <mach_traps.h>
16796
16797 +int panic_on_unrecovered_nmi;
16798 +int kstack_depth_to_print = 12;
16799 static unsigned int code_bytes = 64;
16800 +static int ignore_nmis;
16801 +static int die_counter;
16802
16803 static inline void conditional_sti(struct pt_regs *regs)
16804 {
16805 @@ -100,34 +83,9 @@ static inline void preempt_conditional_c
16806 dec_preempt_count();
16807 }
16808
16809 -int kstack_depth_to_print = 12;
16810 -
16811 void printk_address(unsigned long address, int reliable)
16812 {
16813 -#ifdef CONFIG_KALLSYMS
16814 - unsigned long offset = 0, symsize;
16815 - const char *symname;
16816 - char *modname;
16817 - char *delim = ":";
16818 - char namebuf[KSYM_NAME_LEN];
16819 - char reliab[4] = "";
16820 -
16821 - symname = kallsyms_lookup(address, &symsize, &offset,
16822 - &modname, namebuf);
16823 - if (!symname) {
16824 - printk(" [<%016lx>]\n", address);
16825 - return;
16826 - }
16827 - if (!reliable)
16828 - strcpy(reliab, "? ");
16829 -
16830 - if (!modname)
16831 - modname = delim = "";
16832 - printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
16833 - address, reliab, delim, modname, delim, symname, offset, symsize);
16834 -#else
16835 - printk(" [<%016lx>]\n", address);
16836 -#endif
16837 + printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
16838 }
16839
16840 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
16841 @@ -206,8 +164,6 @@ static unsigned long *in_exception_stack
16842 return NULL;
16843 }
16844
16845 -#define MSG(txt) ops->warning(data, txt)
16846 -
16847 /*
16848 * x86-64 can have up to three kernel stacks:
16849 * process stack
16850 @@ -234,11 +190,11 @@ struct stack_frame {
16851 unsigned long return_address;
16852 };
16853
16854 -
16855 -static inline unsigned long print_context_stack(struct thread_info *tinfo,
16856 - unsigned long *stack, unsigned long bp,
16857 - const struct stacktrace_ops *ops, void *data,
16858 - unsigned long *end)
16859 +static inline unsigned long
16860 +print_context_stack(struct thread_info *tinfo,
16861 + unsigned long *stack, unsigned long bp,
16862 + const struct stacktrace_ops *ops, void *data,
16863 + unsigned long *end)
16864 {
16865 struct stack_frame *frame = (struct stack_frame *)bp;
16866
16867 @@ -260,7 +216,7 @@ static inline unsigned long print_contex
16868 return bp;
16869 }
16870
16871 -void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
16872 +void dump_trace(struct task_struct *task, struct pt_regs *regs,
16873 unsigned long *stack, unsigned long bp,
16874 const struct stacktrace_ops *ops, void *data)
16875 {
16876 @@ -269,36 +225,34 @@ void dump_trace(struct task_struct *tsk,
16877 unsigned used = 0;
16878 struct thread_info *tinfo;
16879
16880 - if (!tsk)
16881 - tsk = current;
16882 - tinfo = task_thread_info(tsk);
16883 + if (!task)
16884 + task = current;
16885
16886 if (!stack) {
16887 unsigned long dummy;
16888 stack = &dummy;
16889 - if (tsk && tsk != current)
16890 - stack = (unsigned long *)tsk->thread.sp;
16891 + if (task && task != current)
16892 + stack = (unsigned long *)task->thread.sp;
16893 }
16894
16895 #ifdef CONFIG_FRAME_POINTER
16896 if (!bp) {
16897 - if (tsk == current) {
16898 + if (task == current) {
16899 /* Grab bp right from our regs */
16900 - asm("movq %%rbp, %0" : "=r" (bp):);
16901 + asm("movq %%rbp, %0" : "=r" (bp) :);
16902 } else {
16903 /* bp is the last reg pushed by switch_to */
16904 - bp = *(unsigned long *) tsk->thread.sp;
16905 + bp = *(unsigned long *) task->thread.sp;
16906 }
16907 }
16908 #endif
16909
16910 -
16911 -
16912 /*
16913 * Print function call entries in all stacks, starting at the
16914 * current stack address. If the stacks consist of nested
16915 * exceptions
16916 */
16917 + tinfo = task_thread_info(task);
16918 for (;;) {
16919 char *id;
16920 unsigned long *estack_end;
16921 @@ -383,18 +337,24 @@ static const struct stacktrace_ops print
16922 .address = print_trace_address,
16923 };
16924
16925 -void
16926 -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
16927 - unsigned long bp)
16928 +static void
16929 +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16930 + unsigned long *stack, unsigned long bp, char *log_lvl)
16931 {
16932 printk("\nCall Trace:\n");
16933 - dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
16934 + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16935 printk("\n");
16936 }
16937
16938 +void show_trace(struct task_struct *task, struct pt_regs *regs,
16939 + unsigned long *stack, unsigned long bp)
16940 +{
16941 + show_trace_log_lvl(task, regs, stack, bp, "");
16942 +}
16943 +
16944 static void
16945 -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
16946 - unsigned long bp)
16947 +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
16948 + unsigned long *sp, unsigned long bp, char *log_lvl)
16949 {
16950 unsigned long *stack;
16951 int i;
16952 @@ -406,14 +366,14 @@ _show_stack(struct task_struct *tsk, str
16953 // back trace for this cpu.
16954
16955 if (sp == NULL) {
16956 - if (tsk)
16957 - sp = (unsigned long *)tsk->thread.sp;
16958 + if (task)
16959 + sp = (unsigned long *)task->thread.sp;
16960 else
16961 sp = (unsigned long *)&sp;
16962 }
16963
16964 stack = sp;
16965 - for(i=0; i < kstack_depth_to_print; i++) {
16966 + for (i = 0; i < kstack_depth_to_print; i++) {
16967 if (stack >= irqstack && stack <= irqstack_end) {
16968 if (stack == irqstack_end) {
16969 stack = (unsigned long *) (irqstack_end[-1]);
16970 @@ -428,12 +388,12 @@ _show_stack(struct task_struct *tsk, str
16971 printk(" %016lx", *stack++);
16972 touch_nmi_watchdog();
16973 }
16974 - show_trace(tsk, regs, sp, bp);
16975 + show_trace_log_lvl(task, regs, sp, bp, log_lvl);
16976 }
16977
16978 -void show_stack(struct task_struct *tsk, unsigned long * sp)
16979 +void show_stack(struct task_struct *task, unsigned long *sp)
16980 {
16981 - _show_stack(tsk, NULL, sp, 0);
16982 + show_stack_log_lvl(task, NULL, sp, 0, "");
16983 }
16984
16985 /*
16986 @@ -441,8 +401,8 @@ void show_stack(struct task_struct *tsk,
16987 */
16988 void dump_stack(void)
16989 {
16990 - unsigned long dummy;
16991 unsigned long bp = 0;
16992 + unsigned long stack;
16993
16994 #ifdef CONFIG_FRAME_POINTER
16995 if (!bp)
16996 @@ -454,7 +414,7 @@ void dump_stack(void)
16997 init_utsname()->release,
16998 (int)strcspn(init_utsname()->version, " "),
16999 init_utsname()->version);
17000 - show_trace(NULL, NULL, &dummy, bp);
17001 + show_trace(NULL, NULL, &stack, bp);
17002 }
17003
17004 EXPORT_SYMBOL(dump_stack);
17005 @@ -465,12 +425,8 @@ void show_registers(struct pt_regs *regs
17006 unsigned long sp;
17007 const int cpu = smp_processor_id();
17008 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
17009 - u8 *ip;
17010 - unsigned int code_prologue = code_bytes * 43 / 64;
17011 - unsigned int code_len = code_bytes;
17012
17013 sp = regs->sp;
17014 - ip = (u8 *) regs->ip - code_prologue;
17015 printk("CPU %d ", cpu);
17016 __show_regs(regs);
17017 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
17018 @@ -481,15 +437,22 @@ void show_registers(struct pt_regs *regs
17019 * time of the fault..
17020 */
17021 if (!user_mode(regs)) {
17022 + unsigned int code_prologue = code_bytes * 43 / 64;
17023 + unsigned int code_len = code_bytes;
17024 unsigned char c;
17025 + u8 *ip;
17026 +
17027 printk("Stack: ");
17028 - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
17029 + show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
17030 + regs->bp, "");
17031 printk("\n");
17032
17033 printk(KERN_EMERG "Code: ");
17034 +
17035 + ip = (u8 *)regs->ip - code_prologue;
17036 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
17037 /* try starting at RIP */
17038 - ip = (u8 *) regs->ip;
17039 + ip = (u8 *)regs->ip;
17040 code_len = code_len - code_prologue + 1;
17041 }
17042 for (i = 0; i < code_len; i++, ip++) {
17043 @@ -505,7 +468,7 @@ void show_registers(struct pt_regs *regs
17044 }
17045 }
17046 printk("\n");
17047 -}
17048 +}
17049
17050 int is_valid_bugaddr(unsigned long ip)
17051 {
17052 @@ -545,7 +508,7 @@ unsigned __kprobes long oops_begin(void)
17053 }
17054
17055 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
17056 -{
17057 +{
17058 die_owner = -1;
17059 bust_spinlocks(0);
17060 die_nest_count--;
17061 @@ -563,10 +526,9 @@ void __kprobes oops_end(unsigned long fl
17062 do_exit(signr);
17063 }
17064
17065 -int __kprobes __die(const char * str, struct pt_regs * regs, long err)
17066 +int __kprobes __die(const char *str, struct pt_regs *regs, long err)
17067 {
17068 - static int die_counter;
17069 - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
17070 + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
17071 #ifdef CONFIG_PREEMPT
17072 printk("PREEMPT ");
17073 #endif
17074 @@ -577,8 +539,10 @@ int __kprobes __die(const char * str, st
17075 printk("DEBUG_PAGEALLOC");
17076 #endif
17077 printk("\n");
17078 - if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17079 + if (notify_die(DIE_OOPS, str, regs, err,
17080 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17081 return 1;
17082 +
17083 show_registers(regs);
17084 add_taint(TAINT_DIE);
17085 /* Executive summary in case the oops scrolled away */
17086 @@ -590,7 +554,7 @@ int __kprobes __die(const char * str, st
17087 return 0;
17088 }
17089
17090 -void die(const char * str, struct pt_regs * regs, long err)
17091 +void die(const char *str, struct pt_regs *regs, long err)
17092 {
17093 unsigned long flags = oops_begin();
17094
17095 @@ -608,8 +572,7 @@ die_nmi(char *str, struct pt_regs *regs,
17096 {
17097 unsigned long flags;
17098
17099 - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
17100 - NOTIFY_STOP)
17101 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
17102 return;
17103
17104 flags = oops_begin();
17105 @@ -617,7 +580,9 @@ die_nmi(char *str, struct pt_regs *regs,
17106 * We are in trouble anyway, lets at least try
17107 * to get a message out.
17108 */
17109 - printk(str, smp_processor_id());
17110 + printk(KERN_EMERG "%s", str);
17111 + printk(" on CPU%d, ip %08lx, registers:\n",
17112 + smp_processor_id(), regs->ip);
17113 show_registers(regs);
17114 if (kexec_should_crash(current))
17115 crash_kexec(regs);
17116 @@ -630,44 +595,44 @@ die_nmi(char *str, struct pt_regs *regs,
17117 }
17118 #endif
17119
17120 -static void __kprobes do_trap(int trapnr, int signr, char *str,
17121 - struct pt_regs * regs, long error_code,
17122 - siginfo_t *info)
17123 +static void __kprobes
17124 +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
17125 + long error_code, siginfo_t *info)
17126 {
17127 struct task_struct *tsk = current;
17128
17129 - if (user_mode(regs)) {
17130 - /*
17131 - * We want error_code and trap_no set for userspace
17132 - * faults and kernelspace faults which result in
17133 - * die(), but not kernelspace faults which are fixed
17134 - * up. die() gives the process no chance to handle
17135 - * the signal and notice the kernel fault information,
17136 - * so that won't result in polluting the information
17137 - * about previously queued, but not yet delivered,
17138 - * faults. See also do_general_protection below.
17139 - */
17140 - tsk->thread.error_code = error_code;
17141 - tsk->thread.trap_no = trapnr;
17142 + if (!user_mode(regs))
17143 + goto kernel_trap;
17144
17145 - if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17146 - printk_ratelimit()) {
17147 - printk(KERN_INFO
17148 - "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17149 - tsk->comm, tsk->pid, str,
17150 - regs->ip, regs->sp, error_code);
17151 - print_vma_addr(" in ", regs->ip);
17152 - printk("\n");
17153 - }
17154 + /*
17155 + * We want error_code and trap_no set for userspace faults and
17156 + * kernelspace faults which result in die(), but not
17157 + * kernelspace faults which are fixed up. die() gives the
17158 + * process no chance to handle the signal and notice the
17159 + * kernel fault information, so that won't result in polluting
17160 + * the information about previously queued, but not yet
17161 + * delivered, faults. See also do_general_protection below.
17162 + */
17163 + tsk->thread.error_code = error_code;
17164 + tsk->thread.trap_no = trapnr;
17165
17166 - if (info)
17167 - force_sig_info(signr, info, tsk);
17168 - else
17169 - force_sig(signr, tsk);
17170 - return;
17171 + if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17172 + printk_ratelimit()) {
17173 + printk(KERN_INFO
17174 + "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17175 + tsk->comm, tsk->pid, str,
17176 + regs->ip, regs->sp, error_code);
17177 + print_vma_addr(" in ", regs->ip);
17178 + printk("\n");
17179 }
17180
17181 + if (info)
17182 + force_sig_info(signr, info, tsk);
17183 + else
17184 + force_sig(signr, tsk);
17185 + return;
17186
17187 +kernel_trap:
17188 if (!fixup_exception(regs)) {
17189 tsk->thread.error_code = error_code;
17190 tsk->thread.trap_no = trapnr;
17191 @@ -677,41 +642,39 @@ static void __kprobes do_trap(int trapnr
17192 }
17193
17194 #define DO_ERROR(trapnr, signr, str, name) \
17195 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17196 -{ \
17197 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17198 - == NOTIFY_STOP) \
17199 - return; \
17200 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17201 +{ \
17202 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17203 + == NOTIFY_STOP) \
17204 + return; \
17205 conditional_sti(regs); \
17206 - do_trap(trapnr, signr, str, regs, error_code, NULL); \
17207 + do_trap(trapnr, signr, str, regs, error_code, NULL); \
17208 }
17209
17210 -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17211 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17212 -{ \
17213 - siginfo_t info; \
17214 - info.si_signo = signr; \
17215 - info.si_errno = 0; \
17216 - info.si_code = sicode; \
17217 - info.si_addr = (void __user *)siaddr; \
17218 - trace_hardirqs_fixup(); \
17219 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17220 - == NOTIFY_STOP) \
17221 - return; \
17222 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17223 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17224 +{ \
17225 + siginfo_t info; \
17226 + info.si_signo = signr; \
17227 + info.si_errno = 0; \
17228 + info.si_code = sicode; \
17229 + info.si_addr = (void __user *)siaddr; \
17230 + trace_hardirqs_fixup(); \
17231 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17232 + == NOTIFY_STOP) \
17233 + return; \
17234 conditional_sti(regs); \
17235 - do_trap(trapnr, signr, str, regs, error_code, &info); \
17236 + do_trap(trapnr, signr, str, regs, error_code, &info); \
17237 }
17238
17239 -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17240 -DO_ERROR( 4, SIGSEGV, "overflow", overflow)
17241 -DO_ERROR( 5, SIGSEGV, "bounds", bounds)
17242 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17243 -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
17244 -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17245 +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17246 +DO_ERROR(4, SIGSEGV, "overflow", overflow)
17247 +DO_ERROR(5, SIGSEGV, "bounds", bounds)
17248 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17249 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17250 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
17251 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17252 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17253 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
17254 -DO_ERROR(18, SIGSEGV, "reserved", reserved)
17255
17256 /* Runs on IST stack */
17257 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
17258 @@ -741,31 +704,34 @@ asmlinkage void do_double_fault(struct p
17259 die(str, regs, error_code);
17260 }
17261
17262 -asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
17263 - long error_code)
17264 +asmlinkage void __kprobes
17265 +do_general_protection(struct pt_regs *regs, long error_code)
17266 {
17267 - struct task_struct *tsk = current;
17268 + struct task_struct *tsk;
17269
17270 conditional_sti(regs);
17271
17272 - if (user_mode(regs)) {
17273 - tsk->thread.error_code = error_code;
17274 - tsk->thread.trap_no = 13;
17275 + tsk = current;
17276 + if (!user_mode(regs))
17277 + goto gp_in_kernel;
17278
17279 - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17280 - printk_ratelimit()) {
17281 - printk(KERN_INFO
17282 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17283 - tsk->comm, tsk->pid,
17284 - regs->ip, regs->sp, error_code);
17285 - print_vma_addr(" in ", regs->ip);
17286 - printk("\n");
17287 - }
17288 + tsk->thread.error_code = error_code;
17289 + tsk->thread.trap_no = 13;
17290
17291 - force_sig(SIGSEGV, tsk);
17292 - return;
17293 - }
17294 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17295 + printk_ratelimit()) {
17296 + printk(KERN_INFO
17297 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17298 + tsk->comm, tsk->pid,
17299 + regs->ip, regs->sp, error_code);
17300 + print_vma_addr(" in ", regs->ip);
17301 + printk("\n");
17302 + }
17303
17304 + force_sig(SIGSEGV, tsk);
17305 + return;
17306 +
17307 +gp_in_kernel:
17308 if (fixup_exception(regs))
17309 return;
17310
17311 @@ -778,14 +744,14 @@ asmlinkage void __kprobes do_general_pro
17312 }
17313
17314 static notrace __kprobes void
17315 -mem_parity_error(unsigned char reason, struct pt_regs * regs)
17316 +mem_parity_error(unsigned char reason, struct pt_regs *regs)
17317 {
17318 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
17319 reason);
17320 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
17321
17322 #if defined(CONFIG_EDAC)
17323 - if(edac_handler_set()) {
17324 + if (edac_handler_set()) {
17325 edac_atomic_assert_error();
17326 return;
17327 }
17328 @@ -801,7 +767,7 @@ mem_parity_error(unsigned char reason, s
17329 }
17330
17331 static notrace __kprobes void
17332 -io_check_error(unsigned char reason, struct pt_regs * regs)
17333 +io_check_error(unsigned char reason, struct pt_regs *regs)
17334 {
17335 printk("NMI: IOCK error (debug interrupt?)\n");
17336 show_registers(regs);
17337 @@ -827,14 +793,14 @@ unknown_nmi_error(unsigned char reason,
17338
17339 /* Runs on IST stack. This code must keep interrupts off all the time.
17340 Nested NMIs are prevented by the CPU. */
17341 -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17342 +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17343 {
17344 unsigned char reason = 0;
17345 int cpu;
17346
17347 cpu = smp_processor_id();
17348
17349 - /* Only the BSP gets external NMIs from the system. */
17350 + /* Only the BSP gets external NMIs from the system. */
17351 if (!cpu)
17352 reason = get_nmi_reason();
17353
17354 @@ -847,33 +813,58 @@ asmlinkage notrace __kprobes void defau
17355 * Ok, so this is none of the documented NMI sources,
17356 * so it must be the NMI watchdog.
17357 */
17358 - if (nmi_watchdog_tick(regs,reason))
17359 + if (nmi_watchdog_tick(regs, reason))
17360 return;
17361 #endif
17362 - if (!do_nmi_callback(regs,cpu))
17363 + if (!do_nmi_callback(regs, cpu))
17364 unknown_nmi_error(reason, regs);
17365
17366 return;
17367 }
17368 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
17369 - return;
17370 + return;
17371
17372 /* AK: following checks seem to be broken on modern chipsets. FIXME */
17373 -
17374 if (reason & 0x80)
17375 mem_parity_error(reason, regs);
17376 if (reason & 0x40)
17377 io_check_error(reason, regs);
17378 }
17379
17380 +asmlinkage notrace __kprobes void
17381 +do_nmi(struct pt_regs *regs, long error_code)
17382 +{
17383 + nmi_enter();
17384 +
17385 + add_pda(__nmi_count, 1);
17386 +
17387 + if (!ignore_nmis)
17388 + default_do_nmi(regs);
17389 +
17390 + nmi_exit();
17391 +}
17392 +
17393 +void stop_nmi(void)
17394 +{
17395 + acpi_nmi_disable();
17396 + ignore_nmis++;
17397 +}
17398 +
17399 +void restart_nmi(void)
17400 +{
17401 + ignore_nmis--;
17402 + acpi_nmi_enable();
17403 +}
17404 +
17405 /* runs on IST stack. */
17406 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
17407 {
17408 trace_hardirqs_fixup();
17409
17410 - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
17411 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
17412 + == NOTIFY_STOP)
17413 return;
17414 - }
17415 +
17416 preempt_conditional_sti(regs);
17417 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
17418 preempt_conditional_cli(regs);
17419 @@ -904,8 +895,8 @@ asmlinkage __kprobes struct pt_regs *syn
17420 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
17421 unsigned long error_code)
17422 {
17423 - unsigned long condition;
17424 struct task_struct *tsk = current;
17425 + unsigned long condition;
17426 siginfo_t info;
17427
17428 trace_hardirqs_fixup();
17429 @@ -926,21 +917,19 @@ asmlinkage void __kprobes do_debug(struc
17430
17431 /* Mask out spurious debug traps due to lazy DR7 setting */
17432 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
17433 - if (!tsk->thread.debugreg7) {
17434 + if (!tsk->thread.debugreg7)
17435 goto clear_dr7;
17436 - }
17437 }
17438
17439 tsk->thread.debugreg6 = condition;
17440
17441 -
17442 /*
17443 * Single-stepping through TF: make sure we ignore any events in
17444 * kernel space (but re-enable TF when returning to user mode).
17445 */
17446 if (condition & DR_STEP) {
17447 - if (!user_mode(regs))
17448 - goto clear_TF_reenable;
17449 + if (!user_mode(regs))
17450 + goto clear_TF_reenable;
17451 }
17452
17453 /* Ok, finally something we can handle */
17454 @@ -953,7 +942,7 @@ asmlinkage void __kprobes do_debug(struc
17455 force_sig_info(SIGTRAP, &info, tsk);
17456
17457 clear_dr7:
17458 - set_debugreg(0UL, 7);
17459 + set_debugreg(0, 7);
17460 preempt_conditional_cli(regs);
17461 return;
17462
17463 @@ -961,6 +950,7 @@ clear_TF_reenable:
17464 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
17465 regs->flags &= ~X86_EFLAGS_TF;
17466 preempt_conditional_cli(regs);
17467 + return;
17468 }
17469
17470 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
17471 @@ -983,7 +973,7 @@ static int kernel_math_error(struct pt_r
17472 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
17473 {
17474 void __user *ip = (void __user *)(regs->ip);
17475 - struct task_struct * task;
17476 + struct task_struct *task;
17477 siginfo_t info;
17478 unsigned short cwd, swd;
17479
17480 @@ -1016,30 +1006,30 @@ asmlinkage void do_coprocessor_error(str
17481 cwd = get_fpu_cwd(task);
17482 swd = get_fpu_swd(task);
17483 switch (swd & ~cwd & 0x3f) {
17484 - case 0x000:
17485 - default:
17486 - break;
17487 - case 0x001: /* Invalid Op */
17488 - /*
17489 - * swd & 0x240 == 0x040: Stack Underflow
17490 - * swd & 0x240 == 0x240: Stack Overflow
17491 - * User must clear the SF bit (0x40) if set
17492 - */
17493 - info.si_code = FPE_FLTINV;
17494 - break;
17495 - case 0x002: /* Denormalize */
17496 - case 0x010: /* Underflow */
17497 - info.si_code = FPE_FLTUND;
17498 - break;
17499 - case 0x004: /* Zero Divide */
17500 - info.si_code = FPE_FLTDIV;
17501 - break;
17502 - case 0x008: /* Overflow */
17503 - info.si_code = FPE_FLTOVF;
17504 - break;
17505 - case 0x020: /* Precision */
17506 - info.si_code = FPE_FLTRES;
17507 - break;
17508 + case 0x000: /* No unmasked exception */
17509 + default: /* Multiple exceptions */
17510 + break;
17511 + case 0x001: /* Invalid Op */
17512 + /*
17513 + * swd & 0x240 == 0x040: Stack Underflow
17514 + * swd & 0x240 == 0x240: Stack Overflow
17515 + * User must clear the SF bit (0x40) if set
17516 + */
17517 + info.si_code = FPE_FLTINV;
17518 + break;
17519 + case 0x002: /* Denormalize */
17520 + case 0x010: /* Underflow */
17521 + info.si_code = FPE_FLTUND;
17522 + break;
17523 + case 0x004: /* Zero Divide */
17524 + info.si_code = FPE_FLTDIV;
17525 + break;
17526 + case 0x008: /* Overflow */
17527 + info.si_code = FPE_FLTOVF;
17528 + break;
17529 + case 0x020: /* Precision */
17530 + info.si_code = FPE_FLTRES;
17531 + break;
17532 }
17533 force_sig_info(SIGFPE, &info, task);
17534 }
17535 @@ -1052,7 +1042,7 @@ asmlinkage void bad_intr(void)
17536 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
17537 {
17538 void __user *ip = (void __user *)(regs->ip);
17539 - struct task_struct * task;
17540 + struct task_struct *task;
17541 siginfo_t info;
17542 unsigned short mxcsr;
17543
17544 @@ -1080,25 +1070,25 @@ asmlinkage void do_simd_coprocessor_erro
17545 */
17546 mxcsr = get_fpu_mxcsr(task);
17547 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
17548 - case 0x000:
17549 - default:
17550 - break;
17551 - case 0x001: /* Invalid Op */
17552 - info.si_code = FPE_FLTINV;
17553 - break;
17554 - case 0x002: /* Denormalize */
17555 - case 0x010: /* Underflow */
17556 - info.si_code = FPE_FLTUND;
17557 - break;
17558 - case 0x004: /* Zero Divide */
17559 - info.si_code = FPE_FLTDIV;
17560 - break;
17561 - case 0x008: /* Overflow */
17562 - info.si_code = FPE_FLTOVF;
17563 - break;
17564 - case 0x020: /* Precision */
17565 - info.si_code = FPE_FLTRES;
17566 - break;
17567 + case 0x000:
17568 + default:
17569 + break;
17570 + case 0x001: /* Invalid Op */
17571 + info.si_code = FPE_FLTINV;
17572 + break;
17573 + case 0x002: /* Denormalize */
17574 + case 0x010: /* Underflow */
17575 + info.si_code = FPE_FLTUND;
17576 + break;
17577 + case 0x004: /* Zero Divide */
17578 + info.si_code = FPE_FLTDIV;
17579 + break;
17580 + case 0x008: /* Overflow */
17581 + info.si_code = FPE_FLTOVF;
17582 + break;
17583 + case 0x020: /* Precision */
17584 + info.si_code = FPE_FLTRES;
17585 + break;
17586 }
17587 force_sig_info(SIGFPE, &info, task);
17588 }
17589 @@ -1118,7 +1108,7 @@ asmlinkage void __attribute__((weak)) mc
17590 }
17591
17592 /*
17593 - * 'math_state_restore()' saves the current math information in the
17594 + * 'math_state_restore()' saves the current math information in the
17595 * old math state array, and gets the new ones from the current task
17596 *
17597 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
17598 @@ -1145,7 +1135,14 @@ asmlinkage void math_state_restore(void)
17599
17600 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
17601
17602 - restore_fpu_checking(&me->thread.xstate->fxsave);
17603 + /*
17604 + * Paranoid restore. send a SIGSEGV if we fail to restore the state.
17605 + */
17606 + if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
17607 + stts();
17608 + force_sig(SIGSEGV, me);
17609 + return;
17610 + }
17611 task_thread_info(me)->status |= TS_USEDFPU;
17612 me->fpu_counter++;
17613 }
17614 @@ -1190,13 +1187,12 @@ void __init trap_init(void)
17615 ret = HYPERVISOR_set_trap_table(trap_table);
17616 if (ret)
17617 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
17618 -
17619 /*
17620 * initialize the per thread extended state:
17621 */
17622 - init_thread_xstate();
17623 + init_thread_xstate();
17624 /*
17625 - * Should be a barrier for any external CPU state.
17626 + * Should be a barrier for any external CPU state:
17627 */
17628 cpu_init();
17629 }
17630 @@ -1212,27 +1208,25 @@ void __cpuinit smp_trap_init(trap_info_t
17631 }
17632 }
17633
17634 -
17635 static int __init oops_setup(char *s)
17636 -{
17637 +{
17638 if (!s)
17639 return -EINVAL;
17640 if (!strcmp(s, "panic"))
17641 panic_on_oops = 1;
17642 return 0;
17643 -}
17644 +}
17645 early_param("oops", oops_setup);
17646
17647 static int __init kstack_setup(char *s)
17648 {
17649 if (!s)
17650 return -EINVAL;
17651 - kstack_depth_to_print = simple_strtoul(s,NULL,0);
17652 + kstack_depth_to_print = simple_strtoul(s, NULL, 0);
17653 return 0;
17654 }
17655 early_param("kstack", kstack_setup);
17656
17657 -
17658 static int __init code_bytes_setup(char *s)
17659 {
17660 code_bytes = simple_strtoul(s, NULL, 0);
17661 --- sle11-2009-06-04.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:38:05.000000000 +0100
17662 +++ sle11-2009-06-04/arch/x86/kernel/vsyscall_64-xen.c 2009-06-04 10:21:39.000000000 +0200
17663 @@ -42,7 +42,8 @@
17664 #include <asm/topology.h>
17665 #include <asm/vgtod.h>
17666
17667 -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
17668 +#define __vsyscall(nr) \
17669 + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
17670 #define __syscall_clobber "r11","cx","memory"
17671
17672 /*
17673 @@ -264,10 +265,7 @@ static void __cpuinit vsyscall_set_cpu(i
17674 d |= cpu;
17675 d |= (node & 0xf) << 12;
17676 d |= (node >> 4) << 48;
17677 - if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
17678 - + GDT_ENTRY_PER_CPU),
17679 - d))
17680 - BUG();
17681 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
17682 }
17683
17684 static void __cpuinit cpu_vsyscall_init(void *arg)
17685 @@ -281,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_bl
17686 {
17687 long cpu = (long)arg;
17688 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
17689 - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
17690 + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
17691 return NOTIFY_DONE;
17692 }
17693
17694 @@ -311,7 +309,7 @@ static int __init vsyscall_init(void)
17695 #ifdef CONFIG_SYSCTL
17696 register_sysctl_table(kernel_root_table2);
17697 #endif
17698 - on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
17699 + on_each_cpu(cpu_vsyscall_init, NULL, 1);
17700 hotcpu_notifier(cpu_vsyscall_notifier, 0);
17701 return 0;
17702 }
17703 --- sle11-2009-06-04.orig/arch/x86/mach-xen/setup.c 2009-03-16 16:33:40.000000000 +0100
17704 +++ sle11-2009-06-04/arch/x86/mach-xen/setup.c 2009-06-04 10:21:39.000000000 +0200
17705 @@ -17,6 +17,8 @@
17706 #include <xen/interface/callback.h>
17707 #include <xen/interface/memory.h>
17708
17709 +#ifdef CONFIG_X86_32
17710 +
17711 #ifdef CONFIG_HOTPLUG_CPU
17712 #define DEFAULT_SEND_IPI (1)
17713 #else
17714 @@ -44,51 +46,6 @@ static int __init print_ipi_mode(void)
17715
17716 late_initcall(print_ipi_mode);
17717
17718 -/**
17719 - * machine_specific_memory_setup - Hook for machine specific memory setup.
17720 - *
17721 - * Description:
17722 - * This is included late in kernel/setup.c so that it can make
17723 - * use of all of the static functions.
17724 - **/
17725 -
17726 -char * __init machine_specific_memory_setup(void)
17727 -{
17728 - int rc;
17729 - struct xen_memory_map memmap;
17730 - /*
17731 - * This is rather large for a stack variable but this early in
17732 - * the boot process we know we have plenty slack space.
17733 - */
17734 - struct e820entry map[E820MAX];
17735 -
17736 - memmap.nr_entries = E820MAX;
17737 - set_xen_guest_handle(memmap.buffer, map);
17738 -
17739 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
17740 - if ( rc == -ENOSYS ) {
17741 - memmap.nr_entries = 1;
17742 - map[0].addr = 0ULL;
17743 - map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
17744 - /* 8MB slack (to balance backend allocations). */
17745 - map[0].size += 8ULL << 20;
17746 - map[0].type = E820_RAM;
17747 - rc = 0;
17748 - }
17749 - BUG_ON(rc);
17750 -
17751 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
17752 -
17753 - BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
17754 -
17755 - return "Xen";
17756 -}
17757 -
17758 -
17759 -extern void hypervisor_callback(void);
17760 -extern void failsafe_callback(void);
17761 -extern void nmi(void);
17762 -
17763 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
17764 EXPORT_SYMBOL(machine_to_phys_mapping);
17765 unsigned int machine_to_phys_order;
17766 @@ -121,33 +78,66 @@ void __init pre_setup_arch_hook(void)
17767 (unsigned long *)xen_start_info->mfn_list;
17768 }
17769
17770 +#endif /* CONFIG_X86_32 */
17771 +
17772 +extern void hypervisor_callback(void);
17773 +extern void failsafe_callback(void);
17774 +extern void nmi(void);
17775 +
17776 +#ifdef CONFIG_X86_64
17777 +#include <asm/proto.h>
17778 +#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
17779 +#else
17780 +#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
17781 +#endif
17782 +
17783 void __init machine_specific_arch_setup(void)
17784 {
17785 int ret;
17786 static struct callback_register __initdata event = {
17787 .type = CALLBACKTYPE_event,
17788 - .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
17789 + .address = CALLBACK_ADDR(hypervisor_callback)
17790 };
17791 static struct callback_register __initdata failsafe = {
17792 .type = CALLBACKTYPE_failsafe,
17793 - .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
17794 + .address = CALLBACK_ADDR(failsafe_callback)
17795 + };
17796 +#ifdef CONFIG_X86_64
17797 + static struct callback_register __initdata syscall = {
17798 + .type = CALLBACKTYPE_syscall,
17799 + .address = CALLBACK_ADDR(system_call)
17800 };
17801 +#endif
17802 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17803 static struct callback_register __initdata nmi_cb = {
17804 .type = CALLBACKTYPE_nmi,
17805 - .address = { __KERNEL_CS, (unsigned long)nmi },
17806 + .address = CALLBACK_ADDR(nmi)
17807 };
17808 +#endif
17809
17810 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
17811 if (ret == 0)
17812 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
17813 +#ifdef CONFIG_X86_64
17814 + if (ret == 0)
17815 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
17816 +#endif
17817 #if CONFIG_XEN_COMPAT <= 0x030002
17818 +#ifdef CONFIG_X86_32
17819 if (ret == -ENOSYS)
17820 ret = HYPERVISOR_set_callbacks(
17821 event.address.cs, event.address.eip,
17822 failsafe.address.cs, failsafe.address.eip);
17823 +#else
17824 + ret = HYPERVISOR_set_callbacks(
17825 + event.address,
17826 + failsafe.address,
17827 + syscall.address);
17828 +#endif
17829 #endif
17830 BUG_ON(ret);
17831
17832 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17833 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
17834 #if CONFIG_XEN_COMPAT <= 0x030002
17835 if (ret == -ENOSYS) {
17836 @@ -158,15 +148,43 @@ void __init machine_specific_arch_setup(
17837 HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
17838 }
17839 #endif
17840 +#endif
17841
17842 +#ifdef CONFIG_X86_32
17843 /* Do an early initialization of the fixmap area */
17844 {
17845 extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
17846 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
17847 pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
17848 pmd_t *pmd = pmd_offset(pud, addr);
17849 + unsigned int i;
17850
17851 make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
17852 set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
17853 +
17854 +#define __FIXADDR_TOP (-PAGE_SIZE)
17855 +#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
17856 + != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
17857 + FIX_BUG_ON(SHARED_INFO);
17858 + FIX_BUG_ON(ISAMAP_BEGIN);
17859 + FIX_BUG_ON(ISAMAP_END);
17860 +#undef __FIXADDR_TOP
17861 + BUG_ON(pte_index(hypervisor_virt_start));
17862 +
17863 + /* Switch to the real shared_info page, and clear the
17864 + * dummy page. */
17865 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17866 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17867 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
17868 +
17869 + /* Setup mapping of lower 1st MB */
17870 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
17871 + if (is_initial_xendomain())
17872 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17873 + else
17874 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
17875 + virt_to_machine(empty_zero_page),
17876 + PAGE_KERNEL_RO);
17877 }
17878 +#endif
17879 }
17880 --- sle11-2009-06-04.orig/arch/x86/mm/fault-xen.c 2009-03-16 16:38:05.000000000 +0100
17881 +++ sle11-2009-06-04/arch/x86/mm/fault-xen.c 2009-06-04 10:21:39.000000000 +0200
17882 @@ -10,6 +10,7 @@
17883 #include <linux/string.h>
17884 #include <linux/types.h>
17885 #include <linux/ptrace.h>
17886 +#include <linux/mmiotrace.h>
17887 #include <linux/mman.h>
17888 #include <linux/mm.h>
17889 #include <linux/smp.h>
17890 @@ -49,17 +50,23 @@
17891 #define PF_RSVD (1<<3)
17892 #define PF_INSTR (1<<4)
17893
17894 +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
17895 +{
17896 +#ifdef CONFIG_MMIOTRACE_HOOKS
17897 + if (unlikely(is_kmmio_active()))
17898 + if (kmmio_handler(regs, addr) == 1)
17899 + return -1;
17900 +#endif
17901 + return 0;
17902 +}
17903 +
17904 static inline int notify_page_fault(struct pt_regs *regs)
17905 {
17906 #ifdef CONFIG_KPROBES
17907 int ret = 0;
17908
17909 /* kprobe_running() needs smp_processor_id() */
17910 -#ifdef CONFIG_X86_32
17911 if (!user_mode_vm(regs)) {
17912 -#else
17913 - if (!user_mode(regs)) {
17914 -#endif
17915 preempt_disable();
17916 if (kprobe_running() && kprobe_fault_handler(regs, 14))
17917 ret = 1;
17918 @@ -409,11 +416,7 @@ static void show_fault_oops(struct pt_re
17919 printk(KERN_CONT "NULL pointer dereference");
17920 else
17921 printk(KERN_CONT "paging request");
17922 -#ifdef CONFIG_X86_32
17923 - printk(KERN_CONT " at %08lx\n", address);
17924 -#else
17925 - printk(KERN_CONT " at %016lx\n", address);
17926 -#endif
17927 + printk(KERN_CONT " at %p\n", (void *) address);
17928 printk(KERN_ALERT "IP:");
17929 printk_address(regs->ip, 1);
17930 dump_pagetable(address);
17931 @@ -628,6 +631,8 @@ void __kprobes do_page_fault(struct pt_r
17932
17933 if (notify_page_fault(regs))
17934 return;
17935 + if (unlikely(kmmio_fault(regs, address)))
17936 + return;
17937
17938 /*
17939 * We fault-in kernel-space virtual memory on-demand. The
17940 @@ -831,14 +836,10 @@ bad_area_nosemaphore:
17941 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17942 printk_ratelimit()) {
17943 printk(
17944 -#ifdef CONFIG_X86_32
17945 - "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
17946 -#else
17947 - "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
17948 -#endif
17949 + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
17950 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
17951 - tsk->comm, task_pid_nr(tsk), address, regs->ip,
17952 - regs->sp, error_code);
17953 + tsk->comm, task_pid_nr(tsk), address,
17954 + (void *) regs->ip, (void *) regs->sp, error_code);
17955 print_vma_addr(" in ", regs->ip);
17956 printk("\n");
17957 }
17958 @@ -946,81 +947,45 @@ LIST_HEAD(pgd_list);
17959 void vmalloc_sync_all(void)
17960 {
17961 #ifdef CONFIG_X86_32
17962 - /*
17963 - * Note that races in the updates of insync and start aren't
17964 - * problematic: insync can only get set bits added, and updates to
17965 - * start are only improving performance (without affecting correctness
17966 - * if undone).
17967 - * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
17968 - * This change works just fine with 2-level paging too.
17969 - */
17970 -#define sync_index(a) ((a) >> PMD_SHIFT)
17971 - static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
17972 - static unsigned long start = TASK_SIZE;
17973 - unsigned long address;
17974 + unsigned long address = VMALLOC_START & PGDIR_MASK;
17975
17976 if (SHARED_KERNEL_PMD)
17977 return;
17978
17979 BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
17980 - for (address = start;
17981 - address < hypervisor_virt_start;
17982 - address += PMD_SIZE) {
17983 - if (!test_bit(sync_index(address), insync)) {
17984 - unsigned long flags;
17985 - struct page *page;
17986 -
17987 - spin_lock_irqsave(&pgd_lock, flags);
17988 - /* XEN: failure path assumes non-empty pgd_list. */
17989 - if (unlikely(list_empty(&pgd_list))) {
17990 - spin_unlock_irqrestore(&pgd_lock, flags);
17991 - return;
17992 - }
17993 - list_for_each_entry(page, &pgd_list, lru) {
17994 - if (!vmalloc_sync_one(page_address(page),
17995 - address))
17996 - break;
17997 - }
17998 - spin_unlock_irqrestore(&pgd_lock, flags);
17999 - if (!page)
18000 - set_bit(sync_index(address), insync);
18001 + for (; address < hypervisor_virt_start; address += PMD_SIZE) {
18002 + unsigned long flags;
18003 + struct page *page;
18004 +
18005 + spin_lock_irqsave(&pgd_lock, flags);
18006 + list_for_each_entry(page, &pgd_list, lru) {
18007 + if (!vmalloc_sync_one(page_address(page),
18008 + address))
18009 + break;
18010 }
18011 - if (address == start && test_bit(sync_index(address), insync))
18012 - start = address + PMD_SIZE;
18013 + spin_unlock_irqrestore(&pgd_lock, flags);
18014 }
18015 #else /* CONFIG_X86_64 */
18016 - /*
18017 - * Note that races in the updates of insync and start aren't
18018 - * problematic: insync can only get set bits added, and updates to
18019 - * start are only improving performance (without affecting correctness
18020 - * if undone).
18021 - */
18022 - static DECLARE_BITMAP(insync, PTRS_PER_PGD);
18023 - static unsigned long start = VMALLOC_START & PGDIR_MASK;
18024 + unsigned long start = VMALLOC_START & PGDIR_MASK;
18025 unsigned long address;
18026
18027 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
18028 - if (!test_bit(pgd_index(address), insync)) {
18029 - const pgd_t *pgd_ref = pgd_offset_k(address);
18030 - unsigned long flags;
18031 - struct page *page;
18032 -
18033 - if (pgd_none(*pgd_ref))
18034 - continue;
18035 - spin_lock_irqsave(&pgd_lock, flags);
18036 - list_for_each_entry(page, &pgd_list, lru) {
18037 - pgd_t *pgd;
18038 - pgd = (pgd_t *)page_address(page) + pgd_index(address);
18039 - if (pgd_none(*pgd))
18040 - set_pgd(pgd, *pgd_ref);
18041 - else
18042 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18043 - }
18044 - spin_unlock_irqrestore(&pgd_lock, flags);
18045 - set_bit(pgd_index(address), insync);
18046 + const pgd_t *pgd_ref = pgd_offset_k(address);
18047 + unsigned long flags;
18048 + struct page *page;
18049 +
18050 + if (pgd_none(*pgd_ref))
18051 + continue;
18052 + spin_lock_irqsave(&pgd_lock, flags);
18053 + list_for_each_entry(page, &pgd_list, lru) {
18054 + pgd_t *pgd;
18055 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
18056 + if (pgd_none(*pgd))
18057 + set_pgd(pgd, *pgd_ref);
18058 + else
18059 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18060 }
18061 - if (address == start)
18062 - start = address + PGDIR_SIZE;
18063 + spin_unlock_irqrestore(&pgd_lock, flags);
18064 }
18065 #endif
18066 }
18067 --- sle11-2009-06-04.orig/arch/x86/mm/hypervisor.c 2009-05-14 11:18:39.000000000 +0200
18068 +++ sle11-2009-06-04/arch/x86/mm/hypervisor.c 2009-06-04 10:21:39.000000000 +0200
18069 @@ -709,6 +709,72 @@ void xen_destroy_contiguous_region(unsig
18070 }
18071 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
18072
18073 +int __init early_create_contiguous_region(unsigned long pfn,
18074 + unsigned int order,
18075 + unsigned int address_bits)
18076 +{
18077 + unsigned long *in_frames = discontig_frames, out_frame = pfn;
18078 + unsigned int i;
18079 + int rc, success;
18080 + struct xen_memory_exchange exchange = {
18081 + .in = {
18082 + .nr_extents = 1UL << order,
18083 + .extent_order = 0,
18084 + .domid = DOMID_SELF
18085 + },
18086 + .out = {
18087 + .nr_extents = 1,
18088 + .extent_order = order,
18089 + .address_bits = address_bits,
18090 + .domid = DOMID_SELF
18091 + }
18092 + };
18093 +
18094 + if (xen_feature(XENFEAT_auto_translated_physmap))
18095 + return 0;
18096 +
18097 + if (unlikely(order > MAX_CONTIG_ORDER))
18098 + return -ENOMEM;
18099 +
18100 + for (i = 0; i < (1U << order); ++i) {
18101 + in_frames[i] = pfn_to_mfn(pfn + i);
18102 + set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
18103 + }
18104 +
18105 + set_xen_guest_handle(exchange.in.extent_start, in_frames);
18106 + set_xen_guest_handle(exchange.out.extent_start, &out_frame);
18107 +
18108 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
18109 + success = (exchange.nr_exchanged == (1UL << order));
18110 + BUG_ON(!success && (exchange.nr_exchanged || !rc));
18111 + BUG_ON(success && rc);
18112 +#if CONFIG_XEN_COMPAT <= 0x030002
18113 + if (unlikely(rc == -ENOSYS)) {
18114 + /* Compatibility when XENMEM_exchange is unavailable. */
18115 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
18116 + &exchange.in) != (1UL << order))
18117 + BUG();
18118 + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18119 + &exchange.out) == 1);
18120 + if (!success) {
18121 + for (i = 0; i < (1U << order); ++i)
18122 + in_frames[i] = pfn + i;
18123 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18124 + &exchange.in) != (1UL << order))
18125 + BUG();
18126 + }
18127 + }
18128 +#endif
18129 +
18130 + for (i = 0; i < (1U << order); ++i, ++out_frame) {
18131 + if (!success)
18132 + out_frame = in_frames[i];
18133 + set_phys_to_machine(pfn + i, out_frame);
18134 + }
18135 +
18136 + return success ? 0 : -ENOMEM;
18137 +}
18138 +
18139 static void undo_limit_pages(struct page *pages, unsigned int order)
18140 {
18141 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
18142 @@ -875,42 +941,9 @@ int write_ldt_entry(struct desc_struct *
18143 return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
18144 }
18145
18146 -#define MAX_BATCHED_FULL_PTES 32
18147 -
18148 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
18149 - unsigned long addr, unsigned long end, pgprot_t newprot,
18150 - int dirty_accountable)
18151 +int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
18152 + int type)
18153 {
18154 - int rc = 0, i = 0;
18155 - mmu_update_t u[MAX_BATCHED_FULL_PTES];
18156 - pte_t *pte;
18157 - spinlock_t *ptl;
18158 -
18159 - if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
18160 - return 0;
18161 -
18162 - pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
18163 - do {
18164 - if (pte_present(*pte)) {
18165 - pte_t ptent = pte_modify(*pte, newprot);
18166 -
18167 - if (dirty_accountable && pte_dirty(ptent))
18168 - ptent = pte_mkwrite(ptent);
18169 - u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
18170 - | ((unsigned long)pte & ~PAGE_MASK)
18171 - | MMU_PT_UPDATE_PRESERVE_AD;
18172 - u[i].val = __pte_val(ptent);
18173 - if (++i == MAX_BATCHED_FULL_PTES) {
18174 - if ((rc = HYPERVISOR_mmu_update(
18175 - &u[0], i, NULL, DOMID_SELF)) != 0)
18176 - break;
18177 - i = 0;
18178 - }
18179 - }
18180 - } while (pte++, addr += PAGE_SIZE, addr != end);
18181 - if (i)
18182 - rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
18183 - pte_unmap_unlock(pte - 1, ptl);
18184 - BUG_ON(rc && rc != -ENOSYS);
18185 - return !rc;
18186 + maddr_t mach_gp = virt_to_machine(gdt + entry);
18187 + return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
18188 }
18189 --- sle11-2009-06-04.orig/arch/x86/mm/init_32-xen.c 2009-03-16 16:38:05.000000000 +0100
18190 +++ sle11-2009-06-04/arch/x86/mm/init_32-xen.c 2009-06-04 10:21:39.000000000 +0200
18191 @@ -54,6 +54,7 @@
18192
18193 unsigned int __VMALLOC_RESERVE = 128 << 20;
18194
18195 +unsigned long max_low_pfn_mapped;
18196 unsigned long max_pfn_mapped;
18197
18198 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18199 @@ -61,6 +62,27 @@ unsigned long highstart_pfn, highend_pfn
18200
18201 static noinline int do_test_wp_bit(void);
18202
18203 +
18204 +static unsigned long __initdata table_start;
18205 +static unsigned long __initdata table_end;
18206 +static unsigned long __initdata table_top;
18207 +
18208 +static int __initdata after_init_bootmem;
18209 +
18210 +static __init void *alloc_low_page(unsigned long *phys)
18211 +{
18212 + unsigned long pfn = table_end++;
18213 + void *adr;
18214 +
18215 + if (pfn >= table_top)
18216 + panic("alloc_low_page: ran out of memory");
18217 +
18218 + adr = __va(pfn * PAGE_SIZE);
18219 + memset(adr, 0, PAGE_SIZE);
18220 + *phys = pfn * PAGE_SIZE;
18221 + return adr;
18222 +}
18223 +
18224 /*
18225 * Creates a middle page table and puts a pointer to it in the
18226 * given global directory entry. This only returns the gd entry
18227 @@ -72,9 +94,12 @@ static pmd_t * __init one_md_table_init(
18228 pmd_t *pmd_table;
18229
18230 #ifdef CONFIG_X86_PAE
18231 + unsigned long phys;
18232 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
18233 - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18234 -
18235 + if (after_init_bootmem)
18236 + pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18237 + else
18238 + pmd_table = (pmd_t *)alloc_low_page(&phys);
18239 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
18240 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18241 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18242 @@ -101,12 +126,16 @@ static pte_t * __init one_page_table_ini
18243 #endif
18244 pte_t *page_table = NULL;
18245
18246 + if (after_init_bootmem) {
18247 #ifdef CONFIG_DEBUG_PAGEALLOC
18248 - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18249 + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18250 #endif
18251 - if (!page_table) {
18252 - page_table =
18253 + if (!page_table)
18254 + page_table =
18255 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18256 + } else {
18257 + unsigned long phys;
18258 + page_table = (pte_t *)alloc_low_page(&phys);
18259 }
18260
18261 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
18262 @@ -167,24 +196,24 @@ static inline int is_kernel_text(unsigne
18263 * of max_low_pfn pages, by creating page tables starting from address
18264 * PAGE_OFFSET:
18265 */
18266 -static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18267 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
18268 + unsigned long start_pfn,
18269 + unsigned long end_pfn,
18270 + int use_pse)
18271 {
18272 int pgd_idx, pmd_idx, pte_ofs;
18273 unsigned long pfn;
18274 pgd_t *pgd;
18275 pmd_t *pmd;
18276 pte_t *pte;
18277 + unsigned pages_2m = 0, pages_4k = 0;
18278
18279 - unsigned long max_ram_pfn = xen_start_info->nr_pages;
18280 - if (max_ram_pfn > max_low_pfn)
18281 - max_ram_pfn = max_low_pfn;
18282 + if (!cpu_has_pse)
18283 + use_pse = 0;
18284
18285 - pgd_idx = pgd_index(PAGE_OFFSET);
18286 + pfn = start_pfn;
18287 + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18288 pgd = pgd_base + pgd_idx;
18289 - pfn = 0;
18290 - pmd_idx = pmd_index(PAGE_OFFSET);
18291 - pte_ofs = pte_index(PAGE_OFFSET);
18292 -
18293 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18294 #ifdef CONFIG_XEN
18295 /*
18296 @@ -198,10 +227,16 @@ static void __init kernel_physical_mappi
18297 #else
18298 pmd = one_md_table_init(pgd);
18299 #endif
18300 - if (pfn >= max_low_pfn)
18301 +
18302 + if (pfn >= end_pfn)
18303 continue;
18304 +#ifdef CONFIG_X86_PAE
18305 + pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18306 pmd += pmd_idx;
18307 - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
18308 +#else
18309 + pmd_idx = 0;
18310 +#endif
18311 + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
18312 pmd++, pmd_idx++) {
18313 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
18314
18315 @@ -211,13 +246,8 @@ static void __init kernel_physical_mappi
18316 /*
18317 * Map with big pages if possible, otherwise
18318 * create normal page tables:
18319 - *
18320 - * Don't use a large page for the first 2/4MB of memory
18321 - * because there are often fixed size MTRRs in there
18322 - * and overlapping MTRRs into large pages can cause
18323 - * slowdowns.
18324 */
18325 - if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
18326 + if (use_pse) {
18327 unsigned int addr2;
18328 pgprot_t prot = PAGE_KERNEL_LARGE;
18329
18330 @@ -228,49 +258,35 @@ static void __init kernel_physical_mappi
18331 is_kernel_text(addr2))
18332 prot = PAGE_KERNEL_LARGE_EXEC;
18333
18334 + pages_2m++;
18335 set_pmd(pmd, pfn_pmd(pfn, prot));
18336
18337 pfn += PTRS_PER_PTE;
18338 - max_pfn_mapped = pfn;
18339 continue;
18340 }
18341 pte = one_page_table_init(pmd);
18342
18343 - for (pte += pte_ofs;
18344 - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
18345 + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18346 + pte += pte_ofs;
18347 + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
18348 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
18349 pgprot_t prot = PAGE_KERNEL;
18350
18351 /* XEN: Only map initial RAM allocation. */
18352 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
18353 + if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
18354 continue;
18355 if (is_kernel_text(addr))
18356 prot = PAGE_KERNEL_EXEC;
18357
18358 + pages_4k++;
18359 set_pte(pte, pfn_pte(pfn, prot));
18360 }
18361 - max_pfn_mapped = pfn;
18362 - pte_ofs = 0;
18363 }
18364 - pmd_idx = 0;
18365 }
18366 + update_page_count(PG_LEVEL_2M, pages_2m);
18367 + update_page_count(PG_LEVEL_4K, pages_4k);
18368 }
18369
18370 -#ifndef CONFIG_XEN
18371 -
18372 -static inline int page_kills_ppro(unsigned long pagenr)
18373 -{
18374 - if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18375 - return 1;
18376 - return 0;
18377 -}
18378 -
18379 -#else
18380 -
18381 -#define page_kills_ppro(p) 0
18382 -
18383 -#endif
18384 -
18385 /*
18386 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
18387 * is valid. The argument is a physical page number.
18388 @@ -331,30 +347,63 @@ static void __init permanent_kmaps_init(
18389 pkmap_page_table = pte;
18390 }
18391
18392 -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18393 +static void __init add_one_highpage_init(struct page *page, int pfn)
18394 {
18395 - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18396 - ClearPageReserved(page);
18397 - init_page_count(page);
18398 - if (pfn < xen_start_info->nr_pages)
18399 - __free_page(page);
18400 - totalhigh_pages++;
18401 - } else
18402 - SetPageReserved(page);
18403 + ClearPageReserved(page);
18404 + init_page_count(page);
18405 + if (pfn < xen_start_info->nr_pages)
18406 + __free_page(page);
18407 + totalhigh_pages++;
18408 +}
18409 +
18410 +struct add_highpages_data {
18411 + unsigned long start_pfn;
18412 + unsigned long end_pfn;
18413 +};
18414 +
18415 +static int __init add_highpages_work_fn(unsigned long start_pfn,
18416 + unsigned long end_pfn, void *datax)
18417 +{
18418 + int node_pfn;
18419 + struct page *page;
18420 + unsigned long final_start_pfn, final_end_pfn;
18421 + struct add_highpages_data *data;
18422 +
18423 + data = (struct add_highpages_data *)datax;
18424 +
18425 + final_start_pfn = max(start_pfn, data->start_pfn);
18426 + final_end_pfn = min(end_pfn, data->end_pfn);
18427 + if (final_start_pfn >= final_end_pfn)
18428 + return 0;
18429 +
18430 + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
18431 + node_pfn++) {
18432 + if (!pfn_valid(node_pfn))
18433 + continue;
18434 + page = pfn_to_page(node_pfn);
18435 + add_one_highpage_init(page, node_pfn);
18436 + }
18437 +
18438 + return 0;
18439 +
18440 +}
18441 +
18442 +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
18443 + unsigned long end_pfn)
18444 +{
18445 + struct add_highpages_data data;
18446 +
18447 + data.start_pfn = start_pfn;
18448 + data.end_pfn = end_pfn;
18449 +
18450 + work_with_active_regions(nid, add_highpages_work_fn, &data);
18451 }
18452
18453 #ifndef CONFIG_NUMA
18454 -static void __init set_highmem_pages_init(int bad_ppro)
18455 +static void __init set_highmem_pages_init(void)
18456 {
18457 - int pfn;
18458 + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
18459
18460 - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
18461 - /*
18462 - * Holes under sparsemem might not have no mem_map[]:
18463 - */
18464 - if (pfn_valid(pfn))
18465 - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18466 - }
18467 totalram_pages += totalhigh_pages;
18468 }
18469 #endif /* !CONFIG_NUMA */
18470 @@ -362,24 +411,11 @@ static void __init set_highmem_pages_ini
18471 #else
18472 # define kmap_init() do { } while (0)
18473 # define permanent_kmaps_init(pgd_base) do { } while (0)
18474 -# define set_highmem_pages_init(bad_ppro) do { } while (0)
18475 +# define set_highmem_pages_init() do { } while (0)
18476 #endif /* CONFIG_HIGHMEM */
18477
18478 -pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
18479 -EXPORT_SYMBOL(__PAGE_KERNEL);
18480 -
18481 -pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18482 -
18483 pgd_t *swapper_pg_dir;
18484
18485 -static void __init xen_pagetable_setup_start(pgd_t *base)
18486 -{
18487 -}
18488 -
18489 -static void __init xen_pagetable_setup_done(pgd_t *base)
18490 -{
18491 -}
18492 -
18493 /*
18494 * Build a proper pagetable for the kernel mappings. Up until this
18495 * point, we've been running on some set of pagetables constructed by
18496 @@ -399,27 +435,10 @@ static void __init xen_pagetable_setup_d
18497 * be partially populated, and so it avoids stomping on any existing
18498 * mappings.
18499 */
18500 -static void __init pagetable_init(void)
18501 +static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
18502 {
18503 - pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18504 unsigned long vaddr, end;
18505
18506 - xen_pagetable_setup_start(pgd_base);
18507 -
18508 - /* Enable PSE if available */
18509 - if (cpu_has_pse)
18510 - set_in_cr4(X86_CR4_PSE);
18511 -
18512 - /* Enable PGE if available */
18513 - if (cpu_has_pge) {
18514 - set_in_cr4(X86_CR4_PGE);
18515 - __PAGE_KERNEL |= _PAGE_GLOBAL;
18516 - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18517 - }
18518 -
18519 - kernel_physical_mapping_init(pgd_base);
18520 - remap_numa_kva();
18521 -
18522 /*
18523 * Fixed mappings, only the page table structure has to be
18524 * created - mappings will be set by set_fixmap():
18525 @@ -429,10 +448,13 @@ static void __init pagetable_init(void)
18526 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
18527 page_table_range_init(vaddr, end, pgd_base);
18528 early_ioremap_reset();
18529 +}
18530
18531 - permanent_kmaps_init(pgd_base);
18532 +static void __init pagetable_init(void)
18533 +{
18534 + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18535
18536 - xen_pagetable_setup_done(pgd_base);
18537 + permanent_kmaps_init(pgd_base);
18538 }
18539
18540 #if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
18541 @@ -475,7 +497,7 @@ void zap_low_mappings(void)
18542
18543 int nx_enabled;
18544
18545 -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
18546 +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
18547 EXPORT_SYMBOL_GPL(__supported_pte_mask);
18548
18549 #ifdef CONFIG_X86_PAE
18550 @@ -528,42 +550,364 @@ static void __init set_nx(void)
18551 }
18552 #endif
18553
18554 +/* user-defined highmem size */
18555 +static unsigned int highmem_pages = -1;
18556 +
18557 /*
18558 - * paging_init() sets up the page tables - note that the first 8MB are
18559 - * already mapped by head.S.
18560 - *
18561 - * This routines also unmaps the page at virtual kernel address 0, so
18562 - * that we can trap those pesky NULL-reference errors in the kernel.
18563 + * highmem=size forces highmem to be exactly 'size' bytes.
18564 + * This works even on boxes that have no highmem otherwise.
18565 + * This also works to reduce highmem size on bigger boxes.
18566 */
18567 -void __init paging_init(void)
18568 +static int __init parse_highmem(char *arg)
18569 +{
18570 + if (!arg)
18571 + return -EINVAL;
18572 +
18573 + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
18574 + return 0;
18575 +}
18576 +early_param("highmem", parse_highmem);
18577 +
18578 +/*
18579 + * Determine low and high memory ranges:
18580 + */
18581 +void __init find_low_pfn_range(void)
18582 +{
18583 + /* it could update max_pfn */
18584 +
18585 + /* max_low_pfn is 0, we already have early_res support */
18586 +
18587 + max_low_pfn = max_pfn;
18588 + if (max_low_pfn > MAXMEM_PFN) {
18589 + if (highmem_pages == -1)
18590 + highmem_pages = max_pfn - MAXMEM_PFN;
18591 + if (highmem_pages + MAXMEM_PFN < max_pfn)
18592 + max_pfn = MAXMEM_PFN + highmem_pages;
18593 + if (highmem_pages + MAXMEM_PFN > max_pfn) {
18594 + printk(KERN_WARNING "only %luMB highmem pages "
18595 + "available, ignoring highmem size of %uMB.\n",
18596 + pages_to_mb(max_pfn - MAXMEM_PFN),
18597 + pages_to_mb(highmem_pages));
18598 + highmem_pages = 0;
18599 + }
18600 + max_low_pfn = MAXMEM_PFN;
18601 +#ifndef CONFIG_HIGHMEM
18602 + /* Maximum memory usable is what is directly addressable */
18603 + printk(KERN_WARNING "Warning only %ldMB will be used.\n",
18604 + MAXMEM>>20);
18605 + if (max_pfn > MAX_NONPAE_PFN)
18606 + printk(KERN_WARNING
18607 + "Use a HIGHMEM64G enabled kernel.\n");
18608 + else
18609 + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
18610 + max_pfn = MAXMEM_PFN;
18611 +#else /* !CONFIG_HIGHMEM */
18612 +#ifndef CONFIG_HIGHMEM64G
18613 + if (max_pfn > MAX_NONPAE_PFN) {
18614 + max_pfn = MAX_NONPAE_PFN;
18615 + printk(KERN_WARNING "Warning only 4GB will be used."
18616 + "Use a HIGHMEM64G enabled kernel.\n");
18617 + }
18618 +#endif /* !CONFIG_HIGHMEM64G */
18619 +#endif /* !CONFIG_HIGHMEM */
18620 + } else {
18621 + if (highmem_pages == -1)
18622 + highmem_pages = 0;
18623 +#ifdef CONFIG_HIGHMEM
18624 + if (highmem_pages >= max_pfn) {
18625 + printk(KERN_ERR "highmem size specified (%uMB) is "
18626 + "bigger than pages available (%luMB)!.\n",
18627 + pages_to_mb(highmem_pages),
18628 + pages_to_mb(max_pfn));
18629 + highmem_pages = 0;
18630 + }
18631 + if (highmem_pages) {
18632 + if (max_low_pfn - highmem_pages <
18633 + 64*1024*1024/PAGE_SIZE){
18634 + printk(KERN_ERR "highmem size %uMB results in "
18635 + "smaller than 64MB lowmem, ignoring it.\n"
18636 + , pages_to_mb(highmem_pages));
18637 + highmem_pages = 0;
18638 + }
18639 + max_low_pfn -= highmem_pages;
18640 + }
18641 +#else
18642 + if (highmem_pages)
18643 + printk(KERN_ERR "ignoring highmem size on non-highmem"
18644 + " kernel!\n");
18645 +#endif
18646 + }
18647 +}
18648 +
18649 +#ifndef CONFIG_NEED_MULTIPLE_NODES
18650 +void __init initmem_init(unsigned long start_pfn,
18651 + unsigned long end_pfn)
18652 +{
18653 +#ifdef CONFIG_HIGHMEM
18654 + highstart_pfn = highend_pfn = max_pfn;
18655 + if (max_pfn > max_low_pfn)
18656 + highstart_pfn = max_low_pfn;
18657 + memory_present(0, 0, highend_pfn);
18658 + e820_register_active_regions(0, 0, highend_pfn);
18659 + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
18660 + pages_to_mb(highend_pfn - highstart_pfn));
18661 + num_physpages = highend_pfn;
18662 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
18663 +#else
18664 + memory_present(0, 0, max_low_pfn);
18665 + e820_register_active_regions(0, 0, max_low_pfn);
18666 + num_physpages = max_low_pfn;
18667 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
18668 +#endif
18669 +#ifdef CONFIG_FLATMEM
18670 + max_mapnr = num_physpages;
18671 +#endif
18672 + printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
18673 + pages_to_mb(max_low_pfn));
18674 +
18675 + setup_bootmem_allocator();
18676 +}
18677 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
18678 +
18679 +static void __init zone_sizes_init(void)
18680 +{
18681 + unsigned long max_zone_pfns[MAX_NR_ZONES];
18682 + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
18683 + max_zone_pfns[ZONE_DMA] =
18684 + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
18685 + max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
18686 +#ifdef CONFIG_HIGHMEM
18687 + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
18688 +#endif
18689 +
18690 + free_area_init_nodes(max_zone_pfns);
18691 +}
18692 +
18693 +void __init setup_bootmem_allocator(void)
18694 {
18695 int i;
18696 + unsigned long bootmap_size, bootmap;
18697 + unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
18698 +
18699 + /*
18700 + * Initialize the boot-time allocator (with low memory only):
18701 + */
18702 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
18703 + bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
18704 + min(max_pfn_mapped, xen_start_info->nr_pages)<<PAGE_SHIFT,
18705 + bootmap_size, PAGE_SIZE);
18706 + if (bootmap == -1L)
18707 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
18708 + reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
18709 +
18710 + /* don't touch min_low_pfn */
18711 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
18712 + min_low_pfn, end_pfn);
18713 + printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
18714 + max_pfn_mapped<<PAGE_SHIFT);
18715 + printk(KERN_INFO " low ram: %08lx - %08lx\n",
18716 + min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
18717 + printk(KERN_INFO " bootmap %08lx - %08lx\n",
18718 + bootmap, bootmap + bootmap_size);
18719 + for_each_online_node(i)
18720 + free_bootmem_with_active_regions(i, end_pfn);
18721 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
18722 +
18723 + after_init_bootmem = 1;
18724 +}
18725 +
18726 +static unsigned long __init extend_init_mapping(unsigned long tables_space)
18727 +{
18728 + unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
18729 + + xen_start_info->nr_pt_frames;
18730 + unsigned long start = start_pfn, va = (unsigned long)&_text;
18731 + pgd_t *pgd;
18732 + pud_t *pud;
18733 + pmd_t *pmd;
18734 + pte_t *pte;
18735 +
18736 + /* Ensure init mappings cover kernel text/data and initial tables. */
18737 + while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
18738 + pgd = pgd_offset_k(va);
18739 + pud = pud_offset(pgd, va);
18740 + pmd = pmd_offset(pud, va);
18741 + if (pmd_none(*pmd)) {
18742 + unsigned long pa = start_pfn++ << PAGE_SHIFT;
18743 +
18744 + memset(__va(pa), 0, PAGE_SIZE);
18745 + make_lowmem_page_readonly(__va(pa),
18746 + XENFEAT_writable_page_tables);
18747 + xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
18748 + }
18749 + pte = pte_offset_kernel(pmd, va);
18750 + if (pte_none(*pte)) {
18751 + pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
18752 +
18753 + if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
18754 + BUG();
18755 + }
18756 + va += PAGE_SIZE;
18757 + }
18758 +
18759 + /* Finally, blow away any spurious initial mappings. */
18760 + while (1) {
18761 + pgd = pgd_offset_k(va);
18762 + pud = pud_offset(pgd, va);
18763 + pmd = pmd_offset(pud, va);
18764 + if (pmd_none(*pmd))
18765 + break;
18766 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18767 + BUG();
18768 + va += PAGE_SIZE;
18769 + }
18770 +
18771 + if (start_pfn > start)
18772 + reserve_early(start << PAGE_SHIFT,
18773 + start_pfn << PAGE_SHIFT, "INITMAP");
18774 +
18775 + return start_pfn;
18776 +}
18777 +
18778 +static void __init find_early_table_space(unsigned long end)
18779 +{
18780 + unsigned long puds, pmds, ptes, tables;
18781 +
18782 + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
18783 + tables = PAGE_ALIGN(puds * sizeof(pud_t));
18784 +
18785 + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
18786 + tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
18787 +
18788 + if (cpu_has_pse) {
18789 + unsigned long extra;
18790 +
18791 + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
18792 + extra += PMD_SIZE;
18793 + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
18794 + } else
18795 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
18796 +
18797 + tables += PAGE_ALIGN(ptes * sizeof(pte_t));
18798 +
18799 + /* for fixmap */
18800 + tables += PAGE_SIZE
18801 + * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
18802 + - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
18803 + >> PMD_SHIFT);
18804 +
18805 + table_start = extend_init_mapping(tables);
18806 +
18807 + table_end = table_start;
18808 + table_top = table_start + (tables>>PAGE_SHIFT);
18809 +
18810 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
18811 + end, table_start << PAGE_SHIFT,
18812 + (table_start << PAGE_SHIFT) + tables);
18813 +}
18814 +
18815 +unsigned long __init_refok init_memory_mapping(unsigned long start,
18816 + unsigned long end)
18817 +{
18818 + pgd_t *pgd_base = swapper_pg_dir;
18819 + unsigned long start_pfn, end_pfn;
18820 + unsigned long big_page_start;
18821 +
18822 + /*
18823 + * Find space for the kernel direct mapping tables.
18824 + */
18825 + if (!after_init_bootmem)
18826 + find_early_table_space(end);
18827
18828 #ifdef CONFIG_X86_PAE
18829 set_nx();
18830 if (nx_enabled)
18831 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
18832 #endif
18833 +
18834 + /* Enable PSE if available */
18835 + if (cpu_has_pse)
18836 + set_in_cr4(X86_CR4_PSE);
18837 +
18838 + /* Enable PGE if available */
18839 + if (cpu_has_pge) {
18840 + set_in_cr4(X86_CR4_PGE);
18841 + __supported_pte_mask |= _PAGE_GLOBAL;
18842 + }
18843 +
18844 + /*
18845 + * Don't use a large page for the first 2/4MB of memory
18846 + * because there are often fixed size MTRRs in there
18847 + * and overlapping MTRRs into large pages can cause
18848 + * slowdowns.
18849 + */
18850 + big_page_start = PMD_SIZE;
18851 +
18852 + if (start < big_page_start) {
18853 + start_pfn = start >> PAGE_SHIFT;
18854 + end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
18855 + } else {
18856 + /* head is not big page alignment ? */
18857 + start_pfn = start >> PAGE_SHIFT;
18858 + end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18859 + << (PMD_SHIFT - PAGE_SHIFT);
18860 + }
18861 + if (start_pfn < end_pfn)
18862 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
18863 +
18864 + /* big page range */
18865 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18866 + << (PMD_SHIFT - PAGE_SHIFT);
18867 + if (start_pfn < (big_page_start >> PAGE_SHIFT))
18868 + start_pfn = big_page_start >> PAGE_SHIFT;
18869 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
18870 + if (start_pfn < end_pfn)
18871 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
18872 + cpu_has_pse);
18873 +
18874 + /* tail is not big page alignment ? */
18875 + start_pfn = end_pfn;
18876 + if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
18877 + end_pfn = end >> PAGE_SHIFT;
18878 + if (start_pfn < end_pfn)
18879 + kernel_physical_mapping_init(pgd_base, start_pfn,
18880 + end_pfn, 0);
18881 + }
18882 +
18883 + early_ioremap_page_table_range_init(pgd_base);
18884 +
18885 + __flush_tlb_all();
18886 +
18887 + if (!after_init_bootmem)
18888 + reserve_early(table_start << PAGE_SHIFT,
18889 + table_end << PAGE_SHIFT, "PGTABLE");
18890 +
18891 + if (!after_init_bootmem)
18892 + early_memtest(start, end);
18893 +
18894 + return end >> PAGE_SHIFT;
18895 +}
18896 +
18897 +
18898 +/*
18899 + * paging_init() sets up the page tables - note that the first 8MB are
18900 + * already mapped by head.S.
18901 + *
18902 + * This routines also unmaps the page at virtual kernel address 0, so
18903 + * that we can trap those pesky NULL-reference errors in the kernel.
18904 + */
18905 +void __init paging_init(void)
18906 +{
18907 pagetable_init();
18908
18909 __flush_tlb_all();
18910
18911 kmap_init();
18912
18913 - /* Switch to the real shared_info page, and clear the
18914 - * dummy page. */
18915 - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18916 - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18917 - memset(empty_zero_page, 0, sizeof(empty_zero_page));
18918 -
18919 - /* Setup mapping of lower 1st MB */
18920 - for (i = 0; i < NR_FIX_ISAMAPS; i++)
18921 - if (is_initial_xendomain())
18922 - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18923 - else
18924 - __set_fixmap(FIX_ISAMAP_BEGIN - i,
18925 - virt_to_machine(empty_zero_page),
18926 - PAGE_KERNEL_RO);
18927 + /*
18928 + * NOTE: at this point the bootmem allocator is fully available.
18929 + */
18930 + sparse_init();
18931 + zone_sizes_init();
18932 }
18933
18934 /*
18935 @@ -598,7 +942,7 @@ static struct kcore_list kcore_mem, kcor
18936 void __init mem_init(void)
18937 {
18938 int codesize, reservedpages, datasize, initsize;
18939 - int tmp, bad_ppro;
18940 + int tmp;
18941 unsigned long pfn;
18942
18943 pci_iommu_alloc();
18944 @@ -606,19 +950,6 @@ void __init mem_init(void)
18945 #ifdef CONFIG_FLATMEM
18946 BUG_ON(!mem_map);
18947 #endif
18948 - bad_ppro = ppro_with_ram_bug();
18949 -
18950 -#ifdef CONFIG_HIGHMEM
18951 - /* check that fixmap and pkmap do not overlap */
18952 - if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
18953 - printk(KERN_ERR
18954 - "fixmap and kmap areas overlap - this will crash\n");
18955 - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
18956 - PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
18957 - FIXADDR_START);
18958 - BUG();
18959 - }
18960 -#endif
18961 /* this will put all low memory onto the freelists */
18962 totalram_pages += free_all_bootmem();
18963 /* XEN: init and count low-mem pages outside initial allocation. */
18964 @@ -636,7 +967,7 @@ void __init mem_init(void)
18965 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
18966 reservedpages++;
18967
18968 - set_highmem_pages_init(bad_ppro);
18969 + set_highmem_pages_init();
18970
18971 codesize = (unsigned long) &_etext - (unsigned long) &_text;
18972 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
18973 @@ -657,7 +988,6 @@ void __init mem_init(void)
18974 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
18975 );
18976
18977 -#if 1 /* double-sanity-check paranoia */
18978 printk(KERN_INFO "virtual kernel memory layout:\n"
18979 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
18980 #ifdef CONFIG_HIGHMEM
18981 @@ -698,7 +1028,6 @@ void __init mem_init(void)
18982 #endif
18983 BUG_ON(VMALLOC_START > VMALLOC_END);
18984 BUG_ON((unsigned long)high_memory > VMALLOC_START);
18985 -#endif /* double-sanity-check paranoia */
18986
18987 if (boot_cpu_data.wp_works_ok < 0)
18988 test_wp_bit();
18989 @@ -755,6 +1084,8 @@ void mark_rodata_ro(void)
18990 unsigned long start = PFN_ALIGN(_text);
18991 unsigned long size = PFN_ALIGN(_etext) - start;
18992
18993 +#ifndef CONFIG_DYNAMIC_FTRACE
18994 + /* Dynamic tracing modifies the kernel text section */
18995 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18996 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
18997 size >> 10);
18998 @@ -767,6 +1098,8 @@ void mark_rodata_ro(void)
18999 printk(KERN_INFO "Testing CPA: write protecting again\n");
19000 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
19001 #endif
19002 +#endif /* CONFIG_DYNAMIC_FTRACE */
19003 +
19004 start += size;
19005 size = (unsigned long)__end_rodata - start;
19006 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
19007 @@ -829,3 +1162,9 @@ void free_initrd_mem(unsigned long start
19008 free_init_pages("initrd memory", start, end);
19009 }
19010 #endif
19011 +
19012 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
19013 + int flags)
19014 +{
19015 + return reserve_bootmem(phys, len, flags);
19016 +}
19017 --- sle11-2009-06-04.orig/arch/x86/mm/init_64-xen.c 2009-03-16 16:38:05.000000000 +0100
19018 +++ sle11-2009-06-04/arch/x86/mm/init_64-xen.c 2009-06-04 10:21:39.000000000 +0200
19019 @@ -21,6 +21,7 @@
19020 #include <linux/swap.h>
19021 #include <linux/smp.h>
19022 #include <linux/init.h>
19023 +#include <linux/initrd.h>
19024 #include <linux/pagemap.h>
19025 #include <linux/bootmem.h>
19026 #include <linux/proc_fs.h>
19027 @@ -52,6 +53,14 @@
19028
19029 #include <xen/features.h>
19030
19031 +/*
19032 + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
19033 + * The direct mapping extends to max_pfn_mapped, so that we can directly access
19034 + * apertures, ACPI and other tables without having to play with fixmaps.
19035 + */
19036 +unsigned long max_low_pfn_mapped;
19037 +unsigned long max_pfn_mapped;
19038 +
19039 #if CONFIG_XEN_COMPAT <= 0x030002
19040 unsigned int __kernel_page_user;
19041 EXPORT_SYMBOL(__kernel_page_user);
19042 @@ -60,13 +69,12 @@ EXPORT_SYMBOL(__kernel_page_user);
19043 int after_bootmem;
19044
19045 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
19046 -extern unsigned long start_pfn;
19047
19048 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
19049 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
19050
19051 #ifndef CONFIG_XEN
19052 -int direct_gbpages __meminitdata
19053 +int direct_gbpages
19054 #ifdef CONFIG_DIRECT_GBPAGES
19055 = 1
19056 #endif
19057 @@ -145,55 +153,23 @@ void __meminit early_make_page_readonly(
19058 * around without checking the pgd every time.
19059 */
19060
19061 -void show_mem(void)
19062 -{
19063 - long i, total = 0, reserved = 0;
19064 - long shared = 0, cached = 0;
19065 - struct page *page;
19066 - pg_data_t *pgdat;
19067 -
19068 - printk(KERN_INFO "Mem-info:\n");
19069 - show_free_areas();
19070 - for_each_online_pgdat(pgdat) {
19071 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19072 - /*
19073 - * This loop can take a while with 256 GB and
19074 - * 4k pages so defer the NMI watchdog:
19075 - */
19076 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
19077 - touch_nmi_watchdog();
19078 -
19079 - if (!pfn_valid(pgdat->node_start_pfn + i))
19080 - continue;
19081 -
19082 - page = pfn_to_page(pgdat->node_start_pfn + i);
19083 - total++;
19084 - if (PageReserved(page))
19085 - reserved++;
19086 - else if (PageSwapCache(page))
19087 - cached++;
19088 - else if (page_count(page))
19089 - shared += page_count(page) - 1;
19090 - }
19091 - }
19092 - printk(KERN_INFO "%lu pages of RAM\n", total);
19093 - printk(KERN_INFO "%lu reserved pages\n", reserved);
19094 - printk(KERN_INFO "%lu pages shared\n", shared);
19095 - printk(KERN_INFO "%lu pages swap cached\n", cached);
19096 -}
19097 -
19098 static unsigned long __meminitdata table_start;
19099 -static unsigned long __meminitdata table_end;
19100 +static unsigned long __meminitdata table_cur;
19101 +static unsigned long __meminitdata table_top;
19102
19103 -static __init void *spp_getpage(void)
19104 +/*
19105 + * NOTE: This function is marked __ref because it calls __init function
19106 + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
19107 + */
19108 +static __ref void *spp_getpage(void)
19109 {
19110 void *ptr;
19111
19112 if (after_bootmem)
19113 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
19114 - else if (start_pfn < table_end) {
19115 - ptr = __va(start_pfn << PAGE_SHIFT);
19116 - start_pfn++;
19117 + else if (table_cur < table_top) {
19118 + ptr = __va(table_cur << PAGE_SHIFT);
19119 + table_cur++;
19120 memset(ptr, 0, PAGE_SIZE);
19121 } else
19122 ptr = alloc_bootmem_pages(PAGE_SIZE);
19123 @@ -208,30 +184,18 @@ static __init void *spp_getpage(void)
19124 return ptr;
19125 }
19126
19127 -#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
19128 -#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
19129 -
19130 -static __init void
19131 -set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
19132 +void
19133 +set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
19134 {
19135 - pgd_t *pgd;
19136 pud_t *pud;
19137 pmd_t *pmd;
19138 - pte_t *pte, new_pte;
19139 -
19140 - pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
19141 + pte_t *pte;
19142
19143 - pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
19144 - if (pgd_none(*pgd)) {
19145 - printk(KERN_ERR
19146 - "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19147 - return;
19148 - }
19149 - pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
19150 + pud = pud_page + pud_index(vaddr);
19151 if (pud_none(*pud)) {
19152 pmd = (pmd_t *) spp_getpage();
19153 make_page_readonly(pmd, XENFEAT_writable_page_tables);
19154 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19155 + pud_populate(&init_mm, pud, pmd);
19156 if (pmd != pmd_offset(pud, 0)) {
19157 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19158 pmd, pmd_offset(pud, 0));
19159 @@ -242,19 +206,20 @@ set_pte_phys(unsigned long vaddr, unsign
19160 if (pmd_none(*pmd)) {
19161 pte = (pte_t *) spp_getpage();
19162 make_page_readonly(pte, XENFEAT_writable_page_tables);
19163 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19164 + pmd_populate_kernel(&init_mm, pmd, pte);
19165 if (pte != pte_offset_kernel(pmd, 0)) {
19166 printk(KERN_ERR "PAGETABLE BUG #02!\n");
19167 return;
19168 }
19169 }
19170 - if (pgprot_val(prot))
19171 - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
19172 - else
19173 - new_pte = __pte(0);
19174
19175 pte = pte_offset_kernel(pmd, vaddr);
19176 if (!pte_none(*pte) && __pte_val(new_pte) &&
19177 +#ifdef CONFIG_ACPI
19178 + /* __acpi_map_table() fails to properly call clear_fixmap() */
19179 + (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19180 + vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19181 +#endif
19182 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19183 pte_ERROR(*pte);
19184 set_pte(pte, new_pte);
19185 @@ -266,15 +231,13 @@ set_pte_phys(unsigned long vaddr, unsign
19186 __flush_tlb_one(vaddr);
19187 }
19188
19189 -static __init void
19190 -set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
19191 +void
19192 +set_pte_vaddr(unsigned long vaddr, pte_t pteval)
19193 {
19194 pgd_t *pgd;
19195 - pud_t *pud;
19196 - pmd_t *pmd;
19197 - pte_t *pte, new_pte;
19198 + pud_t *pud_page;
19199
19200 - pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
19201 + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
19202
19203 pgd = pgd_offset_k(vaddr);
19204 if (pgd_none(*pgd)) {
19205 @@ -282,47 +245,51 @@ set_pte_phys_ma(unsigned long vaddr, uns
19206 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19207 return;
19208 }
19209 - pud = pud_offset(pgd, vaddr);
19210 - if (pud_none(*pud)) {
19211 - pmd = (pmd_t *) spp_getpage();
19212 - make_page_readonly(pmd, XENFEAT_writable_page_tables);
19213 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19214 - if (pmd != pmd_offset(pud, 0)) {
19215 - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19216 - pmd, pmd_offset(pud, 0));
19217 + pud_page = (pud_t*)pgd_page_vaddr(*pgd);
19218 + set_pte_vaddr_pud(pud_page, vaddr, pteval);
19219 +}
19220 +
19221 +#ifndef CONFIG_XEN
19222 +/*
19223 + * Create large page table mappings for a range of physical addresses.
19224 + */
19225 +static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
19226 + pgprot_t prot)
19227 +{
19228 + pgd_t *pgd;
19229 + pud_t *pud;
19230 + pmd_t *pmd;
19231 +
19232 + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
19233 + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
19234 + pgd = pgd_offset_k((unsigned long)__va(phys));
19235 + if (pgd_none(*pgd)) {
19236 + pud = (pud_t *) spp_getpage();
19237 + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
19238 + _PAGE_USER));
19239 }
19240 - }
19241 - pmd = pmd_offset(pud, vaddr);
19242 - if (pmd_none(*pmd)) {
19243 - pte = (pte_t *) spp_getpage();
19244 - make_page_readonly(pte, XENFEAT_writable_page_tables);
19245 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19246 - if (pte != pte_offset_kernel(pmd, 0)) {
19247 - printk(KERN_ERR "PAGETABLE BUG #02!\n");
19248 - return;
19249 + pud = pud_offset(pgd, (unsigned long)__va(phys));
19250 + if (pud_none(*pud)) {
19251 + pmd = (pmd_t *) spp_getpage();
19252 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
19253 + _PAGE_USER));
19254 }
19255 + pmd = pmd_offset(pud, phys);
19256 + BUG_ON(!pmd_none(*pmd));
19257 + set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
19258 }
19259 - new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
19260 +}
19261
19262 - pte = pte_offset_kernel(pmd, vaddr);
19263 - if (!pte_none(*pte) && __pte_val(new_pte) &&
19264 -#ifdef CONFIG_ACPI
19265 - /* __acpi_map_table() fails to properly call clear_fixmap() */
19266 - (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19267 - vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19268 -#endif
19269 - __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19270 - pte_ERROR(*pte);
19271 - set_pte(pte, new_pte);
19272 +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
19273 +{
19274 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
19275 +}
19276
19277 - /*
19278 - * It's enough to flush this one mapping.
19279 - * (PGE mappings get flushed as well)
19280 - */
19281 - __flush_tlb_one(vaddr);
19282 +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
19283 +{
19284 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
19285 }
19286
19287 -#ifndef CONFIG_XEN
19288 /*
19289 * The head.S code sets up the kernel high mapping:
19290 *
19291 @@ -352,63 +319,52 @@ void __init cleanup_highmap(void)
19292 }
19293 #endif
19294
19295 -/* NOTE: this is meant to be run only at boot */
19296 -void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
19297 -{
19298 - unsigned long address = __fix_to_virt(idx);
19299 -
19300 - if (idx >= __end_of_fixed_addresses) {
19301 - printk(KERN_ERR "Invalid __set_fixmap\n");
19302 - return;
19303 - }
19304 - switch (idx) {
19305 - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
19306 - set_pte_phys(address, phys, prot, 0);
19307 - set_pte_phys(address, phys, prot, 1);
19308 - break;
19309 - case FIX_EARLYCON_MEM_BASE:
19310 - xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
19311 - pfn_pte_ma(phys >> PAGE_SHIFT, prot));
19312 - break;
19313 - default:
19314 - set_pte_phys_ma(address, phys, prot);
19315 - break;
19316 - }
19317 -}
19318 -
19319 -static __meminit void *alloc_static_page(unsigned long *phys)
19320 +static __ref void *alloc_low_page(unsigned long *phys)
19321 {
19322 - unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
19323 + unsigned long pfn;
19324 + void *adr;
19325
19326 if (after_bootmem) {
19327 - void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
19328 + adr = (void *)get_zeroed_page(GFP_ATOMIC);
19329 *phys = __pa(adr);
19330
19331 return adr;
19332 }
19333
19334 - *phys = start_pfn << PAGE_SHIFT;
19335 - start_pfn++;
19336 - memset((void *)va, 0, PAGE_SIZE);
19337 - return (void *)va;
19338 + BUG_ON(!table_cur);
19339 + pfn = table_cur++;
19340 + if (pfn >= table_top)
19341 + panic("alloc_low_page: ran out of memory");
19342 +
19343 + adr = early_ioremap(pfn_to_mfn(pfn) * PAGE_SIZE, PAGE_SIZE);
19344 + memset(adr, 0, PAGE_SIZE);
19345 + *phys = pfn * PAGE_SIZE;
19346 + return adr;
19347 }
19348
19349 -#define PTE_SIZE PAGE_SIZE
19350 +static __ref void unmap_low_page(void *adr)
19351 +{
19352 + if (after_bootmem)
19353 + return;
19354 +
19355 + early_iounmap(adr, PAGE_SIZE);
19356 +}
19357
19358 static inline int __meminit make_readonly(unsigned long paddr)
19359 {
19360 extern char __vsyscall_0;
19361 int readonly = 0;
19362
19363 - /* Make new page tables read-only. */
19364 + /* Make new page tables read-only on the first pass. */
19365 if (!xen_feature(XENFEAT_writable_page_tables)
19366 + && !max_pfn_mapped
19367 && (paddr >= (table_start << PAGE_SHIFT))
19368 - && (paddr < (table_end << PAGE_SHIFT)))
19369 + && (paddr < (table_top << PAGE_SHIFT)))
19370 readonly = 1;
19371 /* Make old page tables read-only. */
19372 if (!xen_feature(XENFEAT_writable_page_tables)
19373 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
19374 - && (paddr < (start_pfn << PAGE_SHIFT)))
19375 + && (paddr < (table_cur << PAGE_SHIFT)))
19376 readonly = 1;
19377
19378 /*
19379 @@ -425,118 +381,131 @@ static inline int __meminit make_readonl
19380 return readonly;
19381 }
19382
19383 -#ifndef CONFIG_XEN
19384 -/* Must run before zap_low_mappings */
19385 -__meminit void *early_ioremap(unsigned long addr, unsigned long size)
19386 +static unsigned long __meminit
19387 +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
19388 {
19389 - pmd_t *pmd, *last_pmd;
19390 - unsigned long vaddr;
19391 - int i, pmds;
19392 -
19393 - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19394 - vaddr = __START_KERNEL_map;
19395 - pmd = level2_kernel_pgt;
19396 - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
19397 -
19398 - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
19399 - for (i = 0; i < pmds; i++) {
19400 - if (pmd_present(pmd[i]))
19401 - goto continue_outer_loop;
19402 - }
19403 - vaddr += addr & ~PMD_MASK;
19404 - addr &= PMD_MASK;
19405 + unsigned pages = 0;
19406 + unsigned long last_map_addr = end;
19407 + int i;
19408 +
19409 + pte_t *pte = pte_page + pte_index(addr);
19410 +
19411 + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
19412 + unsigned long pteval = addr | __PAGE_KERNEL;
19413
19414 - for (i = 0; i < pmds; i++, addr += PMD_SIZE)
19415 - set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
19416 - __flush_tlb_all();
19417 -
19418 - return (void *)vaddr;
19419 -continue_outer_loop:
19420 - ;
19421 + if (addr >= end ||
19422 + (!after_bootmem &&
19423 + (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
19424 + break;
19425 +
19426 + if (__pte_val(*pte))
19427 + continue;
19428 +
19429 + if (make_readonly(addr))
19430 + pteval &= ~_PAGE_RW;
19431 + if (0)
19432 + printk(" pte=%p addr=%lx pte=%016lx\n",
19433 + pte, addr, pteval);
19434 + if (!after_bootmem)
19435 + *pte = __pte(pteval & __supported_pte_mask);
19436 + else
19437 + set_pte(pte, __pte(pteval & __supported_pte_mask));
19438 + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
19439 + pages++;
19440 }
19441 - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
19442 - return NULL;
19443 + update_page_count(PG_LEVEL_4K, pages);
19444 +
19445 + return last_map_addr;
19446 }
19447
19448 -/*
19449 - * To avoid virtual aliases later:
19450 - */
19451 -__meminit void early_iounmap(void *addr, unsigned long size)
19452 +static unsigned long __meminit
19453 +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
19454 {
19455 - unsigned long vaddr;
19456 - pmd_t *pmd;
19457 - int i, pmds;
19458 -
19459 - vaddr = (unsigned long)addr;
19460 - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19461 - pmd = level2_kernel_pgt + pmd_index(vaddr);
19462 -
19463 - for (i = 0; i < pmds; i++)
19464 - pmd_clear(pmd + i);
19465 + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
19466
19467 - __flush_tlb_all();
19468 + BUG_ON(!max_pfn_mapped);
19469 + return phys_pte_init(pte, address, end);
19470 }
19471 -#endif
19472
19473 static unsigned long __meminit
19474 -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
19475 +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
19476 + unsigned long page_size_mask)
19477 {
19478 + unsigned long pages = 0;
19479 + unsigned long last_map_addr = end;
19480 + unsigned long start = address;
19481 +
19482 int i = pmd_index(address);
19483
19484 - for (; i < PTRS_PER_PMD; i++) {
19485 + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
19486 unsigned long pte_phys;
19487 - pmd_t *pmd = pmd_page + i;
19488 - pte_t *pte, *pte_save;
19489 - int k;
19490 + pmd_t *pmd = pmd_page + pmd_index(address);
19491 + pte_t *pte;
19492
19493 if (address >= end)
19494 break;
19495
19496 if (__pmd_val(*pmd)) {
19497 - address += PMD_SIZE;
19498 + if (!pmd_large(*pmd)) {
19499 + spin_lock(&init_mm.page_table_lock);
19500 + last_map_addr = phys_pte_update(pmd, address,
19501 + end);
19502 + spin_unlock(&init_mm.page_table_lock);
19503 + }
19504 + /* Count entries we're using from level2_ident_pgt */
19505 + if (start == 0)
19506 + pages++;
19507 continue;
19508 }
19509
19510 - pte = alloc_static_page(&pte_phys);
19511 - pte_save = pte;
19512 - for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
19513 - unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
19514 -
19515 - if (address >= (after_bootmem
19516 - ? end
19517 - : xen_start_info->nr_pages << PAGE_SHIFT))
19518 - pteval = 0;
19519 - else if (make_readonly(address))
19520 - pteval &= ~_PAGE_RW;
19521 - set_pte(pte, __pte(pteval & __supported_pte_mask));
19522 + if (page_size_mask & (1<<PG_LEVEL_2M)) {
19523 + pages++;
19524 + spin_lock(&init_mm.page_table_lock);
19525 + set_pte((pte_t *)pmd,
19526 + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19527 + spin_unlock(&init_mm.page_table_lock);
19528 + last_map_addr = (address & PMD_MASK) + PMD_SIZE;
19529 + continue;
19530 }
19531 +
19532 + pte = alloc_low_page(&pte_phys);
19533 + last_map_addr = phys_pte_init(pte, address, end);
19534 + unmap_low_page(pte);
19535 +
19536 if (!after_bootmem) {
19537 - early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19538 - *pmd = __pmd(pte_phys | _KERNPG_TABLE);
19539 + if (max_pfn_mapped)
19540 + make_page_readonly(__va(pte_phys),
19541 + XENFEAT_writable_page_tables);
19542 + *pmd = __pmd(pte_phys | _PAGE_TABLE);
19543 } else {
19544 - make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19545 - set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
19546 + make_page_readonly(pte, XENFEAT_writable_page_tables);
19547 + spin_lock(&init_mm.page_table_lock);
19548 + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
19549 + spin_unlock(&init_mm.page_table_lock);
19550 }
19551 }
19552 - return address;
19553 + update_page_count(PG_LEVEL_2M, pages);
19554 + return last_map_addr;
19555 }
19556
19557 static unsigned long __meminit
19558 -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
19559 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
19560 + unsigned long page_size_mask)
19561 {
19562 pmd_t *pmd = pmd_offset(pud, 0);
19563 unsigned long last_map_addr;
19564
19565 - spin_lock(&init_mm.page_table_lock);
19566 - last_map_addr = phys_pmd_init(pmd, address, end);
19567 - spin_unlock(&init_mm.page_table_lock);
19568 + BUG_ON(!max_pfn_mapped);
19569 + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
19570 __flush_tlb_all();
19571 return last_map_addr;
19572 }
19573
19574 static unsigned long __meminit
19575 -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
19576 +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
19577 + unsigned long page_size_mask)
19578 {
19579 + unsigned long pages = 0;
19580 unsigned long last_map_addr = end;
19581 int i = pud_index(addr);
19582
19583 @@ -550,29 +519,55 @@ phys_pud_init(pud_t *pud_page, unsigned
19584
19585 if (__pud_val(*pud)) {
19586 if (!pud_large(*pud))
19587 - last_map_addr = phys_pmd_update(pud, addr, end);
19588 + last_map_addr = phys_pmd_update(pud, addr, end,
19589 + page_size_mask);
19590 continue;
19591 }
19592
19593 - if (direct_gbpages) {
19594 + if (page_size_mask & (1<<PG_LEVEL_1G)) {
19595 + pages++;
19596 + spin_lock(&init_mm.page_table_lock);
19597 set_pte((pte_t *)pud,
19598 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19599 + spin_unlock(&init_mm.page_table_lock);
19600 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
19601 continue;
19602 }
19603
19604 - pmd = alloc_static_page(&pmd_phys);
19605 -
19606 - spin_lock(&init_mm.page_table_lock);
19607 - *pud = __pud(pmd_phys | _KERNPG_TABLE);
19608 - last_map_addr = phys_pmd_init(pmd, addr, end);
19609 - spin_unlock(&init_mm.page_table_lock);
19610 + pmd = alloc_low_page(&pmd_phys);
19611 + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
19612 + unmap_low_page(pmd);
19613
19614 - early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19615 + if (!after_bootmem) {
19616 + if (max_pfn_mapped)
19617 + make_page_readonly(__va(pmd_phys),
19618 + XENFEAT_writable_page_tables);
19619 + if (page_size_mask & (1 << PG_LEVEL_NUM))
19620 + xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
19621 + else
19622 + *pud = __pud(pmd_phys | _PAGE_TABLE);
19623 + } else {
19624 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
19625 + spin_lock(&init_mm.page_table_lock);
19626 + pud_populate(&init_mm, pud, __va(pmd_phys));
19627 + spin_unlock(&init_mm.page_table_lock);
19628 + }
19629 }
19630 __flush_tlb_all();
19631 + update_page_count(PG_LEVEL_1G, pages);
19632
19633 - return last_map_addr >> PAGE_SHIFT;
19634 + return last_map_addr;
19635 +}
19636 +
19637 +static unsigned long __meminit
19638 +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
19639 + unsigned long page_size_mask)
19640 +{
19641 + pud_t *pud;
19642 +
19643 + pud = (pud_t *)pgd_page_vaddr(*pgd);
19644 +
19645 + return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
19646 }
19647
19648 void __init xen_init_pt(void)
19649 @@ -651,86 +646,36 @@ void __init xen_init_pt(void)
19650 }
19651 }
19652
19653 -static void __init extend_init_mapping(unsigned long tables_space)
19654 -{
19655 - unsigned long va = __START_KERNEL_map;
19656 - unsigned long start = start_pfn;
19657 - unsigned long phys, addr, *pte_page;
19658 - pmd_t *pmd;
19659 - pte_t *pte, new_pte;
19660 - unsigned long *page = (unsigned long *)init_level4_pgt;
19661 -
19662 - addr = page[pgd_index(va)];
19663 - addr_to_page(addr, page);
19664 - addr = page[pud_index(va)];
19665 - addr_to_page(addr, page);
19666 -
19667 - /* Kill mapping of low 1MB. */
19668 - while (va < (unsigned long)&_text) {
19669 - if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
19670 - BUG();
19671 - va += PAGE_SIZE;
19672 - }
19673 -
19674 - /* Ensure init mappings cover kernel text/data and initial tables. */
19675 - while (va < (__START_KERNEL_map
19676 - + (start_pfn << PAGE_SHIFT)
19677 - + tables_space)) {
19678 - pmd = (pmd_t *)&page[pmd_index(va)];
19679 - if (pmd_none(*pmd)) {
19680 - pte_page = alloc_static_page(&phys);
19681 - early_make_page_readonly(
19682 - pte_page, XENFEAT_writable_page_tables);
19683 - set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
19684 - } else {
19685 - addr = page[pmd_index(va)];
19686 - addr_to_page(addr, pte_page);
19687 - }
19688 - pte = (pte_t *)&pte_page[pte_index(va)];
19689 - if (pte_none(*pte)) {
19690 - new_pte = pfn_pte(
19691 - (va - __START_KERNEL_map) >> PAGE_SHIFT,
19692 - __pgprot(_KERNPG_TABLE));
19693 - xen_l1_entry_update(pte, new_pte);
19694 - }
19695 - va += PAGE_SIZE;
19696 - }
19697 -
19698 - /* Finally, blow away any spurious initial mappings. */
19699 - while (1) {
19700 - pmd = (pmd_t *)&page[pmd_index(va)];
19701 - if (pmd_none(*pmd))
19702 - break;
19703 - if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
19704 - BUG();
19705 - va += PAGE_SIZE;
19706 - }
19707 -
19708 - if (start_pfn > start)
19709 - reserve_early(start << PAGE_SHIFT,
19710 - start_pfn << PAGE_SHIFT, "INITMAP");
19711 -}
19712 -
19713 static void __init find_early_table_space(unsigned long end)
19714 {
19715 unsigned long puds, pmds, ptes, tables;
19716
19717 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
19718 + tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
19719 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
19720 - ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
19721 + tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
19722
19723 - tables = round_up(puds * 8, PAGE_SIZE) +
19724 - round_up(pmds * 8, PAGE_SIZE) +
19725 - round_up(ptes * 8, PAGE_SIZE);
19726 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
19727 + tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
19728
19729 - extend_init_mapping(tables);
19730 + if (!table_top) {
19731 + table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
19732 + xen_start_info->nr_pt_frames;
19733 + table_cur = table_start;
19734 + } else {
19735 + /*
19736 + * [table_start, table_top) gets passed to reserve_early(),
19737 + * so we must not use table_cur here, despite continuing
19738 + * to allocate from there. table_cur possibly being below
19739 + * table_start is otoh not a problem.
19740 + */
19741 + table_start = table_top;
19742 + }
19743
19744 - table_start = start_pfn;
19745 - table_end = table_start + (tables>>PAGE_SHIFT);
19746 + table_top = table_cur + (tables >> PAGE_SHIFT);
19747
19748 - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
19749 - end, table_start << PAGE_SHIFT,
19750 - (table_start << PAGE_SHIFT) + tables);
19751 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
19752 + end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT);
19753 }
19754
19755 static void __init xen_finish_init_mapping(void)
19756 @@ -752,18 +697,18 @@ static void __init xen_finish_init_mappi
19757 xen_start_info->mod_start = (unsigned long)
19758 __va(__pa(xen_start_info->mod_start));
19759
19760 - /* Destroy the Xen-created mappings beyond the kernel image as
19761 - * well as the temporary mappings created above. Prevents
19762 - * overlap with modules area (if init mapping is very big).
19763 - */
19764 + /* Destroy the Xen-created mappings beyond the kernel image. */
19765 start = PAGE_ALIGN((unsigned long)_end);
19766 - end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
19767 + end = __START_KERNEL_map + (table_start << PAGE_SHIFT);
19768 for (; start < end; start += PAGE_SIZE)
19769 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
19770 BUG();
19771
19772 - /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
19773 - table_end = ~0UL;
19774 + /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
19775 + start = table_top;
19776 + WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n",
19777 + table_start, table_cur, start);
19778 + table_top = ~0UL;
19779
19780 /* Switch to the real shared_info page, and clear the dummy page. */
19781 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
19782 @@ -780,8 +725,7 @@ static void __init xen_finish_init_mappi
19783 << PAGE_SHIFT,
19784 PAGE_KERNEL_RO);
19785
19786 - /* Disable the 'start_pfn' allocator. */
19787 - table_end = start_pfn;
19788 + table_top = max(table_cur, start);
19789 }
19790
19791 static void __init init_gbpages(void)
19792 @@ -794,126 +738,91 @@ static void __init init_gbpages(void)
19793 #endif
19794 }
19795
19796 -#ifdef CONFIG_MEMTEST_BOOTPARAM
19797 -
19798 -static void __init memtest(unsigned long start_phys, unsigned long size,
19799 - unsigned pattern)
19800 +static unsigned long __init kernel_physical_mapping_init(unsigned long start,
19801 + unsigned long end,
19802 + unsigned long page_size_mask)
19803 {
19804 - unsigned long i;
19805 - unsigned long *start;
19806 - unsigned long start_bad;
19807 - unsigned long last_bad;
19808 - unsigned long val;
19809 - unsigned long start_phys_aligned;
19810 - unsigned long count;
19811 - unsigned long incr;
19812 -
19813 - switch (pattern) {
19814 - case 0:
19815 - val = 0UL;
19816 - break;
19817 - case 1:
19818 - val = -1UL;
19819 - break;
19820 - case 2:
19821 - val = 0x5555555555555555UL;
19822 - break;
19823 - case 3:
19824 - val = 0xaaaaaaaaaaaaaaaaUL;
19825 - break;
19826 - default:
19827 - return;
19828 - }
19829 -
19830 - incr = sizeof(unsigned long);
19831 - start_phys_aligned = ALIGN(start_phys, incr);
19832 - count = (size - (start_phys_aligned - start_phys))/incr;
19833 - start = __va(start_phys_aligned);
19834 - start_bad = 0;
19835 - last_bad = 0;
19836 -
19837 - for (i = 0; i < count; i++)
19838 - start[i] = val;
19839 - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
19840 - if (*start != val) {
19841 - if (start_phys_aligned == last_bad + incr) {
19842 - last_bad += incr;
19843 - } else {
19844 - if (start_bad) {
19845 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19846 - val, start_bad, last_bad + incr);
19847 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19848 - }
19849 - start_bad = last_bad = start_phys_aligned;
19850 - }
19851 - }
19852 - }
19853 - if (start_bad) {
19854 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19855 - val, start_bad, last_bad + incr);
19856 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19857 - }
19858
19859 -}
19860 + unsigned long next, last_map_addr = end;
19861
19862 -static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
19863 + start = (unsigned long)__va(start);
19864 + end = (unsigned long)__va(end);
19865
19866 -static int __init parse_memtest(char *arg)
19867 -{
19868 - if (arg)
19869 - memtest_pattern = simple_strtoul(arg, NULL, 0);
19870 - return 0;
19871 -}
19872 + for (; start < end; start = next) {
19873 + pgd_t *pgd = pgd_offset_k(start);
19874 + unsigned long pud_phys;
19875 + pud_t *pud;
19876
19877 -early_param("memtest", parse_memtest);
19878 + next = (start + PGDIR_SIZE) & PGDIR_MASK;
19879 + if (next > end)
19880 + next = end;
19881
19882 -static void __init early_memtest(unsigned long start, unsigned long end)
19883 -{
19884 - u64 t_start, t_size;
19885 - unsigned pattern;
19886 + if (__pgd_val(*pgd)) {
19887 + last_map_addr = phys_pud_update(pgd, __pa(start),
19888 + __pa(end), page_size_mask);
19889 + continue;
19890 + }
19891
19892 - if (!memtest_pattern)
19893 - return;
19894 + pud = alloc_low_page(&pud_phys);
19895 + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
19896 + page_size_mask);
19897 + unmap_low_page(pud);
19898 +
19899 + if(!after_bootmem) {
19900 + if (max_pfn_mapped)
19901 + make_page_readonly(__va(pud_phys),
19902 + XENFEAT_writable_page_tables);
19903 + xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
19904 + } else {
19905 + make_page_readonly(pud, XENFEAT_writable_page_tables);
19906 + spin_lock(&init_mm.page_table_lock);
19907 + pgd_populate(&init_mm, pgd, __va(pud_phys));
19908 + spin_unlock(&init_mm.page_table_lock);
19909 + }
19910 + }
19911
19912 - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
19913 - for (pattern = 0; pattern < memtest_pattern; pattern++) {
19914 - t_start = start;
19915 - t_size = 0;
19916 - while (t_start < end) {
19917 - t_start = find_e820_area_size(t_start, &t_size, 1);
19918 + return last_map_addr;
19919 +}
19920
19921 - /* done ? */
19922 - if (t_start >= end)
19923 - break;
19924 - if (t_start + t_size > end)
19925 - t_size = end - t_start;
19926 +struct map_range {
19927 + unsigned long start;
19928 + unsigned long end;
19929 + unsigned page_size_mask;
19930 +};
19931
19932 - printk(KERN_CONT "\n %016llx - %016llx pattern %d",
19933 - (unsigned long long)t_start,
19934 - (unsigned long long)t_start + t_size, pattern);
19935 +#define NR_RANGE_MR 5
19936
19937 - memtest(t_start, t_size, pattern);
19938 +static int save_mr(struct map_range *mr, int nr_range,
19939 + unsigned long start_pfn, unsigned long end_pfn,
19940 + unsigned long page_size_mask)
19941 +{
19942
19943 - t_start += t_size;
19944 - }
19945 + if (start_pfn < end_pfn) {
19946 + if (nr_range >= NR_RANGE_MR)
19947 + panic("run out of range for init_memory_mapping\n");
19948 + mr[nr_range].start = start_pfn<<PAGE_SHIFT;
19949 + mr[nr_range].end = end_pfn<<PAGE_SHIFT;
19950 + mr[nr_range].page_size_mask = page_size_mask;
19951 + nr_range++;
19952 }
19953 - printk(KERN_CONT "\n");
19954 -}
19955 -#else
19956 -static void __init early_memtest(unsigned long start, unsigned long end)
19957 -{
19958 +
19959 + return nr_range;
19960 }
19961 -#endif
19962
19963 /*
19964 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
19965 * This runs before bootmem is initialized and gets pages directly from
19966 * the physical memory. To access them they are temporarily mapped.
19967 */
19968 -unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
19969 +unsigned long __init_refok init_memory_mapping(unsigned long start,
19970 + unsigned long end)
19971 {
19972 - unsigned long next, last_map_addr = end;
19973 - unsigned long start_phys = start, end_phys = end;
19974 + unsigned long last_map_addr = 0;
19975 + unsigned long page_size_mask = 0;
19976 + unsigned long start_pfn, end_pfn;
19977 +
19978 + struct map_range mr[NR_RANGE_MR];
19979 + int nr_range, i;
19980
19981 printk(KERN_INFO "init_memory_mapping\n");
19982
19983 @@ -924,51 +833,150 @@ unsigned long __init_refok init_memory_m
19984 * memory mapped. Unfortunately this is done currently before the
19985 * nodes are discovered.
19986 */
19987 - if (!after_bootmem) {
19988 + if (!after_bootmem)
19989 init_gbpages();
19990 - find_early_table_space(end);
19991 +
19992 + if (direct_gbpages)
19993 + page_size_mask |= 1 << PG_LEVEL_1G;
19994 + if (cpu_has_pse)
19995 + page_size_mask |= 1 << PG_LEVEL_2M;
19996 +
19997 + memset(mr, 0, sizeof(mr));
19998 + nr_range = 0;
19999 +
20000 + /* head if not big page alignment ?*/
20001 + start_pfn = start >> PAGE_SHIFT;
20002 + end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
20003 + << (PMD_SHIFT - PAGE_SHIFT);
20004 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
20005 +
20006 + /* big page (2M) range*/
20007 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
20008 + << (PMD_SHIFT - PAGE_SHIFT);
20009 + end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
20010 + << (PUD_SHIFT - PAGE_SHIFT);
20011 + if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
20012 + end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
20013 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20014 + page_size_mask & (1<<PG_LEVEL_2M));
20015 +
20016 + /* big page (1G) range */
20017 + start_pfn = end_pfn;
20018 + end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
20019 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20020 + page_size_mask &
20021 + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
20022 +
20023 + /* tail is not big page (1G) alignment */
20024 + start_pfn = end_pfn;
20025 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
20026 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20027 + page_size_mask & (1<<PG_LEVEL_2M));
20028 +
20029 + /* tail is not big page (2M) alignment */
20030 + start_pfn = end_pfn;
20031 + end_pfn = end>>PAGE_SHIFT;
20032 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
20033 +
20034 + /* try to merge same page size and continuous */
20035 + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
20036 + unsigned long old_start;
20037 + if (mr[i].end != mr[i+1].start ||
20038 + mr[i].page_size_mask != mr[i+1].page_size_mask)
20039 + continue;
20040 + /* move it */
20041 + old_start = mr[i].start;
20042 + memmove(&mr[i], &mr[i+1],
20043 + (nr_range - 1 - i) * sizeof (struct map_range));
20044 + mr[i--].start = old_start;
20045 + nr_range--;
20046 }
20047
20048 - start = (unsigned long)__va(start);
20049 - end = (unsigned long)__va(end);
20050 + for (i = 0; i < nr_range; i++)
20051 + printk(KERN_DEBUG " %010lx - %010lx page %s\n",
20052 + mr[i].start, mr[i].end,
20053 + (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
20054 + (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
20055
20056 - for (; start < end; start = next) {
20057 - pgd_t *pgd = pgd_offset_k(start);
20058 - unsigned long pud_phys;
20059 - pud_t *pud;
20060 + if (!after_bootmem)
20061 + find_early_table_space(end);
20062
20063 - if (after_bootmem)
20064 - pud = pud_offset(pgd, start & PGDIR_MASK);
20065 - else
20066 - pud = alloc_static_page(&pud_phys);
20067 - next = start + PGDIR_SIZE;
20068 - if (next > end)
20069 - next = end;
20070 - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
20071 - if (!after_bootmem) {
20072 - early_make_page_readonly(pud, XENFEAT_writable_page_tables);
20073 - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
20074 + if (!start) {
20075 + unsigned long addr, va = __START_KERNEL_map;
20076 + unsigned long *page = (unsigned long *)init_level4_pgt;
20077 +
20078 + /* Kill mapping of memory below _text. */
20079 + while (va < (unsigned long)&_text) {
20080 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
20081 + BUG();
20082 + va += PAGE_SIZE;
20083 + }
20084 +
20085 + /* Blow away any spurious initial mappings. */
20086 + va = __START_KERNEL_map + (table_start << PAGE_SHIFT);
20087 + addr = page[pgd_index(va)];
20088 + addr_to_page(addr, page);
20089 + addr = page[pud_index(va)];
20090 + addr_to_page(addr, page);
20091 + while (pmd_index(va) | pte_index(va)) {
20092 + if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
20093 + break;
20094 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
20095 + BUG();
20096 + va += PAGE_SIZE;
20097 }
20098 }
20099
20100 - if (!after_bootmem) {
20101 - BUG_ON(start_pfn != table_end);
20102 + for (i = 0; i < nr_range; i++)
20103 + last_map_addr = kernel_physical_mapping_init(
20104 + mr[i].start, mr[i].end,
20105 + mr[i].page_size_mask);
20106 +
20107 + BUG_ON(table_cur > table_top);
20108 + if (!start)
20109 xen_finish_init_mapping();
20110 - }
20111 + else if (table_cur < table_top)
20112 + /* Disable the 'table_cur' allocator. */
20113 + table_top = table_cur;
20114
20115 __flush_tlb_all();
20116
20117 - if (!after_bootmem)
20118 + if (!after_bootmem && table_top > table_start)
20119 reserve_early(table_start << PAGE_SHIFT,
20120 - table_end << PAGE_SHIFT, "PGTABLE");
20121 + table_top << PAGE_SHIFT, "PGTABLE");
20122 +
20123 + printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
20124 + last_map_addr, end);
20125
20126 if (!after_bootmem)
20127 - early_memtest(start_phys, end_phys);
20128 + early_memtest(start, end);
20129
20130 - return last_map_addr;
20131 + return last_map_addr >> PAGE_SHIFT;
20132 }
20133
20134 #ifndef CONFIG_NUMA
20135 +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
20136 +{
20137 + unsigned long bootmap_size, bootmap;
20138 +
20139 + e820_register_active_regions(0, start_pfn, end_pfn);
20140 +#ifdef CONFIG_XEN
20141 + if (end_pfn > xen_start_info->nr_pages)
20142 + end_pfn = xen_start_info->nr_pages;
20143 +#endif
20144 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
20145 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
20146 + PAGE_SIZE);
20147 + if (bootmap == -1L)
20148 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
20149 + /* don't touch min_low_pfn */
20150 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
20151 + 0, end_pfn);
20152 + free_bootmem_with_active_regions(0, end_pfn);
20153 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
20154 + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
20155 +}
20156 +
20157 void __init paging_init(void)
20158 {
20159 unsigned long max_zone_pfns[MAX_NR_ZONES];
20160 @@ -976,9 +984,9 @@ void __init paging_init(void)
20161 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
20162 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
20163 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
20164 - max_zone_pfns[ZONE_NORMAL] = end_pfn;
20165 + max_zone_pfns[ZONE_NORMAL] = max_pfn;
20166
20167 - memory_present(0, 0, end_pfn);
20168 + memory_present(0, 0, max_pfn);
20169 sparse_init();
20170 free_area_init_nodes(max_zone_pfns);
20171
20172 @@ -1069,8 +1077,8 @@ void __init mem_init(void)
20173 init_page_count(pfn_to_page(pfn));
20174 totalram_pages++;
20175 }
20176 - reservedpages = end_pfn - totalram_pages -
20177 - absent_pages_in_range(0, end_pfn);
20178 + reservedpages = max_pfn - totalram_pages -
20179 + absent_pages_in_range(0, max_pfn);
20180 after_bootmem = 1;
20181
20182 codesize = (unsigned long) &_etext - (unsigned long) &_text;
20183 @@ -1089,7 +1097,7 @@ void __init mem_init(void)
20184 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
20185 "%ldk reserved, %ldk data, %ldk init)\n",
20186 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
20187 - end_pfn << (PAGE_SHIFT-10),
20188 + max_pfn << (PAGE_SHIFT-10),
20189 codesize >> 10,
20190 reservedpages << (PAGE_SHIFT-10),
20191 datasize >> 10,
20192 @@ -1152,6 +1160,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
20193 void mark_rodata_ro(void)
20194 {
20195 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
20196 + unsigned long rodata_start =
20197 + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20198 +
20199 +#ifdef CONFIG_DYNAMIC_FTRACE
20200 + /* Dynamic tracing modifies the kernel text section */
20201 + start = rodata_start;
20202 +#endif
20203
20204 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
20205 (end - start) >> 10);
20206 @@ -1161,8 +1176,7 @@ void mark_rodata_ro(void)
20207 * The rodata section (but not the kernel text!) should also be
20208 * not-executable.
20209 */
20210 - start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20211 - set_memory_nx(start, (end - start) >> PAGE_SHIFT);
20212 + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
20213
20214 rodata_test();
20215
20216 @@ -1184,24 +1198,26 @@ void free_initrd_mem(unsigned long start
20217 }
20218 #endif
20219
20220 -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
20221 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
20222 + int flags)
20223 {
20224 #ifdef CONFIG_NUMA
20225 int nid, next_nid;
20226 + int ret;
20227 #endif
20228 unsigned long pfn = phys >> PAGE_SHIFT;
20229
20230 - if (pfn >= end_pfn) {
20231 + if (pfn >= max_pfn) {
20232 /*
20233 * This can happen with kdump kernels when accessing
20234 * firmware tables:
20235 */
20236 if (pfn < max_pfn_mapped)
20237 - return;
20238 + return -EFAULT;
20239
20240 - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
20241 + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
20242 phys, len);
20243 - return;
20244 + return -EFAULT;
20245 }
20246
20247 /* Should check here against the e820 map to avoid double free */
20248 @@ -1209,9 +1225,13 @@ void __init reserve_bootmem_generic(unsi
20249 nid = phys_to_nid(phys);
20250 next_nid = phys_to_nid(phys + len - 1);
20251 if (nid == next_nid)
20252 - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
20253 + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
20254 else
20255 - reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20256 + ret = reserve_bootmem(phys, len, flags);
20257 +
20258 + if (ret != 0)
20259 + return ret;
20260 +
20261 #else
20262 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20263 #endif
20264 @@ -1222,6 +1242,8 @@ void __init reserve_bootmem_generic(unsi
20265 set_dma_reserve(dma_reserve);
20266 }
20267 #endif
20268 +
20269 + return 0;
20270 }
20271
20272 int kern_addr_valid(unsigned long addr)
20273 @@ -1326,7 +1348,7 @@ vmemmap_populate(struct page *start_page
20274 pmd_t *pmd;
20275
20276 for (; addr < end; addr = next) {
20277 - next = pmd_addr_end(addr, end);
20278 + void *p = NULL;
20279
20280 pgd = vmemmap_pgd_populate(addr, node);
20281 if (!pgd)
20282 @@ -1336,33 +1358,51 @@ vmemmap_populate(struct page *start_page
20283 if (!pud)
20284 return -ENOMEM;
20285
20286 - pmd = pmd_offset(pud, addr);
20287 - if (pmd_none(*pmd)) {
20288 - pte_t entry;
20289 - void *p;
20290 + if (!cpu_has_pse) {
20291 + next = (addr + PAGE_SIZE) & PAGE_MASK;
20292 + pmd = vmemmap_pmd_populate(pud, addr, node);
20293 +
20294 + if (!pmd)
20295 + return -ENOMEM;
20296 +
20297 + p = vmemmap_pte_populate(pmd, addr, node);
20298
20299 - p = vmemmap_alloc_block(PMD_SIZE, node);
20300 if (!p)
20301 return -ENOMEM;
20302
20303 - entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20304 - PAGE_KERNEL_LARGE);
20305 - set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20306 -
20307 - /* check to see if we have contiguous blocks */
20308 - if (p_end != p || node_start != node) {
20309 - if (p_start)
20310 - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20311 - addr_start, addr_end-1, p_start, p_end-1, node_start);
20312 - addr_start = addr;
20313 - node_start = node;
20314 - p_start = p;
20315 - }
20316 - addr_end = addr + PMD_SIZE;
20317 - p_end = p + PMD_SIZE;
20318 + addr_end = addr + PAGE_SIZE;
20319 + p_end = p + PAGE_SIZE;
20320 } else {
20321 - vmemmap_verify((pte_t *)pmd, node, addr, next);
20322 + next = pmd_addr_end(addr, end);
20323 +
20324 + pmd = pmd_offset(pud, addr);
20325 + if (pmd_none(*pmd)) {
20326 + pte_t entry;
20327 +
20328 + p = vmemmap_alloc_block(PMD_SIZE, node);
20329 + if (!p)
20330 + return -ENOMEM;
20331 +
20332 + entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20333 + PAGE_KERNEL_LARGE);
20334 + set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20335 +
20336 + /* check to see if we have contiguous blocks */
20337 + if (p_end != p || node_start != node) {
20338 + if (p_start)
20339 + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20340 + addr_start, addr_end-1, p_start, p_end-1, node_start);
20341 + addr_start = addr;
20342 + node_start = node;
20343 + p_start = p;
20344 + }
20345 +
20346 + addr_end = addr + PMD_SIZE;
20347 + p_end = p + PMD_SIZE;
20348 + } else
20349 + vmemmap_verify((pte_t *)pmd, node, addr, next);
20350 }
20351 +
20352 }
20353 return 0;
20354 }
20355 --- sle11-2009-06-04.orig/arch/x86/mm/ioremap-xen.c 2009-03-16 16:38:05.000000000 +0100
20356 +++ sle11-2009-06-04/arch/x86/mm/ioremap-xen.c 2009-06-04 10:21:39.000000000 +0200
20357 @@ -13,6 +13,7 @@
20358 #include <linux/pfn.h>
20359 #include <linux/slab.h>
20360 #include <linux/vmalloc.h>
20361 +#include <linux/mmiotrace.h>
20362
20363 #include <asm/cacheflush.h>
20364 #include <asm/e820.h>
20365 @@ -274,7 +275,8 @@ int ioremap_check_change_attr(unsigned l
20366 for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
20367 unsigned long pfn = mfn_to_local_pfn(mfn);
20368
20369 - if (pfn >= max_pfn_mapped)
20370 + if (pfn >= max_low_pfn_mapped &&
20371 + (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
20372 continue;
20373 rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
20374 PAGE_SIZE, prot_val);
20375 @@ -297,11 +299,14 @@ static void __iomem *__ioremap_caller(re
20376 {
20377 unsigned long mfn, offset, vaddr;
20378 resource_size_t last_addr;
20379 + const resource_size_t unaligned_phys_addr = phys_addr;
20380 + const unsigned long unaligned_size = size;
20381 struct vm_struct *area;
20382 unsigned long new_prot_val;
20383 pgprot_t prot;
20384 int retval;
20385 domid_t domid = DOMID_IO;
20386 + void __iomem *ret_addr;
20387
20388 /* Don't allow wraparound or zero size */
20389 last_addr = phys_addr + size - 1;
20390 @@ -318,7 +323,7 @@ static void __iomem *__ioremap_caller(re
20391 /*
20392 * Don't remap the low PCI/ISA area, it's always mapped..
20393 */
20394 - if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
20395 + if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
20396 return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
20397
20398 /*
20399 @@ -342,7 +347,7 @@ static void __iomem *__ioremap_caller(re
20400 phys_addr &= PAGE_MASK;
20401 size = PAGE_ALIGN(last_addr+1) - phys_addr;
20402
20403 - retval = reserve_memtype(phys_addr, phys_addr + size,
20404 + retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
20405 prot_val, &new_prot_val);
20406 if (retval) {
20407 pr_debug("Warning: reserve_memtype returned %d\n", retval);
20408 @@ -410,7 +415,10 @@ static void __iomem *__ioremap_caller(re
20409 return NULL;
20410 }
20411
20412 - return (void __iomem *) (vaddr + offset);
20413 + ret_addr = (void __iomem *) (vaddr + offset);
20414 + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
20415 +
20416 + return ret_addr;
20417 }
20418
20419 /**
20420 @@ -438,7 +446,7 @@ void __iomem *ioremap_nocache(resource_s
20421 {
20422 /*
20423 * Ideally, this should be:
20424 - * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20425 + * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20426 *
20427 * Till we fix all X drivers to use ioremap_wc(), we will use
20428 * UC MINUS.
20429 @@ -462,7 +470,7 @@ EXPORT_SYMBOL(ioremap_nocache);
20430 */
20431 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
20432 {
20433 - if (pat_wc_enabled)
20434 + if (pat_enabled)
20435 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
20436 __builtin_return_address(0));
20437 else
20438 @@ -502,6 +510,14 @@ static void __iomem *ioremap_default(res
20439 }
20440 #endif
20441
20442 +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
20443 + unsigned long prot_val)
20444 +{
20445 + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
20446 + __builtin_return_address(0));
20447 +}
20448 +EXPORT_SYMBOL(ioremap_prot);
20449 +
20450 /**
20451 * iounmap - Free a IO remapping
20452 * @addr: virtual address from ioremap_*
20453 @@ -526,6 +542,8 @@ void iounmap(volatile void __iomem *addr
20454 addr = (volatile void __iomem *)
20455 (PAGE_MASK & (unsigned long __force)addr);
20456
20457 + mmiotrace_iounmap(addr);
20458 +
20459 /* Use the vm area unlocked, assuming the caller
20460 ensures there isn't another iounmap for the same address
20461 in parallel. Reuse of the virtual address is prevented by
20462 @@ -533,7 +551,7 @@ void iounmap(volatile void __iomem *addr
20463 cpa takes care of the direct mappings. */
20464 read_lock(&vmlist_lock);
20465 for (p = vmlist; p; p = p->next) {
20466 - if (p->addr == addr)
20467 + if (p->addr == (void __force *)addr)
20468 break;
20469 }
20470 read_unlock(&vmlist_lock);
20471 @@ -547,7 +565,7 @@ void iounmap(volatile void __iomem *addr
20472 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
20473
20474 /* Finally remove it */
20475 - o = remove_vm_area((void *)addr);
20476 + o = remove_vm_area((void __force *)addr);
20477 BUG_ON(p != o || o == NULL);
20478 kfree(p);
20479 }
20480 @@ -567,7 +585,7 @@ void *xlate_dev_mem_ptr(unsigned long ph
20481 if (page_is_ram(start >> PAGE_SHIFT))
20482 return __va(phys);
20483
20484 - addr = (void *)ioremap_default(start, PAGE_SIZE);
20485 + addr = (void __force *)ioremap_default(start, PAGE_SIZE);
20486 if (addr)
20487 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
20488
20489 @@ -595,8 +613,7 @@ static int __init early_ioremap_debug_se
20490 early_param("early_ioremap_debug", early_ioremap_debug_setup);
20491
20492 static __initdata int after_paging_init;
20493 -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
20494 - __section(.bss.page_aligned);
20495 +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
20496
20497 #ifdef CONFIG_X86_32
20498 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
20499 @@ -695,10 +712,11 @@ static void __init __early_set_fixmap(en
20500 return;
20501 }
20502 pte = early_ioremap_pte(addr);
20503 +
20504 if (pgprot_val(flags))
20505 set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
20506 else
20507 - pte_clear(NULL, addr, pte);
20508 + pte_clear(&init_mm, addr, pte);
20509 __flush_tlb_one(addr);
20510 }
20511
20512 @@ -726,13 +744,11 @@ static int __init check_early_ioremap_le
20513 {
20514 if (!early_ioremap_nested)
20515 return 0;
20516 -
20517 - printk(KERN_WARNING
20518 + WARN(1, KERN_WARNING
20519 "Debug warning: early ioremap leak of %d areas detected.\n",
20520 - early_ioremap_nested);
20521 + early_ioremap_nested);
20522 printk(KERN_WARNING
20523 - "please boot with early_ioremap_debug and report the dmesg.\n");
20524 - WARN_ON(1);
20525 + "please boot with early_ioremap_debug and report the dmesg.\n");
20526
20527 return 1;
20528 }
20529 --- sle11-2009-06-04.orig/arch/x86/mm/pageattr-xen.c 2009-03-16 16:38:05.000000000 +0100
20530 +++ sle11-2009-06-04/arch/x86/mm/pageattr-xen.c 2009-06-04 10:21:39.000000000 +0200
20531 @@ -34,6 +34,47 @@ struct cpa_data {
20532 unsigned force_split : 1;
20533 };
20534
20535 +#ifdef CONFIG_PROC_FS
20536 +static unsigned long direct_pages_count[PG_LEVEL_NUM];
20537 +
20538 +void update_page_count(int level, unsigned long pages)
20539 +{
20540 + unsigned long flags;
20541 +
20542 + /* Protect against CPA */
20543 + spin_lock_irqsave(&pgd_lock, flags);
20544 + direct_pages_count[level] += pages;
20545 + spin_unlock_irqrestore(&pgd_lock, flags);
20546 +}
20547 +
20548 +static void split_page_count(int level)
20549 +{
20550 + direct_pages_count[level]--;
20551 + direct_pages_count[level - 1] += PTRS_PER_PTE;
20552 +}
20553 +
20554 +int arch_report_meminfo(char *page)
20555 +{
20556 + int n = sprintf(page, "DirectMap4k: %8lu kB\n",
20557 + direct_pages_count[PG_LEVEL_4K] << 2);
20558 +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
20559 + n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
20560 + direct_pages_count[PG_LEVEL_2M] << 11);
20561 +#else
20562 + n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
20563 + direct_pages_count[PG_LEVEL_2M] << 12);
20564 +#endif
20565 +#ifdef CONFIG_X86_64
20566 + if (direct_gbpages)
20567 + n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
20568 + direct_pages_count[PG_LEVEL_1G] << 20);
20569 +#endif
20570 + return n;
20571 +}
20572 +#else
20573 +static inline void split_page_count(int level) { }
20574 +#endif
20575 +
20576 #ifdef CONFIG_X86_64
20577
20578 static inline unsigned long highmap_start_pfn(void)
20579 @@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long
20580 {
20581 BUG_ON(irqs_disabled());
20582
20583 - on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
20584 + on_each_cpu(__cpa_flush_all, (void *) cache, 1);
20585 }
20586
20587 static void __cpa_flush_range(void *arg)
20588 @@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned lon
20589 BUG_ON(irqs_disabled());
20590 WARN_ON(PAGE_ALIGN(start) != start);
20591
20592 - on_each_cpu(__cpa_flush_range, NULL, 1, 1);
20593 + on_each_cpu(__cpa_flush_range, NULL, 1);
20594
20595 if (!cache)
20596 return;
20597 @@ -229,6 +270,7 @@ pte_t *lookup_address(unsigned long addr
20598
20599 return pte_offset_kernel(pmd, address);
20600 }
20601 +EXPORT_SYMBOL_GPL(lookup_address);
20602
20603 /*
20604 * Set the new pmd in all the pgds we know about:
20605 @@ -509,6 +551,16 @@ static int split_large_page(pte_t *kpte,
20606 }
20607 #endif
20608
20609 + if (address >= (unsigned long)__va(0) &&
20610 + address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
20611 + split_page_count(level);
20612 +
20613 +#ifdef CONFIG_X86_64
20614 + if (address >= (unsigned long)__va(1UL<<32) &&
20615 + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
20616 + split_page_count(level);
20617 +#endif
20618 +
20619 /*
20620 * Get the target mfn from the original entry:
20621 */
20622 @@ -566,10 +618,9 @@ repeat:
20623 if (!__pte_val(old_pte)) {
20624 if (!primary)
20625 return 0;
20626 - printk(KERN_WARNING "CPA: called for zero pte. "
20627 + WARN(1, KERN_WARNING "CPA: called for zero pte. "
20628 "vaddr = %lx cpa->vaddr = %lx\n", address,
20629 cpa->vaddr);
20630 - WARN_ON(1);
20631 return -EINVAL;
20632 }
20633
20634 @@ -634,15 +685,24 @@ static int cpa_process_alias(struct cpa_
20635 struct cpa_data alias_cpa;
20636 int ret = 0;
20637
20638 - if (cpa->pfn > max_pfn_mapped)
20639 + if (cpa->pfn >= max_pfn_mapped)
20640 return 0;
20641
20642 +#ifdef CONFIG_X86_64
20643 + if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
20644 + return 0;
20645 +#endif
20646 /*
20647 * No need to redo, when the primary call touched the direct
20648 * mapping already:
20649 */
20650 - if (!within(cpa->vaddr, PAGE_OFFSET,
20651 - PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
20652 + if (!(within(cpa->vaddr, PAGE_OFFSET,
20653 + PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
20654 +#ifdef CONFIG_X86_64
20655 + || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
20656 + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
20657 +#endif
20658 + )) {
20659
20660 alias_cpa = *cpa;
20661 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
20662 @@ -796,6 +856,51 @@ static inline int change_page_attr_clear
20663 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
20664 }
20665
20666 +#ifdef CONFIG_XEN
20667 +static void _free_memtype(u64 pstart, u64 pend)
20668 +{
20669 + u64 pa = pstart &= __PHYSICAL_MASK;
20670 + u64 ma = phys_to_machine(pa);
20671 +
20672 + while ((pa += PAGE_SIZE) < pend) {
20673 + if (phys_to_machine(pa) != ma + (pa - pstart)) {
20674 + free_memtype(ma, ma + (pa - pstart));
20675 + pstart = pa;
20676 + ma = phys_to_machine(pa);
20677 + }
20678 + }
20679 + free_memtype(ma, ma + (pend - pstart));
20680 +}
20681 +#define free_memtype _free_memtype
20682 +
20683 +static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type)
20684 +{
20685 + u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur;
20686 + u64 ma = phys_to_machine(pa);
20687 + int rc = 0;
20688 +
20689 + while ((pa += PAGE_SIZE) < pend) {
20690 + if (phys_to_machine(pa) != ma + (pa - pcur)) {
20691 + rc = reserve_memtype(ma, ma + (pa - pcur),
20692 + req_type, NULL);
20693 + if (rc)
20694 + break;
20695 + pcur = pa;
20696 + ma = phys_to_machine(pa);
20697 + }
20698 + }
20699 + if (likely(!rc))
20700 + rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL);
20701 +
20702 + if (unlikely(!rc) && pstart < pcur)
20703 + _free_memtype(pstart, pcur);
20704 +
20705 + return rc;
20706 +}
20707 +#define reserve_memtype(s, e, r, n) \
20708 + _reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r))
20709 +#endif
20710 +
20711 int _set_memory_uc(unsigned long addr, int numpages)
20712 {
20713 /*
20714 @@ -810,7 +915,7 @@ int set_memory_uc(unsigned long addr, in
20715 /*
20716 * for now UC MINUS. see comments in ioremap_nocache()
20717 */
20718 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20719 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20720 _PAGE_CACHE_UC_MINUS, NULL))
20721 return -EINVAL;
20722
20723 @@ -826,10 +931,10 @@ int _set_memory_wc(unsigned long addr, i
20724
20725 int set_memory_wc(unsigned long addr, int numpages)
20726 {
20727 - if (!pat_wc_enabled)
20728 + if (!pat_enabled)
20729 return set_memory_uc(addr, numpages);
20730
20731 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20732 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20733 _PAGE_CACHE_WC, NULL))
20734 return -EINVAL;
20735
20736 @@ -845,7 +950,7 @@ int _set_memory_wb(unsigned long addr, i
20737
20738 int set_memory_wb(unsigned long addr, int numpages)
20739 {
20740 - free_memtype(addr, addr + numpages * PAGE_SIZE);
20741 + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
20742
20743 return _set_memory_wb(addr, numpages);
20744 }
20745 --- sle11-2009-06-04.orig/arch/x86/mm/pat-xen.c 2009-03-16 16:38:05.000000000 +0100
20746 +++ sle11-2009-06-04/arch/x86/mm/pat-xen.c 2009-06-04 10:21:39.000000000 +0200
20747 @@ -12,6 +12,8 @@
20748 #include <linux/gfp.h>
20749 #include <linux/fs.h>
20750 #include <linux/bootmem.h>
20751 +#include <linux/debugfs.h>
20752 +#include <linux/seq_file.h>
20753
20754 #include <asm/msr.h>
20755 #include <asm/tlbflush.h>
20756 @@ -26,11 +28,11 @@
20757 #include <asm/io.h>
20758
20759 #ifdef CONFIG_X86_PAT
20760 -int __read_mostly pat_wc_enabled = 1;
20761 +int __read_mostly pat_enabled = 1;
20762
20763 void __cpuinit pat_disable(char *reason)
20764 {
20765 - pat_wc_enabled = 0;
20766 + pat_enabled = 0;
20767 printk(KERN_INFO "%s\n", reason);
20768 }
20769
20770 @@ -42,6 +44,19 @@ static int __init nopat(char *str)
20771 early_param("nopat", nopat);
20772 #endif
20773
20774 +
20775 +static int debug_enable;
20776 +static int __init pat_debug_setup(char *str)
20777 +{
20778 + debug_enable = 1;
20779 + return 0;
20780 +}
20781 +__setup("debugpat", pat_debug_setup);
20782 +
20783 +#define dprintk(fmt, arg...) \
20784 + do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
20785 +
20786 +
20787 static u64 __read_mostly boot_pat_state;
20788
20789 enum {
20790 @@ -53,24 +68,25 @@ enum {
20791 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
20792 };
20793
20794 -#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
20795 +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
20796
20797 void pat_init(void)
20798 {
20799 u64 pat;
20800
20801 - if (!pat_wc_enabled)
20802 + if (!pat_enabled)
20803 return;
20804
20805 /* Paranoia check. */
20806 - if (!cpu_has_pat) {
20807 - printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
20808 + if (!cpu_has_pat && boot_pat_state) {
20809 /*
20810 - * Panic if this happens on the secondary CPU, and we
20811 + * If this happens we are on a secondary CPU, but
20812 * switched to PAT on the boot CPU. We have no way to
20813 * undo PAT.
20814 - */
20815 - BUG_ON(boot_pat_state);
20816 + */
20817 + printk(KERN_ERR "PAT enabled, "
20818 + "but not supported by secondary CPU\n");
20819 + BUG();
20820 }
20821
20822 #ifndef CONFIG_XEN
20823 @@ -87,8 +103,8 @@ void pat_init(void)
20824 * 011 UC _PAGE_CACHE_UC
20825 * PAT bit unused
20826 */
20827 - pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
20828 - PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
20829 + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
20830 + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
20831
20832 /* Boot CPU check */
20833 if (!boot_pat_state)
20834 @@ -113,13 +129,13 @@ void pat_init(void)
20835 static char *cattr_name(unsigned long flags)
20836 {
20837 switch (flags & _PAGE_CACHE_MASK) {
20838 - case _PAGE_CACHE_UC: return "uncached";
20839 - case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20840 - case _PAGE_CACHE_WB: return "write-back";
20841 - case _PAGE_CACHE_WC: return "write-combining";
20842 - case _PAGE_CACHE_WP: return "write-protected";
20843 - case _PAGE_CACHE_WT: return "write-through";
20844 - default: return "broken";
20845 + case _PAGE_CACHE_UC: return "uncached";
20846 + case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20847 + case _PAGE_CACHE_WB: return "write-back";
20848 + case _PAGE_CACHE_WC: return "write-combining";
20849 + case _PAGE_CACHE_WP: return "write-protected";
20850 + case _PAGE_CACHE_WT: return "write-through";
20851 + default: return "broken";
20852 }
20853 }
20854
20855 @@ -157,49 +173,55 @@ static DEFINE_SPINLOCK(memtype_lock); /
20856 * The intersection is based on "Effective Memory Type" tables in IA-32
20857 * SDM vol 3a
20858 */
20859 -static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
20860 - unsigned long *ret_prot)
20861 +static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
20862 {
20863 - unsigned long pat_type;
20864 - u8 mtrr_type;
20865 -
20866 - pat_type = prot & _PAGE_CACHE_MASK;
20867 - prot &= (~_PAGE_CACHE_MASK);
20868 -
20869 - /*
20870 - * We return the PAT request directly for types where PAT takes
20871 - * precedence with respect to MTRR and for UC_MINUS.
20872 - * Consistency checks with other PAT requests is done later
20873 - * while going through memtype list.
20874 - */
20875 - if (pat_type == _PAGE_CACHE_WC) {
20876 - *ret_prot = prot | _PAGE_CACHE_WC;
20877 - return 0;
20878 - } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
20879 - *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
20880 - return 0;
20881 - } else if (pat_type == _PAGE_CACHE_UC) {
20882 - *ret_prot = prot | _PAGE_CACHE_UC;
20883 - return 0;
20884 - }
20885 -
20886 /*
20887 * Look for MTRR hint to get the effective type in case where PAT
20888 * request is for WB.
20889 */
20890 - mtrr_type = mtrr_type_lookup(start, end);
20891 + if (req_type == _PAGE_CACHE_WB) {
20892 + u8 mtrr_type;
20893
20894 - if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
20895 - *ret_prot = prot | _PAGE_CACHE_UC;
20896 - } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
20897 - *ret_prot = prot | _PAGE_CACHE_WC;
20898 - } else {
20899 - *ret_prot = prot | _PAGE_CACHE_WB;
20900 + mtrr_type = mtrr_type_lookup(start, end);
20901 + if (mtrr_type == MTRR_TYPE_UNCACHABLE)
20902 + return _PAGE_CACHE_UC;
20903 + if (mtrr_type == MTRR_TYPE_WRCOMB)
20904 + return _PAGE_CACHE_WC;
20905 + }
20906 +
20907 + return req_type;
20908 +}
20909 +
20910 +static int chk_conflict(struct memtype *new, struct memtype *entry,
20911 + unsigned long *type)
20912 +{
20913 + if (new->type != entry->type) {
20914 + if (type) {
20915 + new->type = entry->type;
20916 + *type = entry->type;
20917 + } else
20918 + goto conflict;
20919 }
20920
20921 + /* check overlaps with more than one entry in the list */
20922 + list_for_each_entry_continue(entry, &memtype_list, nd) {
20923 + if (new->end <= entry->start)
20924 + break;
20925 + else if (new->type != entry->type)
20926 + goto conflict;
20927 + }
20928 return 0;
20929 +
20930 + conflict:
20931 + printk(KERN_INFO "%s:%d conflicting memory types "
20932 + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
20933 + new->end, cattr_name(new->type), cattr_name(entry->type));
20934 + return -EBUSY;
20935 }
20936
20937 +static struct memtype *cached_entry;
20938 +static u64 cached_start;
20939 +
20940 /*
20941 * req_type typically has one of the:
20942 * - _PAGE_CACHE_WB
20943 @@ -210,37 +232,36 @@ static int pat_x_mtrr_type(u64 start, u6
20944 * req_type will have a special case value '-1', when requester want to inherit
20945 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
20946 *
20947 - * If ret_type is NULL, function will return an error if it cannot reserve the
20948 - * region with req_type. If ret_type is non-null, function will return
20949 - * available type in ret_type in case of no error. In case of any error
20950 + * If new_type is NULL, function will return an error if it cannot reserve the
20951 + * region with req_type. If new_type is non-NULL, function will return
20952 + * available type in new_type in case of no error. In case of any error
20953 * it will return a negative return value.
20954 */
20955 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
20956 - unsigned long *ret_type)
20957 + unsigned long *new_type)
20958 {
20959 - struct memtype *new_entry = NULL;
20960 - struct memtype *parse;
20961 + struct memtype *new, *entry;
20962 unsigned long actual_type;
20963 + struct list_head *where;
20964 int err = 0;
20965
20966 - /* Only track when pat_wc_enabled */
20967 - if (!pat_wc_enabled) {
20968 + BUG_ON(start >= end); /* end is exclusive */
20969 +
20970 + if (!pat_enabled) {
20971 /* This is identical to page table setting without PAT */
20972 - if (ret_type) {
20973 - if (req_type == -1) {
20974 - *ret_type = _PAGE_CACHE_WB;
20975 - } else {
20976 - *ret_type = req_type;
20977 - }
20978 + if (new_type) {
20979 + if (req_type == -1)
20980 + *new_type = _PAGE_CACHE_WB;
20981 + else
20982 + *new_type = req_type & _PAGE_CACHE_MASK;
20983 }
20984 return 0;
20985 }
20986
20987 /* Low ISA region is always mapped WB in page table. No need to track */
20988 - if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
20989 - if (ret_type)
20990 - *ret_type = _PAGE_CACHE_WB;
20991 -
20992 + if (is_ISA_range(start, end - 1)) {
20993 + if (new_type)
20994 + *new_type = _PAGE_CACHE_WB;
20995 return 0;
20996 }
20997
20998 @@ -253,206 +274,118 @@ int reserve_memtype(u64 start, u64 end,
20999 */
21000 u8 mtrr_type = mtrr_type_lookup(start, end);
21001
21002 - if (mtrr_type == MTRR_TYPE_WRBACK) {
21003 - req_type = _PAGE_CACHE_WB;
21004 + if (mtrr_type == MTRR_TYPE_WRBACK)
21005 actual_type = _PAGE_CACHE_WB;
21006 - } else {
21007 - req_type = _PAGE_CACHE_UC_MINUS;
21008 + else
21009 actual_type = _PAGE_CACHE_UC_MINUS;
21010 - }
21011 - } else {
21012 - req_type &= _PAGE_CACHE_MASK;
21013 - err = pat_x_mtrr_type(start, end, req_type, &actual_type);
21014 - }
21015 -
21016 - if (err) {
21017 - if (ret_type)
21018 - *ret_type = actual_type;
21019 + } else
21020 + actual_type = pat_x_mtrr_type(start, end,
21021 + req_type & _PAGE_CACHE_MASK);
21022
21023 - return -EINVAL;
21024 - }
21025 -
21026 - new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21027 - if (!new_entry)
21028 + new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21029 + if (!new)
21030 return -ENOMEM;
21031
21032 - new_entry->start = start;
21033 - new_entry->end = end;
21034 - new_entry->type = actual_type;
21035 + new->start = start;
21036 + new->end = end;
21037 + new->type = actual_type;
21038
21039 - if (ret_type)
21040 - *ret_type = actual_type;
21041 + if (new_type)
21042 + *new_type = actual_type;
21043
21044 spin_lock(&memtype_lock);
21045
21046 - /* Search for existing mapping that overlaps the current range */
21047 - list_for_each_entry(parse, &memtype_list, nd) {
21048 - struct memtype *saved_ptr;
21049 + if (cached_entry && start >= cached_start)
21050 + entry = cached_entry;
21051 + else
21052 + entry = list_entry(&memtype_list, struct memtype, nd);
21053
21054 - if (parse->start >= end) {
21055 - pr_debug("New Entry\n");
21056 - list_add(&new_entry->nd, parse->nd.prev);
21057 - new_entry = NULL;
21058 + /* Search for existing mapping that overlaps the current range */
21059 + where = NULL;
21060 + list_for_each_entry_continue(entry, &memtype_list, nd) {
21061 + if (end <= entry->start) {
21062 + where = entry->nd.prev;
21063 + cached_entry = list_entry(where, struct memtype, nd);
21064 break;
21065 - }
21066 -
21067 - if (start <= parse->start && end >= parse->start) {
21068 - if (actual_type != parse->type && ret_type) {
21069 - actual_type = parse->type;
21070 - *ret_type = actual_type;
21071 - new_entry->type = actual_type;
21072 - }
21073 -
21074 - if (actual_type != parse->type) {
21075 - printk(
21076 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21077 - current->comm, current->pid,
21078 - start, end,
21079 - cattr_name(actual_type),
21080 - cattr_name(parse->type));
21081 - err = -EBUSY;
21082 - break;
21083 - }
21084 -
21085 - saved_ptr = parse;
21086 - /*
21087 - * Check to see whether the request overlaps more
21088 - * than one entry in the list
21089 - */
21090 - list_for_each_entry_continue(parse, &memtype_list, nd) {
21091 - if (end <= parse->start) {
21092 - break;
21093 - }
21094 -
21095 - if (actual_type != parse->type) {
21096 - printk(
21097 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21098 - current->comm, current->pid,
21099 - start, end,
21100 - cattr_name(actual_type),
21101 - cattr_name(parse->type));
21102 - err = -EBUSY;
21103 - break;
21104 - }
21105 - }
21106 -
21107 - if (err) {
21108 - break;
21109 + } else if (start <= entry->start) { /* end > entry->start */
21110 + err = chk_conflict(new, entry, new_type);
21111 + if (!err) {
21112 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
21113 + entry->start, entry->end);
21114 + where = entry->nd.prev;
21115 + cached_entry = list_entry(where,
21116 + struct memtype, nd);
21117 }
21118 -
21119 - pr_debug("Overlap at 0x%Lx-0x%Lx\n",
21120 - saved_ptr->start, saved_ptr->end);
21121 - /* No conflict. Go ahead and add this new entry */
21122 - list_add(&new_entry->nd, saved_ptr->nd.prev);
21123 - new_entry = NULL;
21124 break;
21125 - }
21126 -
21127 - if (start < parse->end) {
21128 - if (actual_type != parse->type && ret_type) {
21129 - actual_type = parse->type;
21130 - *ret_type = actual_type;
21131 - new_entry->type = actual_type;
21132 - }
21133 -
21134 - if (actual_type != parse->type) {
21135 - printk(
21136 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21137 - current->comm, current->pid,
21138 - start, end,
21139 - cattr_name(actual_type),
21140 - cattr_name(parse->type));
21141 - err = -EBUSY;
21142 - break;
21143 - }
21144 -
21145 - saved_ptr = parse;
21146 - /*
21147 - * Check to see whether the request overlaps more
21148 - * than one entry in the list
21149 - */
21150 - list_for_each_entry_continue(parse, &memtype_list, nd) {
21151 - if (end <= parse->start) {
21152 - break;
21153 - }
21154 -
21155 - if (actual_type != parse->type) {
21156 - printk(
21157 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21158 - current->comm, current->pid,
21159 - start, end,
21160 - cattr_name(actual_type),
21161 - cattr_name(parse->type));
21162 - err = -EBUSY;
21163 - break;
21164 + } else if (start < entry->end) { /* start > entry->start */
21165 + err = chk_conflict(new, entry, new_type);
21166 + if (!err) {
21167 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
21168 + entry->start, entry->end);
21169 + cached_entry = list_entry(entry->nd.prev,
21170 + struct memtype, nd);
21171 +
21172 + /*
21173 + * Move to right position in the linked
21174 + * list to add this new entry
21175 + */
21176 + list_for_each_entry_continue(entry,
21177 + &memtype_list, nd) {
21178 + if (start <= entry->start) {
21179 + where = entry->nd.prev;
21180 + break;
21181 + }
21182 }
21183 }
21184 -
21185 - if (err) {
21186 - break;
21187 - }
21188 -
21189 - pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
21190 - saved_ptr->start, saved_ptr->end);
21191 - /* No conflict. Go ahead and add this new entry */
21192 - list_add(&new_entry->nd, &saved_ptr->nd);
21193 - new_entry = NULL;
21194 break;
21195 }
21196 }
21197
21198 if (err) {
21199 - printk(KERN_INFO
21200 - "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
21201 - start, end, cattr_name(new_entry->type),
21202 - cattr_name(req_type));
21203 - kfree(new_entry);
21204 + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
21205 + "track %s, req %s\n",
21206 + start, end, cattr_name(new->type), cattr_name(req_type));
21207 + kfree(new);
21208 spin_unlock(&memtype_lock);
21209 return err;
21210 }
21211
21212 - if (new_entry) {
21213 - /* No conflict. Not yet added to the list. Add to the tail */
21214 - list_add_tail(&new_entry->nd, &memtype_list);
21215 - pr_debug("New Entry\n");
21216 - }
21217 + cached_start = start;
21218
21219 - if (ret_type) {
21220 - pr_debug(
21221 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21222 - start, end, cattr_name(actual_type),
21223 - cattr_name(req_type), cattr_name(*ret_type));
21224 - } else {
21225 - pr_debug(
21226 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
21227 - start, end, cattr_name(actual_type),
21228 - cattr_name(req_type));
21229 - }
21230 + if (where)
21231 + list_add(&new->nd, where);
21232 + else
21233 + list_add_tail(&new->nd, &memtype_list);
21234
21235 spin_unlock(&memtype_lock);
21236 +
21237 + dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21238 + start, end, cattr_name(new->type), cattr_name(req_type),
21239 + new_type ? cattr_name(*new_type) : "-");
21240 +
21241 return err;
21242 }
21243
21244 int free_memtype(u64 start, u64 end)
21245 {
21246 - struct memtype *ml;
21247 + struct memtype *entry;
21248 int err = -EINVAL;
21249
21250 - /* Only track when pat_wc_enabled */
21251 - if (!pat_wc_enabled) {
21252 + if (!pat_enabled)
21253 return 0;
21254 - }
21255
21256 /* Low ISA region is always mapped WB. No need to track */
21257 - if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
21258 + if (is_ISA_range(start, end - 1))
21259 return 0;
21260 - }
21261
21262 spin_lock(&memtype_lock);
21263 - list_for_each_entry(ml, &memtype_list, nd) {
21264 - if (ml->start == start && ml->end == end) {
21265 - list_del(&ml->nd);
21266 - kfree(ml);
21267 + list_for_each_entry(entry, &memtype_list, nd) {
21268 + if (entry->start == start && entry->end == end) {
21269 + if (cached_entry == entry || cached_start == start)
21270 + cached_entry = NULL;
21271 +
21272 + list_del(&entry->nd);
21273 + kfree(entry);
21274 err = 0;
21275 break;
21276 }
21277 @@ -464,27 +397,19 @@ int free_memtype(u64 start, u64 end)
21278 current->comm, current->pid, start, end);
21279 }
21280
21281 - pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21282 + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21283 return err;
21284 }
21285
21286
21287 -/*
21288 - * /dev/mem mmap interface. The memtype used for mapping varies:
21289 - * - Use UC for mappings with O_SYNC flag
21290 - * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
21291 - * inherit the memtype from existing mapping.
21292 - * - Else use UC_MINUS memtype (for backward compatibility with existing
21293 - * X drivers.
21294 - */
21295 pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
21296 unsigned long size, pgprot_t vma_prot)
21297 {
21298 return vma_prot;
21299 }
21300
21301 -#ifdef CONFIG_NONPROMISC_DEVMEM
21302 -/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
21303 +#ifdef CONFIG_STRICT_DEVMEM
21304 +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
21305 static inline int range_is_allowed(unsigned long mfn, unsigned long size)
21306 {
21307 return 1;
21308 @@ -508,20 +433,20 @@ static inline int range_is_allowed(unsig
21309 }
21310 return 1;
21311 }
21312 -#endif /* CONFIG_NONPROMISC_DEVMEM */
21313 +#endif /* CONFIG_STRICT_DEVMEM */
21314
21315 int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
21316 unsigned long size, pgprot_t *vma_prot)
21317 {
21318 u64 addr = (u64)mfn << PAGE_SHIFT;
21319 - unsigned long flags = _PAGE_CACHE_UC_MINUS;
21320 + unsigned long flags = -1;
21321 int retval;
21322
21323 if (!range_is_allowed(mfn, size))
21324 return 0;
21325
21326 if (file->f_flags & O_SYNC) {
21327 - flags = _PAGE_CACHE_UC;
21328 + flags = _PAGE_CACHE_UC_MINUS;
21329 }
21330
21331 #ifndef CONFIG_X86_32
21332 @@ -534,25 +459,26 @@ int phys_mem_access_prot_allowed(struct
21333 * caching for the high addresses through the KEN pin, but
21334 * we maintain the tradition of paranoia in this code.
21335 */
21336 - if (!pat_wc_enabled &&
21337 - ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
21338 - test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
21339 - test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
21340 - test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
21341 - (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21342 + if (!pat_enabled &&
21343 + !(boot_cpu_has(X86_FEATURE_MTRR) ||
21344 + boot_cpu_has(X86_FEATURE_K6_MTRR) ||
21345 + boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
21346 + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
21347 + (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21348 flags = _PAGE_CACHE_UC;
21349 }
21350 #endif
21351 #endif
21352
21353 /*
21354 - * With O_SYNC, we can only take UC mapping. Fail if we cannot.
21355 + * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
21356 + *
21357 * Without O_SYNC, we want to get
21358 * - WB for WB-able memory and no other conflicting mappings
21359 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
21360 * - Inherit from confliting mappings otherwise
21361 */
21362 - if (flags != _PAGE_CACHE_UC_MINUS) {
21363 + if (flags != -1) {
21364 retval = reserve_memtype(addr, addr + size, flags, NULL);
21365 } else {
21366 retval = reserve_memtype(addr, addr + size, -1, &flags);
21367 @@ -600,3 +526,88 @@ void unmap_devmem(unsigned long mfn, uns
21368 free_memtype(addr, addr + size);
21369 }
21370
21371 +#if defined(CONFIG_DEBUG_FS)
21372 +
21373 +/* get Nth element of the linked list */
21374 +static struct memtype *memtype_get_idx(loff_t pos)
21375 +{
21376 + struct memtype *list_node, *print_entry;
21377 + int i = 1;
21378 +
21379 + print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21380 + if (!print_entry)
21381 + return NULL;
21382 +
21383 + spin_lock(&memtype_lock);
21384 + list_for_each_entry(list_node, &memtype_list, nd) {
21385 + if (pos == i) {
21386 + *print_entry = *list_node;
21387 + spin_unlock(&memtype_lock);
21388 + return print_entry;
21389 + }
21390 + ++i;
21391 + }
21392 + spin_unlock(&memtype_lock);
21393 + kfree(print_entry);
21394 + return NULL;
21395 +}
21396 +
21397 +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
21398 +{
21399 + if (*pos == 0) {
21400 + ++*pos;
21401 + seq_printf(seq, "PAT memtype list:\n");
21402 + }
21403 +
21404 + return memtype_get_idx(*pos);
21405 +}
21406 +
21407 +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
21408 +{
21409 + ++*pos;
21410 + return memtype_get_idx(*pos);
21411 +}
21412 +
21413 +static void memtype_seq_stop(struct seq_file *seq, void *v)
21414 +{
21415 +}
21416 +
21417 +static int memtype_seq_show(struct seq_file *seq, void *v)
21418 +{
21419 + struct memtype *print_entry = (struct memtype *)v;
21420 +
21421 + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
21422 + print_entry->start, print_entry->end);
21423 + kfree(print_entry);
21424 + return 0;
21425 +}
21426 +
21427 +static struct seq_operations memtype_seq_ops = {
21428 + .start = memtype_seq_start,
21429 + .next = memtype_seq_next,
21430 + .stop = memtype_seq_stop,
21431 + .show = memtype_seq_show,
21432 +};
21433 +
21434 +static int memtype_seq_open(struct inode *inode, struct file *file)
21435 +{
21436 + return seq_open(file, &memtype_seq_ops);
21437 +}
21438 +
21439 +static const struct file_operations memtype_fops = {
21440 + .open = memtype_seq_open,
21441 + .read = seq_read,
21442 + .llseek = seq_lseek,
21443 + .release = seq_release,
21444 +};
21445 +
21446 +static int __init pat_memtype_list_init(void)
21447 +{
21448 + debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
21449 + NULL, &memtype_fops);
21450 + return 0;
21451 +}
21452 +
21453 +late_initcall(pat_memtype_list_init);
21454 +
21455 +#endif /* CONFIG_DEBUG_FS */
21456 --- sle11-2009-06-04.orig/arch/x86/mm/pgtable-xen.c 2009-03-16 16:38:05.000000000 +0100
21457 +++ sle11-2009-06-04/arch/x86/mm/pgtable-xen.c 2009-06-04 10:21:39.000000000 +0200
21458 @@ -4,6 +4,7 @@
21459 #include <asm/pgalloc.h>
21460 #include <asm/pgtable.h>
21461 #include <asm/tlb.h>
21462 +#include <asm/fixmap.h>
21463 #include <asm/hypervisor.h>
21464 #include <asm/mmu_context.h>
21465
21466 @@ -410,15 +411,9 @@ static inline void pgd_list_del(pgd_t *p
21467 static void pgd_ctor(void *p)
21468 {
21469 pgd_t *pgd = p;
21470 - unsigned long flags;
21471
21472 pgd_test_and_unpin(pgd);
21473
21474 - /* Clear usermode parts of PGD */
21475 - memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
21476 -
21477 - spin_lock_irqsave(&pgd_lock, flags);
21478 -
21479 /* If the pgd points to a shared pagetable level (either the
21480 ptes in non-PAE, or shared PMD in PAE), then just copy the
21481 references from swapper_pg_dir. */
21482 @@ -440,13 +435,9 @@ static void pgd_ctor(void *p)
21483 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
21484 #endif
21485
21486 -#ifndef CONFIG_X86_PAE
21487 /* list required to sync kernel mapping updates */
21488 if (!SHARED_KERNEL_PMD)
21489 pgd_list_add(pgd);
21490 -#endif
21491 -
21492 - spin_unlock_irqrestore(&pgd_lock, flags);
21493 }
21494
21495 static void pgd_dtor(void *pgd)
21496 @@ -475,33 +466,6 @@ static void pgd_dtor(void *pgd)
21497
21498 #ifdef CONFIG_X86_PAE
21499 /*
21500 - * Mop up any pmd pages which may still be attached to the pgd.
21501 - * Normally they will be freed by munmap/exit_mmap, but any pmd we
21502 - * preallocate which never got a corresponding vma will need to be
21503 - * freed manually.
21504 - */
21505 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21506 -{
21507 - int i;
21508 -
21509 - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
21510 - pgd_t pgd = pgdp[i];
21511 -
21512 - if (__pgd_val(pgd) != 0) {
21513 - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21514 -
21515 - pgdp[i] = xen_make_pgd(0);
21516 -
21517 - paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21518 - pmd_free(mm, pmd);
21519 - }
21520 - }
21521 -
21522 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21523 - xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21524 -}
21525 -
21526 -/*
21527 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
21528 * updating the top-level pagetable entries to guarantee the
21529 * processor notices the update. Since this is expensive, and
21530 @@ -512,61 +476,7 @@ static void pgd_mop_up_pmds(struct mm_st
21531 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
21532 * and initialize the kernel pmds here.
21533 */
21534 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21535 -{
21536 - pud_t *pud;
21537 - pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
21538 - unsigned long addr, flags;
21539 - int i;
21540 -
21541 - /*
21542 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
21543 - * allocation). We therefore store virtual addresses of pmds as they
21544 - * do not change across save/restore, and poke the machine addresses
21545 - * into the pgdir under the pgd_lock.
21546 - */
21547 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
21548 - pmds[i] = pmd_alloc_one(mm, addr);
21549 - if (!pmds[i])
21550 - goto out_oom;
21551 - }
21552 -
21553 - spin_lock_irqsave(&pgd_lock, flags);
21554 -
21555 - /* Protect against save/restore: move below 4GB under pgd_lock. */
21556 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21557 - && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21558 - spin_unlock_irqrestore(&pgd_lock, flags);
21559 -out_oom:
21560 - while (i--)
21561 - pmd_free(mm, pmds[i]);
21562 - return 0;
21563 - }
21564 -
21565 - /* Copy kernel pmd contents and write-protect the new pmds. */
21566 - pud = pud_offset(pgd, 0);
21567 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
21568 - i++, pud++, addr += PUD_SIZE) {
21569 - if (i >= KERNEL_PGD_BOUNDARY) {
21570 - memcpy(pmds[i],
21571 - (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21572 - sizeof(pmd_t) * PTRS_PER_PMD);
21573 - make_lowmem_page_readonly(
21574 - pmds[i], XENFEAT_writable_page_tables);
21575 - }
21576 -
21577 - /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21578 - pud_populate(mm, pud, pmds[i]);
21579 - }
21580 -
21581 - /* List required to sync kernel mapping updates and
21582 - * to pin/unpin on save/restore. */
21583 - pgd_list_add(pgd);
21584 -
21585 - spin_unlock_irqrestore(&pgd_lock, flags);
21586 -
21587 - return 1;
21588 -}
21589 +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
21590
21591 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
21592 {
21593 @@ -596,16 +506,101 @@ void pud_populate(struct mm_struct *mm,
21594 xen_tlb_flush();
21595 }
21596 #else /* !CONFIG_X86_PAE */
21597 +
21598 /* No need to prepopulate any pagetable entries in non-PAE modes. */
21599 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21600 +#define PREALLOCATED_PMDS 0
21601 +
21602 +#endif /* CONFIG_X86_PAE */
21603 +
21604 +static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
21605 {
21606 - return 1;
21607 + int i;
21608 +
21609 +#ifdef CONFIG_X86_PAE
21610 + if (contig)
21611 + xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
21612 +#endif
21613 +
21614 + for(i = 0; i < PREALLOCATED_PMDS; i++)
21615 + if (pmds[i])
21616 + pmd_free(mm, pmds[i]);
21617 }
21618
21619 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
21620 +static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
21621 {
21622 + int i;
21623 + bool failed = false;
21624 +
21625 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21626 + pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
21627 + if (pmd == NULL)
21628 + failed = true;
21629 + pmds[i] = pmd;
21630 + }
21631 +
21632 + if (failed) {
21633 + free_pmds(pmds, mm, false);
21634 + return -ENOMEM;
21635 + }
21636 +
21637 + return 0;
21638 +}
21639 +
21640 +/*
21641 + * Mop up any pmd pages which may still be attached to the pgd.
21642 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
21643 + * preallocate which never got a corresponding vma will need to be
21644 + * freed manually.
21645 + */
21646 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21647 +{
21648 + int i;
21649 +
21650 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21651 + pgd_t pgd = pgdp[i];
21652 +
21653 + if (__pgd_val(pgd) != 0) {
21654 + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21655 +
21656 + pgdp[i] = xen_make_pgd(0);
21657 +
21658 + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21659 + pmd_free(mm, pmd);
21660 + }
21661 + }
21662 +
21663 +#ifdef CONFIG_X86_PAE
21664 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21665 + xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21666 +#endif
21667 +}
21668 +
21669 +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
21670 +{
21671 + pud_t *pud;
21672 + unsigned long addr;
21673 + int i;
21674 +
21675 + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
21676 + return;
21677 +
21678 + pud = pud_offset(pgd, 0);
21679 + for (addr = i = 0; i < PREALLOCATED_PMDS;
21680 + i++, pud++, addr += PUD_SIZE) {
21681 + pmd_t *pmd = pmds[i];
21682 +
21683 + if (i >= KERNEL_PGD_BOUNDARY) {
21684 + memcpy(pmd,
21685 + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21686 + sizeof(pmd_t) * PTRS_PER_PMD);
21687 + make_lowmem_page_readonly(
21688 + pmd, XENFEAT_writable_page_tables);
21689 + }
21690 +
21691 + /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21692 + pud_populate(mm, pud, pmd);
21693 + }
21694 }
21695 -#endif /* CONFIG_X86_PAE */
21696
21697 #ifdef CONFIG_X86_64
21698 /* We allocate two contiguous pages for kernel and user. */
21699 @@ -616,19 +611,52 @@ static void pgd_mop_up_pmds(struct mm_st
21700
21701 pgd_t *pgd_alloc(struct mm_struct *mm)
21702 {
21703 - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21704 + pgd_t *pgd;
21705 + pmd_t *pmds[PREALLOCATED_PMDS];
21706 + unsigned long flags;
21707 +
21708 + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21709 +
21710 + if (pgd == NULL)
21711 + goto out;
21712
21713 - /* so that alloc_pd can use it */
21714 mm->pgd = pgd;
21715 - if (pgd)
21716 - pgd_ctor(pgd);
21717
21718 - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
21719 - free_pages((unsigned long)pgd, PGD_ORDER);
21720 - pgd = NULL;
21721 + if (preallocate_pmds(pmds, mm) != 0)
21722 + goto out_free_pgd;
21723 +
21724 + if (paravirt_pgd_alloc(mm) != 0)
21725 + goto out_free_pmds;
21726 +
21727 + /*
21728 + * Make sure that pre-populating the pmds is atomic with
21729 + * respect to anything walking the pgd_list, so that they
21730 + * never see a partially populated pgd.
21731 + */
21732 + spin_lock_irqsave(&pgd_lock, flags);
21733 +
21734 +#ifdef CONFIG_X86_PAE
21735 + /* Protect against save/restore: move below 4GB under pgd_lock. */
21736 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21737 + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21738 + spin_unlock_irqrestore(&pgd_lock, flags);
21739 + goto out_free_pmds;
21740 }
21741 +#endif
21742 +
21743 + pgd_ctor(pgd);
21744 + pgd_prepopulate_pmd(mm, pgd, pmds);
21745 +
21746 + spin_unlock_irqrestore(&pgd_lock, flags);
21747
21748 return pgd;
21749 +
21750 +out_free_pmds:
21751 + free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
21752 +out_free_pgd:
21753 + free_pages((unsigned long)pgd, PGD_ORDER);
21754 +out:
21755 + return NULL;
21756 }
21757
21758 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
21759 @@ -644,6 +672,7 @@ void pgd_free(struct mm_struct *mm, pgd_
21760 pgd_dtor(pgd);
21761
21762 pgd_mop_up_pmds(mm, pgd);
21763 + paravirt_pgd_free(mm, pgd);
21764 free_pages((unsigned long)pgd, PGD_ORDER);
21765 }
21766
21767 @@ -685,7 +714,7 @@ int ptep_test_and_clear_young(struct vm_
21768
21769 if (pte_young(*ptep))
21770 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
21771 - &ptep->pte);
21772 + (unsigned long *) &ptep->pte);
21773
21774 if (ret)
21775 pte_update(vma->vm_mm, addr, ptep);
21776 @@ -707,3 +736,42 @@ int ptep_clear_flush_young(struct vm_are
21777
21778 return young;
21779 }
21780 +
21781 +int fixmaps_set;
21782 +
21783 +void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21784 +{
21785 + unsigned long address = __fix_to_virt(idx);
21786 + pte_t pte;
21787 +
21788 + if (idx >= __end_of_fixed_addresses) {
21789 + BUG();
21790 + return;
21791 + }
21792 +
21793 + switch (idx) {
21794 +#ifdef CONFIG_X86_64
21795 + extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
21796 +
21797 + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
21798 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21799 + set_pte_vaddr_pud(level3_user_pgt, address, pte);
21800 + break;
21801 + case FIX_EARLYCON_MEM_BASE:
21802 + xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
21803 + pfn_pte_ma(phys >> PAGE_SHIFT, flags));
21804 + fixmaps_set++;
21805 + return;
21806 +#else
21807 + case FIX_WP_TEST:
21808 + case FIX_VDSO:
21809 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21810 + break;
21811 +#endif
21812 + default:
21813 + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21814 + break;
21815 + }
21816 + set_pte_vaddr(address, pte);
21817 + fixmaps_set++;
21818 +}
21819 --- sle11-2009-06-04.orig/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:38:05.000000000 +0100
21820 +++ sle11-2009-06-04/arch/x86/mm/pgtable_32-xen.c 2009-06-04 10:21:39.000000000 +0200
21821 @@ -25,51 +25,49 @@
21822 #include <xen/features.h>
21823 #include <asm/hypervisor.h>
21824
21825 -void show_mem(void)
21826 +/*
21827 + * Associate a virtual page frame with a given physical page frame
21828 + * and protection flags for that frame.
21829 + */
21830 +void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
21831 {
21832 - int total = 0, reserved = 0;
21833 - int shared = 0, cached = 0;
21834 - int highmem = 0;
21835 - struct page *page;
21836 - pg_data_t *pgdat;
21837 - unsigned long i;
21838 - unsigned long flags;
21839 -
21840 - printk(KERN_INFO "Mem-info:\n");
21841 - show_free_areas();
21842 - for_each_online_pgdat(pgdat) {
21843 - pgdat_resize_lock(pgdat, &flags);
21844 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
21845 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
21846 - touch_nmi_watchdog();
21847 - page = pgdat_page_nr(pgdat, i);
21848 - total++;
21849 - if (PageHighMem(page))
21850 - highmem++;
21851 - if (PageReserved(page))
21852 - reserved++;
21853 - else if (PageSwapCache(page))
21854 - cached++;
21855 - else if (page_count(page))
21856 - shared += page_count(page) - 1;
21857 - }
21858 - pgdat_resize_unlock(pgdat, &flags);
21859 - }
21860 - printk(KERN_INFO "%d pages of RAM\n", total);
21861 - printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
21862 - printk(KERN_INFO "%d reserved pages\n", reserved);
21863 - printk(KERN_INFO "%d pages shared\n", shared);
21864 - printk(KERN_INFO "%d pages swap cached\n", cached);
21865 -
21866 - printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
21867 - printk(KERN_INFO "%lu pages writeback\n",
21868 - global_page_state(NR_WRITEBACK));
21869 - printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
21870 - printk(KERN_INFO "%lu pages slab\n",
21871 - global_page_state(NR_SLAB_RECLAIMABLE) +
21872 - global_page_state(NR_SLAB_UNRECLAIMABLE));
21873 - printk(KERN_INFO "%lu pages pagetables\n",
21874 - global_page_state(NR_PAGETABLE));
21875 +#ifndef CONFIG_XEN
21876 + pgd_t *pgd;
21877 + pud_t *pud;
21878 + pmd_t *pmd;
21879 + pte_t *pte;
21880 +
21881 + pgd = swapper_pg_dir + pgd_index(vaddr);
21882 + if (pgd_none(*pgd)) {
21883 + BUG();
21884 + return;
21885 + }
21886 + pud = pud_offset(pgd, vaddr);
21887 + if (pud_none(*pud)) {
21888 + BUG();
21889 + return;
21890 + }
21891 + pmd = pmd_offset(pud, vaddr);
21892 + if (pmd_none(*pmd)) {
21893 + BUG();
21894 + return;
21895 + }
21896 + pte = pte_offset_kernel(pmd, vaddr);
21897 + if (pte_val(pteval))
21898 + set_pte_present(&init_mm, vaddr, pte, pteval);
21899 + else
21900 + pte_clear(&init_mm, vaddr, pte);
21901 +
21902 + /*
21903 + * It's enough to flush this one mapping.
21904 + * (PGE mappings get flushed as well)
21905 + */
21906 + __flush_tlb_one(vaddr);
21907 +#else
21908 + if (HYPERVISOR_update_va_mapping(vaddr, pteval,
21909 + UVMF_INVLPG|UVMF_ALL))
21910 + BUG();
21911 +#endif
21912 }
21913
21914 /*
21915 @@ -107,35 +105,10 @@ void set_pmd_pfn(unsigned long vaddr, un
21916 __flush_tlb_one(vaddr);
21917 }
21918
21919 -static int fixmaps;
21920 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
21921 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
21922 EXPORT_SYMBOL(__FIXADDR_TOP);
21923
21924 -void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21925 -{
21926 - unsigned long address = __fix_to_virt(idx);
21927 - pte_t pte;
21928 -
21929 - if (idx >= __end_of_fixed_addresses) {
21930 - BUG();
21931 - return;
21932 - }
21933 - switch (idx) {
21934 - case FIX_WP_TEST:
21935 - case FIX_VDSO:
21936 - pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21937 - break;
21938 - default:
21939 - pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21940 - break;
21941 - }
21942 - if (HYPERVISOR_update_va_mapping(address, pte,
21943 - UVMF_INVLPG|UVMF_ALL))
21944 - BUG();
21945 - fixmaps++;
21946 -}
21947 -
21948 /**
21949 * reserve_top_address - reserves a hole in the top of kernel address space
21950 * @reserve - size of hole to reserve
21951 @@ -145,13 +118,48 @@ void __set_fixmap (enum fixed_addresses
21952 */
21953 void __init reserve_top_address(unsigned long reserve)
21954 {
21955 - BUG_ON(fixmaps > 0);
21956 + BUG_ON(fixmaps_set > 0);
21957 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
21958 (int)-reserve);
21959 __FIXADDR_TOP = -reserve - PAGE_SIZE;
21960 __VMALLOC_RESERVE += reserve;
21961 }
21962
21963 +/*
21964 + * vmalloc=size forces the vmalloc area to be exactly 'size'
21965 + * bytes. This can be used to increase (or decrease) the
21966 + * vmalloc area - the default is 128m.
21967 + */
21968 +static int __init parse_vmalloc(char *arg)
21969 +{
21970 + if (!arg)
21971 + return -EINVAL;
21972 +
21973 + __VMALLOC_RESERVE = memparse(arg, &arg);
21974 + return 0;
21975 +}
21976 +early_param("vmalloc", parse_vmalloc);
21977 +
21978 +#ifndef CONFIG_XEN
21979 +/*
21980 + * reservetop=size reserves a hole at the top of the kernel address space which
21981 + * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
21982 + * so relocating the fixmap can be done before paging initialization.
21983 + */
21984 +static int __init parse_reservetop(char *arg)
21985 +{
21986 + unsigned long address;
21987 +
21988 + if (!arg)
21989 + return -EINVAL;
21990 +
21991 + address = memparse(arg, &arg);
21992 + reserve_top_address(address);
21993 + return 0;
21994 +}
21995 +early_param("reservetop", parse_reservetop);
21996 +#endif
21997 +
21998 void make_lowmem_page_readonly(void *va, unsigned int feature)
21999 {
22000 pte_t *pte;
22001 --- sle11-2009-06-04.orig/arch/x86/pci/amd_bus.c 2009-06-04 11:08:07.000000000 +0200
22002 +++ sle11-2009-06-04/arch/x86/pci/amd_bus.c 2009-06-04 10:21:39.000000000 +0200
22003 @@ -607,6 +607,14 @@ static int __init pci_io_ecs_init(void)
22004 for_each_online_cpu(cpu)
22005 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
22006 (void *)(long)cpu);
22007 +#ifdef CONFIG_XEN
22008 + {
22009 + u64 reg;
22010 + rdmsrl(MSR_AMD64_NB_CFG, reg);
22011 + if (!(reg & ENABLE_CF8_EXT_CFG))
22012 + return 0;
22013 + }
22014 +#endif
22015 pci_probe |= PCI_HAS_IO_ECS;
22016
22017 return 0;
22018 @@ -614,6 +622,10 @@ static int __init pci_io_ecs_init(void)
22019
22020 static int __init amd_postcore_init(void)
22021 {
22022 +#ifdef CONFIG_XEN
22023 + if (!is_initial_xendomain())
22024 + return 0;
22025 +#endif
22026 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
22027 return 0;
22028
22029 --- sle11-2009-06-04.orig/arch/x86/pci/irq-xen.c 2009-03-16 16:38:05.000000000 +0100
22030 +++ sle11-2009-06-04/arch/x86/pci/irq-xen.c 2009-06-04 10:21:39.000000000 +0200
22031 @@ -11,8 +11,8 @@
22032 #include <linux/slab.h>
22033 #include <linux/interrupt.h>
22034 #include <linux/dmi.h>
22035 -#include <asm/io.h>
22036 -#include <asm/smp.h>
22037 +#include <linux/io.h>
22038 +#include <linux/smp.h>
22039 #include <asm/io_apic.h>
22040 #include <linux/irq.h>
22041 #include <linux/acpi.h>
22042 @@ -45,7 +45,8 @@ struct irq_router {
22043 char *name;
22044 u16 vendor, device;
22045 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
22046 - int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
22047 + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
22048 + int new);
22049 };
22050
22051 struct irq_router_handler {
22052 @@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_d
22053 * and perform checksum verification.
22054 */
22055
22056 -static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
22057 +static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
22058 {
22059 struct irq_routing_table *rt;
22060 int i;
22061 @@ -74,10 +75,11 @@ static inline struct irq_routing_table *
22062 rt->size < sizeof(struct irq_routing_table))
22063 return NULL;
22064 sum = 0;
22065 - for (i=0; i < rt->size; i++)
22066 + for (i = 0; i < rt->size; i++)
22067 sum += addr[i];
22068 if (!sum) {
22069 - DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
22070 + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
22071 + rt);
22072 return rt;
22073 }
22074 return NULL;
22075 @@ -104,7 +106,9 @@ static struct irq_routing_table * __init
22076 return rt;
22077 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
22078 }
22079 - for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
22080 + for (addr = (u8 *) isa_bus_to_virt(0xf0000);
22081 + addr < (u8 *) isa_bus_to_virt(0x100000);
22082 + addr += 16) {
22083 rt = pirq_check_routing_table(addr);
22084 if (rt)
22085 return rt;
22086 @@ -126,20 +130,20 @@ static void __init pirq_peer_trick(void)
22087 struct irq_info *e;
22088
22089 memset(busmap, 0, sizeof(busmap));
22090 - for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
22091 + for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
22092 e = &rt->slots[i];
22093 #ifdef DEBUG
22094 {
22095 int j;
22096 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
22097 - for(j=0; j<4; j++)
22098 + for (j = 0; j < 4; j++)
22099 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
22100 DBG("\n");
22101 }
22102 #endif
22103 busmap[e->bus] = 1;
22104 }
22105 - for(i = 1; i < 256; i++) {
22106 + for (i = 1; i < 256; i++) {
22107 int node;
22108 if (!busmap[i] || pci_find_bus(0, i))
22109 continue;
22110 @@ -187,7 +191,8 @@ static unsigned int read_config_nybble(s
22111 return (nr & 1) ? (x >> 4) : (x & 0xf);
22112 }
22113
22114 -static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
22115 +static void write_config_nybble(struct pci_dev *router, unsigned offset,
22116 + unsigned nr, unsigned int val)
22117 {
22118 u8 x;
22119 unsigned reg = offset + (nr >> 1);
22120 @@ -289,7 +294,7 @@ static int pirq_ite_get(struct pci_dev *
22121 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
22122
22123 WARN_ON_ONCE(pirq > 4);
22124 - return read_config_nybble(router,0x43, pirqmap[pirq-1]);
22125 + return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
22126 }
22127
22128 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22129 @@ -318,7 +323,7 @@ static int pirq_opti_set(struct pci_dev
22130
22131 /*
22132 * Cyrix: nibble offset 0x5C
22133 - * 0x5C bits 7:4 is INTB bits 3:0 is INTA
22134 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA
22135 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
22136 */
22137 static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
22138 @@ -354,7 +359,7 @@ static int pirq_cyrix_set(struct pci_dev
22139 * Apparently there are systems implementing PCI routing table using
22140 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
22141 * We try our best to handle both link mappings.
22142 - *
22143 + *
22144 * Currently (2003-05-21) it appears most SiS chipsets follow the
22145 * definition of routing registers from the SiS-5595 southbridge.
22146 * According to the SiS 5595 datasheets the revision id's of the
22147 @@ -374,7 +379,7 @@ static int pirq_cyrix_set(struct pci_dev
22148 *
22149 * 0x62: USBIRQ:
22150 * bit 6 OHCI function disabled (0), enabled (1)
22151 - *
22152 + *
22153 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
22154 *
22155 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
22156 @@ -437,7 +442,7 @@ static int pirq_vlsi_get(struct pci_dev
22157 {
22158 WARN_ON_ONCE(pirq >= 9);
22159 if (pirq > 8) {
22160 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
22161 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
22162 return 0;
22163 }
22164 return read_config_nybble(router, 0x74, pirq-1);
22165 @@ -447,7 +452,7 @@ static int pirq_vlsi_set(struct pci_dev
22166 {
22167 WARN_ON_ONCE(pirq >= 9);
22168 if (pirq > 8) {
22169 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
22170 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
22171 return 0;
22172 }
22173 write_config_nybble(router, 0x74, pirq-1, irq);
22174 @@ -471,7 +476,8 @@ static int pirq_serverworks_get(struct p
22175 return inb(0xc01) & 0xf;
22176 }
22177
22178 -static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22179 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
22180 + int pirq, int irq)
22181 {
22182 outb(pirq, 0xc00);
22183 outb(irq, 0xc01);
22184 @@ -491,22 +497,20 @@ static int pirq_amd756_get(struct pci_de
22185 u8 irq;
22186 irq = 0;
22187 if (pirq <= 4)
22188 - {
22189 irq = read_config_nybble(router, 0x56, pirq - 1);
22190 - }
22191 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
22192 - dev->vendor, dev->device, pirq, irq);
22193 + dev_info(&dev->dev,
22194 + "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
22195 + dev->vendor, dev->device, pirq, irq);
22196 return irq;
22197 }
22198
22199 static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22200 {
22201 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
22202 - dev->vendor, dev->device, pirq, irq);
22203 + dev_info(&dev->dev,
22204 + "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
22205 + dev->vendor, dev->device, pirq, irq);
22206 if (pirq <= 4)
22207 - {
22208 write_config_nybble(router, 0x56, pirq - 1, irq);
22209 - }
22210 return 1;
22211 }
22212
22213 @@ -553,50 +557,51 @@ static __init int intel_router_probe(str
22214 if (pci_dev_present(pirq_440gx))
22215 return 0;
22216
22217 - switch(device)
22218 - {
22219 - case PCI_DEVICE_ID_INTEL_82371FB_0:
22220 - case PCI_DEVICE_ID_INTEL_82371SB_0:
22221 - case PCI_DEVICE_ID_INTEL_82371AB_0:
22222 - case PCI_DEVICE_ID_INTEL_82371MX:
22223 - case PCI_DEVICE_ID_INTEL_82443MX_0:
22224 - case PCI_DEVICE_ID_INTEL_82801AA_0:
22225 - case PCI_DEVICE_ID_INTEL_82801AB_0:
22226 - case PCI_DEVICE_ID_INTEL_82801BA_0:
22227 - case PCI_DEVICE_ID_INTEL_82801BA_10:
22228 - case PCI_DEVICE_ID_INTEL_82801CA_0:
22229 - case PCI_DEVICE_ID_INTEL_82801CA_12:
22230 - case PCI_DEVICE_ID_INTEL_82801DB_0:
22231 - case PCI_DEVICE_ID_INTEL_82801E_0:
22232 - case PCI_DEVICE_ID_INTEL_82801EB_0:
22233 - case PCI_DEVICE_ID_INTEL_ESB_1:
22234 - case PCI_DEVICE_ID_INTEL_ICH6_0:
22235 - case PCI_DEVICE_ID_INTEL_ICH6_1:
22236 - case PCI_DEVICE_ID_INTEL_ICH7_0:
22237 - case PCI_DEVICE_ID_INTEL_ICH7_1:
22238 - case PCI_DEVICE_ID_INTEL_ICH7_30:
22239 - case PCI_DEVICE_ID_INTEL_ICH7_31:
22240 - case PCI_DEVICE_ID_INTEL_ESB2_0:
22241 - case PCI_DEVICE_ID_INTEL_ICH8_0:
22242 - case PCI_DEVICE_ID_INTEL_ICH8_1:
22243 - case PCI_DEVICE_ID_INTEL_ICH8_2:
22244 - case PCI_DEVICE_ID_INTEL_ICH8_3:
22245 - case PCI_DEVICE_ID_INTEL_ICH8_4:
22246 - case PCI_DEVICE_ID_INTEL_ICH9_0:
22247 - case PCI_DEVICE_ID_INTEL_ICH9_1:
22248 - case PCI_DEVICE_ID_INTEL_ICH9_2:
22249 - case PCI_DEVICE_ID_INTEL_ICH9_3:
22250 - case PCI_DEVICE_ID_INTEL_ICH9_4:
22251 - case PCI_DEVICE_ID_INTEL_ICH9_5:
22252 - case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22253 - case PCI_DEVICE_ID_INTEL_ICH10_0:
22254 - case PCI_DEVICE_ID_INTEL_ICH10_1:
22255 - case PCI_DEVICE_ID_INTEL_ICH10_2:
22256 - case PCI_DEVICE_ID_INTEL_ICH10_3:
22257 - r->name = "PIIX/ICH";
22258 - r->get = pirq_piix_get;
22259 - r->set = pirq_piix_set;
22260 - return 1;
22261 + switch (device) {
22262 + case PCI_DEVICE_ID_INTEL_82371FB_0:
22263 + case PCI_DEVICE_ID_INTEL_82371SB_0:
22264 + case PCI_DEVICE_ID_INTEL_82371AB_0:
22265 + case PCI_DEVICE_ID_INTEL_82371MX:
22266 + case PCI_DEVICE_ID_INTEL_82443MX_0:
22267 + case PCI_DEVICE_ID_INTEL_82801AA_0:
22268 + case PCI_DEVICE_ID_INTEL_82801AB_0:
22269 + case PCI_DEVICE_ID_INTEL_82801BA_0:
22270 + case PCI_DEVICE_ID_INTEL_82801BA_10:
22271 + case PCI_DEVICE_ID_INTEL_82801CA_0:
22272 + case PCI_DEVICE_ID_INTEL_82801CA_12:
22273 + case PCI_DEVICE_ID_INTEL_82801DB_0:
22274 + case PCI_DEVICE_ID_INTEL_82801E_0:
22275 + case PCI_DEVICE_ID_INTEL_82801EB_0:
22276 + case PCI_DEVICE_ID_INTEL_ESB_1:
22277 + case PCI_DEVICE_ID_INTEL_ICH6_0:
22278 + case PCI_DEVICE_ID_INTEL_ICH6_1:
22279 + case PCI_DEVICE_ID_INTEL_ICH7_0:
22280 + case PCI_DEVICE_ID_INTEL_ICH7_1:
22281 + case PCI_DEVICE_ID_INTEL_ICH7_30:
22282 + case PCI_DEVICE_ID_INTEL_ICH7_31:
22283 + case PCI_DEVICE_ID_INTEL_ESB2_0:
22284 + case PCI_DEVICE_ID_INTEL_ICH8_0:
22285 + case PCI_DEVICE_ID_INTEL_ICH8_1:
22286 + case PCI_DEVICE_ID_INTEL_ICH8_2:
22287 + case PCI_DEVICE_ID_INTEL_ICH8_3:
22288 + case PCI_DEVICE_ID_INTEL_ICH8_4:
22289 + case PCI_DEVICE_ID_INTEL_ICH9_0:
22290 + case PCI_DEVICE_ID_INTEL_ICH9_1:
22291 + case PCI_DEVICE_ID_INTEL_ICH9_2:
22292 + case PCI_DEVICE_ID_INTEL_ICH9_3:
22293 + case PCI_DEVICE_ID_INTEL_ICH9_4:
22294 + case PCI_DEVICE_ID_INTEL_ICH9_5:
22295 + case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22296 + case PCI_DEVICE_ID_INTEL_ICH10_0:
22297 + case PCI_DEVICE_ID_INTEL_ICH10_1:
22298 + case PCI_DEVICE_ID_INTEL_ICH10_2:
22299 + case PCI_DEVICE_ID_INTEL_ICH10_3:
22300 + case PCI_DEVICE_ID_INTEL_PCH_0:
22301 + case PCI_DEVICE_ID_INTEL_PCH_1:
22302 + r->name = "PIIX/ICH";
22303 + r->get = pirq_piix_get;
22304 + r->set = pirq_piix_set;
22305 + return 1;
22306 }
22307 return 0;
22308 }
22309 @@ -610,7 +615,7 @@ static __init int via_router_probe(struc
22310 * workarounds for some buggy BIOSes
22311 */
22312 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
22313 - switch(router->device) {
22314 + switch (router->device) {
22315 case PCI_DEVICE_ID_VIA_82C686:
22316 /*
22317 * Asus k7m bios wrongly reports 82C686A
22318 @@ -635,7 +640,7 @@ static __init int via_router_probe(struc
22319 }
22320 }
22321
22322 - switch(device) {
22323 + switch (device) {
22324 case PCI_DEVICE_ID_VIA_82C586_0:
22325 r->name = "VIA";
22326 r->get = pirq_via586_get;
22327 @@ -658,28 +663,27 @@ static __init int via_router_probe(struc
22328
22329 static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22330 {
22331 - switch(device)
22332 - {
22333 - case PCI_DEVICE_ID_VLSI_82C534:
22334 - r->name = "VLSI 82C534";
22335 - r->get = pirq_vlsi_get;
22336 - r->set = pirq_vlsi_set;
22337 - return 1;
22338 + switch (device) {
22339 + case PCI_DEVICE_ID_VLSI_82C534:
22340 + r->name = "VLSI 82C534";
22341 + r->get = pirq_vlsi_get;
22342 + r->set = pirq_vlsi_set;
22343 + return 1;
22344 }
22345 return 0;
22346 }
22347
22348
22349 -static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22350 +static __init int serverworks_router_probe(struct irq_router *r,
22351 + struct pci_dev *router, u16 device)
22352 {
22353 - switch(device)
22354 - {
22355 - case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22356 - case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22357 - r->name = "ServerWorks";
22358 - r->get = pirq_serverworks_get;
22359 - r->set = pirq_serverworks_set;
22360 - return 1;
22361 + switch (device) {
22362 + case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22363 + case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22364 + r->name = "ServerWorks";
22365 + r->get = pirq_serverworks_get;
22366 + r->set = pirq_serverworks_set;
22367 + return 1;
22368 }
22369 return 0;
22370 }
22371 @@ -688,7 +692,7 @@ static __init int sis_router_probe(struc
22372 {
22373 if (device != PCI_DEVICE_ID_SI_503)
22374 return 0;
22375 -
22376 +
22377 r->name = "SIS";
22378 r->get = pirq_sis_get;
22379 r->set = pirq_sis_set;
22380 @@ -697,50 +701,45 @@ static __init int sis_router_probe(struc
22381
22382 static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22383 {
22384 - switch(device)
22385 - {
22386 - case PCI_DEVICE_ID_CYRIX_5520:
22387 - r->name = "NatSemi";
22388 - r->get = pirq_cyrix_get;
22389 - r->set = pirq_cyrix_set;
22390 - return 1;
22391 + switch (device) {
22392 + case PCI_DEVICE_ID_CYRIX_5520:
22393 + r->name = "NatSemi";
22394 + r->get = pirq_cyrix_get;
22395 + r->set = pirq_cyrix_set;
22396 + return 1;
22397 }
22398 return 0;
22399 }
22400
22401 static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22402 {
22403 - switch(device)
22404 - {
22405 - case PCI_DEVICE_ID_OPTI_82C700:
22406 - r->name = "OPTI";
22407 - r->get = pirq_opti_get;
22408 - r->set = pirq_opti_set;
22409 - return 1;
22410 + switch (device) {
22411 + case PCI_DEVICE_ID_OPTI_82C700:
22412 + r->name = "OPTI";
22413 + r->get = pirq_opti_get;
22414 + r->set = pirq_opti_set;
22415 + return 1;
22416 }
22417 return 0;
22418 }
22419
22420 static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22421 {
22422 - switch(device)
22423 - {
22424 - case PCI_DEVICE_ID_ITE_IT8330G_0:
22425 - r->name = "ITE";
22426 - r->get = pirq_ite_get;
22427 - r->set = pirq_ite_set;
22428 - return 1;
22429 + switch (device) {
22430 + case PCI_DEVICE_ID_ITE_IT8330G_0:
22431 + r->name = "ITE";
22432 + r->get = pirq_ite_get;
22433 + r->set = pirq_ite_set;
22434 + return 1;
22435 }
22436 return 0;
22437 }
22438
22439 static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22440 {
22441 - switch(device)
22442 - {
22443 + switch (device) {
22444 case PCI_DEVICE_ID_AL_M1533:
22445 case PCI_DEVICE_ID_AL_M1563:
22446 - printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
22447 r->name = "ALI";
22448 r->get = pirq_ali_get;
22449 r->set = pirq_ali_set;
22450 @@ -751,25 +750,24 @@ static __init int ali_router_probe(struc
22451
22452 static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22453 {
22454 - switch(device)
22455 - {
22456 - case PCI_DEVICE_ID_AMD_VIPER_740B:
22457 - r->name = "AMD756";
22458 - break;
22459 - case PCI_DEVICE_ID_AMD_VIPER_7413:
22460 - r->name = "AMD766";
22461 - break;
22462 - case PCI_DEVICE_ID_AMD_VIPER_7443:
22463 - r->name = "AMD768";
22464 - break;
22465 - default:
22466 - return 0;
22467 + switch (device) {
22468 + case PCI_DEVICE_ID_AMD_VIPER_740B:
22469 + r->name = "AMD756";
22470 + break;
22471 + case PCI_DEVICE_ID_AMD_VIPER_7413:
22472 + r->name = "AMD766";
22473 + break;
22474 + case PCI_DEVICE_ID_AMD_VIPER_7443:
22475 + r->name = "AMD768";
22476 + break;
22477 + default:
22478 + return 0;
22479 }
22480 r->get = pirq_amd756_get;
22481 r->set = pirq_amd756_set;
22482 return 1;
22483 }
22484 -
22485 +
22486 static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22487 {
22488 switch (device) {
22489 @@ -811,7 +809,7 @@ static struct pci_dev *pirq_router_dev;
22490 * FIXME: should we have an option to say "generic for
22491 * chipset" ?
22492 */
22493 -
22494 +
22495 static void __init pirq_find_router(struct irq_router *r)
22496 {
22497 struct irq_routing_table *rt = pirq_table;
22498 @@ -830,7 +828,7 @@ static void __init pirq_find_router(stru
22499 r->name = "default";
22500 r->get = NULL;
22501 r->set = NULL;
22502 -
22503 +
22504 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
22505 rt->rtr_vendor, rt->rtr_device);
22506
22507 @@ -841,19 +839,19 @@ static void __init pirq_find_router(stru
22508 return;
22509 }
22510
22511 - for( h = pirq_routers; h->vendor; h++) {
22512 + for (h = pirq_routers; h->vendor; h++) {
22513 /* First look for a router match */
22514 - if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
22515 + if (rt->rtr_vendor == h->vendor &&
22516 + h->probe(r, pirq_router_dev, rt->rtr_device))
22517 break;
22518 /* Fall back to a device match */
22519 - if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
22520 + if (pirq_router_dev->vendor == h->vendor &&
22521 + h->probe(r, pirq_router_dev, pirq_router_dev->device))
22522 break;
22523 }
22524 - printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
22525 - pirq_router.name,
22526 - pirq_router_dev->vendor,
22527 - pirq_router_dev->device,
22528 - pci_name(pirq_router_dev));
22529 + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
22530 + pirq_router.name,
22531 + pirq_router_dev->vendor, pirq_router_dev->device);
22532
22533 /* The device remains referenced for the kernel lifetime */
22534 }
22535 @@ -861,11 +859,13 @@ static void __init pirq_find_router(stru
22536 static struct irq_info *pirq_get_info(struct pci_dev *dev)
22537 {
22538 struct irq_routing_table *rt = pirq_table;
22539 - int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
22540 + int entries = (rt->size - sizeof(struct irq_routing_table)) /
22541 + sizeof(struct irq_info);
22542 struct irq_info *info;
22543
22544 for (info = rt->slots; entries--; info++)
22545 - if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22546 + if (info->bus == dev->bus->number &&
22547 + PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22548 return info;
22549 return NULL;
22550 }
22551 @@ -884,7 +884,7 @@ static int pcibios_lookup_irq(struct pci
22552 /* Find IRQ pin */
22553 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
22554 if (!pin) {
22555 - DBG(KERN_DEBUG " -> no interrupt pin\n");
22556 + dev_dbg(&dev->dev, "no interrupt pin\n");
22557 return 0;
22558 }
22559 pin = pin - 1;
22560 @@ -893,20 +893,21 @@ static int pcibios_lookup_irq(struct pci
22561
22562 if (!pirq_table)
22563 return 0;
22564 -
22565 - DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
22566 +
22567 info = pirq_get_info(dev);
22568 if (!info) {
22569 - DBG(" -> not found in routing table\n" KERN_DEBUG);
22570 + dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
22571 + 'A' + pin);
22572 return 0;
22573 }
22574 pirq = info->irq[pin].link;
22575 mask = info->irq[pin].bitmap;
22576 if (!pirq) {
22577 - DBG(" -> not routed\n" KERN_DEBUG);
22578 + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
22579 return 0;
22580 }
22581 - DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
22582 + dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
22583 + 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
22584 mask &= pcibios_irq_mask;
22585
22586 /* Work around broken HP Pavilion Notebooks which assign USB to
22587 @@ -919,7 +920,8 @@ static int pcibios_lookup_irq(struct pci
22588 }
22589
22590 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
22591 - if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
22592 + if (acer_tm360_irqrouting && dev->irq == 11 &&
22593 + dev->vendor == PCI_VENDOR_ID_O2) {
22594 pirq = 0x68;
22595 mask = 0x400;
22596 dev->irq = r->get(pirq_router_dev, dev, pirq);
22597 @@ -932,51 +934,50 @@ static int pcibios_lookup_irq(struct pci
22598 */
22599 newirq = dev->irq;
22600 if (newirq && !((1 << newirq) & mask)) {
22601 - if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
22602 - else printk("\n" KERN_WARNING
22603 - "PCI: IRQ %i for device %s doesn't match PIRQ mask "
22604 - "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
22605 - pci_name(dev));
22606 + if (pci_probe & PCI_USE_PIRQ_MASK)
22607 + newirq = 0;
22608 + else
22609 + dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
22610 + "%#x; try pci=usepirqmask\n", newirq, mask);
22611 }
22612 if (!newirq && assign) {
22613 for (i = 0; i < 16; i++) {
22614 if (!(mask & (1 << i)))
22615 continue;
22616 - if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
22617 + if (pirq_penalty[i] < pirq_penalty[newirq] &&
22618 + can_request_irq(i, IRQF_SHARED))
22619 newirq = i;
22620 }
22621 }
22622 - DBG(" -> newirq=%d", newirq);
22623 + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
22624
22625 /* Check if it is hardcoded */
22626 if ((pirq & 0xf0) == 0xf0) {
22627 irq = pirq & 0xf;
22628 - DBG(" -> hardcoded IRQ %d\n", irq);
22629 - msg = "Hardcoded";
22630 - } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22631 - ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
22632 - DBG(" -> got IRQ %d\n", irq);
22633 - msg = "Found";
22634 + msg = "hardcoded";
22635 + } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22636 + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
22637 + msg = "found";
22638 eisa_set_level_irq(irq);
22639 - } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22640 - DBG(" -> assigning IRQ %d", newirq);
22641 + } else if (newirq && r->set &&
22642 + (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22643 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
22644 eisa_set_level_irq(newirq);
22645 - DBG(" ... OK\n");
22646 - msg = "Assigned";
22647 + msg = "assigned";
22648 irq = newirq;
22649 }
22650 }
22651
22652 if (!irq) {
22653 - DBG(" ... failed\n");
22654 if (newirq && mask == (1 << newirq)) {
22655 - msg = "Guessed";
22656 + msg = "guessed";
22657 irq = newirq;
22658 - } else
22659 + } else {
22660 + dev_dbg(&dev->dev, "can't route interrupt\n");
22661 return 0;
22662 + }
22663 }
22664 - printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
22665 + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
22666
22667 /* Update IRQ for all devices with the same pirq value */
22668 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
22669 @@ -988,20 +989,25 @@ static int pcibios_lookup_irq(struct pci
22670 if (!info)
22671 continue;
22672 if (info->irq[pin].link == pirq) {
22673 - /* We refuse to override the dev->irq information. Give a warning! */
22674 - if ( dev2->irq && dev2->irq != irq && \
22675 + /*
22676 + * We refuse to override the dev->irq
22677 + * information. Give a warning!
22678 + */
22679 + if (dev2->irq && dev2->irq != irq && \
22680 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
22681 - ((1 << dev2->irq) & mask)) ) {
22682 + ((1 << dev2->irq) & mask))) {
22683 #ifndef CONFIG_PCI_MSI
22684 - printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
22685 - pci_name(dev2), dev2->irq, irq);
22686 + dev_info(&dev2->dev, "IRQ routing conflict: "
22687 + "have IRQ %d, want IRQ %d\n",
22688 + dev2->irq, irq);
22689 #endif
22690 - continue;
22691 - }
22692 + continue;
22693 + }
22694 dev2->irq = irq;
22695 pirq_penalty[irq]++;
22696 if (dev != dev2)
22697 - printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
22698 + dev_info(&dev->dev, "sharing IRQ %d with %s\n",
22699 + irq, pci_name(dev2));
22700 }
22701 }
22702 return 1;
22703 @@ -1015,15 +1021,20 @@ static void __init pcibios_fixup_irqs(vo
22704 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
22705 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
22706 /*
22707 - * If the BIOS has set an out of range IRQ number, just ignore it.
22708 - * Also keep track of which IRQ's are already in use.
22709 + * If the BIOS has set an out of range IRQ number, just
22710 + * ignore it. Also keep track of which IRQ's are
22711 + * already in use.
22712 */
22713 if (dev->irq >= 16) {
22714 - DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
22715 + dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
22716 dev->irq = 0;
22717 }
22718 - /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
22719 - if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
22720 + /*
22721 + * If the IRQ is already assigned to a PCI device,
22722 + * ignore its ISA use penalty
22723 + */
22724 + if (pirq_penalty[dev->irq] >= 100 &&
22725 + pirq_penalty[dev->irq] < 100000)
22726 pirq_penalty[dev->irq] = 0;
22727 pirq_penalty[dev->irq]++;
22728 }
22729 @@ -1035,13 +1046,17 @@ static void __init pcibios_fixup_irqs(vo
22730 /*
22731 * Recalculate IRQ numbers if we use the I/O APIC.
22732 */
22733 - if (io_apic_assign_pci_irqs)
22734 - {
22735 + if (io_apic_assign_pci_irqs) {
22736 int irq;
22737
22738 if (pin) {
22739 - pin--; /* interrupt pins are numbered starting from 1 */
22740 - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
22741 + /*
22742 + * interrupt pins are numbered starting
22743 + * from 1
22744 + */
22745 + pin--;
22746 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
22747 + PCI_SLOT(dev->devfn), pin);
22748 /*
22749 * Busses behind bridges are typically not listed in the MP-table.
22750 * In this case we have to look up the IRQ based on the parent bus,
22751 @@ -1049,18 +1064,18 @@ static void __init pcibios_fixup_irqs(vo
22752 * busses itself so we should get into this branch reliably.
22753 */
22754 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22755 - struct pci_dev * bridge = dev->bus->self;
22756 + struct pci_dev *bridge = dev->bus->self;
22757
22758 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22759 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22760 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22761 PCI_SLOT(bridge->devfn), pin);
22762 if (irq >= 0)
22763 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22764 - pci_name(bridge), 'A' + pin, irq);
22765 + dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
22766 + pci_name(bridge),
22767 + 'A' + pin, irq);
22768 }
22769 if (irq >= 0) {
22770 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22771 - pci_name(dev), 'A' + pin, irq);
22772 + dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
22773 dev->irq = irq;
22774 }
22775 }
22776 @@ -1082,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq
22777 {
22778 if (!broken_hp_bios_irq9) {
22779 broken_hp_bios_irq9 = 1;
22780 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22781 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22782 + d->ident);
22783 }
22784 return 0;
22785 }
22786 @@ -1095,7 +1111,8 @@ static int __init fix_acer_tm360_irqrout
22787 {
22788 if (!acer_tm360_irqrouting) {
22789 acer_tm360_irqrouting = 1;
22790 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22791 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22792 + d->ident);
22793 }
22794 return 0;
22795 }
22796 @@ -1107,7 +1124,8 @@ static struct dmi_system_id __initdata p
22797 .matches = {
22798 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
22799 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
22800 - DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
22801 + DMI_MATCH(DMI_PRODUCT_VERSION,
22802 + "HP Pavilion Notebook Model GE"),
22803 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
22804 },
22805 },
22806 @@ -1122,7 +1140,7 @@ static struct dmi_system_id __initdata p
22807 { }
22808 };
22809
22810 -static int __init pcibios_irq_init(void)
22811 +int __init pcibios_irq_init(void)
22812 {
22813 DBG(KERN_DEBUG "PCI: IRQ init\n");
22814
22815 @@ -1142,11 +1160,14 @@ static int __init pcibios_irq_init(void)
22816 pirq_find_router(&pirq_router);
22817 if (pirq_table->exclusive_irqs) {
22818 int i;
22819 - for (i=0; i<16; i++)
22820 + for (i = 0; i < 16; i++)
22821 if (!(pirq_table->exclusive_irqs & (1 << i)))
22822 pirq_penalty[i] += 100;
22823 }
22824 - /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
22825 + /*
22826 + * If we're using the I/O APIC, avoid using the PCI IRQ
22827 + * routing table
22828 + */
22829 if (io_apic_assign_pci_irqs)
22830 pirq_table = NULL;
22831 }
22832 @@ -1157,9 +1178,6 @@ static int __init pcibios_irq_init(void)
22833 return 0;
22834 }
22835
22836 -subsys_initcall(pcibios_irq_init);
22837 -
22838 -
22839 static void pirq_penalize_isa_irq(int irq, int active)
22840 {
22841 /*
22842 @@ -1193,7 +1211,7 @@ static int pirq_enable_irq(struct pci_de
22843 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
22844 char *msg = "";
22845
22846 - pin--; /* interrupt pins are numbered starting from 1 */
22847 + pin--; /* interrupt pins are numbered starting from 1 */
22848
22849 if (io_apic_assign_pci_irqs) {
22850 int irq;
22851 @@ -1207,35 +1225,41 @@ static int pirq_enable_irq(struct pci_de
22852 */
22853 temp_dev = dev;
22854 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22855 - struct pci_dev * bridge = dev->bus->self;
22856 + struct pci_dev *bridge = dev->bus->self;
22857
22858 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22859 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22860 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22861 PCI_SLOT(bridge->devfn), pin);
22862 if (irq >= 0)
22863 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22864 - pci_name(bridge), 'A' + pin, irq);
22865 + dev_warn(&dev->dev, "using bridge %s "
22866 + "INT %c to get IRQ %d\n",
22867 + pci_name(bridge), 'A' + pin,
22868 + irq);
22869 dev = bridge;
22870 }
22871 dev = temp_dev;
22872 if (irq >= 0) {
22873 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22874 - pci_name(dev), 'A' + pin, irq);
22875 + dev_info(&dev->dev, "PCI->APIC IRQ transform: "
22876 + "INT %c -> IRQ %d\n", 'A' + pin, irq);
22877 dev->irq = irq;
22878 return 0;
22879 } else
22880 - msg = " Probably buggy MP table.";
22881 + msg = "; probably buggy MP table";
22882 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
22883 msg = "";
22884 else
22885 - msg = " Please try using pci=biosirq.";
22886 + msg = "; please try using pci=biosirq";
22887
22888 - /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
22889 - if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
22890 + /*
22891 + * With IDE legacy devices the IRQ lookup failure is not
22892 + * a problem..
22893 + */
22894 + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
22895 + !(dev->class & 0x5))
22896 return 0;
22897
22898 - printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
22899 - 'A' + pin, pci_name(dev), msg);
22900 + dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
22901 + 'A' + pin, msg);
22902 }
22903 return 0;
22904 }
22905 --- sle11-2009-06-04.orig/arch/x86/vdso/Makefile 2009-03-16 16:33:40.000000000 +0100
22906 +++ sle11-2009-06-04/arch/x86/vdso/Makefile 2009-06-04 10:21:39.000000000 +0200
22907 @@ -65,9 +65,7 @@ obj-$(VDSO32-y) += vdso32-syms.lds
22908 vdso32.so-$(VDSO32-y) += int80
22909 vdso32.so-$(CONFIG_COMPAT) += syscall
22910 vdso32.so-$(VDSO32-y) += sysenter
22911 -xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
22912 -xen-vdso32-$(CONFIG_X86_32) += syscall
22913 -vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
22914 +vdso32.so-$(CONFIG_X86_XEN) += syscall
22915
22916 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
22917
22918 --- sle11-2009-06-04.orig/arch/x86/vdso/vdso32.S 2009-03-16 16:33:40.000000000 +0100
22919 +++ sle11-2009-06-04/arch/x86/vdso/vdso32.S 2009-06-04 10:21:39.000000000 +0200
22920 @@ -9,7 +9,7 @@ vdso32_int80_end:
22921
22922 .globl vdso32_syscall_start, vdso32_syscall_end
22923 vdso32_syscall_start:
22924 -#ifdef CONFIG_COMPAT
22925 +#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
22926 .incbin "arch/x86/vdso/vdso32-syscall.so"
22927 #endif
22928 vdso32_syscall_end:
22929 @@ -19,16 +19,4 @@ vdso32_sysenter_start:
22930 .incbin "arch/x86/vdso/vdso32-sysenter.so"
22931 vdso32_sysenter_end:
22932
22933 -#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
22934 - .globl vdso32_int80_start, vdso32_int80_end
22935 -vdso32_int80_start:
22936 - .incbin "arch/x86/vdso/vdso32-int80.so"
22937 -vdso32_int80_end:
22938 -#elif defined(CONFIG_X86_XEN)
22939 - .globl vdso32_syscall_start, vdso32_syscall_end
22940 -vdso32_syscall_start:
22941 - .incbin "arch/x86/vdso/vdso32-syscall.so"
22942 -vdso32_syscall_end:
22943 -#endif
22944 -
22945 __FINIT
22946 --- sle11-2009-06-04.orig/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:38:05.000000000 +0100
22947 +++ sle11-2009-06-04/arch/x86/vdso/vdso32-setup-xen.c 2009-06-04 10:21:39.000000000 +0200
22948 @@ -195,50 +195,28 @@ static __init void relocate_vdso(Elf32_E
22949 }
22950 }
22951
22952 -/*
22953 - * These symbols are defined by vdso32.S to mark the bounds
22954 - * of the ELF DSO images included therein.
22955 - */
22956 -extern const char vdso32_default_start, vdso32_default_end;
22957 -extern const char vdso32_sysenter_start, vdso32_sysenter_end;
22958 static struct page *vdso32_pages[1];
22959
22960 #ifdef CONFIG_X86_64
22961
22962 -#if CONFIG_XEN_COMPAT < 0x030200
22963 -static int use_int80 = 1;
22964 -#endif
22965 -static int use_sysenter __read_mostly = -1;
22966 -
22967 -#define vdso32_sysenter() (use_sysenter > 0)
22968 +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
22969 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22970
22971 -/* May not be __init: called during resume */
22972 -void syscall32_cpu_init(void)
22973 +void __cpuinit syscall32_cpu_init(void)
22974 {
22975 - static const struct callback_register cstar = {
22976 + static const struct callback_register __cpuinitconst cstar = {
22977 .type = CALLBACKTYPE_syscall32,
22978 .address = (unsigned long)ia32_cstar_target
22979 };
22980 - static const struct callback_register sysenter = {
22981 + static const struct callback_register __cpuinitconst sysenter = {
22982 .type = CALLBACKTYPE_sysenter,
22983 .address = (unsigned long)ia32_sysenter_target
22984 };
22985
22986 - if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
22987 - (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
22988 -#if CONFIG_XEN_COMPAT < 0x030200
22989 - return;
22990 - use_int80 = 0;
22991 -#else
22992 - BUG();
22993 -#endif
22994 -
22995 - if (use_sysenter < 0) {
22996 - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
22997 - use_sysenter = 1;
22998 - if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
22999 - use_sysenter = 1;
23000 - }
23001 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
23002 + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
23003 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
23004 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
23005 }
23006
23007 #define compat_uses_vma 1
23008 @@ -250,6 +228,7 @@ static inline void map_compat_vdso(int m
23009 #else /* CONFIG_X86_32 */
23010
23011 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
23012 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
23013
23014 extern asmlinkage void ia32pv_cstar_target(void);
23015 static const struct callback_register __cpuinitconst cstar = {
23016 @@ -265,13 +244,13 @@ void __cpuinit enable_sep_cpu(void)
23017 .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
23018 };
23019
23020 - if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
23021 + if (vdso32_syscall()) {
23022 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
23023 BUG();
23024 return;
23025 }
23026
23027 - if (!boot_cpu_has(X86_FEATURE_SEP))
23028 + if (!vdso32_sysenter())
23029 return;
23030
23031 if (xen_feature(XENFEAT_supervisor_mode_kernel))
23032 @@ -341,34 +320,26 @@ int __init sysenter_setup(void)
23033
23034 #ifdef CONFIG_X86_32
23035 gate_vma_init();
23036 -#endif
23037
23038 -#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
23039 - if (use_int80) {
23040 - extern const char vdso32_int80_start, vdso32_int80_end;
23041 -
23042 - vsyscall = &vdso32_int80_start;
23043 - vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
23044 - } else
23045 -#elif defined(CONFIG_X86_32)
23046 - if (boot_cpu_has(X86_FEATURE_SYSCALL)
23047 - && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
23048 - || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
23049 - setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
23050 - barrier(); /* until clear_bit()'s constraints are correct ... */
23051 if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
23052 - extern const char vdso32_syscall_start, vdso32_syscall_end;
23053 -
23054 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
23055 + && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
23056 + setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
23057 + else {
23058 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
23059 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
23060 + }
23061 + }
23062 +#endif
23063 + if (vdso32_syscall()) {
23064 vsyscall = &vdso32_syscall_start;
23065 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
23066 - } else
23067 -#endif
23068 - if (!vdso32_sysenter()) {
23069 - vsyscall = &vdso32_default_start;
23070 - vsyscall_len = &vdso32_default_end - &vdso32_default_start;
23071 - } else {
23072 + } else if (vdso32_sysenter()){
23073 vsyscall = &vdso32_sysenter_start;
23074 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
23075 + } else {
23076 + vsyscall = &vdso32_int80_start;
23077 + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
23078 }
23079
23080 memcpy(syscall_page, vsyscall, vsyscall_len);
23081 --- sle11-2009-06-04.orig/arch/x86/xen/Kconfig 2009-02-16 16:17:21.000000000 +0100
23082 +++ sle11-2009-06-04/arch/x86/xen/Kconfig 2009-06-04 10:21:39.000000000 +0200
23083 @@ -17,7 +17,7 @@ config XEN_MAX_DOMAIN_MEMORY
23084 int "Maximum allowed size of a domain in gigabytes"
23085 default 8 if X86_32
23086 default 32 if X86_64
23087 - depends on XEN
23088 + depends on PARAVIRT_XEN
23089 help
23090 The pseudo-physical to machine address array is sized
23091 according to the maximum possible memory size of a Xen
23092 @@ -26,5 +26,5 @@ config XEN_MAX_DOMAIN_MEMORY
23093
23094 config XEN_SAVE_RESTORE
23095 bool
23096 - depends on PM
23097 + depends on PARAVIRT_XEN && PM
23098 default y
23099 \ No newline at end of file
23100 --- sle11-2009-06-04.orig/drivers/acpi/processor_core.c 2009-03-16 16:38:05.000000000 +0100
23101 +++ sle11-2009-06-04/drivers/acpi/processor_core.c 2009-06-04 10:21:39.000000000 +0200
23102 @@ -721,9 +721,11 @@ static int __cpuinit acpi_processor_star
23103 if (result)
23104 goto end;
23105
23106 - sysdev = get_cpu_sysdev(pr->id);
23107 - if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
23108 - return -EFAULT;
23109 + if (pr->id != -1) {
23110 + sysdev = get_cpu_sysdev(pr->id);
23111 + if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
23112 + return -EFAULT;
23113 + }
23114
23115 status = acpi_install_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
23116 acpi_processor_notify, pr);
23117 @@ -895,7 +897,8 @@ static int acpi_processor_remove(struct
23118 status = acpi_remove_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
23119 acpi_processor_notify);
23120
23121 - sysfs_remove_link(&device->dev.kobj, "sysdev");
23122 + if (pr->id != -1)
23123 + sysfs_remove_link(&device->dev.kobj, "sysdev");
23124
23125 acpi_processor_remove_fs(device);
23126
23127 --- sle11-2009-06-04.orig/drivers/char/tpm/tpm_vtpm.c 2009-02-16 15:58:14.000000000 +0100
23128 +++ sle11-2009-06-04/drivers/char/tpm/tpm_vtpm.c 2009-06-04 10:21:39.000000000 +0200
23129 @@ -347,7 +347,7 @@ static int _vtpm_send_queued(struct tpm_
23130 {
23131 int rc;
23132 int error = 0;
23133 - long flags;
23134 + unsigned long flags;
23135 unsigned char buffer[1];
23136 struct vtpm_state *vtpms;
23137 vtpms = (struct vtpm_state *)chip_get_private(chip);
23138 --- sle11-2009-06-04.orig/drivers/misc/Kconfig 2009-06-04 11:08:07.000000000 +0200
23139 +++ sle11-2009-06-04/drivers/misc/Kconfig 2009-06-04 10:21:39.000000000 +0200
23140 @@ -438,7 +438,7 @@ config ENCLOSURE_SERVICES
23141 config SGI_XP
23142 tristate "Support communication between SGI SSIs"
23143 depends on NET
23144 - depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP
23145 + depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP && !XEN
23146 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
23147 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
23148 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
23149 @@ -465,7 +465,7 @@ config HP_ILO
23150
23151 config SGI_GRU
23152 tristate "SGI GRU driver"
23153 - depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP
23154 + depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP && !XEN
23155 default n
23156 select MMU_NOTIFIER
23157 ---help---
23158 --- sle11-2009-06-04.orig/drivers/pci/msi-xen.c 2009-03-16 16:38:05.000000000 +0100
23159 +++ sle11-2009-06-04/drivers/pci/msi-xen.c 2009-06-04 10:21:39.000000000 +0200
23160 @@ -90,12 +90,10 @@ arch_teardown_msi_irqs(struct pci_dev *d
23161 }
23162 #endif
23163
23164 -static void msi_set_enable(struct pci_dev *dev, int enable)
23165 +static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
23166 {
23167 - int pos;
23168 u16 control;
23169
23170 - pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
23171 if (pos) {
23172 pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
23173 control &= ~PCI_MSI_FLAGS_ENABLE;
23174 @@ -105,6 +103,11 @@ static void msi_set_enable(struct pci_de
23175 }
23176 }
23177
23178 +static void msi_set_enable(struct pci_dev *dev, int enable)
23179 +{
23180 + __msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable);
23181 +}
23182 +
23183 static void msix_set_enable(struct pci_dev *dev, int enable)
23184 {
23185 int pos;
23186 @@ -568,9 +571,8 @@ int pci_enable_msi(struct pci_dev* dev)
23187
23188 /* Check whether driver already requested for MSI-X irqs */
23189 if (dev->msix_enabled) {
23190 - printk(KERN_INFO "PCI: %s: Can't enable MSI. "
23191 - "Device already has MSI-X enabled\n",
23192 - pci_name(dev));
23193 + dev_info(&dev->dev, "can't enable MSI "
23194 + "(MSI-X already enabled)\n");
23195 return -EINVAL;
23196 }
23197
23198 @@ -702,9 +704,8 @@ int pci_enable_msix(struct pci_dev* dev,
23199 temp = dev->irq;
23200 /* Check whether driver already requested for MSI vector */
23201 if (dev->msi_enabled) {
23202 - printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
23203 - "Device already has an MSI irq assigned\n",
23204 - pci_name(dev));
23205 + dev_info(&dev->dev, "can't enable MSI-X "
23206 + "(MSI IRQ already assigned)\n");
23207 return -EINVAL;
23208 }
23209
23210 --- sle11-2009-06-04.orig/drivers/pci/quirks.c 2009-06-04 11:08:07.000000000 +0200
23211 +++ sle11-2009-06-04/drivers/pci/quirks.c 2009-06-04 10:21:39.000000000 +0200
23212 @@ -44,9 +44,8 @@ static void __devinit quirk_release_reso
23213 /* PCI Host Bridge isn't a target device */
23214 return;
23215 }
23216 - printk(KERN_INFO
23217 - "PCI: Disable memory decoding and release memory resources [%s].\n",
23218 - pci_name(dev));
23219 + dev_info(&dev->dev,
23220 + "disable memory decoding and release memory resources\n");
23221 pci_read_config_word(dev, PCI_COMMAND, &command);
23222 command &= ~PCI_COMMAND_MEMORY;
23223 pci_write_config_word(dev, PCI_COMMAND, command);
23224 --- sle11-2009-06-04.orig/drivers/pci/setup-res.c 2009-06-04 11:08:07.000000000 +0200
23225 +++ sle11-2009-06-04/drivers/pci/setup-res.c 2009-06-04 10:21:39.000000000 +0200
23226 @@ -129,7 +129,7 @@ int pci_claim_resource(struct pci_dev *d
23227 #ifdef CONFIG_PCI_REASSIGN
23228 void pci_disable_bridge_window(struct pci_dev *dev)
23229 {
23230 - printk(KERN_DEBUG "PCI: Disable bridge window on %s\n", pci_name(dev));
23231 + dev_dbg(&dev->dev, "disable bridge window\n");
23232
23233 /* MMIO Base/Limit */
23234 pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0);
23235 @@ -190,8 +190,8 @@ int pci_assign_resource(struct pci_dev *
23236 res->flags &= ~IORESOURCE_STARTALIGN;
23237 if (resno < PCI_BRIDGE_RESOURCES) {
23238 #ifdef CONFIG_PCI_REASSIGN
23239 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23240 - "%016llx - %016llx\n", resno, pci_name(dev),
23241 + dev_dbg(&dev->dev, "assign resource(%d) "
23242 + "%016llx - %016llx\n", resno,
23243 (unsigned long long)res->start,
23244 (unsigned long long)res->end);
23245 #endif
23246 @@ -235,8 +235,8 @@ int pci_assign_resource_fixed(struct pci
23247 (unsigned long long)res->end);
23248 } else if (resno < PCI_BRIDGE_RESOURCES) {
23249 #ifdef CONFIG_PCI_REASSIGN
23250 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23251 - "%016llx - %016llx\n", resno, pci_name(dev),
23252 + dev_dbg(&dev->dev, "assign resource(%d) "
23253 + "%016llx - %016llx\n", resno,
23254 (unsigned long long)res->start,
23255 (unsigned long long)res->end);
23256 #endif
23257 --- sle11-2009-06-04.orig/drivers/xen/Makefile 2009-03-16 16:38:05.000000000 +0100
23258 +++ sle11-2009-06-04/drivers/xen/Makefile 2009-06-04 10:21:39.000000000 +0200
23259 @@ -1,4 +1,4 @@
23260 -obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
23261 +obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o
23262 xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
23263 xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
23264
23265 --- sle11-2009-06-04.orig/drivers/xen/balloon/sysfs.c 2009-03-16 16:33:40.000000000 +0100
23266 +++ sle11-2009-06-04/drivers/xen/balloon/sysfs.c 2009-06-04 10:21:39.000000000 +0200
23267 @@ -45,6 +45,7 @@
23268
23269 #define BALLOON_SHOW(name, format, args...) \
23270 static ssize_t show_##name(struct sys_device *dev, \
23271 + struct sysdev_attribute *attr, \
23272 char *buf) \
23273 { \
23274 return sprintf(buf, format, ##args); \
23275 @@ -59,14 +60,15 @@ BALLOON_SHOW(hard_limit_kb,
23276 (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0);
23277 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
23278
23279 -static ssize_t show_target_kb(struct sys_device *dev, char *buf)
23280 +static ssize_t show_target_kb(struct sys_device *dev,
23281 + struct sysdev_attribute *attr, char *buf)
23282 {
23283 return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
23284 }
23285
23286 static ssize_t store_target_kb(struct sys_device *dev,
23287 - const char *buf,
23288 - size_t count)
23289 + struct sysdev_attribute *attr,
23290 + const char *buf, size_t count)
23291 {
23292 char memstring[64], *endchar;
23293 unsigned long long target_bytes;
23294 --- sle11-2009-06-04.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:40:14.000000000 +0200
23295 +++ sle11-2009-06-04/drivers/xen/blktap/blktap.c 2009-06-04 10:21:39.000000000 +0200
23296 @@ -54,6 +54,7 @@
23297 #include <linux/gfp.h>
23298 #include <linux/poll.h>
23299 #include <linux/delay.h>
23300 +#include <linux/nsproxy.h>
23301 #include <asm/tlbflush.h>
23302
23303 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
23304 @@ -498,7 +499,7 @@ found:
23305
23306 if ((class = get_xen_class()) != NULL)
23307 device_create(class, NULL, MKDEV(blktap_major, minor),
23308 - "blktap%d", minor);
23309 + NULL, "blktap%d", minor);
23310 }
23311
23312 out:
23313 @@ -1683,7 +1684,8 @@ static int __init blkif_init(void)
23314 * We only create the device when a request of a new device is
23315 * made.
23316 */
23317 - device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
23318 + device_create(class, NULL, MKDEV(blktap_major, 0), NULL,
23319 + "blktap0");
23320 } else {
23321 /* this is bad, but not fatal */
23322 WPRINTK("blktap: sysfs xen_class not created\n");
23323 --- sle11-2009-06-04.orig/drivers/xen/char/mem.c 2009-03-16 16:38:05.000000000 +0100
23324 +++ sle11-2009-06-04/drivers/xen/char/mem.c 2009-06-04 10:21:39.000000000 +0200
23325 @@ -35,7 +35,7 @@ static inline int uncached_access(struct
23326
23327 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
23328 {
23329 -#ifdef CONFIG_NONPROMISC_DEVMEM
23330 +#ifdef CONFIG_STRICT_DEVMEM
23331 u64 from = ((u64)pfn) << PAGE_SHIFT;
23332 u64 to = from + size;
23333 u64 cursor = from;
23334 @@ -172,7 +172,10 @@ static void mmap_mem_close(struct vm_are
23335
23336 static struct vm_operations_struct mmap_mem_ops = {
23337 .open = mmap_mem_open,
23338 - .close = mmap_mem_close
23339 + .close = mmap_mem_close,
23340 +#ifdef CONFIG_HAVE_IOREMAP_PROT
23341 + .access = generic_access_phys
23342 +#endif
23343 };
23344
23345 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
23346 --- sle11-2009-06-04.orig/drivers/xen/console/console.c 2009-03-16 16:38:05.000000000 +0100
23347 +++ sle11-2009-06-04/drivers/xen/console/console.c 2009-06-04 10:21:39.000000000 +0200
23348 @@ -432,9 +432,7 @@ static void __xencons_tx_flush(void)
23349
23350 if (work_done && (xencons_tty != NULL)) {
23351 wake_up_interruptible(&xencons_tty->write_wait);
23352 - if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
23353 - (xencons_tty->ldisc.write_wakeup != NULL))
23354 - (xencons_tty->ldisc.write_wakeup)(xencons_tty);
23355 + tty_wakeup(xencons_tty);
23356 }
23357 }
23358
23359 @@ -635,8 +633,8 @@ static void xencons_close(struct tty_str
23360 tty->closing = 1;
23361 tty_wait_until_sent(tty, 0);
23362 tty_driver_flush_buffer(tty);
23363 - if (tty->ldisc.flush_buffer != NULL)
23364 - tty->ldisc.flush_buffer(tty);
23365 + if (tty->ldisc.ops->flush_buffer != NULL)
23366 + tty->ldisc.ops->flush_buffer(tty);
23367 tty->closing = 0;
23368 spin_lock_irqsave(&xencons_lock, flags);
23369 xencons_tty = NULL;
23370 --- sle11-2009-06-04.orig/drivers/xen/core/evtchn.c 2009-03-16 16:33:40.000000000 +0100
23371 +++ sle11-2009-06-04/drivers/xen/core/evtchn.c 2009-06-04 10:21:39.000000000 +0200
23372 @@ -746,8 +746,9 @@ static struct irq_chip dynirq_chip = {
23373 };
23374
23375 /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
23376 -static int pirq_eoi_does_unmask;
23377 +static bool pirq_eoi_does_unmask;
23378 static unsigned long *pirq_needs_eoi;
23379 +static DECLARE_BITMAP(probing_pirq, NR_PIRQS);
23380
23381 static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
23382 {
23383 @@ -794,25 +795,31 @@ static inline void pirq_query_unmask(int
23384 set_bit(irq - PIRQ_BASE, pirq_needs_eoi);
23385 }
23386
23387 -/*
23388 - * On startup, if there is no action associated with the IRQ then we are
23389 - * probing. In this case we should not share with others as it will confuse us.
23390 - */
23391 -#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
23392 +static int set_type_pirq(unsigned int irq, unsigned int type)
23393 +{
23394 + if (type != IRQ_TYPE_PROBE)
23395 + return -EINVAL;
23396 + set_bit(irq - PIRQ_BASE, probing_pirq);
23397 + return 0;
23398 +}
23399
23400 static unsigned int startup_pirq(unsigned int irq)
23401 {
23402 struct evtchn_bind_pirq bind_pirq;
23403 int evtchn = evtchn_from_irq(irq);
23404
23405 - if (VALID_EVTCHN(evtchn))
23406 + if (VALID_EVTCHN(evtchn)) {
23407 + clear_bit(irq - PIRQ_BASE, probing_pirq);
23408 goto out;
23409 + }
23410
23411 bind_pirq.pirq = evtchn_get_xen_pirq(irq);
23412 /* NB. We are happy to share unless we are probing. */
23413 - bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
23414 + bind_pirq.flags = test_and_clear_bit(irq - PIRQ_BASE, probing_pirq)
23415 + || (irq_desc[irq].status & IRQ_AUTODETECT)
23416 + ? 0 : BIND_PIRQ__WILL_SHARE;
23417 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
23418 - if (!probing_irq(irq))
23419 + if (bind_pirq.flags)
23420 printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
23421 irq);
23422 return 0;
23423 @@ -891,6 +898,7 @@ static struct irq_chip pirq_chip = {
23424 .mask_ack = ack_pirq,
23425 .ack = ack_pirq,
23426 .end = end_pirq,
23427 + .set_type = set_type_pirq,
23428 #ifdef CONFIG_SMP
23429 .set_affinity = set_affinity_irq,
23430 #endif
23431 @@ -1003,6 +1011,7 @@ void xen_poll_irq(int irq)
23432 BUG();
23433 }
23434
23435 +#ifdef CONFIG_PM_SLEEP
23436 static void restore_cpu_virqs(unsigned int cpu)
23437 {
23438 struct evtchn_bind_virq bind_virq;
23439 @@ -1095,6 +1104,7 @@ void irq_resume(void)
23440 }
23441
23442 }
23443 +#endif
23444
23445 #if defined(CONFIG_X86_IO_APIC)
23446 #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
23447 @@ -1177,7 +1187,7 @@ void __init xen_init_IRQ(void)
23448 * BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8)));
23449 eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
23450 if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
23451 - pirq_eoi_does_unmask = 1;
23452 + pirq_eoi_does_unmask = true;
23453
23454 /* No event channels are 'live' right now. */
23455 for (i = 0; i < NR_EVENT_CHANNELS; i++)
23456 --- sle11-2009-06-04.orig/drivers/xen/core/gnttab.c 2008-12-01 11:25:57.000000000 +0100
23457 +++ sle11-2009-06-04/drivers/xen/core/gnttab.c 2009-06-04 10:21:39.000000000 +0200
23458 @@ -449,6 +449,7 @@ static int map_pte_fn(pte_t *pte, struct
23459 return 0;
23460 }
23461
23462 +#ifdef CONFIG_PM_SLEEP
23463 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
23464 unsigned long addr, void *data)
23465 {
23466 @@ -456,6 +457,7 @@ static int unmap_pte_fn(pte_t *pte, stru
23467 set_pte_at(&init_mm, addr, pte, __pte(0));
23468 return 0;
23469 }
23470 +#endif
23471
23472 void *arch_gnttab_alloc_shared(unsigned long *frames)
23473 {
23474 @@ -633,6 +635,75 @@ void __gnttab_dma_map_page(struct page *
23475 } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
23476 }
23477
23478 +#ifdef __HAVE_ARCH_PTE_SPECIAL
23479 +
23480 +static unsigned int GNTMAP_pte_special;
23481 +
23482 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
23483 + unsigned int count)
23484 +{
23485 + unsigned int i;
23486 +
23487 + if (unlikely(cmd != GNTTABOP_map_grant_ref))
23488 + count = 0;
23489 +
23490 + for (i = 0; i < count; ++i, ++map) {
23491 + if (!(map->flags & GNTMAP_host_map)
23492 + || !(map->flags & GNTMAP_application_map))
23493 + continue;
23494 + if (GNTMAP_pte_special)
23495 + map->flags |= GNTMAP_pte_special;
23496 + else {
23497 + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
23498 + return true;
23499 + }
23500 + }
23501 +
23502 + return false;
23503 +}
23504 +EXPORT_SYMBOL(gnttab_pre_map_adjust);
23505 +
23506 +#if CONFIG_XEN_COMPAT < 0x030400
23507 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
23508 +{
23509 + unsigned int i;
23510 + int rc = 0;
23511 +
23512 + for (i = 0; i < count && rc == 0; ++i, ++map) {
23513 + pte_t pte;
23514 +
23515 + if (!(map->flags & GNTMAP_host_map)
23516 + || !(map->flags & GNTMAP_application_map))
23517 + continue;
23518 +
23519 +#ifdef CONFIG_X86
23520 + pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
23521 + | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
23522 + | _PAGE_SPECIAL)
23523 + & __supported_pte_mask);
23524 +#else
23525 +#error Architecture not yet supported.
23526 +#endif
23527 + if (!(map->flags & GNTMAP_readonly))
23528 + pte = pte_mkwrite(pte);
23529 +
23530 + if (map->flags & GNTMAP_contains_pte) {
23531 + mmu_update_t u;
23532 +
23533 + u.ptr = map->host_addr;
23534 + u.val = __pte_val(pte);
23535 + rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
23536 + } else
23537 + rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
23538 + }
23539 +
23540 + return rc;
23541 +}
23542 +EXPORT_SYMBOL(gnttab_post_map_adjust);
23543 +#endif
23544 +
23545 +#endif /* __HAVE_ARCH_PTE_SPECIAL */
23546 +
23547 int gnttab_resume(void)
23548 {
23549 if (max_nr_grant_frames() < nr_grant_frames)
23550 @@ -640,6 +711,7 @@ int gnttab_resume(void)
23551 return gnttab_map(0, nr_grant_frames - 1);
23552 }
23553
23554 +#ifdef CONFIG_PM_SLEEP
23555 int gnttab_suspend(void)
23556 {
23557 #ifdef CONFIG_X86
23558 @@ -649,6 +721,7 @@ int gnttab_suspend(void)
23559 #endif
23560 return 0;
23561 }
23562 +#endif
23563
23564 #else /* !CONFIG_XEN */
23565
23566 @@ -759,6 +832,18 @@ int __devinit gnttab_init(void)
23567 gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
23568 gnttab_free_head = NR_RESERVED_ENTRIES;
23569
23570 +#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
23571 + if (!xen_feature(XENFEAT_auto_translated_physmap)
23572 + && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
23573 +#ifdef CONFIG_X86
23574 + GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
23575 + >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
23576 +#else
23577 +#error Architecture not yet supported.
23578 +#endif
23579 + }
23580 +#endif
23581 +
23582 return 0;
23583
23584 ini_nomem:
23585 --- sle11-2009-06-04.orig/drivers/xen/core/machine_kexec.c 2009-03-16 16:38:05.000000000 +0100
23586 +++ sle11-2009-06-04/drivers/xen/core/machine_kexec.c 2009-06-04 10:21:39.000000000 +0200
23587 @@ -91,7 +91,7 @@ void __init xen_machine_kexec_setup_reso
23588 xen_hypervisor_res.start = range.start;
23589 xen_hypervisor_res.end = range.start + range.size - 1;
23590 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
23591 -#ifdef CONFIG_X86_64
23592 +#ifdef CONFIG_X86
23593 insert_resource(&iomem_resource, &xen_hypervisor_res);
23594 #endif
23595
23596 @@ -106,7 +106,7 @@ void __init xen_machine_kexec_setup_reso
23597 if (range.size) {
23598 crashk_res.start = range.start;
23599 crashk_res.end = range.start + range.size - 1;
23600 -#ifdef CONFIG_X86_64
23601 +#ifdef CONFIG_X86
23602 insert_resource(&iomem_resource, &crashk_res);
23603 #endif
23604 }
23605 @@ -160,7 +160,7 @@ void __init xen_machine_kexec_setup_reso
23606 return;
23607 }
23608
23609 -#ifndef CONFIG_X86_64
23610 +#ifndef CONFIG_X86
23611 void __init xen_machine_kexec_register_resources(struct resource *res)
23612 {
23613 request_resource(res, &xen_hypervisor_res);
23614 --- sle11-2009-06-04.orig/drivers/xen/core/machine_reboot.c 2009-06-04 11:08:07.000000000 +0200
23615 +++ sle11-2009-06-04/drivers/xen/core/machine_reboot.c 2009-06-04 10:21:39.000000000 +0200
23616 @@ -57,6 +57,7 @@ EXPORT_SYMBOL(machine_restart);
23617 EXPORT_SYMBOL(machine_halt);
23618 EXPORT_SYMBOL(machine_power_off);
23619
23620 +#ifdef CONFIG_PM_SLEEP
23621 static void pre_suspend(void)
23622 {
23623 HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
23624 @@ -111,6 +112,7 @@ static void post_suspend(int suspend_can
23625 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
23626 virt_to_mfn(pfn_to_mfn_frame_list_list);
23627 }
23628 +#endif
23629
23630 #else /* !(defined(__i386__) || defined(__x86_64__)) */
23631
23632 @@ -129,6 +131,7 @@ static void post_suspend(int suspend_can
23633
23634 #endif
23635
23636 +#ifdef CONFIG_PM_SLEEP
23637 struct suspend {
23638 int fast_suspend;
23639 void (*resume_notifier)(int);
23640 @@ -222,7 +225,8 @@ int __xen_suspend(int fast_suspend, void
23641
23642 if (fast_suspend) {
23643 xenbus_suspend();
23644 - err = stop_machine_run(take_machine_down, &suspend, 0);
23645 + err = stop_machine(take_machine_down, &suspend,
23646 + &cpumask_of_cpu(0));
23647 if (err < 0)
23648 xenbus_suspend_cancel();
23649 } else {
23650 @@ -245,3 +249,4 @@ int __xen_suspend(int fast_suspend, void
23651
23652 return 0;
23653 }
23654 +#endif
23655 --- sle11-2009-06-04.orig/drivers/xen/core/reboot.c 2009-02-16 16:17:21.000000000 +0100
23656 +++ sle11-2009-06-04/drivers/xen/core/reboot.c 2009-06-04 10:21:39.000000000 +0200
23657 @@ -29,17 +29,12 @@ MODULE_LICENSE("Dual BSD/GPL");
23658 /* Ignore multiple shutdown requests. */
23659 static int shutting_down = SHUTDOWN_INVALID;
23660
23661 -/* Was last suspend request cancelled? */
23662 -static int suspend_cancelled;
23663 -
23664 /* Can we leave APs online when we suspend? */
23665 static int fast_suspend;
23666
23667 static void __shutdown_handler(struct work_struct *unused);
23668 static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
23669
23670 -static int setup_suspend_evtchn(void);
23671 -
23672 int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
23673
23674 static int shutdown_process(void *__unused)
23675 @@ -69,6 +64,13 @@ static int shutdown_process(void *__unus
23676 return 0;
23677 }
23678
23679 +#ifdef CONFIG_PM_SLEEP
23680 +
23681 +static int setup_suspend_evtchn(void);
23682 +
23683 +/* Was last suspend request cancelled? */
23684 +static int suspend_cancelled;
23685 +
23686 static void xen_resume_notifier(int _suspend_cancelled)
23687 {
23688 int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
23689 @@ -118,6 +120,10 @@ static int xen_suspend(void *__unused)
23690 return 0;
23691 }
23692
23693 +#else
23694 +# define xen_suspend NULL
23695 +#endif
23696 +
23697 static void switch_shutdown_state(int new_state)
23698 {
23699 int prev_state, old_state = SHUTDOWN_INVALID;
23700 @@ -194,8 +200,10 @@ static void shutdown_handler(struct xenb
23701 new_state = SHUTDOWN_POWEROFF;
23702 else if (strcmp(str, "reboot") == 0)
23703 ctrl_alt_del();
23704 +#ifdef CONFIG_PM_SLEEP
23705 else if (strcmp(str, "suspend") == 0)
23706 new_state = SHUTDOWN_SUSPEND;
23707 +#endif
23708 else if (strcmp(str, "halt") == 0)
23709 new_state = SHUTDOWN_HALT;
23710 else
23711 @@ -247,6 +255,7 @@ static struct xenbus_watch sysrq_watch =
23712 .callback = sysrq_handler
23713 };
23714
23715 +#ifdef CONFIG_PM_SLEEP
23716 static irqreturn_t suspend_int(int irq, void* dev_id)
23717 {
23718 switch_shutdown_state(SHUTDOWN_SUSPEND);
23719 @@ -274,6 +283,9 @@ static int setup_suspend_evtchn(void)
23720
23721 return 0;
23722 }
23723 +#else
23724 +#define setup_suspend_evtchn() 0
23725 +#endif
23726
23727 static int setup_shutdown_watcher(void)
23728 {
23729 --- sle11-2009-06-04.orig/drivers/xen/core/smpboot.c 2009-03-16 16:38:05.000000000 +0100
23730 +++ sle11-2009-06-04/drivers/xen/core/smpboot.c 2009-06-04 10:21:39.000000000 +0200
23731 @@ -27,6 +27,7 @@
23732
23733 extern irqreturn_t smp_reschedule_interrupt(int, void *);
23734 extern irqreturn_t smp_call_function_interrupt(int, void *);
23735 +extern irqreturn_t smp_call_function_single_interrupt(int, void *);
23736
23737 extern int local_setup_timer(unsigned int cpu);
23738 extern void local_teardown_timer(unsigned int cpu);
23739 @@ -50,8 +51,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
23740
23741 static DEFINE_PER_CPU(int, resched_irq);
23742 static DEFINE_PER_CPU(int, callfunc_irq);
23743 +static DEFINE_PER_CPU(int, call1func_irq);
23744 static char resched_name[NR_CPUS][15];
23745 static char callfunc_name[NR_CPUS][15];
23746 +static char call1func_name[NR_CPUS][15];
23747
23748 #ifdef CONFIG_X86_LOCAL_APIC
23749 #define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
23750 @@ -73,15 +76,13 @@ void __init prefill_possible_map(void)
23751
23752 for (i = 0; i < NR_CPUS; i++) {
23753 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
23754 - if (rc >= 0)
23755 + if (rc >= 0) {
23756 cpu_set(i, cpu_possible_map);
23757 + nr_cpu_ids = i + 1;
23758 + }
23759 }
23760 }
23761
23762 -void __init smp_alloc_memory(void)
23763 -{
23764 -}
23765 -
23766 static inline void
23767 set_cpu_sibling_map(unsigned int cpu)
23768 {
23769 @@ -110,7 +111,8 @@ static int __cpuinit xen_smp_intr_init(u
23770 {
23771 int rc;
23772
23773 - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
23774 + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) =
23775 + per_cpu(call1func_irq, cpu) = -1;
23776
23777 sprintf(resched_name[cpu], "resched%u", cpu);
23778 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
23779 @@ -134,6 +136,17 @@ static int __cpuinit xen_smp_intr_init(u
23780 goto fail;
23781 per_cpu(callfunc_irq, cpu) = rc;
23782
23783 + sprintf(call1func_name[cpu], "call1func%u", cpu);
23784 + rc = bind_ipi_to_irqhandler(CALL_FUNC_SINGLE_VECTOR,
23785 + cpu,
23786 + smp_call_function_single_interrupt,
23787 + IRQF_DISABLED|IRQF_NOBALANCING,
23788 + call1func_name[cpu],
23789 + NULL);
23790 + if (rc < 0)
23791 + goto fail;
23792 + per_cpu(call1func_irq, cpu) = rc;
23793 +
23794 rc = xen_spinlock_init(cpu);
23795 if (rc < 0)
23796 goto fail;
23797 @@ -148,6 +161,8 @@ static int __cpuinit xen_smp_intr_init(u
23798 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23799 if (per_cpu(callfunc_irq, cpu) >= 0)
23800 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23801 + if (per_cpu(call1func_irq, cpu) >= 0)
23802 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23803 xen_spinlock_cleanup(cpu);
23804 return rc;
23805 }
23806 @@ -160,6 +175,7 @@ static void __cpuexit xen_smp_intr_exit(
23807
23808 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23809 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23810 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23811 xen_spinlock_cleanup(cpu);
23812 }
23813 #endif
23814 @@ -167,11 +183,7 @@ static void __cpuexit xen_smp_intr_exit(
23815 void __cpuinit cpu_bringup(void)
23816 {
23817 cpu_init();
23818 -#ifdef __i386__
23819 identify_secondary_cpu(&current_cpu_data);
23820 -#else
23821 - identify_cpu(&current_cpu_data);
23822 -#endif
23823 touch_softlockup_watchdog();
23824 preempt_disable();
23825 local_irq_enable();
23826 @@ -251,9 +263,6 @@ void __init smp_prepare_cpus(unsigned in
23827 struct task_struct *idle;
23828 int apicid;
23829 struct vcpu_get_physid cpu_id;
23830 -#ifdef __x86_64__
23831 - struct desc_ptr *gdt_descr;
23832 -#endif
23833 void *gdt_addr;
23834
23835 apicid = 0;
23836 @@ -266,7 +275,7 @@ void __init smp_prepare_cpus(unsigned in
23837
23838 current_thread_info()->cpu = 0;
23839
23840 - for (cpu = 0; cpu < NR_CPUS; cpu++) {
23841 + for_each_possible_cpu (cpu) {
23842 cpus_clear(per_cpu(cpu_sibling_map, cpu));
23843 cpus_clear(per_cpu(cpu_core_map, cpu));
23844 }
23845 @@ -293,21 +302,10 @@ void __init smp_prepare_cpus(unsigned in
23846 if (IS_ERR(idle))
23847 panic("failed fork for CPU %d", cpu);
23848
23849 -#ifdef __x86_64__
23850 - gdt_descr = &cpu_gdt_descr[cpu];
23851 - gdt_descr->address = get_zeroed_page(GFP_KERNEL);
23852 - if (unlikely(!gdt_descr->address)) {
23853 - printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
23854 - cpu);
23855 - continue;
23856 - }
23857 - gdt_descr->size = GDT_SIZE;
23858 - memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
23859 - gdt_addr = (void *)gdt_descr->address;
23860 -#else
23861 +#ifdef __i386__
23862 init_gdt(cpu);
23863 - gdt_addr = get_cpu_gdt_table(cpu);
23864 #endif
23865 + gdt_addr = get_cpu_gdt_table(cpu);
23866 make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
23867
23868 apicid = cpu;
23869 @@ -353,8 +351,8 @@ void __init smp_prepare_boot_cpu(void)
23870 {
23871 #ifdef __i386__
23872 init_gdt(smp_processor_id());
23873 - switch_to_new_gdt();
23874 #endif
23875 + switch_to_new_gdt();
23876 prefill_possible_map();
23877 }
23878
23879 --- sle11-2009-06-04.orig/drivers/xen/core/spinlock.c 2009-03-16 16:33:40.000000000 +0100
23880 +++ sle11-2009-06-04/drivers/xen/core/spinlock.c 2009-06-04 10:36:24.000000000 +0200
23881 @@ -5,6 +5,8 @@
23882 * portions of this file.
23883 */
23884
23885 +#if CONFIG_XEN_COMPAT >= 0x030200
23886 +
23887 #include <linux/init.h>
23888 #include <linux/irq.h>
23889 #include <linux/kernel.h>
23890 @@ -73,9 +75,9 @@ int xen_spin_wait(raw_spinlock_t *lock,
23891 /* announce we're spinning */
23892 spinning.ticket = token;
23893 spinning.lock = lock;
23894 - spinning.prev = __get_cpu_var(spinning);
23895 + spinning.prev = x86_read_percpu(spinning);
23896 smp_wmb();
23897 - __get_cpu_var(spinning) = &spinning;
23898 + x86_write_percpu(spinning, &spinning);
23899
23900 /* clear pending */
23901 xen_clear_irq_pending(irq);
23902 @@ -102,7 +104,7 @@ int xen_spin_wait(raw_spinlock_t *lock,
23903 kstat_this_cpu.irqs[irq] += !rc;
23904
23905 /* announce we're done */
23906 - __get_cpu_var(spinning) = spinning.prev;
23907 + x86_write_percpu(spinning, spinning.prev);
23908 rm_lock = &__get_cpu_var(spinning_rm_lock);
23909 raw_local_irq_save(flags);
23910 __raw_write_lock(rm_lock);
23911 @@ -159,3 +161,5 @@ void xen_spin_kick(raw_spinlock_t *lock,
23912 }
23913 }
23914 EXPORT_SYMBOL(xen_spin_kick);
23915 +
23916 +#endif /* CONFIG_XEN_COMPAT >= 0x030200 */
23917 --- sle11-2009-06-04.orig/drivers/xen/fbfront/xenfb.c 2009-03-16 16:38:05.000000000 +0100
23918 +++ sle11-2009-06-04/drivers/xen/fbfront/xenfb.c 2009-06-04 10:21:39.000000000 +0200
23919 @@ -18,6 +18,7 @@
23920 * frame buffer.
23921 */
23922
23923 +#include <linux/console.h>
23924 #include <linux/kernel.h>
23925 #include <linux/errno.h>
23926 #include <linux/fb.h>
23927 @@ -545,6 +546,28 @@ static unsigned long vmalloc_to_mfn(void
23928 return pfn_to_mfn(vmalloc_to_pfn(address));
23929 }
23930
23931 +static __devinit void
23932 +xenfb_make_preferred_console(void)
23933 +{
23934 + struct console *c;
23935 +
23936 + if (console_set_on_cmdline)
23937 + return;
23938 +
23939 + acquire_console_sem();
23940 + for (c = console_drivers; c; c = c->next) {
23941 + if (!strcmp(c->name, "tty") && c->index == 0)
23942 + break;
23943 + }
23944 + release_console_sem();
23945 + if (c) {
23946 + unregister_console(c);
23947 + c->flags |= CON_CONSDEV;
23948 + c->flags &= ~CON_PRINTBUFFER; /* don't print again */
23949 + register_console(c);
23950 + }
23951 +}
23952 +
23953 static int __devinit xenfb_probe(struct xenbus_device *dev,
23954 const struct xenbus_device_id *id)
23955 {
23956 @@ -665,6 +688,7 @@ static int __devinit xenfb_probe(struct
23957 if (ret < 0)
23958 goto error;
23959
23960 + xenfb_make_preferred_console();
23961 return 0;
23962
23963 error_nomem:
23964 @@ -882,4 +906,5 @@ static void __exit xenfb_cleanup(void)
23965 module_init(xenfb_init);
23966 module_exit(xenfb_cleanup);
23967
23968 +MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
23969 MODULE_LICENSE("GPL");
23970 --- sle11-2009-06-04.orig/drivers/xen/fbfront/xenkbd.c 2009-03-04 11:25:55.000000000 +0100
23971 +++ sle11-2009-06-04/drivers/xen/fbfront/xenkbd.c 2009-06-04 10:21:39.000000000 +0200
23972 @@ -350,4 +350,5 @@ static void __exit xenkbd_cleanup(void)
23973 module_init(xenkbd_init);
23974 module_exit(xenkbd_cleanup);
23975
23976 +MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
23977 MODULE_LICENSE("GPL");
23978 --- sle11-2009-06-04.orig/drivers/xen/gntdev/gntdev.c 2009-03-16 16:38:05.000000000 +0100
23979 +++ sle11-2009-06-04/drivers/xen/gntdev/gntdev.c 2009-06-04 10:21:39.000000000 +0200
23980 @@ -418,7 +418,7 @@ static int __init gntdev_init(void)
23981 }
23982
23983 device = device_create(class, NULL, MKDEV(gntdev_major, 0),
23984 - GNTDEV_NAME);
23985 + NULL, GNTDEV_NAME);
23986 if (IS_ERR(device)) {
23987 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
23988 printk(KERN_ERR "gntdev created with major number = %d\n",
23989 --- sle11-2009-06-04.orig/drivers/xen/netfront/accel.c 2009-03-30 16:39:19.000000000 +0200
23990 +++ sle11-2009-06-04/drivers/xen/netfront/accel.c 2009-06-04 10:21:39.000000000 +0200
23991 @@ -28,6 +28,7 @@
23992 * IN THE SOFTWARE.
23993 */
23994
23995 +#include <linux/version.h>
23996 #include <linux/netdevice.h>
23997 #include <linux/skbuff.h>
23998 #include <linux/list.h>
23999 --- sle11-2009-06-04.orig/drivers/xen/netfront/netfront.c 2009-03-30 16:40:17.000000000 +0200
24000 +++ sle11-2009-06-04/drivers/xen/netfront/netfront.c 2009-06-04 10:21:39.000000000 +0200
24001 @@ -640,7 +640,7 @@ static int network_open(struct net_devic
24002 }
24003 spin_unlock_bh(&np->rx_lock);
24004
24005 - network_maybe_wake_tx(dev);
24006 + netif_start_queue(dev);
24007
24008 return 0;
24009 }
24010 --- sle11-2009-06-04.orig/drivers/xen/sfc_netback/accel.h 2009-03-30 16:00:09.000000000 +0200
24011 +++ sle11-2009-06-04/drivers/xen/sfc_netback/accel.h 2009-06-04 10:21:39.000000000 +0200
24012 @@ -25,6 +25,7 @@
24013 #ifndef NETBACK_ACCEL_H
24014 #define NETBACK_ACCEL_H
24015
24016 +#include <linux/version.h>
24017 #include <linux/slab.h>
24018 #include <linux/ip.h>
24019 #include <linux/tcp.h>
24020 --- sle11-2009-06-04.orig/drivers/xen/sfc_netfront/accel.h 2009-03-30 16:34:56.000000000 +0200
24021 +++ sle11-2009-06-04/drivers/xen/sfc_netfront/accel.h 2009-06-04 10:21:39.000000000 +0200
24022 @@ -35,6 +35,7 @@
24023 #include <xen/evtchn.h>
24024
24025 #include <linux/kernel.h>
24026 +#include <linux/version.h>
24027 #include <linux/list.h>
24028
24029 enum netfront_accel_post_status {
24030 --- sle11-2009-06-04.orig/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:13:17.000000000 +0100
24031 +++ sle11-2009-06-04/drivers/xen/xenbus/xenbus_client.c 2009-06-04 10:21:39.000000000 +0200
24032 @@ -150,7 +150,7 @@ int xenbus_watch_pathfmt(struct xenbus_d
24033 char *path;
24034
24035 va_start(ap, pathfmt);
24036 - path = kvasprintf(GFP_KERNEL, pathfmt, ap);
24037 + path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap);
24038 va_end(ap);
24039
24040 if (!path) {
24041 --- sle11-2009-06-04.orig/drivers/xen/xenbus/xenbus_comms.c 2009-02-16 16:17:21.000000000 +0100
24042 +++ sle11-2009-06-04/drivers/xen/xenbus/xenbus_comms.c 2009-06-04 10:21:39.000000000 +0200
24043 @@ -228,14 +228,11 @@ int xb_init_comms(void)
24044 intf->rsp_cons = intf->rsp_prod;
24045 }
24046
24047 +#if defined(CONFIG_XEN) || defined(MODULE)
24048 if (xenbus_irq)
24049 unbind_from_irqhandler(xenbus_irq, &xb_waitq);
24050
24051 -#if defined(CONFIG_XEN) || defined(MODULE)
24052 err = bind_caller_port_to_irqhandler(
24053 -#else
24054 - err = bind_evtchn_to_irqhandler(
24055 -#endif
24056 xen_store_evtchn, wake_waiting,
24057 0, "xenbus", &xb_waitq);
24058 if (err <= 0) {
24059 @@ -244,6 +241,20 @@ int xb_init_comms(void)
24060 }
24061
24062 xenbus_irq = err;
24063 +#else
24064 + if (xenbus_irq) {
24065 + /* Already have an irq; assume we're resuming */
24066 + rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
24067 + } else {
24068 + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
24069 + 0, "xenbus", &xb_waitq);
24070 + if (err <= 0) {
24071 + printk(KERN_ERR "XENBUS request irq failed %i\n", err);
24072 + return err;
24073 + }
24074 + xenbus_irq = err;
24075 + }
24076 +#endif
24077
24078 return 0;
24079 }
24080 --- sle11-2009-06-04.orig/drivers/xen/xenbus/xenbus_probe.c 2009-03-16 16:38:05.000000000 +0100
24081 +++ sle11-2009-06-04/drivers/xen/xenbus/xenbus_probe.c 2009-06-04 10:21:39.000000000 +0200
24082 @@ -36,6 +36,7 @@
24083 __FUNCTION__, __LINE__, ##args)
24084
24085 #include <linux/kernel.h>
24086 +#include <linux/version.h>
24087 #include <linux/err.h>
24088 #include <linux/string.h>
24089 #include <linux/ctype.h>
24090 --- sle11-2009-06-04.orig/fs/aio.c 2009-03-24 10:13:25.000000000 +0100
24091 +++ sle11-2009-06-04/fs/aio.c 2009-06-04 10:21:39.000000000 +0200
24092 @@ -1335,7 +1335,7 @@ static int make_aio_fd(struct kioctx *io
24093 int fd;
24094 struct file *file;
24095
24096 - fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
24097 + fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
24098 if (fd < 0)
24099 return fd;
24100
24101 --- sle11-2009-06-04.orig/include/asm-generic/pgtable.h 2009-03-04 11:28:34.000000000 +0100
24102 +++ sle11-2009-06-04/include/asm-generic/pgtable.h 2009-06-04 10:21:39.000000000 +0200
24103 @@ -99,10 +99,6 @@ static inline void ptep_set_wrprotect(st
24104 }
24105 #endif
24106
24107 -#ifndef arch_change_pte_range
24108 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
24109 -#endif
24110 -
24111 #ifndef __HAVE_ARCH_PTE_SAME
24112 #define pte_same(A,B) (pte_val(A) == pte_val(B))
24113 #endif
24114 --- sle11-2009-06-04.orig/include/asm-x86/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
24115 +++ sle11-2009-06-04/include/asm-x86/dma-mapping.h 2009-06-04 10:21:39.000000000 +0200
24116 @@ -74,7 +74,7 @@ static inline struct dma_mapping_ops *ge
24117 /* Make sure we keep the same behaviour */
24118 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
24119 {
24120 -#ifdef CONFIG_X86_32
24121 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
24122 return 0;
24123 #else
24124 struct dma_mapping_ops *ops = get_dma_ops(dev);
24125 --- sle11-2009-06-04.orig/include/asm-x86/kexec.h 2008-12-01 11:11:08.000000000 +0100
24126 +++ sle11-2009-06-04/include/asm-x86/kexec.h 2009-06-04 10:21:39.000000000 +0200
24127 @@ -10,6 +10,7 @@
24128 # define VA_PTE_0 5
24129 # define PA_PTE_1 6
24130 # define VA_PTE_1 7
24131 +# ifndef CONFIG_XEN
24132 # define PA_SWAP_PAGE 8
24133 # ifdef CONFIG_X86_PAE
24134 # define PA_PMD_0 9
24135 @@ -20,6 +21,18 @@
24136 # else
24137 # define PAGES_NR 9
24138 # endif
24139 +# else /* CONFIG_XEN */
24140 +/*
24141 + * The hypervisor interface implicitly requires that all entries (except
24142 + * for possibly the final one) are arranged in matching PA_/VA_ pairs.
24143 + */
24144 +# define PA_PMD_0 8
24145 +# define VA_PMD_0 9
24146 +# define PA_PMD_1 10
24147 +# define VA_PMD_1 11
24148 +# define PA_SWAP_PAGE 12
24149 +# define PAGES_NR 13
24150 +# endif /* CONFIG_XEN */
24151 #else
24152 # define PA_CONTROL_PAGE 0
24153 # define VA_CONTROL_PAGE 1
24154 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:38:05.000000000 +0100
24155 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/desc.h 2009-06-04 10:21:39.000000000 +0200
24156 @@ -31,11 +31,17 @@ extern struct desc_ptr idt_descr;
24157 extern gate_desc idt_table[];
24158 #endif
24159
24160 +struct gdt_page {
24161 + struct desc_struct gdt[GDT_ENTRIES];
24162 +} __attribute__((aligned(PAGE_SIZE)));
24163 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
24164 +
24165 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
24166 +{
24167 + return per_cpu(gdt_page, cpu).gdt;
24168 +}
24169 +
24170 #ifdef CONFIG_X86_64
24171 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
24172 -extern struct desc_ptr cpu_gdt_descr[];
24173 -/* the cpu gdt accessor */
24174 -#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
24175
24176 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
24177 unsigned dpl, unsigned ist, unsigned seg)
24178 @@ -53,16 +59,6 @@ static inline void pack_gate(gate_desc *
24179 }
24180
24181 #else
24182 -struct gdt_page {
24183 - struct desc_struct gdt[GDT_ENTRIES];
24184 -} __attribute__((aligned(PAGE_SIZE)));
24185 -DECLARE_PER_CPU(struct gdt_page, gdt_page);
24186 -
24187 -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
24188 -{
24189 - return per_cpu(gdt_page, cpu).gdt;
24190 -}
24191 -
24192 static inline void pack_gate(gate_desc *gate, unsigned char type,
24193 unsigned long base, unsigned dpl, unsigned flags,
24194 unsigned short seg)
24195 @@ -333,6 +329,28 @@ static inline void set_intr_gate(unsigne
24196 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
24197 }
24198
24199 +#define SYS_VECTOR_FREE 0
24200 +#define SYS_VECTOR_ALLOCED 1
24201 +
24202 +extern int first_system_vector;
24203 +extern char system_vectors[];
24204 +
24205 +static inline void alloc_system_vector(int vector)
24206 +{
24207 + if (system_vectors[vector] == SYS_VECTOR_FREE) {
24208 + system_vectors[vector] = SYS_VECTOR_ALLOCED;
24209 + if (first_system_vector > vector)
24210 + first_system_vector = vector;
24211 + } else
24212 + BUG();
24213 +}
24214 +
24215 +static inline void alloc_intr_gate(unsigned int n, void *addr)
24216 +{
24217 + alloc_system_vector(n);
24218 + set_intr_gate(n, addr);
24219 +}
24220 +
24221 /*
24222 * This routine sets up an interrupt gate at directory privilege level 3.
24223 */
24224 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/fixmap.h 2009-03-16 16:38:05.000000000 +0100
24225 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/fixmap.h 2009-06-04 10:21:39.000000000 +0200
24226 @@ -7,7 +7,58 @@
24227 # include "fixmap_64.h"
24228 #endif
24229
24230 +extern int fixmaps_set;
24231 +
24232 +void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
24233 +
24234 +static inline void __set_fixmap(enum fixed_addresses idx,
24235 + maddr_t phys, pgprot_t flags)
24236 +{
24237 + xen_set_fixmap(idx, phys, flags);
24238 +}
24239 +
24240 +#define set_fixmap(idx, phys) \
24241 + __set_fixmap(idx, phys, PAGE_KERNEL)
24242 +
24243 +/*
24244 + * Some hardware wants to get fixmapped without caching.
24245 + */
24246 +#define set_fixmap_nocache(idx, phys) \
24247 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24248 +
24249 #define clear_fixmap(idx) \
24250 __set_fixmap(idx, 0, __pgprot(0))
24251
24252 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24253 +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24254 +
24255 +extern void __this_fixmap_does_not_exist(void);
24256 +
24257 +/*
24258 + * 'index to address' translation. If anyone tries to use the idx
24259 + * directly without translation, we catch the bug with a NULL-deference
24260 + * kernel oops. Illegal ranges of incoming indices are caught too.
24261 + */
24262 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24263 +{
24264 + /*
24265 + * this branch gets completely eliminated after inlining,
24266 + * except when someone tries to use fixaddr indices in an
24267 + * illegal way. (such as mixing up address types or using
24268 + * out-of-range indices).
24269 + *
24270 + * If it doesn't get removed, the linker will complain
24271 + * loudly with a reasonably clear error message..
24272 + */
24273 + if (idx >= __end_of_fixed_addresses)
24274 + __this_fixmap_does_not_exist();
24275 +
24276 + return __fix_to_virt(idx);
24277 +}
24278 +
24279 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
24280 +{
24281 + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24282 + return __virt_to_fix(vaddr);
24283 +}
24284 #endif
24285 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:38:05.000000000 +0100
24286 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-06-04 10:21:39.000000000 +0200
24287 @@ -58,10 +58,17 @@ enum fixed_addresses {
24288 #ifdef CONFIG_X86_LOCAL_APIC
24289 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24290 #endif
24291 -#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
24292 +#ifndef CONFIG_XEN
24293 +#ifdef CONFIG_X86_IO_APIC
24294 FIX_IO_APIC_BASE_0,
24295 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
24296 #endif
24297 +#else
24298 + FIX_SHARED_INFO,
24299 +#define NR_FIX_ISAMAPS 256
24300 + FIX_ISAMAP_END,
24301 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24302 +#endif
24303 #ifdef CONFIG_X86_VISWS_APIC
24304 FIX_CO_CPU, /* Cobalt timer */
24305 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
24306 @@ -78,51 +85,38 @@ enum fixed_addresses {
24307 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
24308 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
24309 #endif
24310 -#ifdef CONFIG_ACPI
24311 - FIX_ACPI_BEGIN,
24312 - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24313 -#endif
24314 #ifdef CONFIG_PCI_MMCONFIG
24315 FIX_PCIE_MCFG,
24316 #endif
24317 #ifdef CONFIG_PARAVIRT
24318 FIX_PARAVIRT_BOOTMAP,
24319 #endif
24320 - FIX_SHARED_INFO,
24321 -#define NR_FIX_ISAMAPS 256
24322 - FIX_ISAMAP_END,
24323 - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24324 __end_of_permanent_fixed_addresses,
24325 /*
24326 * 256 temporary boot-time mappings, used by early_ioremap(),
24327 * before ioremap() is functional.
24328 *
24329 - * We round it up to the next 512 pages boundary so that we
24330 + * We round it up to the next 256 pages boundary so that we
24331 * can have a single pgd entry and a single pte table:
24332 */
24333 #define NR_FIX_BTMAPS 64
24334 #define FIX_BTMAPS_NESTING 4
24335 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24336 - (__end_of_permanent_fixed_addresses & 511),
24337 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
24338 + (__end_of_permanent_fixed_addresses & 255),
24339 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24340 FIX_WP_TEST,
24341 +#ifdef CONFIG_ACPI
24342 + FIX_ACPI_BEGIN,
24343 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24344 +#endif
24345 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24346 FIX_OHCI1394_BASE,
24347 #endif
24348 __end_of_fixed_addresses
24349 };
24350
24351 -extern void __set_fixmap(enum fixed_addresses idx,
24352 - maddr_t phys, pgprot_t flags);
24353 extern void reserve_top_address(unsigned long reserve);
24354
24355 -#define set_fixmap(idx, phys) \
24356 - __set_fixmap(idx, phys, PAGE_KERNEL)
24357 -/*
24358 - * Some hardware wants to get fixmapped without caching.
24359 - */
24360 -#define set_fixmap_nocache(idx, phys) \
24361 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24362
24363 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
24364
24365 @@ -131,38 +125,5 @@ extern void reserve_top_address(unsigned
24366 #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
24367 #define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
24368
24369 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24370 -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24371 -
24372 -extern void __this_fixmap_does_not_exist(void);
24373 -
24374 -/*
24375 - * 'index to address' translation. If anyone tries to use the idx
24376 - * directly without tranlation, we catch the bug with a NULL-deference
24377 - * kernel oops. Illegal ranges of incoming indices are caught too.
24378 - */
24379 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24380 -{
24381 - /*
24382 - * this branch gets completely eliminated after inlining,
24383 - * except when someone tries to use fixaddr indices in an
24384 - * illegal way. (such as mixing up address types or using
24385 - * out-of-range indices).
24386 - *
24387 - * If it doesn't get removed, the linker will complain
24388 - * loudly with a reasonably clear error message..
24389 - */
24390 - if (idx >= __end_of_fixed_addresses)
24391 - __this_fixmap_does_not_exist();
24392 -
24393 - return __fix_to_virt(idx);
24394 -}
24395 -
24396 -static inline unsigned long virt_to_fix(const unsigned long vaddr)
24397 -{
24398 - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24399 - return __virt_to_fix(vaddr);
24400 -}
24401 -
24402 #endif /* !__ASSEMBLY__ */
24403 #endif
24404 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:38:05.000000000 +0100
24405 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-06-04 10:21:39.000000000 +0200
24406 @@ -12,6 +12,7 @@
24407 #define _ASM_FIXMAP_64_H
24408
24409 #include <linux/kernel.h>
24410 +#include <asm/acpi.h>
24411 #include <asm/apicdef.h>
24412 #include <asm/page.h>
24413 #include <asm/vsyscall.h>
24414 @@ -40,7 +41,6 @@ enum fixed_addresses {
24415 VSYSCALL_HPET,
24416 FIX_DBGP_BASE,
24417 FIX_EARLYCON_MEM_BASE,
24418 - FIX_HPET_BASE,
24419 #ifdef CONFIG_X86_LOCAL_APIC
24420 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24421 #endif
24422 @@ -53,14 +53,21 @@ enum fixed_addresses {
24423 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
24424 + MAX_EFI_IO_PAGES - 1,
24425 #endif
24426 +#ifdef CONFIG_PARAVIRT
24427 + FIX_PARAVIRT_BOOTMAP,
24428 +#else
24429 + FIX_SHARED_INFO,
24430 +#endif
24431 #ifdef CONFIG_ACPI
24432 FIX_ACPI_BEGIN,
24433 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24434 #endif
24435 - FIX_SHARED_INFO,
24436 #define NR_FIX_ISAMAPS 256
24437 FIX_ISAMAP_END,
24438 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24439 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24440 + FIX_OHCI1394_BASE,
24441 +#endif
24442 __end_of_permanent_fixed_addresses,
24443 /*
24444 * 256 temporary boot-time mappings, used by early_ioremap(),
24445 @@ -71,27 +78,12 @@ enum fixed_addresses {
24446 */
24447 #define NR_FIX_BTMAPS 64
24448 #define FIX_BTMAPS_NESTING 4
24449 - FIX_BTMAP_END =
24450 - __end_of_permanent_fixed_addresses + 512 -
24451 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24452 (__end_of_permanent_fixed_addresses & 511),
24453 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24454 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24455 - FIX_OHCI1394_BASE,
24456 -#endif
24457 __end_of_fixed_addresses
24458 };
24459
24460 -extern void __set_fixmap(enum fixed_addresses idx,
24461 - unsigned long phys, pgprot_t flags);
24462 -
24463 -#define set_fixmap(idx, phys) \
24464 - __set_fixmap(idx, phys, PAGE_KERNEL)
24465 -/*
24466 - * Some hardware wants to get fixmapped without caching.
24467 - */
24468 -#define set_fixmap_nocache(idx, phys) \
24469 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24470 -
24471 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
24472 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
24473 #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
24474 @@ -100,30 +92,4 @@ extern void __set_fixmap(enum fixed_addr
24475 #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
24476 #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
24477
24478 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24479 -
24480 -extern void __this_fixmap_does_not_exist(void);
24481 -
24482 -/*
24483 - * 'index to address' translation. If anyone tries to use the idx
24484 - * directly without translation, we catch the bug with a NULL-deference
24485 - * kernel oops. Illegal ranges of incoming indices are caught too.
24486 - */
24487 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24488 -{
24489 - /*
24490 - * this branch gets completely eliminated after inlining,
24491 - * except when someone tries to use fixaddr indices in an
24492 - * illegal way. (such as mixing up address types or using
24493 - * out-of-range indices).
24494 - *
24495 - * If it doesn't get removed, the linker will complain
24496 - * loudly with a reasonably clear error message..
24497 - */
24498 - if (idx >= __end_of_fixed_addresses)
24499 - __this_fixmap_does_not_exist();
24500 -
24501 - return __fix_to_virt(idx);
24502 -}
24503 -
24504 #endif
24505 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:38:05.000000000 +0100
24506 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/highmem.h 2009-06-04 10:21:39.000000000 +0200
24507 @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *p
24508
24509 #define flush_cache_kmaps() do { } while (0)
24510
24511 +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
24512 + unsigned long end_pfn);
24513 +
24514 void clear_highpage(struct page *);
24515 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
24516 {
24517 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/hypercall.h 2009-02-16 16:18:36.000000000 +0100
24518 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/hypercall.h 2009-06-04 10:21:39.000000000 +0200
24519 @@ -323,9 +323,19 @@ static inline int __must_check
24520 HYPERVISOR_grant_table_op(
24521 unsigned int cmd, void *uop, unsigned int count)
24522 {
24523 + bool fixup = false;
24524 + int rc;
24525 +
24526 if (arch_use_lazy_mmu_mode())
24527 xen_multicall_flush(false);
24528 - return _hypercall3(int, grant_table_op, cmd, uop, count);
24529 +#ifdef GNTTABOP_map_grant_ref
24530 + if (cmd == GNTTABOP_map_grant_ref)
24531 +#endif
24532 + fixup = gnttab_pre_map_adjust(cmd, uop, count);
24533 + rc = _hypercall3(int, grant_table_op, cmd, uop, count);
24534 + if (rc == 0 && fixup)
24535 + rc = gnttab_post_map_adjust(uop, count);
24536 + return rc;
24537 }
24538
24539 static inline int __must_check
24540 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2009-03-16 16:33:40.000000000 +0100
24541 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/hypervisor.h 2009-06-04 10:21:39.000000000 +0200
24542 @@ -35,7 +35,6 @@
24543
24544 #include <linux/types.h>
24545 #include <linux/kernel.h>
24546 -#include <linux/version.h>
24547 #include <linux/errno.h>
24548 #include <xen/interface/xen.h>
24549 #include <xen/interface/platform.h>
24550 @@ -112,6 +111,8 @@ int xen_create_contiguous_region(
24551 unsigned long vstart, unsigned int order, unsigned int address_bits);
24552 void xen_destroy_contiguous_region(
24553 unsigned long vstart, unsigned int order);
24554 +int early_create_contiguous_region(unsigned long pfn, unsigned int order,
24555 + unsigned int address_bits);
24556
24557 struct page;
24558
24559 @@ -181,6 +182,29 @@ static inline void xen_multicall_flush(b
24560
24561 #endif /* CONFIG_XEN && !MODULE */
24562
24563 +#ifdef CONFIG_XEN
24564 +
24565 +struct gnttab_map_grant_ref;
24566 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
24567 + unsigned int count);
24568 +#if CONFIG_XEN_COMPAT < 0x030400
24569 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
24570 +#else
24571 +static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
24572 + unsigned int count)
24573 +{
24574 + BUG();
24575 + return -ENOSYS;
24576 +}
24577 +#endif
24578 +
24579 +#else /* !CONFIG_XEN */
24580 +
24581 +#define gnttab_pre_map_adjust(...) false
24582 +#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
24583 +
24584 +#endif /* CONFIG_XEN */
24585 +
24586 #if defined(CONFIG_X86_64)
24587 #define MULTI_UVMFLAGS_INDEX 2
24588 #define MULTI_UVMDOMID_INDEX 3
24589 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/io.h 2009-03-16 16:38:05.000000000 +0100
24590 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/io.h 2009-06-04 10:21:39.000000000 +0200
24591 @@ -3,20 +3,139 @@
24592
24593 #define ARCH_HAS_IOREMAP_WC
24594
24595 +#include <linux/compiler.h>
24596 +
24597 +/*
24598 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24599 + * mappings, before the real ioremap() is functional.
24600 + * A boot-time mapping is currently limited to at most 16 pages.
24601 + */
24602 +#ifndef __ASSEMBLY__
24603 +extern void early_ioremap_init(void);
24604 +extern void early_ioremap_clear(void);
24605 +extern void early_ioremap_reset(void);
24606 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24607 +extern void early_iounmap(void *addr, unsigned long size);
24608 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24609 +#endif
24610 +
24611 +#define build_mmio_read(name, size, type, reg, barrier) \
24612 +static inline type name(const volatile void __iomem *addr) \
24613 +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
24614 +:"m" (*(volatile type __force *)addr) barrier); return ret; }
24615 +
24616 +#define build_mmio_write(name, size, type, reg, barrier) \
24617 +static inline void name(type val, volatile void __iomem *addr) \
24618 +{ asm volatile("mov" size " %0,%1": :reg (val), \
24619 +"m" (*(volatile type __force *)addr) barrier); }
24620 +
24621 +build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
24622 +build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
24623 +build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
24624 +
24625 +build_mmio_read(__readb, "b", unsigned char, "=q", )
24626 +build_mmio_read(__readw, "w", unsigned short, "=r", )
24627 +build_mmio_read(__readl, "l", unsigned int, "=r", )
24628 +
24629 +build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
24630 +build_mmio_write(writew, "w", unsigned short, "r", :"memory")
24631 +build_mmio_write(writel, "l", unsigned int, "r", :"memory")
24632 +
24633 +build_mmio_write(__writeb, "b", unsigned char, "q", )
24634 +build_mmio_write(__writew, "w", unsigned short, "r", )
24635 +build_mmio_write(__writel, "l", unsigned int, "r", )
24636 +
24637 +#define readb_relaxed(a) __readb(a)
24638 +#define readw_relaxed(a) __readw(a)
24639 +#define readl_relaxed(a) __readl(a)
24640 +#define __raw_readb __readb
24641 +#define __raw_readw __readw
24642 +#define __raw_readl __readl
24643 +
24644 +#define __raw_writeb __writeb
24645 +#define __raw_writew __writew
24646 +#define __raw_writel __writel
24647 +
24648 +#define mmiowb() barrier()
24649 +
24650 +#ifdef CONFIG_X86_64
24651 +build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
24652 +build_mmio_read(__readq, "q", unsigned long, "=r", )
24653 +build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
24654 +build_mmio_write(__writeq, "q", unsigned long, "r", )
24655 +
24656 +#define readq_relaxed(a) __readq(a)
24657 +#define __raw_readq __readq
24658 +#define __raw_writeq writeq
24659 +
24660 +/* Let people know we have them */
24661 +#define readq readq
24662 +#define writeq writeq
24663 +#endif
24664 +
24665 +#define native_io_delay xen_io_delay
24666 +
24667 #ifdef CONFIG_X86_32
24668 -# include "io_32.h"
24669 +# include "../../io_32.h"
24670 #else
24671 -# include "io_64.h"
24672 +# include "../../io_64.h"
24673 +#endif
24674 +
24675 +#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
24676 +
24677 +/* We will be supplying our own /dev/mem implementation */
24678 +#define ARCH_HAS_DEV_MEM
24679 +
24680 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
24681 +#undef page_to_phys
24682 +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
24683 +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
24684 +
24685 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
24686 + (unsigned long) (bv)->bv_offset)
24687 +
24688 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
24689 + (bvec_to_phys(vec1) + (vec1)->bv_len == bvec_to_phys(vec2) \
24690 + && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
24691 + == bvec_to_pseudophys(vec2))
24692 +
24693 +#undef virt_to_bus
24694 +#undef bus_to_virt
24695 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
24696 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
24697 +
24698 +#include <asm/fixmap.h>
24699 +
24700 +#undef isa_virt_to_bus
24701 +#undef isa_page_to_bus
24702 +#undef isa_bus_to_virt
24703 +#define isa_virt_to_bus(_x) ({ BUG(); virt_to_bus(_x); })
24704 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->_x
24705 +#define isa_bus_to_virt(_x) ((void *)__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
24706 +
24707 +#undef __ISA_IO_base
24708 +#define __ISA_IO_base ((char __iomem *)fix_to_virt(FIX_ISAMAP_BEGIN))
24709 +
24710 #endif
24711
24712 extern void *xlate_dev_mem_ptr(unsigned long phys);
24713 extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
24714
24715 -extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
24716 -extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
24717 -
24718 extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
24719 unsigned long prot_val);
24720 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
24721
24722 +/*
24723 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24724 + * mappings, before the real ioremap() is functional.
24725 + * A boot-time mapping is currently limited to at most 16 pages.
24726 + */
24727 +extern void early_ioremap_init(void);
24728 +extern void early_ioremap_clear(void);
24729 +extern void early_ioremap_reset(void);
24730 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24731 +extern void early_iounmap(void *addr, unsigned long size);
24732 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24733 +
24734 +
24735 #endif /* _ASM_X86_IO_H */
24736 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
24737 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/irq_vectors.h 2009-06-04 10:21:39.000000000 +0200
24738 @@ -0,0 +1,52 @@
24739 +#ifndef _ASM_IRQ_VECTORS_H
24740 +#define _ASM_IRQ_VECTORS_H
24741 +
24742 +#ifdef CONFIG_X86_32
24743 +# define SYSCALL_VECTOR 0x80
24744 +#else
24745 +# define IA32_SYSCALL_VECTOR 0x80
24746 +#endif
24747 +
24748 +#define RESCHEDULE_VECTOR 0
24749 +#define CALL_FUNCTION_VECTOR 1
24750 +#define CALL_FUNC_SINGLE_VECTOR 2
24751 +#define SPIN_UNLOCK_VECTOR 3
24752 +#define NR_IPIS 4
24753 +
24754 +/*
24755 + * The maximum number of vectors supported by i386 processors
24756 + * is limited to 256. For processors other than i386, NR_VECTORS
24757 + * should be changed accordingly.
24758 + */
24759 +#define NR_VECTORS 256
24760 +
24761 +#define FIRST_VM86_IRQ 3
24762 +#define LAST_VM86_IRQ 15
24763 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
24764 +
24765 +/*
24766 + * The flat IRQ space is divided into two regions:
24767 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
24768 + * if we have physical device-access privilege. This region is at the
24769 + * start of the IRQ space so that existing device drivers do not need
24770 + * to be modified to translate physical IRQ numbers into our IRQ space.
24771 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
24772 + * are bound using the provided bind/unbind functions.
24773 + */
24774 +
24775 +#define PIRQ_BASE 0
24776 +#if defined(NR_CPUS) && defined(MAX_IO_APICS)
24777 +# if NR_CPUS < MAX_IO_APICS
24778 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24779 +# else
24780 +# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
24781 +# endif
24782 +#endif
24783 +
24784 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
24785 +#define NR_DYNIRQS 256
24786 +
24787 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
24788 +#define NR_IRQ_VECTORS NR_IRQS
24789 +
24790 +#endif /* _ASM_IRQ_VECTORS_H */
24791 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:38:05.000000000 +0100
24792 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/irqflags.h 2009-06-04 10:21:39.000000000 +0200
24793 @@ -118,7 +118,7 @@ static inline void halt(void)
24794
24795 #ifndef CONFIG_X86_64
24796 #define INTERRUPT_RETURN iret
24797 -#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
24798 +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
24799 sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
24800 __TEST_PENDING ; \
24801 jnz 14f /* process more events if necessary... */ ; \
24802 @@ -175,18 +175,6 @@ static inline void trace_hardirqs_fixup_
24803 #else
24804
24805 #ifdef CONFIG_X86_64
24806 -/*
24807 - * Currently paravirt can't handle swapgs nicely when we
24808 - * don't have a stack we can rely on (such as a user space
24809 - * stack). So we either find a way around these or just fault
24810 - * and emulate if a guest tries to call swapgs directly.
24811 - *
24812 - * Either way, this is a good way to document that we don't
24813 - * have a reliable stack. x86_64 only.
24814 - */
24815 -#define SWAPGS_UNSAFE_STACK swapgs
24816 -#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
24817 -#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
24818 #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
24819 #define ARCH_LOCKDEP_SYS_EXIT_IRQ \
24820 TRACE_IRQS_ON; \
24821 @@ -198,24 +186,6 @@ static inline void trace_hardirqs_fixup_
24822 TRACE_IRQS_OFF;
24823
24824 #else
24825 -#define ARCH_TRACE_IRQS_ON \
24826 - pushl %eax; \
24827 - pushl %ecx; \
24828 - pushl %edx; \
24829 - call trace_hardirqs_on; \
24830 - popl %edx; \
24831 - popl %ecx; \
24832 - popl %eax;
24833 -
24834 -#define ARCH_TRACE_IRQS_OFF \
24835 - pushl %eax; \
24836 - pushl %ecx; \
24837 - pushl %edx; \
24838 - call trace_hardirqs_off; \
24839 - popl %edx; \
24840 - popl %ecx; \
24841 - popl %eax;
24842 -
24843 #define ARCH_LOCKDEP_SYS_EXIT \
24844 pushl %eax; \
24845 pushl %ecx; \
24846 @@ -229,8 +199,8 @@ static inline void trace_hardirqs_fixup_
24847 #endif
24848
24849 #ifdef CONFIG_TRACE_IRQFLAGS
24850 -# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
24851 -# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
24852 +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
24853 +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
24854 #else
24855 # define TRACE_IRQS_ON
24856 # define TRACE_IRQS_OFF
24857 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/mmu_context.h 2009-02-16 16:18:36.000000000 +0100
24858 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/mmu_context.h 2009-06-04 10:21:39.000000000 +0200
24859 @@ -1,5 +1,42 @@
24860 +#ifndef __ASM_X86_MMU_CONTEXT_H
24861 +#define __ASM_X86_MMU_CONTEXT_H
24862 +
24863 +#include <asm/desc.h>
24864 +#include <asm/atomic.h>
24865 +#include <asm/pgalloc.h>
24866 +#include <asm/tlbflush.h>
24867 +
24868 +void arch_exit_mmap(struct mm_struct *mm);
24869 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24870 +
24871 +void mm_pin(struct mm_struct *mm);
24872 +void mm_unpin(struct mm_struct *mm);
24873 +void mm_pin_all(void);
24874 +
24875 +static inline void xen_activate_mm(struct mm_struct *prev,
24876 + struct mm_struct *next)
24877 +{
24878 + if (!PagePinned(virt_to_page(next->pgd)))
24879 + mm_pin(next);
24880 +}
24881 +
24882 +/*
24883 + * Used for LDT copy/destruction.
24884 + */
24885 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24886 +void destroy_context(struct mm_struct *mm);
24887 +
24888 #ifdef CONFIG_X86_32
24889 # include "mmu_context_32.h"
24890 #else
24891 # include "mmu_context_64.h"
24892 #endif
24893 +
24894 +#define activate_mm(prev, next) \
24895 +do { \
24896 + xen_activate_mm(prev, next); \
24897 + switch_mm((prev), (next), NULL); \
24898 +} while (0);
24899 +
24900 +
24901 +#endif /* __ASM_X86_MMU_CONTEXT_H */
24902 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:38:05.000000000 +0100
24903 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-06-04 10:21:39.000000000 +0200
24904 @@ -1,32 +1,6 @@
24905 #ifndef __I386_SCHED_H
24906 #define __I386_SCHED_H
24907
24908 -#include <asm/desc.h>
24909 -#include <asm/atomic.h>
24910 -#include <asm/pgalloc.h>
24911 -#include <asm/tlbflush.h>
24912 -
24913 -void arch_exit_mmap(struct mm_struct *mm);
24914 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24915 -
24916 -void mm_pin(struct mm_struct *mm);
24917 -void mm_unpin(struct mm_struct *mm);
24918 -void mm_pin_all(void);
24919 -
24920 -static inline void xen_activate_mm(struct mm_struct *prev,
24921 - struct mm_struct *next)
24922 -{
24923 - if (!PagePinned(virt_to_page(next->pgd)))
24924 - mm_pin(next);
24925 -}
24926 -
24927 -/*
24928 - * Used for LDT copy/destruction.
24929 - */
24930 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24931 -void destroy_context(struct mm_struct *mm);
24932 -
24933 -
24934 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24935 {
24936 #if 0 /* XEN: no lazy tlb */
24937 @@ -107,10 +81,4 @@ static inline void switch_mm(struct mm_s
24938 #define deactivate_mm(tsk, mm) \
24939 asm("movl %0,%%gs": :"r" (0));
24940
24941 -#define activate_mm(prev, next) \
24942 -do { \
24943 - xen_activate_mm(prev, next); \
24944 - switch_mm((prev), (next), NULL); \
24945 -} while (0)
24946 -
24947 #endif
24948 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:38:05.000000000 +0100
24949 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-06-04 10:21:39.000000000 +0200
24950 @@ -1,23 +1,6 @@
24951 #ifndef __X86_64_MMU_CONTEXT_H
24952 #define __X86_64_MMU_CONTEXT_H
24953
24954 -#include <asm/desc.h>
24955 -#include <asm/atomic.h>
24956 -#include <asm/pgalloc.h>
24957 -#include <asm/page.h>
24958 -#include <asm/pda.h>
24959 -#include <asm/pgtable.h>
24960 -#include <asm/tlbflush.h>
24961 -
24962 -void arch_exit_mmap(struct mm_struct *mm);
24963 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24964 -
24965 -/*
24966 - * possibly do the LDT unload here?
24967 - */
24968 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24969 -void destroy_context(struct mm_struct *mm);
24970 -
24971 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24972 {
24973 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
24974 @@ -58,10 +41,6 @@ static inline void __prepare_arch_switch
24975 }
24976 }
24977
24978 -extern void mm_pin(struct mm_struct *mm);
24979 -extern void mm_unpin(struct mm_struct *mm);
24980 -void mm_pin_all(void);
24981 -
24982 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
24983 struct task_struct *tsk)
24984 {
24985 @@ -124,11 +103,4 @@ do { \
24986 asm volatile("movl %0,%%fs"::"r"(0)); \
24987 } while (0)
24988
24989 -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
24990 -{
24991 - if (!PagePinned(virt_to_page(next->pgd)))
24992 - mm_pin(next);
24993 - switch_mm(prev, next, NULL);
24994 -}
24995 -
24996 #endif
24997 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:38:05.000000000 +0100
24998 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/page.h 2009-06-04 10:21:39.000000000 +0200
24999 @@ -16,9 +16,9 @@
25000 * below. The preprocessor will warn if the two definitions aren't identical.
25001 */
25002 #define _PAGE_BIT_PRESENT 0
25003 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25004 -#define _PAGE_BIT_IO 9
25005 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25006 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25007 +#define _PAGE_BIT_IO 11
25008 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25009
25010 #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
25011 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
25012 @@ -28,8 +28,11 @@
25013 (ie, 32-bit PAE). */
25014 #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
25015
25016 -/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
25017 -#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
25018 +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
25019 +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
25020 +
25021 +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
25022 +#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
25023
25024 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
25025 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
25026 @@ -39,8 +42,7 @@
25027 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
25028 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
25029
25030 -/* to align the pointer to the (next) page boundary */
25031 -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
25032 +#define HUGE_MAX_HSTATE 2
25033
25034 #ifndef __ASSEMBLY__
25035 #include <linux/types.h>
25036 @@ -61,9 +63,17 @@
25037
25038 #ifndef __ASSEMBLY__
25039
25040 +typedef struct { pgdval_t pgd; } pgd_t;
25041 +typedef struct { pgprotval_t pgprot; } pgprot_t;
25042 +
25043 extern int page_is_ram(unsigned long pagenr);
25044 extern int devmem_is_allowed(unsigned long pagenr);
25045 +extern void map_devmem(unsigned long pfn, unsigned long size,
25046 + pgprot_t vma_prot);
25047 +extern void unmap_devmem(unsigned long pfn, unsigned long size,
25048 + pgprot_t vma_prot);
25049
25050 +extern unsigned long max_low_pfn_mapped;
25051 extern unsigned long max_pfn_mapped;
25052
25053 struct page;
25054 @@ -84,15 +94,11 @@ static inline void copy_user_page(void *
25055 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
25056 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
25057
25058 -typedef struct { pgprotval_t pgprot; } pgprot_t;
25059 -
25060 #define pgprot_val(x) ((x).pgprot)
25061 #define __pgprot(x) ((pgprot_t) { (x) } )
25062
25063 #include <asm/maddr.h>
25064
25065 -typedef struct { pgdval_t pgd; } pgd_t;
25066 -
25067 #define __pgd_ma(x) ((pgd_t) { (x) } )
25068 static inline pgd_t xen_make_pgd(pgdval_t val)
25069 {
25070 @@ -196,6 +202,11 @@ static inline pteval_t xen_pte_val(pte_t
25071 return ret;
25072 }
25073
25074 +static inline pteval_t xen_pte_flags(pte_t pte)
25075 +{
25076 + return __pte_val(pte) & PTE_FLAGS_MASK;
25077 +}
25078 +
25079 #define pgd_val(x) xen_pgd_val(x)
25080 #define __pgd(x) xen_make_pgd(x)
25081
25082 @@ -210,6 +221,7 @@ static inline pteval_t xen_pte_val(pte_t
25083 #endif
25084
25085 #define pte_val(x) xen_pte_val(x)
25086 +#define pte_flags(x) xen_pte_flags(x)
25087 #define __pte(x) xen_make_pte(x)
25088
25089 #define __pa(x) __phys_addr((unsigned long)(x))
25090 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:38:05.000000000 +0100
25091 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/page_64.h 2009-06-04 10:21:39.000000000 +0200
25092 @@ -26,6 +26,12 @@
25093 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
25094 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
25095
25096 +/*
25097 + * Set __PAGE_OFFSET to the most negative possible address +
25098 + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
25099 + * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
25100 + * what Xen requires.
25101 + */
25102 #define __PAGE_OFFSET _AC(0xffff880000000000, UL)
25103
25104 #define __PHYSICAL_START CONFIG_PHYSICAL_START
25105 @@ -63,7 +69,8 @@
25106 void clear_page(void *page);
25107 void copy_page(void *to, void *from);
25108
25109 -extern unsigned long end_pfn;
25110 +/* duplicated to the one in bootmem.h */
25111 +extern unsigned long max_pfn;
25112
25113 static inline unsigned long __phys_addr(unsigned long x)
25114 {
25115 @@ -91,6 +98,11 @@ typedef union { pteval_t pte; unsigned i
25116 extern unsigned long init_memory_mapping(unsigned long start,
25117 unsigned long end);
25118
25119 +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
25120 +
25121 +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
25122 +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
25123 +
25124 #endif /* !__ASSEMBLY__ */
25125
25126 #ifdef CONFIG_FLATMEM
25127 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:38:05.000000000 +0100
25128 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pci.h 2009-06-04 10:21:39.000000000 +0200
25129 @@ -21,6 +21,8 @@ struct pci_sysdata {
25130 #endif
25131 };
25132
25133 +extern int pci_routeirq;
25134 +
25135 /* scan a bus after allocating a pci_sysdata for it */
25136 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
25137 int node);
25138 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pci_32.h 2009-02-16 16:18:36.000000000 +0100
25139 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pci_32.h 2009-06-04 10:21:39.000000000 +0200
25140 @@ -38,12 +38,14 @@ struct pci_dev;
25141 #define PCI_DMA_BUS_IS_PHYS (1)
25142
25143 /* pci_unmap_{page,single} is a nop so... */
25144 -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
25145 -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
25146 -#define pci_unmap_addr(PTR, ADDR_NAME) (0)
25147 -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
25148 -#define pci_unmap_len(PTR, LEN_NAME) (0)
25149 -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
25150 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
25151 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
25152 +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
25153 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25154 + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
25155 +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
25156 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
25157 + do { break; } while (pci_unmap_len(PTR, LEN_NAME))
25158
25159 #endif
25160
25161 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2009-03-16 16:38:05.000000000 +0100
25162 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgalloc.h 2009-06-04 10:21:39.000000000 +0200
25163 @@ -7,6 +7,9 @@
25164
25165 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
25166
25167 +static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
25168 +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
25169 +
25170 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
25171 static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
25172 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
25173 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:38:05.000000000 +0100
25174 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable.h 2009-06-04 10:21:39.000000000 +0200
25175 @@ -13,11 +13,12 @@
25176 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
25177 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
25178 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
25179 -#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
25180 +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
25181 +#define _PAGE_BIT_UNUSED2 10
25182 +#define _PAGE_BIT_IO 11 /* Mapped page is I/O or foreign and
25183 * has no associated page struct. */
25184 -#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
25185 -#define _PAGE_BIT_UNUSED3 11
25186 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
25187 +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
25188 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
25189
25190 /* If _PAGE_BIT_PRESENT is clear, we use these: */
25191 @@ -28,34 +29,31 @@
25192 /* if the user mapped it with PROT_NONE; pte_present gives true */
25193 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
25194
25195 -/*
25196 - * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
25197 - * sign-extended value on 32-bit with all 1's in the upper word,
25198 - * which preserves the upper pte values on 64-bit ptes:
25199 - */
25200 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25201 -#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
25202 -#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
25203 -#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
25204 -#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
25205 -#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
25206 -#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
25207 -#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
25208 -#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
25209 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25210 -#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
25211 -#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
25212 -#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
25213 -#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
25214 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25215 +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
25216 +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
25217 +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
25218 +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
25219 +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
25220 +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
25221 +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
25222 +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
25223 +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
25224 +#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2)
25225 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25226 +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
25227 +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
25228 +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
25229 +#define __HAVE_ARCH_PTE_SPECIAL
25230
25231 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
25232 -#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
25233 +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
25234 #else
25235 -#define _PAGE_NX 0
25236 +#define _PAGE_NX (_AT(pteval_t, 0))
25237 #endif
25238
25239 -#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
25240 -#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
25241 +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
25242 +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
25243
25244 #ifndef __ASSEMBLY__
25245 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
25246 @@ -71,8 +69,8 @@ extern unsigned int __kernel_page_user;
25247 _PAGE_DIRTY | __kernel_page_user)
25248
25249 /* Set of bits not changed in pte_modify */
25250 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25251 - _PAGE_ACCESSED | _PAGE_DIRTY)
25252 +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25253 + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
25254
25255 /*
25256 * PAT settings are part of the hypervisor interface, which sets the
25257 @@ -102,19 +100,9 @@ extern unsigned int __kernel_page_user;
25258 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
25259 _PAGE_ACCESSED)
25260
25261 -#ifdef CONFIG_X86_32
25262 -#define _PAGE_KERNEL_EXEC \
25263 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
25264 -#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
25265 -
25266 -#ifndef __ASSEMBLY__
25267 -extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
25268 -#endif /* __ASSEMBLY__ */
25269 -#else
25270 #define __PAGE_KERNEL_EXEC \
25271 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
25272 #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
25273 -#endif
25274
25275 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
25276 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
25277 @@ -125,25 +113,22 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
25278 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
25279 #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
25280 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
25281 +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
25282 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
25283
25284 -/*
25285 - * We don't support GLOBAL page in xenolinux64
25286 - */
25287 -#define MAKE_GLOBAL(x) __pgprot((x))
25288 -
25289 -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
25290 -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
25291 -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
25292 -#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
25293 -#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
25294 -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
25295 -#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
25296 -#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
25297 -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
25298 -#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
25299 -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
25300 -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25301 +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
25302 +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
25303 +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
25304 +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
25305 +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
25306 +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
25307 +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
25308 +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
25309 +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
25310 +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
25311 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
25312 +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
25313 +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25314
25315 /* xwr */
25316 #define __P000 PAGE_NONE
25317 @@ -182,27 +167,27 @@ extern struct list_head pgd_list;
25318 */
25319 static inline int pte_dirty(pte_t pte)
25320 {
25321 - return __pte_val(pte) & _PAGE_DIRTY;
25322 + return pte_flags(pte) & _PAGE_DIRTY;
25323 }
25324
25325 static inline int pte_young(pte_t pte)
25326 {
25327 - return __pte_val(pte) & _PAGE_ACCESSED;
25328 + return pte_flags(pte) & _PAGE_ACCESSED;
25329 }
25330
25331 static inline int pte_write(pte_t pte)
25332 {
25333 - return __pte_val(pte) & _PAGE_RW;
25334 + return pte_flags(pte) & _PAGE_RW;
25335 }
25336
25337 static inline int pte_file(pte_t pte)
25338 {
25339 - return __pte_val(pte) & _PAGE_FILE;
25340 + return pte_flags(pte) & _PAGE_FILE;
25341 }
25342
25343 static inline int pte_huge(pte_t pte)
25344 {
25345 - return __pte_val(pte) & _PAGE_PSE;
25346 + return pte_flags(pte) & _PAGE_PSE;
25347 }
25348
25349 static inline int pte_global(pte_t pte)
25350 @@ -212,12 +197,12 @@ static inline int pte_global(pte_t pte)
25351
25352 static inline int pte_exec(pte_t pte)
25353 {
25354 - return !(__pte_val(pte) & _PAGE_NX);
25355 + return !(pte_flags(pte) & _PAGE_NX);
25356 }
25357
25358 static inline int pte_special(pte_t pte)
25359 {
25360 - return 0;
25361 + return pte_flags(pte) & _PAGE_SPECIAL;
25362 }
25363
25364 static inline int pmd_large(pmd_t pte)
25365 @@ -228,22 +213,22 @@ static inline int pmd_large(pmd_t pte)
25366
25367 static inline pte_t pte_mkclean(pte_t pte)
25368 {
25369 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
25370 + return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
25371 }
25372
25373 static inline pte_t pte_mkold(pte_t pte)
25374 {
25375 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
25376 + return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
25377 }
25378
25379 static inline pte_t pte_wrprotect(pte_t pte)
25380 {
25381 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
25382 + return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
25383 }
25384
25385 static inline pte_t pte_mkexec(pte_t pte)
25386 {
25387 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
25388 + return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
25389 }
25390
25391 static inline pte_t pte_mkdirty(pte_t pte)
25392 @@ -268,7 +253,7 @@ static inline pte_t pte_mkhuge(pte_t pte
25393
25394 static inline pte_t pte_clrhuge(pte_t pte)
25395 {
25396 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
25397 + return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
25398 }
25399
25400 static inline pte_t pte_mkglobal(pte_t pte)
25401 @@ -283,35 +268,46 @@ static inline pte_t pte_clrglobal(pte_t
25402
25403 static inline pte_t pte_mkspecial(pte_t pte)
25404 {
25405 - return pte;
25406 + return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
25407 }
25408
25409 extern pteval_t __supported_pte_mask;
25410
25411 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
25412 {
25413 - return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
25414 - pgprot_val(pgprot)) & __supported_pte_mask);
25415 + pgprotval_t prot = pgprot_val(pgprot);
25416 +
25417 + if (prot & _PAGE_PRESENT)
25418 + prot &= __supported_pte_mask;
25419 + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25420 }
25421
25422 static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
25423 {
25424 - return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
25425 - pgprot_val(pgprot)) & __supported_pte_mask);
25426 + pgprotval_t prot = pgprot_val(pgprot);
25427 +
25428 + if (prot & _PAGE_PRESENT)
25429 + prot &= __supported_pte_mask;
25430 + return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25431 }
25432
25433 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
25434 {
25435 - return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
25436 - pgprot_val(pgprot)) & __supported_pte_mask);
25437 + pgprotval_t prot = pgprot_val(pgprot);
25438 +
25439 + if (prot & _PAGE_PRESENT)
25440 + prot &= __supported_pte_mask;
25441 + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25442 }
25443
25444 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
25445 {
25446 - pteval_t val = pte_val(pte);
25447 + pgprotval_t prot = pgprot_val(newprot);
25448 + pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
25449
25450 - val &= _PAGE_CHG_MASK;
25451 - val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
25452 + if (prot & _PAGE_PRESENT)
25453 + prot &= __supported_pte_mask;
25454 + val |= prot & ~_PAGE_CHG_MASK;
25455
25456 return __pte(val);
25457 }
25458 @@ -325,9 +321,11 @@ static inline pgprot_t pgprot_modify(pgp
25459 return __pgprot(preservebits | addbits);
25460 }
25461
25462 -#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
25463 +#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
25464
25465 -#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
25466 +#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
25467 + ? pgprot_val(p) & __supported_pte_mask \
25468 + : pgprot_val(p))
25469
25470 #ifndef __ASSEMBLY__
25471 #define __HAVE_PHYS_MEM_ACCESS_PROT
25472 @@ -338,6 +336,17 @@ int phys_mem_access_prot_allowed(struct
25473 unsigned long size, pgprot_t *vma_prot);
25474 #endif
25475
25476 +/* Install a pte for a particular vaddr in kernel space. */
25477 +void set_pte_vaddr(unsigned long vaddr, pte_t pte);
25478 +
25479 +#ifndef CONFIG_XEN
25480 +extern void native_pagetable_setup_start(pgd_t *base);
25481 +extern void native_pagetable_setup_done(pgd_t *base);
25482 +#else
25483 +static inline void xen_pagetable_setup_start(pgd_t *base) {}
25484 +static inline void xen_pagetable_setup_done(pgd_t *base) {}
25485 +#endif
25486 +
25487 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
25488 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
25489
25490 @@ -373,6 +382,26 @@ int phys_mem_access_prot_allowed(struct
25491 # include "pgtable_64.h"
25492 #endif
25493
25494 +/*
25495 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25496 + *
25497 + * this macro returns the index of the entry in the pgd page which would
25498 + * control the given virtual address
25499 + */
25500 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25501 +
25502 +/*
25503 + * pgd_offset() returns a (pgd_t *)
25504 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25505 + */
25506 +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25507 +/*
25508 + * a shortcut which implies the use of the kernel's pgd, instead
25509 + * of a process's
25510 + */
25511 +#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25512 +
25513 +
25514 #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
25515 #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
25516
25517 @@ -383,8 +412,15 @@ enum {
25518 PG_LEVEL_4K,
25519 PG_LEVEL_2M,
25520 PG_LEVEL_1G,
25521 + PG_LEVEL_NUM
25522 };
25523
25524 +#ifdef CONFIG_PROC_FS
25525 +extern void update_page_count(int level, unsigned long pages);
25526 +#else
25527 +static inline void update_page_count(int level, unsigned long pages) { }
25528 +#endif
25529 +
25530 /*
25531 * Helper function that returns the kernel pagetable entry controlling
25532 * the virtual address 'address'. NULL means no pagetable entry present.
25533 @@ -441,6 +477,8 @@ static inline void xen_pte_clear(struct
25534 * race with other CPU's that might be updating the dirty
25535 * bit at the same time.
25536 */
25537 +struct vm_area_struct;
25538 +
25539 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
25540 extern int ptep_set_access_flags(struct vm_area_struct *vma,
25541 unsigned long address, pte_t *ptep,
25542 @@ -523,9 +561,6 @@ static inline void clone_pgd_range(pgd_t
25543 memcpy(dst, src, count * sizeof(pgd_t));
25544 }
25545
25546 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25547 - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25548 -
25549 #define arbitrary_virt_to_machine(va) \
25550 ({ \
25551 unsigned int __lvl; \
25552 @@ -548,6 +583,34 @@ struct page *kmap_atomic_to_page(void *)
25553 #define ptep_to_machine(ptep) virt_to_machine(ptep)
25554 #endif
25555
25556 +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
25557 +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
25558 + pte_t *ptep)
25559 +{
25560 +#if CONFIG_XEN_COMPAT < 0x030300
25561 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
25562 + return ptep_get_and_clear(mm, addr, ptep);
25563 +#endif
25564 + return *ptep;
25565 +}
25566 +
25567 +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
25568 + pte_t *ptep, pte_t pte)
25569 +{
25570 + mmu_update_t u;
25571 +
25572 +#if CONFIG_XEN_COMPAT < 0x030300
25573 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
25574 + set_pte_at(mm, addr, ptep, pte);
25575 + return;
25576 + }
25577 +#endif
25578 + u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
25579 + u.val = __pte_val(pte);
25580 + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
25581 + BUG();
25582 +}
25583 +
25584 #include <asm-generic/pgtable.h>
25585
25586 #include <xen/features.h>
25587 @@ -576,10 +639,6 @@ int touch_pte_range(struct mm_struct *mm
25588 unsigned long address,
25589 unsigned long size);
25590
25591 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25592 - unsigned long addr, unsigned long end, pgprot_t newprot,
25593 - int dirty_accountable);
25594 -
25595 #endif /* __ASSEMBLY__ */
25596
25597 #endif /* _ASM_X86_PGTABLE_H */
25598 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:38:05.000000000 +0100
25599 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-06-04 10:21:39.000000000 +0200
25600 @@ -14,11 +14,11 @@
25601 #define pmd_ERROR(e) \
25602 printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
25603 __FILE__, __LINE__, &(e), __pmd_val(e), \
25604 - (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25605 + (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25606 #define pgd_ERROR(e) \
25607 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
25608 __FILE__, __LINE__, &(e), __pgd_val(e), \
25609 - (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25610 + (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25611
25612 static inline int pud_none(pud_t pud)
25613 {
25614 @@ -27,7 +27,7 @@ static inline int pud_none(pud_t pud)
25615 }
25616 static inline int pud_bad(pud_t pud)
25617 {
25618 - return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25619 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25620 }
25621
25622 static inline int pud_present(pud_t pud)
25623 @@ -102,9 +102,9 @@ static inline void pud_clear(pud_t *pudp
25624 xen_tlb_flush();
25625 }
25626
25627 -#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
25628 +#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
25629
25630 -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
25631 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
25632
25633
25634 /* Find an entry in the second-level page table.. */
25635 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:38:05.000000000 +0100
25636 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-06-04 10:21:39.000000000 +0200
25637 @@ -89,10 +89,10 @@ extern unsigned long pg0[];
25638 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
25639 can temporarily clear it. */
25640 #define pmd_present(x) (__pmd_val(x))
25641 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25642 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25643 #else
25644 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
25645 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25646 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25647 #endif
25648
25649
25650 @@ -119,26 +119,6 @@ extern unsigned long pg0[];
25651 */
25652 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
25653
25654 -/*
25655 - * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25656 - *
25657 - * this macro returns the index of the entry in the pgd page which would
25658 - * control the given virtual address
25659 - */
25660 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25661 -#define pgd_index_k(addr) pgd_index((addr))
25662 -
25663 -/*
25664 - * pgd_offset() returns a (pgd_t *)
25665 - * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25666 - */
25667 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25668 -
25669 -/*
25670 - * a shortcut which implies the use of the kernel's pgd, instead
25671 - * of a process's
25672 - */
25673 -#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25674
25675 static inline int pud_large(pud_t pud) { return 0; }
25676
25677 @@ -165,7 +145,7 @@ static inline int pud_large(pud_t pud) {
25678 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25679
25680 #define pmd_page_vaddr(pmd) \
25681 - ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
25682 + ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
25683
25684 #if defined(CONFIG_HIGHPTE)
25685 #define pte_offset_map(dir, address) \
25686 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:38:05.000000000 +0100
25687 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-06-04 10:21:39.000000000 +0200
25688 @@ -23,6 +23,8 @@ extern void xen_init_pt(void);
25689 extern pud_t level3_kernel_pgt[512];
25690 extern pud_t level3_ident_pgt[512];
25691 extern pmd_t level2_kernel_pgt[512];
25692 +extern pmd_t level2_fixmap_pgt[512];
25693 +extern pmd_t level2_ident_pgt[512];
25694 extern pgd_t init_level4_pgt[];
25695
25696 #define swapper_pg_dir init_level4_pgt
25697 @@ -79,6 +81,9 @@ extern void paging_init(void);
25698
25699 struct mm_struct;
25700
25701 +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
25702 +
25703 +
25704 #define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
25705
25706 static inline void xen_set_pte(pte_t *ptep, pte_t pte)
25707 @@ -145,29 +150,29 @@ static inline void xen_pgd_clear(pgd_t *
25708 #define PGDIR_MASK (~(PGDIR_SIZE - 1))
25709
25710
25711 -#define MAXMEM _AC(0x00003fffffffffff, UL)
25712 +#define MAXMEM _AC(0x000004ffffffffff, UL)
25713 #define VMALLOC_START _AC(0xffffc20000000000, UL)
25714 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
25715 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
25716 #define MODULES_VADDR _AC(0xffffffffa0000000, UL)
25717 -#define MODULES_END _AC(0xfffffffffff00000, UL)
25718 +#define MODULES_END _AC(0xffffffffff000000, UL)
25719 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
25720
25721 #ifndef __ASSEMBLY__
25722
25723 static inline int pgd_bad(pgd_t pgd)
25724 {
25725 - return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25726 + return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25727 }
25728
25729 static inline int pud_bad(pud_t pud)
25730 {
25731 - return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25732 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25733 }
25734
25735 static inline int pmd_bad(pmd_t pmd)
25736 {
25737 - return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25738 + return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25739 }
25740
25741 #define pte_none(x) (!(x).pte)
25742 @@ -175,7 +180,7 @@ static inline int pmd_bad(pmd_t pmd)
25743
25744 #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
25745
25746 -#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
25747 +#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
25748 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
25749 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
25750 #define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
25751 @@ -200,11 +205,8 @@ static inline int pmd_bad(pmd_t pmd)
25752 * Level 4 access.
25753 */
25754 #define pgd_page_vaddr(pgd) \
25755 - ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
25756 + ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
25757 #define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
25758 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25759 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25760 -#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
25761 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
25762 static inline int pgd_large(pgd_t pgd) { return 0; }
25763 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
25764 @@ -226,7 +228,7 @@ static inline int pud_large(pud_t pte)
25765 }
25766
25767 /* PMD - Level 2 access */
25768 -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
25769 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
25770 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25771
25772 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
25773 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:38:05.000000000 +0100
25774 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/processor.h 2009-06-04 10:21:39.000000000 +0200
25775 @@ -134,7 +134,7 @@ extern __u32 cleared_cpu_caps[NCAPINTS
25776 #ifdef CONFIG_SMP
25777 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25778 #define cpu_data(cpu) per_cpu(cpu_info, cpu)
25779 -#define current_cpu_data cpu_data(smp_processor_id())
25780 +#define current_cpu_data __get_cpu_var(cpu_info)
25781 #else
25782 #define cpu_data(cpu) boot_cpu_data
25783 #define current_cpu_data boot_cpu_data
25784 @@ -153,7 +153,7 @@ static inline int hlt_works(int cpu)
25785
25786 extern void cpu_detect(struct cpuinfo_x86 *c);
25787
25788 -extern void identify_cpu(struct cpuinfo_x86 *);
25789 +extern void early_cpu_init(void);
25790 extern void identify_boot_cpu(void);
25791 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25792 extern void print_cpu_info(struct cpuinfo_x86 *);
25793 @@ -267,15 +267,11 @@ struct tss_struct {
25794 struct thread_struct *io_bitmap_owner;
25795
25796 /*
25797 - * Pad the TSS to be cacheline-aligned (size is 0x100):
25798 - */
25799 - unsigned long __cacheline_filler[35];
25800 - /*
25801 * .. and then another 0x100 bytes for the emergency kernel stack:
25802 */
25803 unsigned long stack[64];
25804
25805 -} __attribute__((packed));
25806 +} ____cacheline_aligned;
25807
25808 DECLARE_PER_CPU(struct tss_struct, init_tss);
25809
25810 @@ -668,11 +664,36 @@ static inline void __sti_mwait(unsigned
25811
25812 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25813
25814 -extern int force_mwait;
25815 -
25816 extern void select_idle_routine(const struct cpuinfo_x86 *c);
25817
25818 extern unsigned long boot_option_idle_override;
25819 +extern unsigned long idle_halt;
25820 +extern unsigned long idle_nomwait;
25821 +
25822 +#ifndef CONFIG_XEN
25823 +/*
25824 + * on systems with caches, caches must be flashed as the absolute
25825 + * last instruction before going into a suspended halt. Otherwise,
25826 + * dirty data can linger in the cache and become stale on resume,
25827 + * leading to strange errors.
25828 + *
25829 + * perform a variety of operations to guarantee that the compiler
25830 + * will not reorder instructions. wbinvd itself is serializing
25831 + * so the processor will not reorder.
25832 + *
25833 + * Systems without cache can just go into halt.
25834 + */
25835 +static inline void wbinvd_halt(void)
25836 +{
25837 + mb();
25838 + /* check for clflush to determine if wbinvd is legal */
25839 + if (cpu_has_clflush)
25840 + asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
25841 + else
25842 + while (1)
25843 + halt();
25844 +}
25845 +#endif
25846
25847 extern void enable_sep_cpu(void);
25848 extern int sysenter_setup(void);
25849 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:38:05.000000000 +0100
25850 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/segment.h 2009-06-04 10:21:39.000000000 +0200
25851 @@ -1,6 +1,15 @@
25852 #ifndef _ASM_X86_SEGMENT_H_
25853 #define _ASM_X86_SEGMENT_H_
25854
25855 +/* Constructor for a conventional segment GDT (or LDT) entry */
25856 +/* This is a macro so it can be used in initializers */
25857 +#define GDT_ENTRY(flags, base, limit) \
25858 + ((((base) & 0xff000000ULL) << (56-24)) | \
25859 + (((flags) & 0x0000f0ffULL) << 40) | \
25860 + (((limit) & 0x000f0000ULL) << (48-16)) | \
25861 + (((base) & 0x00ffffffULL) << 16) | \
25862 + (((limit) & 0x0000ffffULL)))
25863 +
25864 /* Simple and small GDT entries for booting only */
25865
25866 #define GDT_ENTRY_BOOT_CS 2
25867 @@ -61,18 +70,14 @@
25868 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
25869
25870 #define GDT_ENTRY_DEFAULT_USER_CS 14
25871 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
25872
25873 #define GDT_ENTRY_DEFAULT_USER_DS 15
25874 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
25875
25876 #define GDT_ENTRY_KERNEL_BASE 12
25877
25878 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
25879 -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25880
25881 #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
25882 -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25883
25884 #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
25885 #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
25886 @@ -143,10 +148,11 @@
25887 #else
25888 #include <asm/cache.h>
25889
25890 -#define __KERNEL_CS 0x10
25891 -#define __KERNEL_DS 0x18
25892 +#define GDT_ENTRY_KERNEL32_CS 1
25893 +#define GDT_ENTRY_KERNEL_CS 2
25894 +#define GDT_ENTRY_KERNEL_DS 3
25895
25896 -#define __KERNEL32_CS 0x08
25897 +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
25898
25899 /*
25900 * we cannot use the same code segment descriptor for user and kernel
25901 @@ -154,10 +160,10 @@
25902 * The segment offset needs to contain a RPL. Grr. -AK
25903 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
25904 */
25905 -
25906 -#define __USER32_CS 0x23 /* 4*8+3 */
25907 -#define __USER_DS 0x2b /* 5*8+3 */
25908 -#define __USER_CS 0x33 /* 6*8+3 */
25909 +#define GDT_ENTRY_DEFAULT_USER32_CS 4
25910 +#define GDT_ENTRY_DEFAULT_USER_DS 5
25911 +#define GDT_ENTRY_DEFAULT_USER_CS 6
25912 +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
25913 #define __USER32_DS __USER_DS
25914
25915 #define GDT_ENTRY_TSS 8 /* needs two entries */
25916 @@ -179,6 +185,11 @@
25917
25918 #endif
25919
25920 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25921 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25922 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
25923 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
25924 +
25925 /* User mode is privilege level 3 */
25926 #define USER_RPL 0x3
25927 /* LDT segment has TI set, GDT has it cleared */
25928 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/smp.h 2009-03-16 16:38:05.000000000 +0100
25929 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/smp.h 2009-06-04 10:21:39.000000000 +0200
25930 @@ -25,23 +25,16 @@ extern cpumask_t cpu_initialized;
25931 extern void (*mtrr_hook)(void);
25932 extern void zap_low_mappings(void);
25933
25934 +extern int __cpuinit get_local_pda(int cpu);
25935 +
25936 extern int smp_num_siblings;
25937 extern unsigned int num_processors;
25938 extern cpumask_t cpu_initialized;
25939
25940 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
25941 -extern u16 x86_cpu_to_apicid_init[];
25942 -extern u16 x86_bios_cpu_apicid_init[];
25943 -extern void *x86_cpu_to_apicid_early_ptr;
25944 -extern void *x86_bios_cpu_apicid_early_ptr;
25945 -#else
25946 -#define x86_cpu_to_apicid_early_ptr NULL
25947 -#define x86_bios_cpu_apicid_early_ptr NULL
25948 -#endif
25949 -
25950 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
25951 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
25952 DECLARE_PER_CPU(u16, cpu_llc_id);
25953 +
25954 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
25955 DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
25956
25957 @@ -63,9 +56,9 @@ struct smp_ops {
25958
25959 void (*smp_send_stop)(void);
25960 void (*smp_send_reschedule)(int cpu);
25961 - int (*smp_call_function_mask)(cpumask_t mask,
25962 - void (*func)(void *info), void *info,
25963 - int wait);
25964 +
25965 + void (*send_call_func_ipi)(cpumask_t mask);
25966 + void (*send_call_func_single_ipi)(int cpu);
25967 };
25968
25969 /* Globals due to paravirt */
25970 @@ -106,11 +99,14 @@ static inline void smp_send_reschedule(i
25971 smp_ops.smp_send_reschedule(cpu);
25972 }
25973
25974 -static inline int smp_call_function_mask(cpumask_t mask,
25975 - void (*func) (void *info), void *info,
25976 - int wait)
25977 +static inline void arch_send_call_function_single_ipi(int cpu)
25978 {
25979 - return smp_ops.smp_call_function_mask(mask, func, info, wait);
25980 + smp_ops.send_call_func_single_ipi(cpu);
25981 +}
25982 +
25983 +static inline void arch_send_call_function_ipi(cpumask_t mask)
25984 +{
25985 + smp_ops.send_call_func_ipi(mask);
25986 }
25987
25988 void native_smp_prepare_boot_cpu(void);
25989 @@ -122,23 +118,19 @@ int native_cpu_up(unsigned int cpunum);
25990
25991 void xen_smp_send_stop(void);
25992 void xen_smp_send_reschedule(int cpu);
25993 -int xen_smp_call_function_mask(cpumask_t mask,
25994 - void (*func) (void *info), void *info,
25995 - int wait);
25996 +void xen_send_call_func_ipi(cpumask_t mask);
25997 +void xen_send_call_func_single_ipi(int cpu);
25998
25999 #define smp_send_stop xen_smp_send_stop
26000 #define smp_send_reschedule xen_smp_send_reschedule
26001 -#define smp_call_function_mask xen_smp_call_function_mask
26002 -
26003 -extern void prefill_possible_map(void);
26004 +#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
26005 +#define arch_send_call_function_ipi xen_send_call_func_ipi
26006
26007 #endif /* CONFIG_XEN */
26008
26009 extern int __cpu_disable(void);
26010 extern void __cpu_die(unsigned int cpu);
26011
26012 -extern void prefill_possible_map(void);
26013 -
26014 void smp_store_cpu_info(int id);
26015 #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
26016
26017 @@ -149,6 +141,14 @@ static inline int num_booting_cpus(void)
26018 }
26019 #endif /* CONFIG_SMP */
26020
26021 +#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN))
26022 +extern void prefill_possible_map(void);
26023 +#else
26024 +static inline void prefill_possible_map(void)
26025 +{
26026 +}
26027 +#endif
26028 +
26029 extern unsigned disabled_cpus __cpuinitdata;
26030
26031 #ifdef CONFIG_X86_32_SMP
26032 @@ -216,12 +216,8 @@ static inline int hard_smp_processor_id(
26033 #endif /* CONFIG_X86_LOCAL_APIC */
26034
26035 #ifdef CONFIG_HOTPLUG_CPU
26036 -extern void cpu_exit_clear(void);
26037 extern void cpu_uninit(void);
26038 #endif
26039
26040 -extern void smp_alloc_memory(void);
26041 -extern void lock_ipi_call_lock(void);
26042 -extern void unlock_ipi_call_lock(void);
26043 #endif /* __ASSEMBLY__ */
26044 #endif
26045 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:38:05.000000000 +0100
26046 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/spinlock.h 2009-06-04 11:09:05.000000000 +0200
26047 @@ -38,6 +38,11 @@
26048 # define UNLOCK_LOCK_PREFIX
26049 #endif
26050
26051 +/*
26052 + * Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
26053 + */
26054 +#if CONFIG_XEN_COMPAT >= 0x030200
26055 +
26056 int xen_spinlock_init(unsigned int cpu);
26057 void xen_spinlock_cleanup(unsigned int cpu);
26058 extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
26059 @@ -65,14 +70,14 @@ extern void xen_spin_kick(raw_spinlock_t
26060 */
26061 #if (NR_CPUS < 256)
26062 #define TICKET_SHIFT 8
26063 -#define __raw_spin_lock_preamble \
26064 +#define __ticket_spin_lock_preamble \
26065 asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
26066 "cmpb %h0, %b0\n\t" \
26067 "sete %1" \
26068 : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
26069 : "0" (0x0100) \
26070 : "memory", "cc")
26071 -#define __raw_spin_lock_body \
26072 +#define __ticket_spin_lock_body \
26073 asm("1:\t" \
26074 "cmpb %h0, %b0\n\t" \
26075 "je 2f\n\t" \
26076 @@ -88,7 +93,7 @@ extern void xen_spin_kick(raw_spinlock_t
26077 : "memory", "cc")
26078
26079
26080 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26081 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26082 {
26083 int tmp, new;
26084
26085 @@ -107,7 +112,7 @@ static __always_inline int __raw_spin_tr
26086 return tmp;
26087 }
26088
26089 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26090 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26091 {
26092 unsigned int token;
26093 unsigned char kick;
26094 @@ -124,7 +129,7 @@ static __always_inline void __raw_spin_u
26095 }
26096 #else
26097 #define TICKET_SHIFT 16
26098 -#define __raw_spin_lock_preamble \
26099 +#define __ticket_spin_lock_preamble \
26100 do { \
26101 unsigned int tmp; \
26102 asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
26103 @@ -136,7 +141,7 @@ static __always_inline void __raw_spin_u
26104 : "0" (0x00010000) \
26105 : "memory", "cc"); \
26106 } while (0)
26107 -#define __raw_spin_lock_body \
26108 +#define __ticket_spin_lock_body \
26109 do { \
26110 unsigned int tmp; \
26111 asm("shldl $16, %0, %2\n" \
26112 @@ -155,7 +160,7 @@ static __always_inline void __raw_spin_u
26113 : "memory", "cc"); \
26114 } while (0)
26115
26116 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26117 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26118 {
26119 int tmp;
26120 int new;
26121 @@ -177,7 +182,7 @@ static __always_inline int __raw_spin_tr
26122 return tmp;
26123 }
26124
26125 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26126 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26127 {
26128 unsigned int token, tmp;
26129 bool kick;
26130 @@ -195,49 +200,161 @@ static __always_inline void __raw_spin_u
26131 }
26132 #endif
26133
26134 -static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26135 +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
26136 {
26137 int tmp = ACCESS_ONCE(lock->slock);
26138
26139 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
26140 }
26141
26142 -static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26143 +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
26144 {
26145 int tmp = ACCESS_ONCE(lock->slock);
26146
26147 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
26148 }
26149
26150 -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26151 +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
26152 {
26153 unsigned int token, count;
26154 bool free;
26155
26156 - __raw_spin_lock_preamble;
26157 + __ticket_spin_lock_preamble;
26158 if (unlikely(!free))
26159 token = xen_spin_adjust(lock, token);
26160 do {
26161 count = 1 << 10;
26162 - __raw_spin_lock_body;
26163 + __ticket_spin_lock_body;
26164 } while (unlikely(!count) && !xen_spin_wait(lock, token));
26165 }
26166
26167 -static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26168 - unsigned long flags)
26169 +static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
26170 + unsigned long flags)
26171 {
26172 unsigned int token, count;
26173 bool free;
26174
26175 - __raw_spin_lock_preamble;
26176 + __ticket_spin_lock_preamble;
26177 if (unlikely(!free))
26178 token = xen_spin_adjust(lock, token);
26179 do {
26180 count = 1 << 10;
26181 - __raw_spin_lock_body;
26182 + __ticket_spin_lock_body;
26183 } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
26184 }
26185
26186 +#define __raw_spin(n) __ticket_spin_##n
26187 +
26188 +#else /* CONFIG_XEN_COMPAT < 0x030200 */
26189 +/*
26190 + * Define virtualization-friendly old-style lock byte lock, for use in
26191 + * pv_lock_ops if desired.
26192 + *
26193 + * This differs from the pre-2.6.24 spinlock by always using xchgb
26194 + * rather than decb to take the lock; this allows it to use a
26195 + * zero-initialized lock structure. It also maintains a 1-byte
26196 + * contention counter, so that we can implement
26197 + * __byte_spin_is_contended.
26198 + */
26199 +struct __byte_spinlock {
26200 + u8 lock;
26201 +#if NR_CPUS < 256
26202 + u8 spinners;
26203 +#else
26204 +#error NR_CPUS >= 256 support not implemented
26205 +#endif
26206 +};
26207 +
26208 +static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
26209 +static inline void xen_spinlock_cleanup(unsigned int cpu) {}
26210 +
26211 +static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
26212 +{
26213 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26214 + return bl->lock != 0;
26215 +}
26216 +
26217 +static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
26218 +{
26219 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26220 + return bl->spinners != 0;
26221 +}
26222 +
26223 +static inline void __byte_spin_lock(raw_spinlock_t *lock)
26224 +{
26225 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26226 + s8 val = 1;
26227 +
26228 + asm("1: xchgb %1, %0\n"
26229 + " test %1,%1\n"
26230 + " jz 3f\n"
26231 + " " LOCK_PREFIX "incb %2\n"
26232 + "2: rep;nop\n"
26233 + " cmpb $1, %0\n"
26234 + " je 2b\n"
26235 + " " LOCK_PREFIX "decb %2\n"
26236 + " jmp 1b\n"
26237 + "3:"
26238 + : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
26239 +}
26240 +
26241 +#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
26242 +
26243 +static inline int __byte_spin_trylock(raw_spinlock_t *lock)
26244 +{
26245 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26246 + u8 old = 1;
26247 +
26248 + asm("xchgb %1,%0"
26249 + : "+m" (bl->lock), "+q" (old) : : "memory");
26250 +
26251 + return old == 0;
26252 +}
26253 +
26254 +static inline void __byte_spin_unlock(raw_spinlock_t *lock)
26255 +{
26256 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26257 + smp_wmb();
26258 + bl->lock = 0;
26259 +}
26260 +
26261 +#define __raw_spin(n) __byte_spin_##n
26262 +
26263 +#endif /* CONFIG_XEN_COMPAT */
26264 +
26265 +static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26266 +{
26267 + return __raw_spin(is_locked)(lock);
26268 +}
26269 +
26270 +static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26271 +{
26272 + return __raw_spin(is_contended)(lock);
26273 +}
26274 +
26275 +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26276 +{
26277 + __raw_spin(lock)(lock);
26278 +}
26279 +
26280 +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26281 + unsigned long flags)
26282 +{
26283 + __raw_spin(lock_flags)(lock, flags);
26284 +}
26285 +
26286 +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26287 +{
26288 + return __raw_spin(trylock)(lock);
26289 +}
26290 +
26291 +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26292 +{
26293 + __raw_spin(unlock)(lock);
26294 +}
26295 +
26296 +#undef __raw_spin
26297 +
26298 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
26299 {
26300 while (__raw_spin_is_locked(lock))
26301 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:38:05.000000000 +0100
26302 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/system.h 2009-06-04 10:21:39.000000000 +0200
26303 @@ -137,7 +137,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
26304 #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
26305 #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
26306
26307 -extern void load_gs_index(unsigned);
26308 +extern void xen_load_gs_index(unsigned);
26309
26310 /*
26311 * Load a segment. Fall back on loading the zero
26312 @@ -154,14 +154,14 @@ extern void load_gs_index(unsigned);
26313 "jmp 2b\n" \
26314 ".previous\n" \
26315 _ASM_EXTABLE(1b,3b) \
26316 - : :"r" (value), "r" (0))
26317 + : :"r" (value), "r" (0) : "memory")
26318
26319
26320 /*
26321 * Save a segment register away
26322 */
26323 #define savesegment(seg, value) \
26324 - asm volatile("mov %%" #seg ",%0":"=rm" (value))
26325 + asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
26326
26327 static inline unsigned long get_limit(unsigned long segment)
26328 {
26329 @@ -269,6 +269,7 @@ static inline void xen_wbinvd(void)
26330 #ifdef CONFIG_X86_64
26331 #define read_cr8() (xen_read_cr8())
26332 #define write_cr8(x) (xen_write_cr8(x))
26333 +#define load_gs_index xen_load_gs_index
26334 #endif
26335
26336 /* Clear the 'TS' bit */
26337 @@ -287,13 +288,12 @@ static inline void clflush(volatile void
26338 void disable_hlt(void);
26339 void enable_hlt(void);
26340
26341 -extern int es7000_plat;
26342 void cpu_idle_wait(void);
26343
26344 extern unsigned long arch_align_stack(unsigned long sp);
26345 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
26346
26347 -void default_idle(void);
26348 +void xen_idle(void);
26349
26350 /*
26351 * Force strict CPU ordering.
26352 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/asm/xor_64.h 2009-03-16 16:38:05.000000000 +0100
26353 +++ sle11-2009-06-04/include/asm-x86/mach-xen/asm/xor_64.h 2009-06-04 10:21:39.000000000 +0200
26354 @@ -1,3 +1,6 @@
26355 +#ifndef ASM_X86__XOR_64_H
26356 +#define ASM_X86__XOR_64_H
26357 +
26358 /*
26359 * x86-64 changes / gcc fixes from Andi Kleen.
26360 * Copyright 2002 Andi Kleen, SuSE Labs.
26361 @@ -330,3 +333,5 @@ do { \
26362 We may also be able to load into the L1 only depending on how the cpu
26363 deals with a load to a line that is being prefetched. */
26364 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
26365 +
26366 +#endif /* ASM_X86__XOR_64_H */
26367 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/irq_vectors.h 2009-03-16 16:33:40.000000000 +0100
26368 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26369 @@ -1,126 +0,0 @@
26370 -/*
26371 - * This file should contain #defines for all of the interrupt vector
26372 - * numbers used by this architecture.
26373 - *
26374 - * In addition, there are some standard defines:
26375 - *
26376 - * FIRST_EXTERNAL_VECTOR:
26377 - * The first free place for external interrupts
26378 - *
26379 - * SYSCALL_VECTOR:
26380 - * The IRQ vector a syscall makes the user to kernel transition
26381 - * under.
26382 - *
26383 - * TIMER_IRQ:
26384 - * The IRQ number the timer interrupt comes in at.
26385 - *
26386 - * NR_IRQS:
26387 - * The total number of interrupt vectors (including all the
26388 - * architecture specific interrupts) needed.
26389 - *
26390 - */
26391 -#ifndef _ASM_IRQ_VECTORS_H
26392 -#define _ASM_IRQ_VECTORS_H
26393 -
26394 -/*
26395 - * IDT vectors usable for external interrupt sources start
26396 - * at 0x20:
26397 - */
26398 -#define FIRST_EXTERNAL_VECTOR 0x20
26399 -
26400 -#define SYSCALL_VECTOR 0x80
26401 -
26402 -/*
26403 - * Vectors 0x20-0x2f are used for ISA interrupts.
26404 - */
26405 -
26406 -#if 0
26407 -/*
26408 - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
26409 - *
26410 - * some of the following vectors are 'rare', they are merged
26411 - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
26412 - * TLB, reschedule and local APIC vectors are performance-critical.
26413 - *
26414 - * Vectors 0xf0-0xfa are free (reserved for future Linux use).
26415 - */
26416 -#define SPURIOUS_APIC_VECTOR 0xff
26417 -#define ERROR_APIC_VECTOR 0xfe
26418 -#define INVALIDATE_TLB_VECTOR 0xfd
26419 -#define RESCHEDULE_VECTOR 0xfc
26420 -#define CALL_FUNCTION_VECTOR 0xfb
26421 -
26422 -#define THERMAL_APIC_VECTOR 0xf0
26423 -/*
26424 - * Local APIC timer IRQ vector is on a different priority level,
26425 - * to work around the 'lost local interrupt if more than 2 IRQ
26426 - * sources per level' errata.
26427 - */
26428 -#define LOCAL_TIMER_VECTOR 0xef
26429 -#endif
26430 -
26431 -#define SPURIOUS_APIC_VECTOR 0xff
26432 -#define ERROR_APIC_VECTOR 0xfe
26433 -
26434 -/*
26435 - * First APIC vector available to drivers: (vectors 0x30-0xee)
26436 - * we start at 0x31 to spread out vectors evenly between priority
26437 - * levels. (0x80 is the syscall vector)
26438 - */
26439 -#define FIRST_DEVICE_VECTOR 0x31
26440 -#define FIRST_SYSTEM_VECTOR 0xef
26441 -
26442 -/*
26443 - * 16 8259A IRQ's, 208 potential APIC interrupt sources.
26444 - * Right now the APIC is mostly only used for SMP.
26445 - * 256 vectors is an architectural limit. (we can have
26446 - * more than 256 devices theoretically, but they will
26447 - * have to use shared interrupts)
26448 - * Since vectors 0x00-0x1f are used/reserved for the CPU,
26449 - * the usable vector space is 0x20-0xff (224 vectors)
26450 - */
26451 -
26452 -#define RESCHEDULE_VECTOR 0
26453 -#define CALL_FUNCTION_VECTOR 1
26454 -#define SPIN_UNLOCK_VECTOR 2
26455 -#define NR_IPIS 3
26456 -
26457 -/*
26458 - * The maximum number of vectors supported by i386 processors
26459 - * is limited to 256. For processors other than i386, NR_VECTORS
26460 - * should be changed accordingly.
26461 - */
26462 -#define NR_VECTORS 256
26463 -
26464 -#define FPU_IRQ 13
26465 -
26466 -#define FIRST_VM86_IRQ 3
26467 -#define LAST_VM86_IRQ 15
26468 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
26469 -
26470 -/*
26471 - * The flat IRQ space is divided into two regions:
26472 - * 1. A one-to-one mapping of real physical IRQs. This space is only used
26473 - * if we have physical device-access privilege. This region is at the
26474 - * start of the IRQ space so that existing device drivers do not need
26475 - * to be modified to translate physical IRQ numbers into our IRQ space.
26476 - * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
26477 - * are bound using the provided bind/unbind functions.
26478 - */
26479 -
26480 -#define PIRQ_BASE 0
26481 -#if !defined(MAX_IO_APICS)
26482 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26483 -#elif NR_CPUS < MAX_IO_APICS
26484 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26485 -#else
26486 -# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
26487 -#endif
26488 -
26489 -#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
26490 -#define NR_DYNIRQS 256
26491 -
26492 -#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
26493 -#define NR_IRQ_VECTORS NR_IRQS
26494 -
26495 -#endif /* _ASM_IRQ_VECTORS_H */
26496 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/setup_arch_post.h 2009-06-04 11:08:07.000000000 +0200
26497 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26498 @@ -1,63 +0,0 @@
26499 -/**
26500 - * machine_specific_* - Hooks for machine specific setup.
26501 - *
26502 - * Description:
26503 - * This is included late in kernel/setup.c so that it can make
26504 - * use of all of the static functions.
26505 - **/
26506 -
26507 -#include <xen/interface/callback.h>
26508 -
26509 -extern void hypervisor_callback(void);
26510 -extern void failsafe_callback(void);
26511 -extern void nmi(void);
26512 -
26513 -static void __init machine_specific_arch_setup(void)
26514 -{
26515 - int ret;
26516 - static struct callback_register __initdata event = {
26517 - .type = CALLBACKTYPE_event,
26518 - .address = (unsigned long) hypervisor_callback,
26519 - };
26520 - static struct callback_register __initdata failsafe = {
26521 - .type = CALLBACKTYPE_failsafe,
26522 - .address = (unsigned long)failsafe_callback,
26523 - };
26524 - static struct callback_register __initdata syscall = {
26525 - .type = CALLBACKTYPE_syscall,
26526 - .address = (unsigned long)system_call,
26527 - };
26528 -#ifdef CONFIG_X86_LOCAL_APIC
26529 - static struct callback_register __initdata nmi_cb = {
26530 - .type = CALLBACKTYPE_nmi,
26531 - .address = (unsigned long)nmi,
26532 - };
26533 -#endif
26534 -
26535 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
26536 - if (ret == 0)
26537 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
26538 - if (ret == 0)
26539 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
26540 -#if CONFIG_XEN_COMPAT <= 0x030002
26541 - if (ret == -ENOSYS)
26542 - ret = HYPERVISOR_set_callbacks(
26543 - event.address,
26544 - failsafe.address,
26545 - syscall.address);
26546 -#endif
26547 - BUG_ON(ret);
26548 -
26549 -#ifdef CONFIG_X86_LOCAL_APIC
26550 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
26551 -#if CONFIG_XEN_COMPAT <= 0x030002
26552 - if (ret == -ENOSYS) {
26553 - static struct xennmi_callback __initdata cb = {
26554 - .handler_address = (unsigned long)nmi
26555 - };
26556 -
26557 - HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
26558 - }
26559 -#endif
26560 -#endif
26561 -}
26562 --- sle11-2009-06-04.orig/include/asm-x86/mach-xen/setup_arch_pre.h 2009-06-04 11:08:07.000000000 +0200
26563 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26564 @@ -1,5 +0,0 @@
26565 -/* Hook to call BIOS initialisation function */
26566 -
26567 -#define ARCH_SETUP machine_specific_arch_setup();
26568 -
26569 -static void __init machine_specific_arch_setup(void);
26570 --- sle11-2009-06-04.orig/include/asm-x86/traps.h 2009-06-04 11:08:07.000000000 +0200
26571 +++ sle11-2009-06-04/include/asm-x86/traps.h 2009-06-04 10:21:39.000000000 +0200
26572 @@ -23,6 +23,9 @@ asmlinkage void spurious_interrupt_bug(v
26573 #ifdef CONFIG_X86_MCE
26574 asmlinkage void machine_check(void);
26575 #endif /* CONFIG_X86_MCE */
26576 +#ifdef CONFIG_X86_XEN
26577 +asmlinkage void fixup_4gb_segment(void);
26578 +#endif
26579
26580 void do_divide_error(struct pt_regs *, long);
26581 void do_overflow(struct pt_regs *, long);
26582 @@ -48,6 +51,9 @@ void math_error(void __user *);
26583 void do_coprocessor_error(struct pt_regs *, long);
26584 void do_simd_coprocessor_error(struct pt_regs *, long);
26585 void do_spurious_interrupt_bug(struct pt_regs *, long);
26586 +#ifdef CONFIG_XEN
26587 +void do_fixup_4gb_segment(struct pt_regs *, long);
26588 +#endif
26589 unsigned long patch_espfix_desc(unsigned long, unsigned long);
26590 asmlinkage void math_emulate(long);
26591
26592 --- sle11-2009-06-04.orig/include/asm-x86/xen/interface_64.h 2009-06-04 11:08:07.000000000 +0200
26593 +++ sle11-2009-06-04/include/asm-x86/xen/interface_64.h 2009-06-04 10:21:39.000000000 +0200
26594 @@ -136,7 +136,7 @@ struct cpu_user_regs {
26595 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
26596 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
26597 };
26598 -DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
26599 +DEFINE_XEN_GUEST_HANDLE_STRUCT(cpu_user_regs);
26600
26601 #undef __DECL_REG
26602
26603 --- sle11-2009-06-04.orig/include/linux/page-flags.h 2009-03-16 16:38:05.000000000 +0100
26604 +++ sle11-2009-06-04/include/linux/page-flags.h 2009-06-04 10:21:39.000000000 +0200
26605 @@ -110,9 +110,11 @@ enum pageflags {
26606 /* Filesystems */
26607 PG_checked = PG_owner_priv_1,
26608
26609 +#ifdef CONFIG_PARAVIRT_XEN
26610 /* XEN */
26611 PG_pinned = PG_owner_priv_1,
26612 PG_savepinned = PG_dirty,
26613 +#endif
26614
26615 /* SLOB */
26616 PG_slob_page = PG_active,
26617 @@ -187,8 +189,12 @@ PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU,
26618 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
26619 __PAGEFLAG(Slab, slab)
26620 PAGEFLAG(Checked, checked) /* Used by some filesystems */
26621 +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
26622 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
26623 +#endif
26624 +#ifdef CONFIG_PARAVIRT_XEN
26625 PAGEFLAG(SavePinned, savepinned); /* Xen */
26626 +#endif
26627 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
26628 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
26629 __SETPAGEFLAG(Private, private)
26630 --- sle11-2009-06-04.orig/include/xen/interface/memory.h 2009-03-16 16:38:05.000000000 +0100
26631 +++ sle11-2009-06-04/include/xen/interface/memory.h 2009-06-04 10:21:39.000000000 +0200
26632 @@ -82,6 +82,7 @@ struct xen_memory_reservation {
26633 domid_t domid;
26634
26635 };
26636 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
26637 typedef struct xen_memory_reservation xen_memory_reservation_t;
26638 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
26639
26640 @@ -167,11 +168,7 @@ struct xen_machphys_mfn_list {
26641 * any large discontiguities in the machine address space, 2MB gaps in
26642 * the machphys table will be represented by an MFN base of zero.
26643 */
26644 -#ifndef CONFIG_PARAVIRT_XEN
26645 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
26646 -#else
26647 - ulong extent_start;
26648 -#endif
26649
26650 /*
26651 * Number of extents written to the above array. This will be smaller
26652 @@ -179,6 +176,7 @@ struct xen_machphys_mfn_list {
26653 */
26654 unsigned int nr_extents;
26655 };
26656 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
26657 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
26658 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
26659
26660 @@ -218,6 +216,7 @@ struct xen_add_to_physmap {
26661 /* GPFN where the source mapping page should appear. */
26662 xen_pfn_t gpfn;
26663 };
26664 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
26665 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
26666 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
26667
26668 @@ -250,21 +249,13 @@ struct xen_translate_gpfn_list {
26669 xen_ulong_t nr_gpfns;
26670
26671 /* List of GPFNs to translate. */
26672 -#ifndef CONFIG_PARAVIRT_XEN
26673 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
26674 -#else
26675 - ulong gpfn_list;
26676 -#endif
26677
26678 /*
26679 * Output list to contain MFN translations. May be the same as the input
26680 * list (in which case each input GPFN is overwritten with the output MFN).
26681 */
26682 -#ifndef CONFIG_PARAVIRT_XEN
26683 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
26684 -#else
26685 - ulong mfn_list;
26686 -#endif
26687 };
26688 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
26689 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
26690 --- sle11-2009-06-04.orig/kernel/hrtimer.c 2009-06-04 11:08:07.000000000 +0200
26691 +++ sle11-2009-06-04/kernel/hrtimer.c 2009-06-04 10:21:39.000000000 +0200
26692 @@ -1084,7 +1084,7 @@ ktime_t hrtimer_get_remaining(const stru
26693 }
26694 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
26695
26696 -#ifdef CONFIG_NO_HZ
26697 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26698 /**
26699 * hrtimer_get_next_event - get the time until next expiry event
26700 *
26701 --- sle11-2009-06-04.orig/kernel/kexec.c 2009-02-17 12:38:20.000000000 +0100
26702 +++ sle11-2009-06-04/kernel/kexec.c 2009-06-04 10:21:39.000000000 +0200
26703 @@ -54,7 +54,7 @@ int dump_after_notifier;
26704 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
26705 u32
26706 #if defined(CONFIG_XEN) && defined(CONFIG_X86)
26707 -__attribute__((__section__(".bss.page_aligned"), __aligned__(PAGE_SIZE)))
26708 +__page_aligned_bss
26709 #endif
26710 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
26711 size_t vmcoreinfo_size;
26712 --- sle11-2009-06-04.orig/kernel/timer.c 2009-06-04 11:08:07.000000000 +0200
26713 +++ sle11-2009-06-04/kernel/timer.c 2009-06-04 10:21:39.000000000 +0200
26714 @@ -884,7 +884,7 @@ static inline void __run_timers(struct t
26715 spin_unlock_irq(&base->lock);
26716 }
26717
26718 -#ifdef CONFIG_NO_HZ
26719 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26720 /*
26721 * Find out when the next timer event is due to happen. This
26722 * is used on S/390 to stop all activity when a cpus is idle.
26723 --- sle11-2009-06-04.orig/lib/swiotlb-xen.c 2009-03-16 16:38:05.000000000 +0100
26724 +++ sle11-2009-06-04/lib/swiotlb-xen.c 2009-06-04 10:21:39.000000000 +0200
26725 @@ -750,7 +750,7 @@ swiotlb_sync_sg_for_device(struct device
26726 }
26727
26728 int
26729 -swiotlb_dma_mapping_error(dma_addr_t dma_addr)
26730 +swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
26731 {
26732 return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
26733 }
26734 --- sle11-2009-06-04.orig/mm/mprotect.c 2009-03-04 11:28:34.000000000 +0100
26735 +++ sle11-2009-06-04/mm/mprotect.c 2009-06-04 10:21:39.000000000 +0200
26736 @@ -92,8 +92,6 @@ static inline void change_pmd_range(stru
26737 next = pmd_addr_end(addr, end);
26738 if (pmd_none_or_clear_bad(pmd))
26739 continue;
26740 - if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
26741 - continue;
26742 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
26743 } while (pmd++, addr = next, addr != end);
26744 }