]> git.ipfire.org Git - ipfire-2.x.git/blob - src/patches/suse-2.6.27.39/patches.xen/xen3-patch-2.6.27
Add a patch to fix Intel E100 wake-on-lan problems.
[ipfire-2.x.git] / src / patches / suse-2.6.27.39 / patches.xen / xen3-patch-2.6.27
1 From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
2 Subject: [PATCH] Linux: Update to 2.6.27
3 Patch-mainline: 2.6.27
4
5 This patch contains the differences between Linux 2.6.26 and 2.6.27.
6
7 Acked-by: Jeff Mahoney <jeffm@suse.com>
8 Automatically created from "patches.kernel.org/patch-2.6.27" by xen-port-patches.py
9
10 --- sle11-2009-10-16.orig/arch/x86/Kconfig 2009-03-16 16:38:05.000000000 +0100
11 +++ sle11-2009-10-16/arch/x86/Kconfig 2009-06-04 10:21:39.000000000 +0200
12 @@ -594,7 +594,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
13 config AMD_IOMMU
14 bool "AMD IOMMU support"
15 select SWIOTLB
16 - depends on X86_64 && PCI && ACPI
17 + depends on X86_64 && PCI && ACPI && !X86_64_XEN
18 help
19 With this option you can enable support for AMD IOMMU hardware in
20 your system. An IOMMU is a hardware component which provides
21 @@ -629,8 +629,10 @@ config MAXSMP
22
23 config NR_CPUS
24 int "Maximum number of CPUs (2-4096)"
25 + range 2 32 if XEN
26 range 2 4096
27 depends on SMP
28 + default "32" if MAXSMP && XEN
29 default "4096" if MAXSMP
30 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
31 default "16" if X86_64_XEN
32 @@ -1227,7 +1229,7 @@ config MTRR
33 config MTRR_SANITIZER
34 bool
35 prompt "MTRR cleanup support"
36 - depends on MTRR
37 + depends on MTRR && !XEN
38 help
39 Convert MTRR layout from continuous to discrete, so X drivers can
40 add writeback entries.
41 --- sle11-2009-10-16.orig/arch/x86/Kconfig.debug 2009-03-16 16:33:40.000000000 +0100
42 +++ sle11-2009-10-16/arch/x86/Kconfig.debug 2009-06-04 10:21:39.000000000 +0200
43 @@ -25,6 +25,7 @@ config STRICT_DEVMEM
44 config X86_VERBOSE_BOOTUP
45 bool "Enable verbose x86 bootup info messages"
46 default y
47 + depends on !XEN
48 help
49 Enables the informational output from the decompression stage
50 (e.g. bzImage) of the boot. If you disable this you will still
51 @@ -179,7 +180,7 @@ config MMIOTRACE_HOOKS
52
53 config MMIOTRACE
54 bool "Memory mapped IO tracing"
55 - depends on DEBUG_KERNEL && PCI
56 + depends on DEBUG_KERNEL && PCI && !XEN
57 select TRACING
58 select MMIOTRACE_HOOKS
59 help
60 --- sle11-2009-10-16.orig/arch/x86/Makefile 2009-02-16 16:18:36.000000000 +0100
61 +++ sle11-2009-10-16/arch/x86/Makefile 2009-06-04 10:21:39.000000000 +0200
62 @@ -116,8 +116,8 @@ mflags-$(CONFIG_X86_VOYAGER) := -Iinclud
63 mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
64
65 # Xen subarch support
66 -mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen
67 -mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/
68 +mflags-$(CONFIG_XEN) := -Iinclude/asm-x86/mach-xen
69 +mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
70
71 # generic subarchitecture
72 mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
73 @@ -128,8 +128,10 @@ mcore-$(CONFIG_X86_GENERICARCH) := arch/
74 mflags-y += -Iinclude/asm-x86/mach-default
75
76 # 64 bit does not support subarch support - clear sub arch variables
77 +ifneq ($(CONFIG_XEN),y)
78 fcore-$(CONFIG_X86_64) :=
79 mcore-$(CONFIG_X86_64) :=
80 +endif
81
82 KBUILD_CFLAGS += $(mflags-y)
83 KBUILD_AFLAGS += $(mflags-y)
84 --- sle11-2009-10-16.orig/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:38:05.000000000 +0100
85 +++ sle11-2009-10-16/arch/x86/ia32/ia32entry-xen.S 2009-10-16 14:51:56.000000000 +0200
86 @@ -15,6 +15,16 @@
87 #include <asm/irqflags.h>
88 #include <linux/linkage.h>
89
90 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
91 +#include <linux/elf-em.h>
92 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
93 +#define __AUDIT_ARCH_LE 0x40000000
94 +
95 +#ifndef CONFIG_AUDITSYSCALL
96 +#define sysexit_audit int_ret_from_sys_call
97 +#define sysretl_audit int_ret_from_sys_call
98 +#endif
99 +
100 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
101
102 .macro IA32_ARG_FIXUP noebp=0
103 @@ -37,6 +47,11 @@
104 movq %rax,R8(%rsp)
105 .endm
106
107 + /*
108 + * Reload arg registers from stack in case ptrace changed them.
109 + * We don't reload %eax because syscall_trace_enter() returned
110 + * the value it wants us to use in the table lookup.
111 + */
112 .macro LOAD_ARGS32 offset
113 movl \offset(%rsp),%r11d
114 movl \offset+8(%rsp),%r10d
115 @@ -46,7 +61,6 @@
116 movl \offset+48(%rsp),%edx
117 movl \offset+56(%rsp),%esi
118 movl \offset+64(%rsp),%edi
119 - movl \offset+72(%rsp),%eax
120 .endm
121
122 .macro CFI_STARTPROC32 simple
123 @@ -61,6 +75,19 @@
124 CFI_UNDEFINED r15
125 .endm
126
127 +#ifdef CONFIG_PARAVIRT
128 +ENTRY(native_usergs_sysret32)
129 + swapgs
130 + sysretl
131 +ENDPROC(native_usergs_sysret32)
132 +
133 +ENTRY(native_irq_enable_sysexit)
134 + swapgs
135 + sti
136 + sysexit
137 +ENDPROC(native_irq_enable_sysexit)
138 +#endif
139 +
140 /*
141 * 32bit SYSENTER instruction entry.
142 *
143 @@ -98,7 +125,7 @@ ENTRY(ia32_sysenter_target)
144 CFI_RESTORE rcx
145 movl %ebp,%ebp /* zero extension */
146 movl %eax,%eax
147 - movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
148 + movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
149 movl $__USER32_DS,40(%rsp)
150 movq %rbp,32(%rsp)
151 movl $__USER32_CS,16(%rsp)
152 @@ -113,19 +140,75 @@ ENTRY(ia32_sysenter_target)
153 .quad 1b,ia32_badarg
154 .previous
155 GET_THREAD_INFO(%r10)
156 - orl $TS_COMPAT,threadinfo_status(%r10)
157 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
158 + orl $TS_COMPAT,TI_status(%r10)
159 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
160 jnz sysenter_tracesys
161 -sysenter_do_call:
162 cmpl $(IA32_NR_syscalls-1),%eax
163 ja ia32_badsys
164 +sysenter_do_call:
165 IA32_ARG_FIXUP 1
166 +sysenter_dispatch:
167 call *ia32_sys_call_table(,%rax,8)
168 movq %rax,RAX-ARGOFFSET(%rsp)
169 + GET_THREAD_INFO(%r10)
170 + DISABLE_INTERRUPTS(CLBR_NONE)
171 + TRACE_IRQS_OFF
172 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
173 + jnz sysexit_audit
174 jmp int_ret_from_sys_call
175
176 +#ifdef CONFIG_AUDITSYSCALL
177 + .macro auditsys_entry_common
178 + movl %esi,%r9d /* 6th arg: 4th syscall arg */
179 + movl %edx,%r8d /* 5th arg: 3rd syscall arg */
180 + /* (already in %ecx) 4th arg: 2nd syscall arg */
181 + movl %ebx,%edx /* 3rd arg: 1st syscall arg */
182 + movl %eax,%esi /* 2nd arg: syscall number */
183 + movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
184 + call audit_syscall_entry
185 + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
186 + cmpl $(IA32_NR_syscalls-1),%eax
187 + ja ia32_badsys
188 + movl %ebx,%edi /* reload 1st syscall arg */
189 + movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
190 + movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
191 + movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
192 + movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
193 + .endm
194 +
195 + .macro auditsys_exit exit,ebpsave=RBP
196 + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
197 + jnz int_ret_from_sys_call
198 + TRACE_IRQS_ON
199 + ENABLE_INTERRUPTS(CLBR_NONE)
200 + movl %eax,%esi /* second arg, syscall return value */
201 + cmpl $0,%eax /* is it < 0? */
202 + setl %al /* 1 if so, 0 if not */
203 + movzbl %al,%edi /* zero-extend that into %edi */
204 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
205 + call audit_syscall_exit
206 + movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
207 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
208 + DISABLE_INTERRUPTS(CLBR_NONE)
209 + TRACE_IRQS_OFF
210 + jmp int_with_check
211 + .endm
212 +
213 +sysenter_auditsys:
214 + auditsys_entry_common
215 + movl %ebp,%r9d /* reload 6th syscall arg */
216 + jmp sysenter_dispatch
217 +
218 +sysexit_audit:
219 + auditsys_exit sysexit_from_sys_call
220 +#endif
221 +
222 sysenter_tracesys:
223 xchgl %r9d,%ebp
224 +#ifdef CONFIG_AUDITSYSCALL
225 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
226 + jz sysenter_auditsys
227 +#endif
228 SAVE_REST
229 CLEAR_RREGS
230 movq %r9,R9(%rsp)
231 @@ -186,18 +269,38 @@ ENTRY(ia32_cstar_target)
232 .quad 1b,ia32_badarg
233 .previous
234 GET_THREAD_INFO(%r10)
235 - orl $TS_COMPAT,threadinfo_status(%r10)
236 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
237 + orl $TS_COMPAT,TI_status(%r10)
238 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
239 jnz cstar_tracesys
240 cstar_do_call:
241 cmpl $IA32_NR_syscalls-1,%eax
242 ja ia32_badsys
243 IA32_ARG_FIXUP 1
244 +cstar_dispatch:
245 call *ia32_sys_call_table(,%rax,8)
246 movq %rax,RAX-ARGOFFSET(%rsp)
247 + GET_THREAD_INFO(%r10)
248 + DISABLE_INTERRUPTS(CLBR_NONE)
249 + testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
250 + jnz sysretl_audit
251 jmp int_ret_from_sys_call
252
253 -cstar_tracesys:
254 +#ifdef CONFIG_AUDITSYSCALL
255 +cstar_auditsys:
256 + movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
257 + auditsys_entry_common
258 + movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
259 + jmp cstar_dispatch
260 +
261 +sysretl_audit:
262 + auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
263 +#endif
264 +
265 +cstar_tracesys:
266 +#ifdef CONFIG_AUDITSYSCALL
267 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
268 + jz cstar_auditsys
269 +#endif
270 xchgl %r9d,%ebp
271 SAVE_REST
272 CLEAR_RREGS
273 @@ -263,8 +366,8 @@ ENTRY(ia32_syscall)
274 this could be a problem. */
275 SAVE_ARGS 0,0,1
276 GET_THREAD_INFO(%r10)
277 - orl $TS_COMPAT,threadinfo_status(%r10)
278 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
279 + orl $TS_COMPAT,TI_status(%r10)
280 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
281 jnz ia32_tracesys
282 ia32_do_syscall:
283 cmpl $(IA32_NR_syscalls-1),%eax
284 @@ -309,13 +412,11 @@ quiet_ni_syscall:
285 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
286 PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
287 PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
288 - PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
289 PTREGSCALL stub32_execve, sys32_execve, %rcx
290 PTREGSCALL stub32_fork, sys_fork, %rdi
291 PTREGSCALL stub32_clone, sys32_clone, %rdx
292 PTREGSCALL stub32_vfork, sys_vfork, %rdi
293 PTREGSCALL stub32_iopl, sys_iopl, %rsi
294 - PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
295
296 ENTRY(ia32_ptregs_common)
297 popq %r11
298 @@ -415,7 +516,7 @@ ia32_sys_call_table:
299 .quad sys_ssetmask
300 .quad sys_setreuid16 /* 70 */
301 .quad sys_setregid16
302 - .quad stub32_sigsuspend
303 + .quad sys32_sigsuspend
304 .quad compat_sys_sigpending
305 .quad sys_sethostname
306 .quad compat_sys_setrlimit /* 75 */
307 @@ -522,7 +623,7 @@ ia32_sys_call_table:
308 .quad sys32_rt_sigpending
309 .quad compat_sys_rt_sigtimedwait
310 .quad sys32_rt_sigqueueinfo
311 - .quad stub32_rt_sigsuspend
312 + .quad sys_rt_sigsuspend
313 .quad sys32_pread /* 180 */
314 .quad sys32_pwrite
315 .quad sys_chown16
316 @@ -670,4 +771,10 @@ ia32_sys_call_table:
317 .quad sys32_fallocate
318 .quad compat_sys_timerfd_settime /* 325 */
319 .quad compat_sys_timerfd_gettime
320 + .quad compat_sys_signalfd4
321 + .quad sys_eventfd2
322 + .quad sys_epoll_create1
323 + .quad sys_dup3 /* 330 */
324 + .quad sys_pipe2
325 + .quad sys_inotify_init1
326 ia32_syscall_end:
327 --- sle11-2009-10-16.orig/arch/x86/kernel/Makefile 2009-03-16 16:38:05.000000000 +0100
328 +++ sle11-2009-10-16/arch/x86/kernel/Makefile 2009-06-04 10:21:39.000000000 +0200
329 @@ -120,9 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
330
331 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
332
333 - obj-$(CONFIG_XEN) += nmi_64.o
334 + obj-$(CONFIG_XEN) += nmi.o
335 time_64-$(CONFIG_XEN) += time_32.o
336 endif
337
338 -disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
339 - pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
340 +disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \
341 + i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
342 + tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o
343 --- sle11-2009-10-16.orig/arch/x86/kernel/acpi/boot.c 2009-08-26 11:55:26.000000000 +0200
344 +++ sle11-2009-10-16/arch/x86/kernel/acpi/boot.c 2009-08-26 12:03:49.000000000 +0200
345 @@ -949,7 +949,9 @@ void __init mp_register_ioapic(int id, u
346 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
347 mp_ioapics[idx].mp_apicaddr = address;
348
349 +#ifndef CONFIG_XEN
350 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
351 +#endif
352 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
353 #ifdef CONFIG_X86_32
354 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
355 @@ -1106,7 +1108,7 @@ int mp_register_gsi(u32 gsi, int trigger
356 {
357 int ioapic;
358 int ioapic_pin;
359 -#ifdef CONFIG_X86_32
360 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
361 #define MAX_GSI_NUM 4096
362 #define IRQ_COMPRESSION_START 64
363
364 @@ -1154,7 +1156,7 @@ int mp_register_gsi(u32 gsi, int trigger
365 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
366 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
367 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
368 -#ifdef CONFIG_X86_32
369 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
370 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
371 #else
372 return gsi;
373 @@ -1162,7 +1164,7 @@ int mp_register_gsi(u32 gsi, int trigger
374 }
375
376 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
377 -#ifdef CONFIG_X86_32
378 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
379 /*
380 * For GSI >= 64, use IRQ compression
381 */
382 --- sle11-2009-10-16.orig/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:38:05.000000000 +0100
383 +++ sle11-2009-10-16/arch/x86/kernel/acpi/sleep-xen.c 2009-06-04 10:21:39.000000000 +0200
384 @@ -9,6 +9,7 @@
385 #include <linux/bootmem.h>
386 #include <linux/dmi.h>
387 #include <linux/cpumask.h>
388 +#include <asm/segment.h>
389
390 #include "realmode/wakeup.h"
391 #include "sleep.h"
392 @@ -20,7 +21,7 @@ unsigned long acpi_realmode_flags;
393 /* address in low memory of the wakeup routine. */
394 static unsigned long acpi_realmode;
395
396 -#ifdef CONFIG_64BIT
397 +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
398 static char temp_stack[10240];
399 #endif
400 #endif
401 @@ -54,18 +55,27 @@ int acpi_save_state_mem(void)
402 header->video_mode = saved_video_mode;
403
404 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
405 +
406 + /*
407 + * Set up the wakeup GDT. We set these up as Big Real Mode,
408 + * that is, with limits set to 4 GB. At least the Lenovo
409 + * Thinkpad X61 is known to need this for the video BIOS
410 + * initialization quirk to work; this is likely to also
411 + * be the case for other laptops or integrated video devices.
412 + */
413 +
414 /* GDT[0]: GDT self-pointer */
415 header->wakeup_gdt[0] =
416 (u64)(sizeof(header->wakeup_gdt) - 1) +
417 ((u64)(acpi_wakeup_address +
418 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
419 << 16);
420 - /* GDT[1]: real-mode-like code segment */
421 - header->wakeup_gdt[1] = (0x009bULL << 40) +
422 - ((u64)acpi_wakeup_address << 16) + 0xffff;
423 - /* GDT[2]: real-mode-like data segment */
424 - header->wakeup_gdt[2] = (0x0093ULL << 40) +
425 - ((u64)acpi_wakeup_address << 16) + 0xffff;
426 + /* GDT[1]: big real mode-like code segment */
427 + header->wakeup_gdt[1] =
428 + GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
429 + /* GDT[2]: big real mode-like data segment */
430 + header->wakeup_gdt[2] =
431 + GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
432
433 #ifndef CONFIG_64BIT
434 store_gdt((struct desc_ptr *)&header->pmode_gdt);
435 @@ -79,7 +89,7 @@ int acpi_save_state_mem(void)
436 #endif /* !CONFIG_64BIT */
437
438 header->pmode_cr0 = read_cr0();
439 - header->pmode_cr4 = read_cr4();
440 + header->pmode_cr4 = read_cr4_safe();
441 header->realmode_flags = acpi_realmode_flags;
442 header->real_magic = 0x12345678;
443
444 @@ -89,7 +99,9 @@ int acpi_save_state_mem(void)
445 saved_magic = 0x12345678;
446 #else /* CONFIG_64BIT */
447 header->trampoline_segment = setup_trampoline() >> 4;
448 - init_rsp = (unsigned long)temp_stack + 4096;
449 +#ifdef CONFIG_SMP
450 + stack_start.sp = temp_stack + 4096;
451 +#endif
452 initial_code = (unsigned long)wakeup_long64;
453 saved_magic = 0x123456789abcdef0;
454 #endif /* CONFIG_64BIT */
455 @@ -145,6 +157,12 @@ static int __init acpi_sleep_setup(char
456 acpi_realmode_flags |= 2;
457 if (strncmp(str, "s3_beep", 7) == 0)
458 acpi_realmode_flags |= 4;
459 +#ifdef CONFIG_HIBERNATION
460 + if (strncmp(str, "s4_nohwsig", 10) == 0)
461 + acpi_no_s4_hw_signature();
462 +#endif
463 + if (strncmp(str, "old_ordering", 12) == 0)
464 + acpi_old_suspend_ordering();
465 str = strchr(str, ',');
466 if (str != NULL)
467 str += strspn(str, ", \t");
468 --- sle11-2009-10-16.orig/arch/x86/kernel/apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
469 +++ sle11-2009-10-16/arch/x86/kernel/apic_32-xen.c 2009-06-04 10:21:39.000000000 +0200
470 @@ -59,7 +59,10 @@ static cpumask_t timer_bcast_ipi;
471 /*
472 * Debug level, exported for io_apic.c
473 */
474 -int apic_verbosity;
475 +unsigned int apic_verbosity;
476 +
477 +/* Have we found an MP table */
478 +int smp_found_config;
479
480 #ifndef CONFIG_XEN
481 static int modern_apic(void)
482 --- sle11-2009-10-16.orig/arch/x86/kernel/apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
483 +++ sle11-2009-10-16/arch/x86/kernel/apic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
484 @@ -39,7 +39,10 @@ int disable_apic;
485 /*
486 * Debug level, exported for io_apic.c
487 */
488 -int apic_verbosity;
489 +unsigned int apic_verbosity;
490 +
491 +/* Have we found an MP table */
492 +int smp_found_config;
493
494 /*
495 * The guts of the apic timer interrupt
496 --- sle11-2009-10-16.orig/arch/x86/kernel/asm-offsets_64.c 2008-11-25 12:35:54.000000000 +0100
497 +++ sle11-2009-10-16/arch/x86/kernel/asm-offsets_64.c 2009-06-04 10:21:39.000000000 +0200
498 @@ -138,7 +138,7 @@ int main(void)
499
500 BLANK();
501 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
502 -#ifdef CONFIG_XEN
503 +#ifdef CONFIG_PARAVIRT_XEN
504 BLANK();
505 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
506 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
507 --- sle11-2009-10-16.orig/arch/x86/kernel/cpu/amd_64.c 2009-10-28 14:55:02.000000000 +0100
508 +++ sle11-2009-10-16/arch/x86/kernel/cpu/amd_64.c 2009-06-04 10:21:39.000000000 +0200
509 @@ -193,6 +193,7 @@ static void __cpuinit init_amd(struct cp
510 fam10h_check_enable_mmcfg();
511 }
512
513 +#ifndef CONFIG_XEN
514 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
515 unsigned long long tseg;
516
517 @@ -211,6 +212,7 @@ static void __cpuinit init_amd(struct cp
518 set_memory_4k((unsigned long)__va(tseg), 1);
519 }
520 }
521 +#endif
522 }
523
524 static struct cpu_dev amd_cpu_dev __cpuinitdata = {
525 --- sle11-2009-10-16.orig/arch/x86/kernel/cpu/bugs_64.c 2009-10-28 14:55:02.000000000 +0100
526 +++ sle11-2009-10-16/arch/x86/kernel/cpu/bugs_64.c 2009-06-04 10:21:39.000000000 +0200
527 @@ -20,6 +20,7 @@ void __init check_bugs(void)
528 #endif
529 alternative_instructions();
530
531 +#ifndef CONFIG_XEN
532 /*
533 * Make sure the first 2MB area is not mapped by huge pages
534 * There are typically fixed size MTRRs in there and overlapping
535 @@ -30,4 +31,5 @@ void __init check_bugs(void)
536 */
537 if (!direct_gbpages)
538 set_memory_4k((unsigned long)__va(0), 1);
539 +#endif
540 }
541 --- sle11-2009-10-16.orig/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:38:05.000000000 +0100
542 +++ sle11-2009-10-16/arch/x86/kernel/cpu/common-xen.c 2009-06-04 10:21:39.000000000 +0200
543 @@ -13,6 +13,7 @@
544 #include <asm/mtrr.h>
545 #include <asm/mce.h>
546 #include <asm/pat.h>
547 +#include <asm/asm.h>
548 #ifdef CONFIG_X86_LOCAL_APIC
549 #include <asm/mpspec.h>
550 #include <asm/apic.h>
551 @@ -341,11 +342,24 @@ static void __init early_cpu_detect(void
552
553 get_cpu_vendor(c, 1);
554
555 + early_get_cap(c);
556 +
557 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
558 cpu_devs[c->x86_vendor]->c_early_init)
559 cpu_devs[c->x86_vendor]->c_early_init(c);
560 +}
561
562 - early_get_cap(c);
563 +/*
564 + * The NOPL instruction is supposed to exist on all CPUs with
565 + * family >= 6; unfortunately, that's not true in practice because
566 + * of early VIA chips and (more importantly) broken virtualizers that
567 + * are not easy to detect. In the latter case it doesn't even *fail*
568 + * reliably, so probing for it doesn't even work. Disable it completely
569 + * unless we can find a reliable way to detect all the broken cases.
570 + */
571 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
572 +{
573 + clear_cpu_cap(c, X86_FEATURE_NOPL);
574 }
575
576 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
577 @@ -402,8 +416,8 @@ static void __cpuinit generic_identify(s
578 }
579
580 init_scattered_cpuid_features(c);
581 + detect_nopl(c);
582 }
583 -
584 }
585
586 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
587 @@ -434,7 +448,7 @@ __setup("serialnumber", x86_serial_nr_se
588 /*
589 * This does the hard work of actually picking apart the CPU stuff...
590 */
591 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
592 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
593 {
594 int i;
595
596 @@ -448,6 +462,8 @@ void __cpuinit identify_cpu(struct cpuin
597 c->x86_max_cores = 1;
598 c->x86_clflush_size = 32;
599 memset(&c->x86_capability, 0, sizeof c->x86_capability);
600 + if (boot_cpu_has(X86_FEATURE_SYSCALL32))
601 + set_cpu_cap(c, X86_FEATURE_SYSCALL32);
602
603 if (!have_cpuid_p()) {
604 /*
605 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
606 +++ sle11-2009-10-16/arch/x86/kernel/cpu/common_64-xen.c 2009-06-04 10:21:39.000000000 +0200
607 @@ -0,0 +1,771 @@
608 +#include <linux/init.h>
609 +#include <linux/kernel.h>
610 +#include <linux/sched.h>
611 +#include <linux/string.h>
612 +#include <linux/bootmem.h>
613 +#include <linux/bitops.h>
614 +#include <linux/module.h>
615 +#include <linux/kgdb.h>
616 +#include <linux/topology.h>
617 +#include <linux/delay.h>
618 +#include <linux/smp.h>
619 +#include <linux/percpu.h>
620 +#include <asm/i387.h>
621 +#include <asm/msr.h>
622 +#include <asm/io.h>
623 +#include <asm/linkage.h>
624 +#include <asm/mmu_context.h>
625 +#include <asm/mtrr.h>
626 +#include <asm/mce.h>
627 +#include <asm/pat.h>
628 +#include <asm/asm.h>
629 +#include <asm/numa.h>
630 +#ifdef CONFIG_X86_LOCAL_APIC
631 +#include <asm/mpspec.h>
632 +#include <asm/apic.h>
633 +#include <mach_apic.h>
634 +#elif defined(CONFIG_XEN)
635 +#include <mach_apic.h>
636 +#endif
637 +#include <asm/pda.h>
638 +#include <asm/pgtable.h>
639 +#include <asm/processor.h>
640 +#include <asm/desc.h>
641 +#include <asm/atomic.h>
642 +#include <asm/proto.h>
643 +#include <asm/sections.h>
644 +#include <asm/setup.h>
645 +#include <asm/genapic.h>
646 +
647 +#include "cpu.h"
648 +
649 +/* We need valid kernel segments for data and code in long mode too
650 + * IRET will check the segment types kkeil 2000/10/28
651 + * Also sysret mandates a special GDT layout
652 + */
653 +/* The TLS descriptors are currently at a different place compared to i386.
654 + Hopefully nobody expects them at a fixed place (Wine?) */
655 +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
656 + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
657 + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
658 + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
659 + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
660 + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
661 + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
662 +} };
663 +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
664 +
665 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
666 +
667 +/* Current gdt points %fs at the "master" per-cpu area: after this,
668 + * it's on the real one. */
669 +void switch_to_new_gdt(void)
670 +{
671 +#ifndef CONFIG_XEN
672 + struct desc_ptr gdt_descr;
673 +
674 + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
675 + gdt_descr.size = GDT_SIZE - 1;
676 + load_gdt(&gdt_descr);
677 +#else
678 + void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id());
679 + unsigned long frames[16];
680 + unsigned int f = 0;
681 +
682 + for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) {
683 + frames[f++] = virt_to_mfn(va);
684 + make_page_readonly(va, XENFEAT_writable_descriptor_tables);
685 + }
686 + if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct)))
687 + BUG();
688 +#endif
689 +}
690 +
691 +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
692 +
693 +static void __cpuinit default_init(struct cpuinfo_x86 *c)
694 +{
695 + display_cacheinfo(c);
696 +}
697 +
698 +static struct cpu_dev __cpuinitdata default_cpu = {
699 + .c_init = default_init,
700 + .c_vendor = "Unknown",
701 +};
702 +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
703 +
704 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
705 +{
706 + unsigned int *v;
707 +
708 + if (c->extended_cpuid_level < 0x80000004)
709 + return 0;
710 +
711 + v = (unsigned int *) c->x86_model_id;
712 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
713 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
714 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
715 + c->x86_model_id[48] = 0;
716 + return 1;
717 +}
718 +
719 +
720 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
721 +{
722 + unsigned int n, dummy, ebx, ecx, edx;
723 +
724 + n = c->extended_cpuid_level;
725 +
726 + if (n >= 0x80000005) {
727 + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
728 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
729 + "D cache %dK (%d bytes/line)\n",
730 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
731 + c->x86_cache_size = (ecx>>24) + (edx>>24);
732 + /* On K8 L1 TLB is inclusive, so don't count it */
733 + c->x86_tlbsize = 0;
734 + }
735 +
736 + if (n >= 0x80000006) {
737 + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
738 + ecx = cpuid_ecx(0x80000006);
739 + c->x86_cache_size = ecx >> 16;
740 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
741 +
742 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
743 + c->x86_cache_size, ecx & 0xFF);
744 + }
745 +}
746 +
747 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
748 +{
749 +#ifdef CONFIG_SMP
750 + u32 eax, ebx, ecx, edx;
751 + int index_msb, core_bits;
752 +
753 + cpuid(1, &eax, &ebx, &ecx, &edx);
754 +
755 +
756 + if (!cpu_has(c, X86_FEATURE_HT))
757 + return;
758 + if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
759 + goto out;
760 +
761 + smp_num_siblings = (ebx & 0xff0000) >> 16;
762 +
763 + if (smp_num_siblings == 1) {
764 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
765 + } else if (smp_num_siblings > 1) {
766 +
767 + if (smp_num_siblings > NR_CPUS) {
768 + printk(KERN_WARNING "CPU: Unsupported number of "
769 + "siblings %d", smp_num_siblings);
770 + smp_num_siblings = 1;
771 + return;
772 + }
773 +
774 + index_msb = get_count_order(smp_num_siblings);
775 + c->phys_proc_id = phys_pkg_id(index_msb);
776 +
777 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
778 +
779 + index_msb = get_count_order(smp_num_siblings);
780 +
781 + core_bits = get_count_order(c->x86_max_cores);
782 +
783 + c->cpu_core_id = phys_pkg_id(index_msb) &
784 + ((1 << core_bits) - 1);
785 + }
786 +out:
787 + if ((c->x86_max_cores * smp_num_siblings) > 1) {
788 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
789 + c->phys_proc_id);
790 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
791 + c->cpu_core_id);
792 + }
793 +
794 +#endif
795 +}
796 +
797 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
798 +{
799 + char *v = c->x86_vendor_id;
800 + int i;
801 + static int printed;
802 +
803 + for (i = 0; i < X86_VENDOR_NUM; i++) {
804 + if (cpu_devs[i]) {
805 + if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
806 + (cpu_devs[i]->c_ident[1] &&
807 + !strcmp(v, cpu_devs[i]->c_ident[1]))) {
808 + c->x86_vendor = i;
809 + this_cpu = cpu_devs[i];
810 + return;
811 + }
812 + }
813 + }
814 + if (!printed) {
815 + printed++;
816 + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
817 + printk(KERN_ERR "CPU: Your system may be unstable.\n");
818 + }
819 + c->x86_vendor = X86_VENDOR_UNKNOWN;
820 +}
821 +
822 +static void __init early_cpu_support_print(void)
823 +{
824 + int i,j;
825 + struct cpu_dev *cpu_devx;
826 +
827 + printk("KERNEL supported cpus:\n");
828 + for (i = 0; i < X86_VENDOR_NUM; i++) {
829 + cpu_devx = cpu_devs[i];
830 + if (!cpu_devx)
831 + continue;
832 + for (j = 0; j < 2; j++) {
833 + if (!cpu_devx->c_ident[j])
834 + continue;
835 + printk(" %s %s\n", cpu_devx->c_vendor,
836 + cpu_devx->c_ident[j]);
837 + }
838 + }
839 +}
840 +
841 +/*
842 + * The NOPL instruction is supposed to exist on all CPUs with
843 + * family >= 6, unfortunately, that's not true in practice because
844 + * of early VIA chips and (more importantly) broken virtualizers that
845 + * are not easy to detect. Hence, probe for it based on first
846 + * principles.
847 + *
848 + * Note: no 64-bit chip is known to lack these, but put the code here
849 + * for consistency with 32 bits, and to make it utterly trivial to
850 + * diagnose the problem should it ever surface.
851 + */
852 +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
853 +{
854 + const u32 nopl_signature = 0x888c53b1; /* Random number */
855 + u32 has_nopl = nopl_signature;
856 +
857 + clear_cpu_cap(c, X86_FEATURE_NOPL);
858 + if (c->x86 >= 6) {
859 + asm volatile("\n"
860 + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
861 + "2:\n"
862 + " .section .fixup,\"ax\"\n"
863 + "3: xor %0,%0\n"
864 + " jmp 2b\n"
865 + " .previous\n"
866 + _ASM_EXTABLE(1b,3b)
867 + : "+a" (has_nopl));
868 +
869 + if (has_nopl == nopl_signature)
870 + set_cpu_cap(c, X86_FEATURE_NOPL);
871 + }
872 +}
873 +
874 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
875 +
876 +void __init early_cpu_init(void)
877 +{
878 + struct cpu_vendor_dev *cvdev;
879 +
880 + for (cvdev = __x86cpuvendor_start ;
881 + cvdev < __x86cpuvendor_end ;
882 + cvdev++)
883 + cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
884 + early_cpu_support_print();
885 + early_identify_cpu(&boot_cpu_data);
886 +}
887 +
888 +/* Do some early cpuid on the boot CPU to get some parameter that are
889 + needed before check_bugs. Everything advanced is in identify_cpu
890 + below. */
891 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
892 +{
893 + u32 tfms, xlvl;
894 +
895 + c->loops_per_jiffy = loops_per_jiffy;
896 + c->x86_cache_size = -1;
897 + c->x86_vendor = X86_VENDOR_UNKNOWN;
898 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
899 + c->x86_vendor_id[0] = '\0'; /* Unset */
900 + c->x86_model_id[0] = '\0'; /* Unset */
901 + c->x86_clflush_size = 64;
902 + c->x86_cache_alignment = c->x86_clflush_size;
903 + c->x86_max_cores = 1;
904 + c->x86_coreid_bits = 0;
905 + c->extended_cpuid_level = 0;
906 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
907 +
908 + /* Get vendor name */
909 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
910 + (unsigned int *)&c->x86_vendor_id[0],
911 + (unsigned int *)&c->x86_vendor_id[8],
912 + (unsigned int *)&c->x86_vendor_id[4]);
913 +
914 + get_cpu_vendor(c);
915 +
916 + /* Initialize the standard set of capabilities */
917 + /* Note that the vendor-specific code below might override */
918 +
919 + /* Intel-defined flags: level 0x00000001 */
920 + if (c->cpuid_level >= 0x00000001) {
921 + __u32 misc;
922 + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
923 + &c->x86_capability[0]);
924 + c->x86 = (tfms >> 8) & 0xf;
925 + c->x86_model = (tfms >> 4) & 0xf;
926 + c->x86_mask = tfms & 0xf;
927 + if (c->x86 == 0xf)
928 + c->x86 += (tfms >> 20) & 0xff;
929 + if (c->x86 >= 0x6)
930 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
931 + if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
932 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
933 + } else {
934 + /* Have CPUID level 0 only - unheard of */
935 + c->x86 = 4;
936 + }
937 +
938 + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
939 +#ifdef CONFIG_SMP
940 + c->phys_proc_id = c->initial_apicid;
941 +#endif
942 + /* AMD-defined flags: level 0x80000001 */
943 + xlvl = cpuid_eax(0x80000000);
944 + c->extended_cpuid_level = xlvl;
945 + if ((xlvl & 0xffff0000) == 0x80000000) {
946 + if (xlvl >= 0x80000001) {
947 + c->x86_capability[1] = cpuid_edx(0x80000001);
948 + c->x86_capability[6] = cpuid_ecx(0x80000001);
949 + }
950 + if (xlvl >= 0x80000004)
951 + get_model_name(c); /* Default name */
952 + }
953 +
954 + /* Transmeta-defined flags: level 0x80860001 */
955 + xlvl = cpuid_eax(0x80860000);
956 + if ((xlvl & 0xffff0000) == 0x80860000) {
957 + /* Don't set x86_cpuid_level here for now to not confuse. */
958 + if (xlvl >= 0x80860001)
959 + c->x86_capability[2] = cpuid_edx(0x80860001);
960 + }
961 +
962 + if (c->extended_cpuid_level >= 0x80000007)
963 + c->x86_power = cpuid_edx(0x80000007);
964 +
965 + if (c->extended_cpuid_level >= 0x80000008) {
966 + u32 eax = cpuid_eax(0x80000008);
967 +
968 + c->x86_virt_bits = (eax >> 8) & 0xff;
969 + c->x86_phys_bits = eax & 0xff;
970 + }
971 +
972 + detect_nopl(c);
973 +
974 + if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
975 + cpu_devs[c->x86_vendor]->c_early_init)
976 + cpu_devs[c->x86_vendor]->c_early_init(c);
977 +
978 + validate_pat_support(c);
979 +}
980 +
981 +/*
982 + * This does the hard work of actually picking apart the CPU stuff...
983 + */
984 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
985 +{
986 + int i;
987 +
988 + early_identify_cpu(c);
989 +
990 + init_scattered_cpuid_features(c);
991 +
992 + c->apicid = phys_pkg_id(0);
993 +
994 + /*
995 + * Vendor-specific initialization. In this section we
996 + * canonicalize the feature flags, meaning if there are
997 + * features a certain CPU supports which CPUID doesn't
998 + * tell us, CPUID claiming incorrect flags, or other bugs,
999 + * we handle them here.
1000 + *
1001 + * At the end of this section, c->x86_capability better
1002 + * indicate the features this CPU genuinely supports!
1003 + */
1004 + if (this_cpu->c_init)
1005 + this_cpu->c_init(c);
1006 +
1007 + detect_ht(c);
1008 +
1009 + /*
1010 + * On SMP, boot_cpu_data holds the common feature set between
1011 + * all CPUs; so make sure that we indicate which features are
1012 + * common between the CPUs. The first time this routine gets
1013 + * executed, c == &boot_cpu_data.
1014 + */
1015 + if (c != &boot_cpu_data) {
1016 + /* AND the already accumulated flags with these */
1017 + for (i = 0; i < NCAPINTS; i++)
1018 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
1019 + }
1020 +
1021 + /* Clear all flags overriden by options */
1022 + for (i = 0; i < NCAPINTS; i++)
1023 + c->x86_capability[i] &= ~cleared_cpu_caps[i];
1024 +
1025 +#ifdef CONFIG_X86_MCE
1026 + mcheck_init(c);
1027 +#endif
1028 + select_idle_routine(c);
1029 +
1030 +#ifdef CONFIG_NUMA
1031 + numa_add_cpu(smp_processor_id());
1032 +#endif
1033 +
1034 +}
1035 +
1036 +void __cpuinit identify_boot_cpu(void)
1037 +{
1038 + identify_cpu(&boot_cpu_data);
1039 +}
1040 +
1041 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
1042 +{
1043 + BUG_ON(c == &boot_cpu_data);
1044 + identify_cpu(c);
1045 + mtrr_ap_init();
1046 +}
1047 +
1048 +static __init int setup_noclflush(char *arg)
1049 +{
1050 + setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1051 + return 1;
1052 +}
1053 +__setup("noclflush", setup_noclflush);
1054 +
1055 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1056 +{
1057 + if (c->x86_model_id[0])
1058 + printk(KERN_CONT "%s", c->x86_model_id);
1059 +
1060 + if (c->x86_mask || c->cpuid_level >= 0)
1061 + printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1062 + else
1063 + printk(KERN_CONT "\n");
1064 +}
1065 +
1066 +static __init int setup_disablecpuid(char *arg)
1067 +{
1068 + int bit;
1069 + if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1070 + setup_clear_cpu_cap(bit);
1071 + else
1072 + return 0;
1073 + return 1;
1074 +}
1075 +__setup("clearcpuid=", setup_disablecpuid);
1076 +
1077 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
1078 +
1079 +struct x8664_pda **_cpu_pda __read_mostly;
1080 +EXPORT_SYMBOL(_cpu_pda);
1081 +
1082 +#ifndef CONFIG_X86_NO_IDT
1083 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
1084 +#endif
1085 +
1086 +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
1087 +
1088 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
1089 +EXPORT_SYMBOL_GPL(__supported_pte_mask);
1090 +
1091 +static int do_not_nx __cpuinitdata;
1092 +
1093 +/* noexec=on|off
1094 +Control non executable mappings for 64bit processes.
1095 +
1096 +on Enable(default)
1097 +off Disable
1098 +*/
1099 +static int __init nonx_setup(char *str)
1100 +{
1101 + if (!str)
1102 + return -EINVAL;
1103 + if (!strncmp(str, "on", 2)) {
1104 + __supported_pte_mask |= _PAGE_NX;
1105 + do_not_nx = 0;
1106 + } else if (!strncmp(str, "off", 3)) {
1107 + do_not_nx = 1;
1108 + __supported_pte_mask &= ~_PAGE_NX;
1109 + }
1110 + return 0;
1111 +}
1112 +early_param("noexec", nonx_setup);
1113 +
1114 +int force_personality32;
1115 +
1116 +/* noexec32=on|off
1117 +Control non executable heap for 32bit processes.
1118 +To control the stack too use noexec=off
1119 +
1120 +on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
1121 +off PROT_READ implies PROT_EXEC
1122 +*/
1123 +static int __init nonx32_setup(char *str)
1124 +{
1125 + if (!strcmp(str, "on"))
1126 + force_personality32 &= ~READ_IMPLIES_EXEC;
1127 + else if (!strcmp(str, "off"))
1128 + force_personality32 |= READ_IMPLIES_EXEC;
1129 + return 1;
1130 +}
1131 +__setup("noexec32=", nonx32_setup);
1132 +
1133 +static void __init_refok switch_pt(int cpu)
1134 +{
1135 +#ifdef CONFIG_XEN
1136 + if (cpu == 0)
1137 + xen_init_pt();
1138 + xen_pt_switch(__pa_symbol(init_level4_pgt));
1139 + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
1140 +#endif
1141 +}
1142 +
1143 +void pda_init(int cpu)
1144 +{
1145 + struct x8664_pda *pda = cpu_pda(cpu);
1146 +
1147 + /* Setup up data that may be needed in __get_free_pages early */
1148 + loadsegment(fs, 0);
1149 + loadsegment(gs, 0);
1150 +#ifndef CONFIG_XEN
1151 + /* Memory clobbers used to order PDA accessed */
1152 + mb();
1153 + wrmsrl(MSR_GS_BASE, pda);
1154 + mb();
1155 +#else
1156 + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
1157 + (unsigned long)pda))
1158 + BUG();
1159 +#endif
1160 +
1161 + pda->cpunumber = cpu;
1162 + pda->irqcount = -1;
1163 + pda->kernelstack = (unsigned long)stack_thread_info() -
1164 + PDA_STACKOFFSET + THREAD_SIZE;
1165 + pda->active_mm = &init_mm;
1166 + pda->mmu_state = 0;
1167 +
1168 + if (cpu == 0) {
1169 + /* others are initialized in smpboot.c */
1170 + pda->pcurrent = &init_task;
1171 + pda->irqstackptr = boot_cpu_stack;
1172 + pda->irqstackptr += IRQSTACKSIZE - 64;
1173 + } else {
1174 + if (!pda->irqstackptr) {
1175 + pda->irqstackptr = (char *)
1176 + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
1177 + if (!pda->irqstackptr)
1178 + panic("cannot allocate irqstack for cpu %d",
1179 + cpu);
1180 + pda->irqstackptr += IRQSTACKSIZE - 64;
1181 + }
1182 +
1183 + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
1184 + pda->nodenumber = cpu_to_node(cpu);
1185 + }
1186 +
1187 + switch_pt(cpu);
1188 +}
1189 +
1190 +#ifndef CONFIG_X86_NO_TSS
1191 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
1192 + DEBUG_STKSZ] __page_aligned_bss;
1193 +#endif
1194 +
1195 +extern asmlinkage void ignore_sysret(void);
1196 +
1197 +void __cpuinit syscall_init(void)
1198 +{
1199 +#ifndef CONFIG_XEN
1200 + /*
1201 + * LSTAR and STAR live in a bit strange symbiosis.
1202 + * They both write to the same internal register. STAR allows to
1203 + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
1204 + */
1205 + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
1206 + wrmsrl(MSR_LSTAR, system_call);
1207 + wrmsrl(MSR_CSTAR, ignore_sysret);
1208 +
1209 + /* Flags to clear on syscall */
1210 + wrmsrl(MSR_SYSCALL_MASK,
1211 + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1212 +#endif
1213 +#ifdef CONFIG_IA32_EMULATION
1214 + syscall32_cpu_init();
1215 +#else
1216 + static const struct callback_register __cpuinitconst cstar = {
1217 + .type = CALLBACKTYPE_syscall32,
1218 + .address = (unsigned long)ignore_sysret
1219 + };
1220 +
1221 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
1222 + printk(KERN_WARNING "Unable to register CSTAR callback\n");
1223 +#endif
1224 +}
1225 +
1226 +void __cpuinit check_efer(void)
1227 +{
1228 + unsigned long efer;
1229 +
1230 + rdmsrl(MSR_EFER, efer);
1231 + if (!(efer & EFER_NX) || do_not_nx)
1232 + __supported_pte_mask &= ~_PAGE_NX;
1233 +}
1234 +
1235 +unsigned long kernel_eflags;
1236 +
1237 +#ifndef CONFIG_X86_NO_TSS
1238 +/*
1239 + * Copies of the original ist values from the tss are only accessed during
1240 + * debugging, no special alignment required.
1241 + */
1242 +DEFINE_PER_CPU(struct orig_ist, orig_ist);
1243 +#endif
1244 +
1245 +/*
1246 + * cpu_init() initializes state that is per-CPU. Some data is already
1247 + * initialized (naturally) in the bootstrap process, such as the GDT
1248 + * and IDT. We reload them nevertheless, this function acts as a
1249 + * 'CPU state barrier', nothing should get across.
1250 + * A lot of state is already set up in PDA init.
1251 + */
1252 +void __cpuinit cpu_init(void)
1253 +{
1254 + int cpu = stack_smp_processor_id();
1255 +#ifndef CONFIG_X86_NO_TSS
1256 + struct tss_struct *t = &per_cpu(init_tss, cpu);
1257 + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
1258 + unsigned long v;
1259 + char *estacks = NULL;
1260 + int i;
1261 +#endif
1262 + struct task_struct *me;
1263 +
1264 + /* CPU 0 is initialised in head64.c */
1265 + if (cpu != 0)
1266 + pda_init(cpu);
1267 +#ifndef CONFIG_X86_NO_TSS
1268 + else
1269 + estacks = boot_exception_stacks;
1270 +#endif
1271 +
1272 + me = current;
1273 +
1274 + if (cpu_test_and_set(cpu, cpu_initialized))
1275 + panic("CPU#%d already initialized!\n", cpu);
1276 +
1277 + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1278 +
1279 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1280 +
1281 + /*
1282 + * Initialize the per-CPU GDT with the boot GDT,
1283 + * and set up the GDT descriptor:
1284 + */
1285 +
1286 + switch_to_new_gdt();
1287 +#ifndef CONFIG_X86_NO_IDT
1288 + load_idt((const struct desc_ptr *)&idt_descr);
1289 +#endif
1290 +
1291 + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
1292 + syscall_init();
1293 +
1294 + wrmsrl(MSR_FS_BASE, 0);
1295 + wrmsrl(MSR_KERNEL_GS_BASE, 0);
1296 + barrier();
1297 +
1298 + check_efer();
1299 +
1300 +#ifndef CONFIG_X86_NO_TSS
1301 + /*
1302 + * set up and load the per-CPU TSS
1303 + */
1304 + if (!orig_ist->ist[0]) {
1305 + static const unsigned int order[N_EXCEPTION_STACKS] = {
1306 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1307 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1308 + };
1309 + for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1310 + if (cpu) {
1311 + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1312 + if (!estacks)
1313 + panic("Cannot allocate exception "
1314 + "stack %ld %d\n", v, cpu);
1315 + }
1316 + estacks += PAGE_SIZE << order[v];
1317 + orig_ist->ist[v] = t->x86_tss.ist[v] =
1318 + (unsigned long)estacks;
1319 + }
1320 + }
1321 +
1322 + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1323 + /*
1324 + * <= is required because the CPU will access up to
1325 + * 8 bits beyond the end of the IO permission bitmap.
1326 + */
1327 + for (i = 0; i <= IO_BITMAP_LONGS; i++)
1328 + t->io_bitmap[i] = ~0UL;
1329 +#endif
1330 +
1331 + atomic_inc(&init_mm.mm_count);
1332 + me->active_mm = &init_mm;
1333 + if (me->mm)
1334 + BUG();
1335 + enter_lazy_tlb(&init_mm, me);
1336 +
1337 + load_sp0(t, &current->thread);
1338 +#ifndef CONFIG_X86_NO_TSS
1339 + set_tss_desc(cpu, t);
1340 + load_TR_desc();
1341 +#endif
1342 + load_LDT(&init_mm.context);
1343 +
1344 +#ifdef CONFIG_KGDB
1345 + /*
1346 + * If the kgdb is connected no debug regs should be altered. This
1347 + * is only applicable when KGDB and a KGDB I/O module are built
1348 + * into the kernel and you are using early debugging with
1349 + * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1350 + */
1351 + if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1352 + arch_kgdb_ops.correct_hw_break();
1353 + else {
1354 +#endif
1355 + /*
1356 + * Clear all 6 debug registers:
1357 + */
1358 +
1359 + set_debugreg(0UL, 0);
1360 + set_debugreg(0UL, 1);
1361 + set_debugreg(0UL, 2);
1362 + set_debugreg(0UL, 3);
1363 + set_debugreg(0UL, 6);
1364 + set_debugreg(0UL, 7);
1365 +#ifdef CONFIG_KGDB
1366 + /* If the kgdb is connected no debug regs should be altered. */
1367 + }
1368 +#endif
1369 +
1370 + fpu_init();
1371 +
1372 + asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
1373 + if (raw_irqs_disabled())
1374 + kernel_eflags &= ~X86_EFLAGS_IF;
1375 +
1376 + if (is_uv_system())
1377 + uv_cpu_init();
1378 +}
1379 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
1380 +++ sle11-2009-10-16/arch/x86/kernel/e820-xen.c 2009-06-04 10:21:39.000000000 +0200
1381 @@ -0,0 +1,1545 @@
1382 +/*
1383 + * Handle the memory map.
1384 + * The functions here do the job until bootmem takes over.
1385 + *
1386 + * Getting sanitize_e820_map() in sync with i386 version by applying change:
1387 + * - Provisions for empty E820 memory regions (reported by certain BIOSes).
1388 + * Alex Achenbach <xela@slit.de>, December 2002.
1389 + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
1390 + *
1391 + */
1392 +#include <linux/kernel.h>
1393 +#include <linux/types.h>
1394 +#include <linux/init.h>
1395 +#include <linux/bootmem.h>
1396 +#include <linux/ioport.h>
1397 +#include <linux/string.h>
1398 +#include <linux/kexec.h>
1399 +#include <linux/module.h>
1400 +#include <linux/mm.h>
1401 +#include <linux/pfn.h>
1402 +#include <linux/suspend.h>
1403 +#include <linux/firmware-map.h>
1404 +
1405 +#include <asm/pgtable.h>
1406 +#include <asm/page.h>
1407 +#include <asm/e820.h>
1408 +#include <asm/proto.h>
1409 +#include <asm/setup.h>
1410 +#include <xen/interface/memory.h>
1411 +
1412 +/*
1413 + * The e820 map is the map that gets modified e.g. with command line parameters
1414 + * and that is also registered with modifications in the kernel resource tree
1415 + * with the iomem_resource as parent.
1416 + *
1417 + * The e820_saved is directly saved after the BIOS-provided memory map is
1418 + * copied. It doesn't get modified afterwards. It's registered for the
1419 + * /sys/firmware/memmap interface.
1420 + *
1421 + * That memory map is not modified and is used as base for kexec. The kexec'd
1422 + * kernel should get the same memory map as the firmware provides. Then the
1423 + * user can e.g. boot the original kernel with mem=1G while still booting the
1424 + * next kernel with full memory.
1425 + */
1426 +struct e820map e820;
1427 +#ifndef CONFIG_XEN
1428 +struct e820map e820_saved;
1429 +#else
1430 +static struct e820map machine_e820;
1431 +#define e820_saved machine_e820
1432 +#endif
1433 +
1434 +/* For PCI or other memory-mapped resources */
1435 +unsigned long pci_mem_start = 0xaeedbabe;
1436 +#ifdef CONFIG_PCI
1437 +EXPORT_SYMBOL(pci_mem_start);
1438 +#endif
1439 +
1440 +/*
1441 + * This function checks if any part of the range <start,end> is mapped
1442 + * with type.
1443 + */
1444 +int
1445 +e820_any_mapped(u64 start, u64 end, unsigned type)
1446 +{
1447 + int i;
1448 +
1449 +#ifndef CONFIG_XEN
1450 + for (i = 0; i < e820.nr_map; i++) {
1451 + struct e820entry *ei = &e820.map[i];
1452 +#else
1453 + if (!is_initial_xendomain())
1454 + return 0;
1455 + for (i = 0; i < machine_e820.nr_map; ++i) {
1456 + const struct e820entry *ei = &machine_e820.map[i];
1457 +#endif
1458 +
1459 + if (type && ei->type != type)
1460 + continue;
1461 + if (ei->addr >= end || ei->addr + ei->size <= start)
1462 + continue;
1463 + return 1;
1464 + }
1465 + return 0;
1466 +}
1467 +EXPORT_SYMBOL_GPL(e820_any_mapped);
1468 +
1469 +/*
1470 + * This function checks if the entire range <start,end> is mapped with type.
1471 + *
1472 + * Note: this function only works correct if the e820 table is sorted and
1473 + * not-overlapping, which is the case
1474 + */
1475 +int __init e820_all_mapped(u64 start, u64 end, unsigned type)
1476 +{
1477 + int i;
1478 +
1479 +#ifndef CONFIG_XEN
1480 + for (i = 0; i < e820.nr_map; i++) {
1481 + struct e820entry *ei = &e820.map[i];
1482 +#else
1483 + if (!is_initial_xendomain())
1484 + return 0;
1485 + for (i = 0; i < machine_e820.nr_map; ++i) {
1486 + const struct e820entry *ei = &machine_e820.map[i];
1487 +#endif
1488 +
1489 + if (type && ei->type != type)
1490 + continue;
1491 + /* is the region (part) in overlap with the current region ?*/
1492 + if (ei->addr >= end || ei->addr + ei->size <= start)
1493 + continue;
1494 +
1495 + /* if the region is at the beginning of <start,end> we move
1496 + * start to the end of the region since it's ok until there
1497 + */
1498 + if (ei->addr <= start)
1499 + start = ei->addr + ei->size;
1500 + /*
1501 + * if start is now at or beyond end, we're done, full
1502 + * coverage
1503 + */
1504 + if (start >= end)
1505 + return 1;
1506 + }
1507 + return 0;
1508 +}
1509 +
1510 +/*
1511 + * Add a memory region to the kernel e820 map.
1512 + */
1513 +void __init e820_add_region(u64 start, u64 size, int type)
1514 +{
1515 + int x = e820.nr_map;
1516 +
1517 + if (x == ARRAY_SIZE(e820.map)) {
1518 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1519 + return;
1520 + }
1521 +
1522 + e820.map[x].addr = start;
1523 + e820.map[x].size = size;
1524 + e820.map[x].type = type;
1525 + e820.nr_map++;
1526 +}
1527 +
1528 +void __init e820_print_map(char *who)
1529 +{
1530 + int i;
1531 +
1532 + for (i = 0; i < e820.nr_map; i++) {
1533 + printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1534 + (unsigned long long) e820.map[i].addr,
1535 + (unsigned long long)
1536 + (e820.map[i].addr + e820.map[i].size));
1537 + switch (e820.map[i].type) {
1538 + case E820_RAM:
1539 + case E820_RESERVED_KERN:
1540 + printk(KERN_CONT "(usable)\n");
1541 + break;
1542 + case E820_RESERVED:
1543 + printk(KERN_CONT "(reserved)\n");
1544 + break;
1545 + case E820_ACPI:
1546 + printk(KERN_CONT "(ACPI data)\n");
1547 + break;
1548 + case E820_NVS:
1549 + printk(KERN_CONT "(ACPI NVS)\n");
1550 + break;
1551 + default:
1552 + printk(KERN_CONT "type %u\n", e820.map[i].type);
1553 + break;
1554 + }
1555 + }
1556 +}
1557 +
1558 +/*
1559 + * Sanitize the BIOS e820 map.
1560 + *
1561 + * Some e820 responses include overlapping entries. The following
1562 + * replaces the original e820 map with a new one, removing overlaps,
1563 + * and resolving conflicting memory types in favor of highest
1564 + * numbered type.
1565 + *
1566 + * The input parameter biosmap points to an array of 'struct
1567 + * e820entry' which on entry has elements in the range [0, *pnr_map)
1568 + * valid, and which has space for up to max_nr_map entries.
1569 + * On return, the resulting sanitized e820 map entries will be in
1570 + * overwritten in the same location, starting at biosmap.
1571 + *
1572 + * The integer pointed to by pnr_map must be valid on entry (the
1573 + * current number of valid entries located at biosmap) and will
1574 + * be updated on return, with the new number of valid entries
1575 + * (something no more than max_nr_map.)
1576 + *
1577 + * The return value from sanitize_e820_map() is zero if it
1578 + * successfully 'sanitized' the map entries passed in, and is -1
1579 + * if it did nothing, which can happen if either of (1) it was
1580 + * only passed one map entry, or (2) any of the input map entries
1581 + * were invalid (start + size < start, meaning that the size was
1582 + * so big the described memory range wrapped around through zero.)
1583 + *
1584 + * Visually we're performing the following
1585 + * (1,2,3,4 = memory types)...
1586 + *
1587 + * Sample memory map (w/overlaps):
1588 + * ____22__________________
1589 + * ______________________4_
1590 + * ____1111________________
1591 + * _44_____________________
1592 + * 11111111________________
1593 + * ____________________33__
1594 + * ___________44___________
1595 + * __________33333_________
1596 + * ______________22________
1597 + * ___________________2222_
1598 + * _________111111111______
1599 + * _____________________11_
1600 + * _________________4______
1601 + *
1602 + * Sanitized equivalent (no overlap):
1603 + * 1_______________________
1604 + * _44_____________________
1605 + * ___1____________________
1606 + * ____22__________________
1607 + * ______11________________
1608 + * _________1______________
1609 + * __________3_____________
1610 + * ___________44___________
1611 + * _____________33_________
1612 + * _______________2________
1613 + * ________________1_______
1614 + * _________________4______
1615 + * ___________________2____
1616 + * ____________________33__
1617 + * ______________________4_
1618 + */
1619 +
1620 +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
1621 + int *pnr_map)
1622 +{
1623 + struct change_member {
1624 + struct e820entry *pbios; /* pointer to original bios entry */
1625 + unsigned long long addr; /* address for this change point */
1626 + };
1627 + static struct change_member change_point_list[2*E820_X_MAX] __initdata;
1628 + static struct change_member *change_point[2*E820_X_MAX] __initdata;
1629 + static struct e820entry *overlap_list[E820_X_MAX] __initdata;
1630 + static struct e820entry new_bios[E820_X_MAX] __initdata;
1631 + struct change_member *change_tmp;
1632 + unsigned long current_type, last_type;
1633 + unsigned long long last_addr;
1634 + int chgidx, still_changing;
1635 + int overlap_entries;
1636 + int new_bios_entry;
1637 + int old_nr, new_nr, chg_nr;
1638 + int i;
1639 +
1640 + /* if there's only one memory region, don't bother */
1641 +#ifdef CONFIG_XEN
1642 + if (*pnr_map == 1)
1643 + return 0;
1644 +#endif
1645 + if (*pnr_map < 2)
1646 + return -1;
1647 +
1648 + old_nr = *pnr_map;
1649 + BUG_ON(old_nr > max_nr_map);
1650 +
1651 + /* bail out if we find any unreasonable addresses in bios map */
1652 + for (i = 0; i < old_nr; i++)
1653 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1654 + return -1;
1655 +
1656 + /* create pointers for initial change-point information (for sorting) */
1657 + for (i = 0; i < 2 * old_nr; i++)
1658 + change_point[i] = &change_point_list[i];
1659 +
1660 + /* record all known change-points (starting and ending addresses),
1661 + omitting those that are for empty memory regions */
1662 + chgidx = 0;
1663 + for (i = 0; i < old_nr; i++) {
1664 + if (biosmap[i].size != 0) {
1665 + change_point[chgidx]->addr = biosmap[i].addr;
1666 + change_point[chgidx++]->pbios = &biosmap[i];
1667 + change_point[chgidx]->addr = biosmap[i].addr +
1668 + biosmap[i].size;
1669 + change_point[chgidx++]->pbios = &biosmap[i];
1670 + }
1671 + }
1672 + chg_nr = chgidx;
1673 +
1674 + /* sort change-point list by memory addresses (low -> high) */
1675 + still_changing = 1;
1676 + while (still_changing) {
1677 + still_changing = 0;
1678 + for (i = 1; i < chg_nr; i++) {
1679 + unsigned long long curaddr, lastaddr;
1680 + unsigned long long curpbaddr, lastpbaddr;
1681 +
1682 + curaddr = change_point[i]->addr;
1683 + lastaddr = change_point[i - 1]->addr;
1684 + curpbaddr = change_point[i]->pbios->addr;
1685 + lastpbaddr = change_point[i - 1]->pbios->addr;
1686 +
1687 + /*
1688 + * swap entries, when:
1689 + *
1690 + * curaddr > lastaddr or
1691 + * curaddr == lastaddr and curaddr == curpbaddr and
1692 + * lastaddr != lastpbaddr
1693 + */
1694 + if (curaddr < lastaddr ||
1695 + (curaddr == lastaddr && curaddr == curpbaddr &&
1696 + lastaddr != lastpbaddr)) {
1697 + change_tmp = change_point[i];
1698 + change_point[i] = change_point[i-1];
1699 + change_point[i-1] = change_tmp;
1700 + still_changing = 1;
1701 + }
1702 + }
1703 + }
1704 +
1705 + /* create a new bios memory map, removing overlaps */
1706 + overlap_entries = 0; /* number of entries in the overlap table */
1707 + new_bios_entry = 0; /* index for creating new bios map entries */
1708 + last_type = 0; /* start with undefined memory type */
1709 + last_addr = 0; /* start with 0 as last starting address */
1710 +
1711 + /* loop through change-points, determining affect on the new bios map */
1712 + for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1713 + /* keep track of all overlapping bios entries */
1714 + if (change_point[chgidx]->addr ==
1715 + change_point[chgidx]->pbios->addr) {
1716 + /*
1717 + * add map entry to overlap list (> 1 entry
1718 + * implies an overlap)
1719 + */
1720 + overlap_list[overlap_entries++] =
1721 + change_point[chgidx]->pbios;
1722 + } else {
1723 + /*
1724 + * remove entry from list (order independent,
1725 + * so swap with last)
1726 + */
1727 + for (i = 0; i < overlap_entries; i++) {
1728 + if (overlap_list[i] ==
1729 + change_point[chgidx]->pbios)
1730 + overlap_list[i] =
1731 + overlap_list[overlap_entries-1];
1732 + }
1733 + overlap_entries--;
1734 + }
1735 + /*
1736 + * if there are overlapping entries, decide which
1737 + * "type" to use (larger value takes precedence --
1738 + * 1=usable, 2,3,4,4+=unusable)
1739 + */
1740 + current_type = 0;
1741 + for (i = 0; i < overlap_entries; i++)
1742 + if (overlap_list[i]->type > current_type)
1743 + current_type = overlap_list[i]->type;
1744 + /*
1745 + * continue building up new bios map based on this
1746 + * information
1747 + */
1748 + if (current_type != last_type) {
1749 + if (last_type != 0) {
1750 + new_bios[new_bios_entry].size =
1751 + change_point[chgidx]->addr - last_addr;
1752 + /*
1753 + * move forward only if the new size
1754 + * was non-zero
1755 + */
1756 + if (new_bios[new_bios_entry].size != 0)
1757 + /*
1758 + * no more space left for new
1759 + * bios entries ?
1760 + */
1761 + if (++new_bios_entry >= max_nr_map)
1762 + break;
1763 + }
1764 + if (current_type != 0) {
1765 + new_bios[new_bios_entry].addr =
1766 + change_point[chgidx]->addr;
1767 + new_bios[new_bios_entry].type = current_type;
1768 + last_addr = change_point[chgidx]->addr;
1769 + }
1770 + last_type = current_type;
1771 + }
1772 + }
1773 + /* retain count for new bios entries */
1774 + new_nr = new_bios_entry;
1775 +
1776 + /* copy new bios mapping into original location */
1777 + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
1778 + *pnr_map = new_nr;
1779 +
1780 + return 0;
1781 +}
1782 +
1783 +static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
1784 +{
1785 + while (nr_map) {
1786 + u64 start = biosmap->addr;
1787 + u64 size = biosmap->size;
1788 + u64 end = start + size;
1789 + u32 type = biosmap->type;
1790 +
1791 + /* Overflow in 64 bits? Ignore the memory map. */
1792 + if (start > end)
1793 + return -1;
1794 +
1795 + e820_add_region(start, size, type);
1796 +
1797 + biosmap++;
1798 + nr_map--;
1799 + }
1800 + return 0;
1801 +}
1802 +
1803 +/*
1804 + * Copy the BIOS e820 map into a safe place.
1805 + *
1806 + * Sanity-check it while we're at it..
1807 + *
1808 + * If we're lucky and live on a modern system, the setup code
1809 + * will have given us a memory map that we can use to properly
1810 + * set up memory. If we aren't, we'll fake a memory map.
1811 + */
1812 +static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
1813 +{
1814 +#ifndef CONFIG_XEN
1815 + /* Only one memory region (or negative)? Ignore it */
1816 + if (nr_map < 2)
1817 + return -1;
1818 +#else
1819 + BUG_ON(nr_map < 1);
1820 +#endif
1821 +
1822 + return __append_e820_map(biosmap, nr_map);
1823 +}
1824 +
1825 +static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
1826 + u64 size, unsigned old_type,
1827 + unsigned new_type)
1828 +{
1829 + unsigned int i, x;
1830 + u64 real_updated_size = 0;
1831 +
1832 + BUG_ON(old_type == new_type);
1833 +
1834 + if (size > (ULLONG_MAX - start))
1835 + size = ULLONG_MAX - start;
1836 +
1837 + for (i = 0; i < e820x->nr_map; i++) {
1838 + struct e820entry *ei = &e820x->map[i];
1839 + u64 final_start, final_end;
1840 + if (ei->type != old_type)
1841 + continue;
1842 + /* totally covered? */
1843 + if (ei->addr >= start &&
1844 + (ei->addr + ei->size) <= (start + size)) {
1845 + ei->type = new_type;
1846 + real_updated_size += ei->size;
1847 + continue;
1848 + }
1849 + /* partially covered */
1850 + final_start = max(start, ei->addr);
1851 + final_end = min(start + size, ei->addr + ei->size);
1852 + if (final_start >= final_end)
1853 + continue;
1854 +
1855 + x = e820x->nr_map;
1856 + if (x == ARRAY_SIZE(e820x->map)) {
1857 + printk(KERN_ERR "Too many memory map entries!\n");
1858 + break;
1859 + }
1860 + e820x->map[x].addr = final_start;
1861 + e820x->map[x].size = final_end - final_start;
1862 + e820x->map[x].type = new_type;
1863 + e820x->nr_map++;
1864 +
1865 + real_updated_size += final_end - final_start;
1866 +
1867 + if (ei->addr < final_start)
1868 + continue;
1869 + ei->addr = final_end;
1870 + ei->size -= final_end - final_start;
1871 + }
1872 + return real_updated_size;
1873 +}
1874 +
1875 +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
1876 + unsigned new_type)
1877 +{
1878 + return e820_update_range_map(&e820, start, size, old_type, new_type);
1879 +}
1880 +
1881 +static u64 __init e820_update_range_saved(u64 start, u64 size,
1882 + unsigned old_type, unsigned new_type)
1883 +{
1884 +#ifdef CONFIG_XEN
1885 + if (is_initial_xendomain())
1886 + return e820_update_range_map(&machine_e820,
1887 + phys_to_machine(start), size,
1888 + old_type, new_type);
1889 +#endif
1890 + return e820_update_range_map(&e820_saved, start, size, old_type,
1891 + new_type);
1892 +}
1893 +
1894 +/* make e820 not cover the range */
1895 +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
1896 + int checktype)
1897 +{
1898 + int i;
1899 + u64 real_removed_size = 0;
1900 +
1901 + if (size > (ULLONG_MAX - start))
1902 + size = ULLONG_MAX - start;
1903 +
1904 + for (i = 0; i < e820.nr_map; i++) {
1905 + struct e820entry *ei = &e820.map[i];
1906 + u64 final_start, final_end;
1907 +
1908 + if (checktype && ei->type != old_type)
1909 + continue;
1910 + /* totally covered? */
1911 + if (ei->addr >= start &&
1912 + (ei->addr + ei->size) <= (start + size)) {
1913 + real_removed_size += ei->size;
1914 + memset(ei, 0, sizeof(struct e820entry));
1915 + continue;
1916 + }
1917 + /* partially covered */
1918 + final_start = max(start, ei->addr);
1919 + final_end = min(start + size, ei->addr + ei->size);
1920 + if (final_start >= final_end)
1921 + continue;
1922 + real_removed_size += final_end - final_start;
1923 +
1924 + ei->size -= final_end - final_start;
1925 + if (ei->addr < final_start)
1926 + continue;
1927 + ei->addr = final_end;
1928 + }
1929 + return real_removed_size;
1930 +}
1931 +
1932 +void __init update_e820(void)
1933 +{
1934 + int nr_map;
1935 +
1936 + nr_map = e820.nr_map;
1937 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
1938 + return;
1939 + e820.nr_map = nr_map;
1940 + printk(KERN_INFO "modified physical RAM map:\n");
1941 + e820_print_map("modified");
1942 +}
1943 +static void __init update_e820_saved(void)
1944 +{
1945 + int nr_map;
1946 +
1947 + nr_map = e820_saved.nr_map;
1948 + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
1949 + return;
1950 + e820_saved.nr_map = nr_map;
1951 +}
1952 +
1953 +#ifdef CONFIG_XEN
1954 +#define e820 machine_e820
1955 +#endif
1956 +
1957 +#define MAX_GAP_END 0x100000000ull
1958 +/*
1959 + * Search for a gap in the e820 memory space from start_addr to end_addr.
1960 + */
1961 +__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
1962 + unsigned long start_addr, unsigned long long end_addr)
1963 +{
1964 + unsigned long long last;
1965 + int i = e820.nr_map;
1966 + int found = 0;
1967 +
1968 + last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
1969 +#ifdef CONFIG_X86_64
1970 + if (start_addr >= MAX_GAP_END)
1971 + last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
1972 +#endif
1973 +
1974 + while (--i >= 0) {
1975 + unsigned long long start = e820.map[i].addr;
1976 + unsigned long long end = start + e820.map[i].size;
1977 +
1978 + if (end < start_addr)
1979 + continue;
1980 +
1981 + /*
1982 + * Since "last" is at most 4GB, we know we'll
1983 + * fit in 32 bits if this condition is true
1984 + */
1985 + if (last > end) {
1986 + unsigned long gap = last - end;
1987 +
1988 + if (gap >= *gapsize) {
1989 + *gapsize = gap;
1990 + *gapstart = end;
1991 + found = 1;
1992 + }
1993 + }
1994 + if (start < last)
1995 + last = start;
1996 + }
1997 + return found;
1998 +}
1999 +
2000 +/*
2001 + * Search for the biggest gap in the low 32 bits of the e820
2002 + * memory space. We pass this space to PCI to assign MMIO resources
2003 + * for hotplug or unconfigured devices in.
2004 + * Hopefully the BIOS let enough space left.
2005 + */
2006 +__init void e820_setup_gap(void)
2007 +{
2008 + unsigned long gapstart, gapsize, round;
2009 + int found;
2010 +
2011 + gapstart = 0x10000000;
2012 + gapsize = 0x400000;
2013 + found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
2014 +
2015 +#ifdef CONFIG_X86_64
2016 + if (!found) {
2017 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2018 + "address range\n"
2019 + KERN_ERR "PCI: Unassigned devices with 32bit resource "
2020 + "registers may break!\n");
2021 + found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
2022 + BUG_ON(!found);
2023 + }
2024 +#endif
2025 +
2026 + /*
2027 + * See how much we want to round up: start off with
2028 + * rounding to the next 1MB area.
2029 + */
2030 + round = 0x100000;
2031 + while ((gapsize >> 4) > round)
2032 + round += round;
2033 + /* Fun with two's complement */
2034 + pci_mem_start = (gapstart + round) & -round;
2035 +
2036 + printk(KERN_INFO
2037 + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2038 + pci_mem_start, gapstart, gapsize);
2039 +}
2040 +
2041 +#undef e820
2042 +
2043 +#ifndef CONFIG_XEN
2044 +/**
2045 + * Because of the size limitation of struct boot_params, only first
2046 + * 128 E820 memory entries are passed to kernel via
2047 + * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
2048 + * linked list of struct setup_data, which is parsed here.
2049 + */
2050 +void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
2051 +{
2052 + u32 map_len;
2053 + int entries;
2054 + struct e820entry *extmap;
2055 +
2056 + entries = sdata->len / sizeof(struct e820entry);
2057 + map_len = sdata->len + sizeof(struct setup_data);
2058 + if (map_len > PAGE_SIZE)
2059 + sdata = early_ioremap(pa_data, map_len);
2060 + extmap = (struct e820entry *)(sdata->data);
2061 + __append_e820_map(extmap, entries);
2062 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
2063 + if (map_len > PAGE_SIZE)
2064 + early_iounmap(sdata, map_len);
2065 + printk(KERN_INFO "extended physical RAM map:\n");
2066 + e820_print_map("extended");
2067 +}
2068 +
2069 +#if defined(CONFIG_X86_64) || \
2070 + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
2071 +/**
2072 + * Find the ranges of physical addresses that do not correspond to
2073 + * e820 RAM areas and mark the corresponding pages as nosave for
2074 + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
2075 + *
2076 + * This function requires the e820 map to be sorted and without any
2077 + * overlapping entries and assumes the first e820 area to be RAM.
2078 + */
2079 +void __init e820_mark_nosave_regions(unsigned long limit_pfn)
2080 +{
2081 + int i;
2082 + unsigned long pfn;
2083 +
2084 + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
2085 + for (i = 1; i < e820.nr_map; i++) {
2086 + struct e820entry *ei = &e820.map[i];
2087 +
2088 + if (pfn < PFN_UP(ei->addr))
2089 + register_nosave_region(pfn, PFN_UP(ei->addr));
2090 +
2091 + pfn = PFN_DOWN(ei->addr + ei->size);
2092 + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
2093 + register_nosave_region(PFN_UP(ei->addr), pfn);
2094 +
2095 + if (pfn >= limit_pfn)
2096 + break;
2097 + }
2098 +}
2099 +#endif
2100 +#endif
2101 +
2102 +/*
2103 + * Early reserved memory areas.
2104 + */
2105 +#define MAX_EARLY_RES 20
2106 +
2107 +struct early_res {
2108 + u64 start, end;
2109 + char name[16];
2110 + char overlap_ok;
2111 +};
2112 +static struct early_res early_res[MAX_EARLY_RES] __initdata = {
2113 +#ifndef CONFIG_XEN
2114 + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
2115 +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
2116 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
2117 +#endif
2118 +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
2119 + /*
2120 + * But first pinch a few for the stack/trampoline stuff
2121 + * FIXME: Don't need the extra page at 4K, but need to fix
2122 + * trampoline before removing it. (see the GDT stuff)
2123 + */
2124 + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
2125 + /*
2126 + * Has to be in very low memory so we can execute
2127 + * real-mode AP code.
2128 + */
2129 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
2130 +#endif
2131 +#endif
2132 + {}
2133 +};
2134 +
2135 +static int __init find_overlapped_early(u64 start, u64 end)
2136 +{
2137 + int i;
2138 + struct early_res *r;
2139 +
2140 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2141 + r = &early_res[i];
2142 + if (end > r->start && start < r->end)
2143 + break;
2144 + }
2145 +
2146 + return i;
2147 +}
2148 +
2149 +/*
2150 + * Drop the i-th range from the early reservation map,
2151 + * by copying any higher ranges down one over it, and
2152 + * clearing what had been the last slot.
2153 + */
2154 +static void __init drop_range(int i)
2155 +{
2156 + int j;
2157 +
2158 + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
2159 + ;
2160 +
2161 + memmove(&early_res[i], &early_res[i + 1],
2162 + (j - 1 - i) * sizeof(struct early_res));
2163 +
2164 + early_res[j - 1].end = 0;
2165 +}
2166 +
2167 +/*
2168 + * Split any existing ranges that:
2169 + * 1) are marked 'overlap_ok', and
2170 + * 2) overlap with the stated range [start, end)
2171 + * into whatever portion (if any) of the existing range is entirely
2172 + * below or entirely above the stated range. Drop the portion
2173 + * of the existing range that overlaps with the stated range,
2174 + * which will allow the caller of this routine to then add that
2175 + * stated range without conflicting with any existing range.
2176 + */
2177 +static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
2178 +{
2179 + int i;
2180 + struct early_res *r;
2181 + u64 lower_start, lower_end;
2182 + u64 upper_start, upper_end;
2183 + char name[16];
2184 +
2185 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2186 + r = &early_res[i];
2187 +
2188 + /* Continue past non-overlapping ranges */
2189 + if (end <= r->start || start >= r->end)
2190 + continue;
2191 +
2192 + /*
2193 + * Leave non-ok overlaps as is; let caller
2194 + * panic "Overlapping early reservations"
2195 + * when it hits this overlap.
2196 + */
2197 + if (!r->overlap_ok)
2198 + return;
2199 +
2200 + /*
2201 + * We have an ok overlap. We will drop it from the early
2202 + * reservation map, and add back in any non-overlapping
2203 + * portions (lower or upper) as separate, overlap_ok,
2204 + * non-overlapping ranges.
2205 + */
2206 +
2207 + /* 1. Note any non-overlapping (lower or upper) ranges. */
2208 + strncpy(name, r->name, sizeof(name) - 1);
2209 +
2210 + lower_start = lower_end = 0;
2211 + upper_start = upper_end = 0;
2212 + if (r->start < start) {
2213 + lower_start = r->start;
2214 + lower_end = start;
2215 + }
2216 + if (r->end > end) {
2217 + upper_start = end;
2218 + upper_end = r->end;
2219 + }
2220 +
2221 + /* 2. Drop the original ok overlapping range */
2222 + drop_range(i);
2223 +
2224 + i--; /* resume for-loop on copied down entry */
2225 +
2226 + /* 3. Add back in any non-overlapping ranges. */
2227 + if (lower_end)
2228 + reserve_early_overlap_ok(lower_start, lower_end, name);
2229 + if (upper_end)
2230 + reserve_early_overlap_ok(upper_start, upper_end, name);
2231 + }
2232 +}
2233 +
2234 +static void __init __reserve_early(u64 start, u64 end, char *name,
2235 + int overlap_ok)
2236 +{
2237 + int i;
2238 + struct early_res *r;
2239 +
2240 + i = find_overlapped_early(start, end);
2241 + if (i >= MAX_EARLY_RES)
2242 + panic("Too many early reservations");
2243 + r = &early_res[i];
2244 + if (r->end)
2245 + panic("Overlapping early reservations "
2246 + "%llx-%llx %s to %llx-%llx %s\n",
2247 + start, end - 1, name?name:"", r->start,
2248 + r->end - 1, r->name);
2249 + r->start = start;
2250 + r->end = end;
2251 + r->overlap_ok = overlap_ok;
2252 + if (name)
2253 + strncpy(r->name, name, sizeof(r->name) - 1);
2254 +}
2255 +
2256 +/*
2257 + * A few early reservtations come here.
2258 + *
2259 + * The 'overlap_ok' in the name of this routine does -not- mean it
2260 + * is ok for these reservations to overlap an earlier reservation.
2261 + * Rather it means that it is ok for subsequent reservations to
2262 + * overlap this one.
2263 + *
2264 + * Use this entry point to reserve early ranges when you are doing
2265 + * so out of "Paranoia", reserving perhaps more memory than you need,
2266 + * just in case, and don't mind a subsequent overlapping reservation
2267 + * that is known to be needed.
2268 + *
2269 + * The drop_overlaps_that_are_ok() call here isn't really needed.
2270 + * It would be needed if we had two colliding 'overlap_ok'
2271 + * reservations, so that the second such would not panic on the
2272 + * overlap with the first. We don't have any such as of this
2273 + * writing, but might as well tolerate such if it happens in
2274 + * the future.
2275 + */
2276 +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
2277 +{
2278 + drop_overlaps_that_are_ok(start, end);
2279 + __reserve_early(start, end, name, 1);
2280 +}
2281 +
2282 +/*
2283 + * Most early reservations come here.
2284 + *
2285 + * We first have drop_overlaps_that_are_ok() drop any pre-existing
2286 + * 'overlap_ok' ranges, so that we can then reserve this memory
2287 + * range without risk of panic'ing on an overlapping overlap_ok
2288 + * early reservation.
2289 + */
2290 +void __init reserve_early(u64 start, u64 end, char *name)
2291 +{
2292 + drop_overlaps_that_are_ok(start, end);
2293 + __reserve_early(start, end, name, 0);
2294 +}
2295 +
2296 +void __init free_early(u64 start, u64 end)
2297 +{
2298 + struct early_res *r;
2299 + int i;
2300 +
2301 + i = find_overlapped_early(start, end);
2302 + r = &early_res[i];
2303 + if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
2304 + panic("free_early on not reserved area: %llx-%llx!",
2305 + start, end - 1);
2306 +
2307 + drop_range(i);
2308 +}
2309 +
2310 +void __init early_res_to_bootmem(u64 start, u64 end)
2311 +{
2312 + int i, count;
2313 + u64 final_start, final_end;
2314 +
2315 + count = 0;
2316 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
2317 + count++;
2318 +
2319 + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
2320 + count, start, end);
2321 + for (i = 0; i < count; i++) {
2322 + struct early_res *r = &early_res[i];
2323 + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
2324 + r->start, r->end, r->name);
2325 + final_start = max(start, r->start);
2326 + final_end = min(end, r->end);
2327 + if (final_start >= final_end) {
2328 + printk(KERN_CONT "\n");
2329 + continue;
2330 + }
2331 + printk(KERN_CONT " ==> [%010llx - %010llx]\n",
2332 + final_start, final_end);
2333 + reserve_bootmem_generic(final_start, final_end - final_start,
2334 + BOOTMEM_DEFAULT);
2335 + }
2336 +}
2337 +
2338 +/* Check for already reserved areas */
2339 +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
2340 +{
2341 + int i;
2342 + u64 addr = *addrp;
2343 + int changed = 0;
2344 + struct early_res *r;
2345 +again:
2346 + i = find_overlapped_early(addr, addr + size);
2347 + r = &early_res[i];
2348 + if (i < MAX_EARLY_RES && r->end) {
2349 + *addrp = addr = round_up(r->end, align);
2350 + changed = 1;
2351 + goto again;
2352 + }
2353 + return changed;
2354 +}
2355 +
2356 +/* Check for already reserved areas */
2357 +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
2358 +{
2359 + int i;
2360 + u64 addr = *addrp, last;
2361 + u64 size = *sizep;
2362 + int changed = 0;
2363 +again:
2364 + last = addr + size;
2365 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
2366 + struct early_res *r = &early_res[i];
2367 + if (last > r->start && addr < r->start) {
2368 + size = r->start - addr;
2369 + changed = 1;
2370 + goto again;
2371 + }
2372 + if (last > r->end && addr < r->end) {
2373 + addr = round_up(r->end, align);
2374 + size = last - addr;
2375 + changed = 1;
2376 + goto again;
2377 + }
2378 + if (last <= r->end && addr >= r->start) {
2379 + (*sizep)++;
2380 + return 0;
2381 + }
2382 + }
2383 + if (changed) {
2384 + *addrp = addr;
2385 + *sizep = size;
2386 + }
2387 + return changed;
2388 +}
2389 +
2390 +/*
2391 + * Find a free area with specified alignment in a specific range.
2392 + */
2393 +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
2394 +{
2395 + int i;
2396 +
2397 + for (i = 0; i < e820.nr_map; i++) {
2398 + struct e820entry *ei = &e820.map[i];
2399 + u64 addr, last;
2400 + u64 ei_last;
2401 +
2402 + if (ei->type != E820_RAM)
2403 + continue;
2404 + addr = round_up(ei->addr, align);
2405 + ei_last = ei->addr + ei->size;
2406 + if (addr < start)
2407 + addr = round_up(start, align);
2408 + if (addr >= ei_last)
2409 + continue;
2410 + while (bad_addr(&addr, size, align) && addr+size <= ei_last)
2411 + ;
2412 + last = addr + size;
2413 + if (last > ei_last)
2414 + continue;
2415 + if (last > end)
2416 + continue;
2417 + return addr;
2418 + }
2419 + return -1ULL;
2420 +}
2421 +
2422 +/*
2423 + * Find next free range after *start
2424 + */
2425 +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
2426 +{
2427 + int i;
2428 +
2429 + for (i = 0; i < e820.nr_map; i++) {
2430 + struct e820entry *ei = &e820.map[i];
2431 + u64 addr, last;
2432 + u64 ei_last;
2433 +
2434 + if (ei->type != E820_RAM)
2435 + continue;
2436 + addr = round_up(ei->addr, align);
2437 + ei_last = ei->addr + ei->size;
2438 + if (addr < start)
2439 + addr = round_up(start, align);
2440 + if (addr >= ei_last)
2441 + continue;
2442 + *sizep = ei_last - addr;
2443 + while (bad_addr_size(&addr, sizep, align) &&
2444 + addr + *sizep <= ei_last)
2445 + ;
2446 + last = addr + *sizep;
2447 + if (last > ei_last)
2448 + continue;
2449 + return addr;
2450 + }
2451 +
2452 + return -1ULL;
2453 +}
2454 +
2455 +/*
2456 + * pre allocated 4k and reserved it in e820
2457 + */
2458 +u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
2459 +{
2460 + u64 size = 0;
2461 + u64 addr;
2462 + u64 start;
2463 +#ifdef CONFIG_XEN
2464 + unsigned int order = get_order(sizet);
2465 +
2466 + if (is_initial_xendomain()) {
2467 + sizet = PAGE_SIZE << order;
2468 + if (align < PAGE_SIZE)
2469 + align = PAGE_SIZE;
2470 + }
2471 +#endif
2472 + for (start = startt; ; start += size) {
2473 + start = find_e820_area_size(start, &size, align);
2474 + if (!(start + 1))
2475 + return 0;
2476 + if (size >= sizet)
2477 + break;
2478 + }
2479 +
2480 +#ifdef CONFIG_X86_32
2481 + if (start >= MAXMEM)
2482 + return 0;
2483 + if (start + size > MAXMEM)
2484 + size = MAXMEM - start;
2485 +#endif
2486 +#ifdef CONFIG_XEN
2487 + if ((start >> PAGE_SHIFT) >= xen_start_info->nr_pages)
2488 + return 0;
2489 + if (PFN_UP(start + size) > xen_start_info->nr_pages)
2490 + size = ((u64)xen_start_info->nr_pages << PAGE_SHIFT) - start;
2491 +#endif
2492 +
2493 + addr = round_down(start + size - sizet, align);
2494 + if (addr < start)
2495 + return 0;
2496 +#ifdef CONFIG_XEN
2497 + if (is_initial_xendomain()) {
2498 + int rc;
2499 + unsigned long max_initmap_pfn;
2500 +
2501 + max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
2502 + + xen_start_info->nr_pt_frames
2503 + + 1 + (1 << (19 - PAGE_SHIFT)),
2504 + 1UL << (22 - PAGE_SHIFT));
2505 +#ifdef CONFIG_X86_32
2506 + if ((addr >> PAGE_SHIFT)
2507 + < max(max_initmap_pfn, max_pfn_mapped))
2508 + rc = xen_create_contiguous_region((unsigned long)
2509 + __va(addr),
2510 + order, 32);
2511 +#else
2512 + if ((addr >> PAGE_SHIFT) < max_pfn_mapped)
2513 + rc = xen_create_contiguous_region((unsigned long)
2514 + __va(addr),
2515 + order, 32);
2516 + else if ((addr >> PAGE_SHIFT) < max_initmap_pfn)
2517 + rc = xen_create_contiguous_region(__START_KERNEL_map
2518 + + addr,
2519 + order, 32);
2520 +#endif
2521 + else
2522 + rc = early_create_contiguous_region(addr >> PAGE_SHIFT,
2523 + order, 32);
2524 + if (rc)
2525 + return 0;
2526 + }
2527 +#endif
2528 + e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
2529 + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
2530 + printk(KERN_INFO "update e820 for early_reserve_e820\n");
2531 + update_e820();
2532 + update_e820_saved();
2533 +
2534 + return addr;
2535 +}
2536 +
2537 +#ifdef CONFIG_X86_32
2538 +# ifdef CONFIG_X86_PAE
2539 +# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT))
2540 +# else
2541 +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
2542 +# endif
2543 +#else /* CONFIG_X86_32 */
2544 +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
2545 +#endif
2546 +
2547 +/*
2548 + * Find the highest page frame number we have available
2549 + */
2550 +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
2551 +{
2552 + int i;
2553 + unsigned long last_pfn = 0;
2554 + unsigned long max_arch_pfn = MAX_ARCH_PFN;
2555 +
2556 + for (i = 0; i < e820.nr_map; i++) {
2557 + struct e820entry *ei = &e820.map[i];
2558 + unsigned long start_pfn;
2559 + unsigned long end_pfn;
2560 +
2561 + if (ei->type != type)
2562 + continue;
2563 +
2564 + start_pfn = ei->addr >> PAGE_SHIFT;
2565 + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
2566 +
2567 + if (start_pfn >= limit_pfn)
2568 + continue;
2569 + if (end_pfn > limit_pfn) {
2570 + last_pfn = limit_pfn;
2571 + break;
2572 + }
2573 + if (end_pfn > last_pfn)
2574 + last_pfn = end_pfn;
2575 + }
2576 +
2577 + if (last_pfn > max_arch_pfn)
2578 + last_pfn = max_arch_pfn;
2579 +
2580 + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
2581 + last_pfn, max_arch_pfn);
2582 + return last_pfn;
2583 +}
2584 +unsigned long __init e820_end_of_ram_pfn(void)
2585 +{
2586 + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
2587 +}
2588 +
2589 +unsigned long __init e820_end_of_low_ram_pfn(void)
2590 +{
2591 + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
2592 +}
2593 +/*
2594 + * Finds an active region in the address range from start_pfn to last_pfn and
2595 + * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
2596 + */
2597 +int __init e820_find_active_region(const struct e820entry *ei,
2598 + unsigned long start_pfn,
2599 + unsigned long last_pfn,
2600 + unsigned long *ei_startpfn,
2601 + unsigned long *ei_endpfn)
2602 +{
2603 + u64 align = PAGE_SIZE;
2604 +
2605 + *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
2606 + *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
2607 +
2608 + /* Skip map entries smaller than a page */
2609 + if (*ei_startpfn >= *ei_endpfn)
2610 + return 0;
2611 +
2612 + /* Skip if map is outside the node */
2613 + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
2614 + *ei_startpfn >= last_pfn)
2615 + return 0;
2616 +
2617 + /* Check for overlaps */
2618 + if (*ei_startpfn < start_pfn)
2619 + *ei_startpfn = start_pfn;
2620 + if (*ei_endpfn > last_pfn)
2621 + *ei_endpfn = last_pfn;
2622 +
2623 + return 1;
2624 +}
2625 +
2626 +/* Walk the e820 map and register active regions within a node */
2627 +void __init e820_register_active_regions(int nid, unsigned long start_pfn,
2628 + unsigned long last_pfn)
2629 +{
2630 + unsigned long ei_startpfn;
2631 + unsigned long ei_endpfn;
2632 + int i;
2633 +
2634 + for (i = 0; i < e820.nr_map; i++)
2635 + if (e820_find_active_region(&e820.map[i],
2636 + start_pfn, last_pfn,
2637 + &ei_startpfn, &ei_endpfn))
2638 + add_active_range(nid, ei_startpfn, ei_endpfn);
2639 +}
2640 +
2641 +/*
2642 + * Find the hole size (in bytes) in the memory range.
2643 + * @start: starting address of the memory range to scan
2644 + * @end: ending address of the memory range to scan
2645 + */
2646 +u64 __init e820_hole_size(u64 start, u64 end)
2647 +{
2648 + unsigned long start_pfn = start >> PAGE_SHIFT;
2649 + unsigned long last_pfn = end >> PAGE_SHIFT;
2650 + unsigned long ei_startpfn, ei_endpfn, ram = 0;
2651 + int i;
2652 +
2653 + for (i = 0; i < e820.nr_map; i++) {
2654 + if (e820_find_active_region(&e820.map[i],
2655 + start_pfn, last_pfn,
2656 + &ei_startpfn, &ei_endpfn))
2657 + ram += ei_endpfn - ei_startpfn;
2658 + }
2659 + return end - start - ((u64)ram << PAGE_SHIFT);
2660 +}
2661 +
2662 +static void early_panic(char *msg)
2663 +{
2664 + early_printk(msg);
2665 + panic(msg);
2666 +}
2667 +
2668 +static int userdef __initdata;
2669 +
2670 +/* "mem=nopentium" disables the 4MB page tables. */
2671 +static int __init parse_memopt(char *p)
2672 +{
2673 + u64 mem_size, current_end;
2674 + unsigned int i;
2675 +
2676 + if (!p)
2677 + return -EINVAL;
2678 +
2679 +#ifdef CONFIG_X86_32
2680 + if (!strcmp(p, "nopentium")) {
2681 + setup_clear_cpu_cap(X86_FEATURE_PSE);
2682 + return 0;
2683 + }
2684 +#endif
2685 +
2686 + userdef = 1;
2687 + mem_size = memparse(p, &p);
2688 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2689 +
2690 + i = e820.nr_map - 1;
2691 + current_end = e820.map[i].addr + e820.map[i].size;
2692 + if (current_end < mem_size) {
2693 + /*
2694 + * The e820 map ends before our requested size so
2695 + * extend the final entry to the requested address.
2696 + */
2697 + if (e820.map[i].type == E820_RAM)
2698 + e820.map[i].size = mem_size - e820.map[i].addr;
2699 + else
2700 + e820_add_region(current_end, mem_size - current_end, E820_RAM);
2701 + }
2702 +
2703 + return 0;
2704 +}
2705 +early_param("mem", parse_memopt);
2706 +
2707 +#ifndef CONFIG_XEN
2708 +static int __init parse_memmap_opt(char *p)
2709 +{
2710 + char *oldp;
2711 + u64 start_at, mem_size;
2712 +
2713 + if (!p)
2714 + return -EINVAL;
2715 +
2716 + if (!strncmp(p, "exactmap", 8)) {
2717 +#ifdef CONFIG_CRASH_DUMP
2718 + /*
2719 + * If we are doing a crash dump, we still need to know
2720 + * the real mem size before original memory map is
2721 + * reset.
2722 + */
2723 + saved_max_pfn = e820_end_of_ram_pfn();
2724 +#endif
2725 + e820.nr_map = 0;
2726 + userdef = 1;
2727 + return 0;
2728 + }
2729 +
2730 + oldp = p;
2731 + mem_size = memparse(p, &p);
2732 + if (p == oldp)
2733 + return -EINVAL;
2734 +
2735 + userdef = 1;
2736 + if (*p == '@') {
2737 + start_at = memparse(p+1, &p);
2738 + e820_add_region(start_at, mem_size, E820_RAM);
2739 + } else if (*p == '#') {
2740 + start_at = memparse(p+1, &p);
2741 + e820_add_region(start_at, mem_size, E820_ACPI);
2742 + } else if (*p == '$') {
2743 + start_at = memparse(p+1, &p);
2744 + e820_add_region(start_at, mem_size, E820_RESERVED);
2745 + } else
2746 + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
2747 +
2748 + return *p == '\0' ? 0 : -EINVAL;
2749 +}
2750 +early_param("memmap", parse_memmap_opt);
2751 +#endif
2752 +
2753 +void __init finish_e820_parsing(void)
2754 +{
2755 + if (userdef) {
2756 + int nr = e820.nr_map;
2757 +
2758 + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
2759 + early_panic("Invalid user supplied memory map");
2760 + e820.nr_map = nr;
2761 +
2762 + printk(KERN_INFO "user-defined physical RAM map:\n");
2763 + e820_print_map("user");
2764 + }
2765 +}
2766 +
2767 +static inline const char *e820_type_to_string(int e820_type)
2768 +{
2769 + switch (e820_type) {
2770 + case E820_RESERVED_KERN:
2771 + case E820_RAM: return "System RAM";
2772 + case E820_ACPI: return "ACPI Tables";
2773 + case E820_NVS: return "ACPI Non-volatile Storage";
2774 + default: return "reserved";
2775 + }
2776 +}
2777 +
2778 +#ifdef CONFIG_XEN
2779 +#define e820 machine_e820
2780 +#endif
2781 +
2782 +/*
2783 + * Mark e820 reserved areas as busy for the resource manager.
2784 + */
2785 +void __init e820_reserve_resources(void)
2786 +{
2787 + int i;
2788 + struct resource *res;
2789 + u64 end;
2790 +
2791 + res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
2792 + for (i = 0; i < e820.nr_map; i++) {
2793 + end = e820.map[i].addr + e820.map[i].size - 1;
2794 +#ifndef CONFIG_RESOURCES_64BIT
2795 + if (end > 0x100000000ULL) {
2796 + res++;
2797 + continue;
2798 + }
2799 +#endif
2800 + res->name = e820_type_to_string(e820.map[i].type);
2801 + res->start = e820.map[i].addr;
2802 + res->end = end;
2803 +
2804 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2805 + insert_resource(&iomem_resource, res);
2806 + res++;
2807 + }
2808 +
2809 + for (i = 0; i < e820_saved.nr_map; i++) {
2810 + struct e820entry *entry = &e820_saved.map[i];
2811 + firmware_map_add_early(entry->addr,
2812 + entry->addr + entry->size - 1,
2813 + e820_type_to_string(entry->type));
2814 + }
2815 +}
2816 +
2817 +#undef e820
2818 +
2819 +#ifndef CONFIG_XEN
2820 +char *__init default_machine_specific_memory_setup(void)
2821 +{
2822 + char *who = "BIOS-e820";
2823 + int new_nr;
2824 + /*
2825 + * Try to copy the BIOS-supplied E820-map.
2826 + *
2827 + * Otherwise fake a memory map; one section from 0k->640k,
2828 + * the next section from 1mb->appropriate_mem_k
2829 + */
2830 + new_nr = boot_params.e820_entries;
2831 + sanitize_e820_map(boot_params.e820_map,
2832 + ARRAY_SIZE(boot_params.e820_map),
2833 + &new_nr);
2834 + boot_params.e820_entries = new_nr;
2835 + if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
2836 + < 0) {
2837 + u64 mem_size;
2838 +
2839 + /* compare results from other methods and take the greater */
2840 + if (boot_params.alt_mem_k
2841 + < boot_params.screen_info.ext_mem_k) {
2842 + mem_size = boot_params.screen_info.ext_mem_k;
2843 + who = "BIOS-88";
2844 + } else {
2845 + mem_size = boot_params.alt_mem_k;
2846 + who = "BIOS-e801";
2847 + }
2848 +
2849 + e820.nr_map = 0;
2850 + e820_add_region(0, LOWMEMSIZE(), E820_RAM);
2851 + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
2852 + }
2853 +
2854 + /* In case someone cares... */
2855 + return who;
2856 +}
2857 +
2858 +char *__init __attribute__((weak)) machine_specific_memory_setup(void)
2859 +{
2860 + if (x86_quirks->arch_memory_setup) {
2861 + char *who = x86_quirks->arch_memory_setup();
2862 +
2863 + if (who)
2864 + return who;
2865 + }
2866 + return default_machine_specific_memory_setup();
2867 +}
2868 +#endif
2869 +
2870 +char * __init memory_setup(void)
2871 +{
2872 + int rc, nr_map;
2873 + struct xen_memory_map memmap;
2874 + /*
2875 + * This is rather large for a stack variable but this early in
2876 + * the boot process we know we have plenty slack space.
2877 + */
2878 + struct e820entry map[E820MAX];
2879 +
2880 + memmap.nr_entries = E820MAX;
2881 + set_xen_guest_handle(memmap.buffer, map);
2882 +
2883 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
2884 + if (rc == -ENOSYS) {
2885 + memmap.nr_entries = 1;
2886 + map[0].addr = 0ULL;
2887 + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
2888 + /* 8MB slack (to balance backend allocations). */
2889 + map[0].size += 8ULL << 20;
2890 + map[0].type = E820_RAM;
2891 + rc = 0;
2892 + }
2893 + BUG_ON(rc);
2894 +
2895 + nr_map = memmap.nr_entries;
2896 + sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
2897 +
2898 + if (append_e820_map(map, nr_map) < 0)
2899 + BUG();
2900 +
2901 +#ifdef CONFIG_XEN
2902 + if (is_initial_xendomain()) {
2903 + memmap.nr_entries = E820MAX;
2904 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
2905 +
2906 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
2907 + BUG();
2908 + machine_e820.nr_map = memmap.nr_entries;
2909 + }
2910 +#endif
2911 +
2912 + return "Xen";
2913 +}
2914 +
2915 +void __init setup_memory_map(void)
2916 +{
2917 + char *who;
2918 +
2919 + who = memory_setup();
2920 +#ifdef CONFIG_XEN
2921 + if (!is_initial_xendomain())
2922 +#endif
2923 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
2924 + printk(KERN_INFO "Xen-provided physical RAM map:\n");
2925 + e820_print_map(who);
2926 +}
2927 --- sle11-2009-10-16.orig/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:38:05.000000000 +0100
2928 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
2929 @@ -1,873 +0,0 @@
2930 -#include <linux/kernel.h>
2931 -#include <linux/types.h>
2932 -#include <linux/init.h>
2933 -#include <linux/bootmem.h>
2934 -#include <linux/ioport.h>
2935 -#include <linux/string.h>
2936 -#include <linux/kexec.h>
2937 -#include <linux/module.h>
2938 -#include <linux/mm.h>
2939 -#include <linux/pfn.h>
2940 -#include <linux/uaccess.h>
2941 -#include <linux/suspend.h>
2942 -
2943 -#include <asm/pgtable.h>
2944 -#include <asm/page.h>
2945 -#include <asm/e820.h>
2946 -#include <asm/setup.h>
2947 -#include <xen/interface/memory.h>
2948 -
2949 -struct e820map e820;
2950 -struct change_member {
2951 - struct e820entry *pbios; /* pointer to original bios entry */
2952 - unsigned long long addr; /* address for this change point */
2953 -};
2954 -static struct change_member change_point_list[2*E820MAX] __initdata;
2955 -static struct change_member *change_point[2*E820MAX] __initdata;
2956 -static struct e820entry *overlap_list[E820MAX] __initdata;
2957 -static struct e820entry new_bios[E820MAX] __initdata;
2958 -/* For PCI or other memory-mapped resources */
2959 -unsigned long pci_mem_start = 0x10000000;
2960 -#ifdef CONFIG_PCI
2961 -EXPORT_SYMBOL(pci_mem_start);
2962 -#endif
2963 -extern int user_defined_memmap;
2964 -
2965 -static struct resource system_rom_resource = {
2966 - .name = "System ROM",
2967 - .start = 0xf0000,
2968 - .end = 0xfffff,
2969 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2970 -};
2971 -
2972 -static struct resource extension_rom_resource = {
2973 - .name = "Extension ROM",
2974 - .start = 0xe0000,
2975 - .end = 0xeffff,
2976 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2977 -};
2978 -
2979 -static struct resource adapter_rom_resources[] = { {
2980 - .name = "Adapter ROM",
2981 - .start = 0xc8000,
2982 - .end = 0,
2983 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2984 -}, {
2985 - .name = "Adapter ROM",
2986 - .start = 0,
2987 - .end = 0,
2988 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2989 -}, {
2990 - .name = "Adapter ROM",
2991 - .start = 0,
2992 - .end = 0,
2993 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2994 -}, {
2995 - .name = "Adapter ROM",
2996 - .start = 0,
2997 - .end = 0,
2998 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
2999 -}, {
3000 - .name = "Adapter ROM",
3001 - .start = 0,
3002 - .end = 0,
3003 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3004 -}, {
3005 - .name = "Adapter ROM",
3006 - .start = 0,
3007 - .end = 0,
3008 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3009 -} };
3010 -
3011 -static struct resource video_rom_resource = {
3012 - .name = "Video ROM",
3013 - .start = 0xc0000,
3014 - .end = 0xc7fff,
3015 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3016 -};
3017 -
3018 -#define ROMSIGNATURE 0xaa55
3019 -
3020 -static int __init romsignature(const unsigned char *rom)
3021 -{
3022 - const unsigned short * const ptr = (const unsigned short *)rom;
3023 - unsigned short sig;
3024 -
3025 - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
3026 -}
3027 -
3028 -static int __init romchecksum(const unsigned char *rom, unsigned long length)
3029 -{
3030 - unsigned char sum, c;
3031 -
3032 - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
3033 - sum += c;
3034 - return !length && !sum;
3035 -}
3036 -
3037 -static void __init probe_roms(void)
3038 -{
3039 - const unsigned char *rom;
3040 - unsigned long start, length, upper;
3041 - unsigned char c;
3042 - int i;
3043 -
3044 -#ifdef CONFIG_XEN
3045 - /* Nothing to do if not running in dom0. */
3046 - if (!is_initial_xendomain())
3047 - return;
3048 -#endif
3049 -
3050 - /* video rom */
3051 - upper = adapter_rom_resources[0].start;
3052 - for (start = video_rom_resource.start; start < upper; start += 2048) {
3053 - rom = isa_bus_to_virt(start);
3054 - if (!romsignature(rom))
3055 - continue;
3056 -
3057 - video_rom_resource.start = start;
3058 -
3059 - if (probe_kernel_address(rom + 2, c) != 0)
3060 - continue;
3061 -
3062 - /* 0 < length <= 0x7f * 512, historically */
3063 - length = c * 512;
3064 -
3065 - /* if checksum okay, trust length byte */
3066 - if (length && romchecksum(rom, length))
3067 - video_rom_resource.end = start + length - 1;
3068 -
3069 - request_resource(&iomem_resource, &video_rom_resource);
3070 - break;
3071 - }
3072 -
3073 - start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3074 - if (start < upper)
3075 - start = upper;
3076 -
3077 - /* system rom */
3078 - request_resource(&iomem_resource, &system_rom_resource);
3079 - upper = system_rom_resource.start;
3080 -
3081 - /* check for extension rom (ignore length byte!) */
3082 - rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
3083 - if (romsignature(rom)) {
3084 - length = extension_rom_resource.end - extension_rom_resource.start + 1;
3085 - if (romchecksum(rom, length)) {
3086 - request_resource(&iomem_resource, &extension_rom_resource);
3087 - upper = extension_rom_resource.start;
3088 - }
3089 - }
3090 -
3091 - /* check for adapter roms on 2k boundaries */
3092 - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3093 - rom = isa_bus_to_virt(start);
3094 - if (!romsignature(rom))
3095 - continue;
3096 -
3097 - if (probe_kernel_address(rom + 2, c) != 0)
3098 - continue;
3099 -
3100 - /* 0 < length <= 0x7f * 512, historically */
3101 - length = c * 512;
3102 -
3103 - /* but accept any length that fits if checksum okay */
3104 - if (!length || start + length > upper || !romchecksum(rom, length))
3105 - continue;
3106 -
3107 - adapter_rom_resources[i].start = start;
3108 - adapter_rom_resources[i].end = start + length - 1;
3109 - request_resource(&iomem_resource, &adapter_rom_resources[i]);
3110 -
3111 - start = adapter_rom_resources[i++].end & ~2047UL;
3112 - }
3113 -}
3114 -
3115 -#ifdef CONFIG_XEN
3116 -static struct e820map machine_e820;
3117 -#define e820 machine_e820
3118 -#endif
3119 -
3120 -/*
3121 - * Request address space for all standard RAM and ROM resources
3122 - * and also for regions reported as reserved by the e820.
3123 - */
3124 -void __init init_iomem_resources(struct resource *code_resource,
3125 - struct resource *data_resource,
3126 - struct resource *bss_resource)
3127 -{
3128 - int i;
3129 -
3130 - probe_roms();
3131 - for (i = 0; i < e820.nr_map; i++) {
3132 - struct resource *res;
3133 -#ifndef CONFIG_RESOURCES_64BIT
3134 - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
3135 - continue;
3136 -#endif
3137 - res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
3138 - switch (e820.map[i].type) {
3139 - case E820_RAM: res->name = "System RAM"; break;
3140 - case E820_ACPI: res->name = "ACPI Tables"; break;
3141 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
3142 - default: res->name = "reserved";
3143 - }
3144 - res->start = e820.map[i].addr;
3145 - res->end = res->start + e820.map[i].size - 1;
3146 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3147 - if (request_resource(&iomem_resource, res)) {
3148 - kfree(res);
3149 - continue;
3150 - }
3151 - if (e820.map[i].type == E820_RAM) {
3152 - /*
3153 - * We don't know which RAM region contains kernel data,
3154 - * so we try it repeatedly and let the resource manager
3155 - * test it.
3156 - */
3157 -#ifndef CONFIG_XEN
3158 - request_resource(res, code_resource);
3159 - request_resource(res, data_resource);
3160 - request_resource(res, bss_resource);
3161 -#endif
3162 -#ifdef CONFIG_KEXEC
3163 - if (crashk_res.start != crashk_res.end)
3164 - request_resource(res, &crashk_res);
3165 -#ifdef CONFIG_XEN
3166 - xen_machine_kexec_register_resources(res);
3167 -#endif
3168 -#endif
3169 - }
3170 - }
3171 -}
3172 -
3173 -#undef e820
3174 -
3175 -#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
3176 -/**
3177 - * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
3178 - * correspond to e820 RAM areas and mark the corresponding pages as nosave for
3179 - * hibernation.
3180 - *
3181 - * This function requires the e820 map to be sorted and without any
3182 - * overlapping entries and assumes the first e820 area to be RAM.
3183 - */
3184 -void __init e820_mark_nosave_regions(void)
3185 -{
3186 - int i;
3187 - unsigned long pfn;
3188 -
3189 - pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
3190 - for (i = 1; i < e820.nr_map; i++) {
3191 - struct e820entry *ei = &e820.map[i];
3192 -
3193 - if (pfn < PFN_UP(ei->addr))
3194 - register_nosave_region(pfn, PFN_UP(ei->addr));
3195 -
3196 - pfn = PFN_DOWN(ei->addr + ei->size);
3197 - if (ei->type != E820_RAM)
3198 - register_nosave_region(PFN_UP(ei->addr), pfn);
3199 -
3200 - if (pfn >= max_low_pfn)
3201 - break;
3202 - }
3203 -}
3204 -#endif
3205 -
3206 -void __init add_memory_region(unsigned long long start,
3207 - unsigned long long size, int type)
3208 -{
3209 - int x;
3210 -
3211 - x = e820.nr_map;
3212 -
3213 - if (x == E820MAX) {
3214 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3215 - return;
3216 - }
3217 -
3218 - e820.map[x].addr = start;
3219 - e820.map[x].size = size;
3220 - e820.map[x].type = type;
3221 - e820.nr_map++;
3222 -} /* add_memory_region */
3223 -
3224 -/*
3225 - * Sanitize the BIOS e820 map.
3226 - *
3227 - * Some e820 responses include overlapping entries. The following
3228 - * replaces the original e820 map with a new one, removing overlaps.
3229 - *
3230 - */
3231 -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3232 -{
3233 - struct change_member *change_tmp;
3234 - unsigned long current_type, last_type;
3235 - unsigned long long last_addr;
3236 - int chgidx, still_changing;
3237 - int overlap_entries;
3238 - int new_bios_entry;
3239 - int old_nr, new_nr, chg_nr;
3240 - int i;
3241 -
3242 - /*
3243 - Visually we're performing the following (1,2,3,4 = memory types)...
3244 -
3245 - Sample memory map (w/overlaps):
3246 - ____22__________________
3247 - ______________________4_
3248 - ____1111________________
3249 - _44_____________________
3250 - 11111111________________
3251 - ____________________33__
3252 - ___________44___________
3253 - __________33333_________
3254 - ______________22________
3255 - ___________________2222_
3256 - _________111111111______
3257 - _____________________11_
3258 - _________________4______
3259 -
3260 - Sanitized equivalent (no overlap):
3261 - 1_______________________
3262 - _44_____________________
3263 - ___1____________________
3264 - ____22__________________
3265 - ______11________________
3266 - _________1______________
3267 - __________3_____________
3268 - ___________44___________
3269 - _____________33_________
3270 - _______________2________
3271 - ________________1_______
3272 - _________________4______
3273 - ___________________2____
3274 - ____________________33__
3275 - ______________________4_
3276 - */
3277 - /* if there's only one memory region, don't bother */
3278 - if (*pnr_map < 2) {
3279 - return -1;
3280 - }
3281 -
3282 - old_nr = *pnr_map;
3283 -
3284 - /* bail out if we find any unreasonable addresses in bios map */
3285 - for (i=0; i<old_nr; i++)
3286 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
3287 - return -1;
3288 - }
3289 -
3290 - /* create pointers for initial change-point information (for sorting) */
3291 - for (i=0; i < 2*old_nr; i++)
3292 - change_point[i] = &change_point_list[i];
3293 -
3294 - /* record all known change-points (starting and ending addresses),
3295 - omitting those that are for empty memory regions */
3296 - chgidx = 0;
3297 - for (i=0; i < old_nr; i++) {
3298 - if (biosmap[i].size != 0) {
3299 - change_point[chgidx]->addr = biosmap[i].addr;
3300 - change_point[chgidx++]->pbios = &biosmap[i];
3301 - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3302 - change_point[chgidx++]->pbios = &biosmap[i];
3303 - }
3304 - }
3305 - chg_nr = chgidx; /* true number of change-points */
3306 -
3307 - /* sort change-point list by memory addresses (low -> high) */
3308 - still_changing = 1;
3309 - while (still_changing) {
3310 - still_changing = 0;
3311 - for (i=1; i < chg_nr; i++) {
3312 - /* if <current_addr> > <last_addr>, swap */
3313 - /* or, if current=<start_addr> & last=<end_addr>, swap */
3314 - if ((change_point[i]->addr < change_point[i-1]->addr) ||
3315 - ((change_point[i]->addr == change_point[i-1]->addr) &&
3316 - (change_point[i]->addr == change_point[i]->pbios->addr) &&
3317 - (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3318 - )
3319 - {
3320 - change_tmp = change_point[i];
3321 - change_point[i] = change_point[i-1];
3322 - change_point[i-1] = change_tmp;
3323 - still_changing=1;
3324 - }
3325 - }
3326 - }
3327 -
3328 - /* create a new bios memory map, removing overlaps */
3329 - overlap_entries=0; /* number of entries in the overlap table */
3330 - new_bios_entry=0; /* index for creating new bios map entries */
3331 - last_type = 0; /* start with undefined memory type */
3332 - last_addr = 0; /* start with 0 as last starting address */
3333 - /* loop through change-points, determining affect on the new bios map */
3334 - for (chgidx=0; chgidx < chg_nr; chgidx++)
3335 - {
3336 - /* keep track of all overlapping bios entries */
3337 - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3338 - {
3339 - /* add map entry to overlap list (> 1 entry implies an overlap) */
3340 - overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3341 - }
3342 - else
3343 - {
3344 - /* remove entry from list (order independent, so swap with last) */
3345 - for (i=0; i<overlap_entries; i++)
3346 - {
3347 - if (overlap_list[i] == change_point[chgidx]->pbios)
3348 - overlap_list[i] = overlap_list[overlap_entries-1];
3349 - }
3350 - overlap_entries--;
3351 - }
3352 - /* if there are overlapping entries, decide which "type" to use */
3353 - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3354 - current_type = 0;
3355 - for (i=0; i<overlap_entries; i++)
3356 - if (overlap_list[i]->type > current_type)
3357 - current_type = overlap_list[i]->type;
3358 - /* continue building up new bios map based on this information */
3359 - if (current_type != last_type) {
3360 - if (last_type != 0) {
3361 - new_bios[new_bios_entry].size =
3362 - change_point[chgidx]->addr - last_addr;
3363 - /* move forward only if the new size was non-zero */
3364 - if (new_bios[new_bios_entry].size != 0)
3365 - if (++new_bios_entry >= E820MAX)
3366 - break; /* no more space left for new bios entries */
3367 - }
3368 - if (current_type != 0) {
3369 - new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3370 - new_bios[new_bios_entry].type = current_type;
3371 - last_addr=change_point[chgidx]->addr;
3372 - }
3373 - last_type = current_type;
3374 - }
3375 - }
3376 - new_nr = new_bios_entry; /* retain count for new bios entries */
3377 -
3378 - /* copy new bios mapping into original location */
3379 - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3380 - *pnr_map = new_nr;
3381 -
3382 - return 0;
3383 -}
3384 -
3385 -/*
3386 - * Copy the BIOS e820 map into a safe place.
3387 - *
3388 - * Sanity-check it while we're at it..
3389 - *
3390 - * If we're lucky and live on a modern system, the setup code
3391 - * will have given us a memory map that we can use to properly
3392 - * set up memory. If we aren't, we'll fake a memory map.
3393 - *
3394 - * We check to see that the memory map contains at least 2 elements
3395 - * before we'll use it, because the detection code in setup.S may
3396 - * not be perfect and most every PC known to man has two memory
3397 - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3398 - * thinkpad 560x, for example, does not cooperate with the memory
3399 - * detection code.)
3400 - */
3401 -int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
3402 -{
3403 -#ifndef CONFIG_XEN
3404 - /* Only one memory region (or negative)? Ignore it */
3405 - if (nr_map < 2)
3406 - return -1;
3407 -#else
3408 - BUG_ON(nr_map < 1);
3409 -#endif
3410 -
3411 - do {
3412 - u64 start = biosmap->addr;
3413 - u64 size = biosmap->size;
3414 - u64 end = start + size;
3415 - u32 type = biosmap->type;
3416 -
3417 - /* Overflow in 64 bits? Ignore the memory map. */
3418 - if (start > end)
3419 - return -1;
3420 -
3421 - add_memory_region(start, size, type);
3422 - } while (biosmap++, --nr_map);
3423 -
3424 -#ifdef CONFIG_XEN
3425 - if (is_initial_xendomain()) {
3426 - struct xen_memory_map memmap;
3427 -
3428 - memmap.nr_entries = E820MAX;
3429 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3430 -
3431 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3432 - BUG();
3433 - machine_e820.nr_map = memmap.nr_entries;
3434 - } else
3435 - machine_e820 = e820;
3436 -#endif
3437 -
3438 - return 0;
3439 -}
3440 -
3441 -/*
3442 - * Find the highest page frame number we have available
3443 - */
3444 -void __init propagate_e820_map(void)
3445 -{
3446 - int i;
3447 -
3448 - max_pfn = 0;
3449 -
3450 - for (i = 0; i < e820.nr_map; i++) {
3451 - unsigned long start, end;
3452 - /* RAM? */
3453 - if (e820.map[i].type != E820_RAM)
3454 - continue;
3455 - start = PFN_UP(e820.map[i].addr);
3456 - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3457 - if (start >= end)
3458 - continue;
3459 - if (end > max_pfn)
3460 - max_pfn = end;
3461 - memory_present(0, start, end);
3462 - }
3463 -}
3464 -
3465 -/*
3466 - * Register fully available low RAM pages with the bootmem allocator.
3467 - */
3468 -void __init register_bootmem_low_pages(unsigned long max_low_pfn)
3469 -{
3470 - int i;
3471 -
3472 - for (i = 0; i < e820.nr_map; i++) {
3473 - unsigned long curr_pfn, last_pfn, size;
3474 - /*
3475 - * Reserve usable low memory
3476 - */
3477 - if (e820.map[i].type != E820_RAM)
3478 - continue;
3479 - /*
3480 - * We are rounding up the start address of usable memory:
3481 - */
3482 - curr_pfn = PFN_UP(e820.map[i].addr);
3483 - if (curr_pfn >= max_low_pfn)
3484 - continue;
3485 - /*
3486 - * ... and at the end of the usable range downwards:
3487 - */
3488 - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
3489 -
3490 -#ifdef CONFIG_XEN
3491 - /*
3492 - * Truncate to the number of actual pages currently
3493 - * present.
3494 - */
3495 - if (last_pfn > xen_start_info->nr_pages)
3496 - last_pfn = xen_start_info->nr_pages;
3497 -#endif
3498 -
3499 - if (last_pfn > max_low_pfn)
3500 - last_pfn = max_low_pfn;
3501 -
3502 - /*
3503 - * .. finally, did all the rounding and playing
3504 - * around just make the area go away?
3505 - */
3506 - if (last_pfn <= curr_pfn)
3507 - continue;
3508 -
3509 - size = last_pfn - curr_pfn;
3510 - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
3511 - }
3512 -}
3513 -
3514 -void __init e820_register_memory(void)
3515 -{
3516 - unsigned long gapstart, gapsize, round;
3517 - unsigned long long last;
3518 - int i;
3519 -
3520 -#ifdef CONFIG_XEN
3521 - if (is_initial_xendomain()) {
3522 - struct xen_memory_map memmap;
3523 -
3524 - memmap.nr_entries = E820MAX;
3525 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
3526 -
3527 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
3528 - BUG();
3529 - machine_e820.nr_map = memmap.nr_entries;
3530 - }
3531 - else
3532 - machine_e820 = e820;
3533 -#define e820 machine_e820
3534 -#endif
3535 -
3536 - /*
3537 - * Search for the biggest gap in the low 32 bits of the e820
3538 - * memory space.
3539 - */
3540 - last = 0x100000000ull;
3541 - gapstart = 0x10000000;
3542 - gapsize = 0x400000;
3543 - i = e820.nr_map;
3544 - while (--i >= 0) {
3545 - unsigned long long start = e820.map[i].addr;
3546 - unsigned long long end = start + e820.map[i].size;
3547 -
3548 - /*
3549 - * Since "last" is at most 4GB, we know we'll
3550 - * fit in 32 bits if this condition is true
3551 - */
3552 - if (last > end) {
3553 - unsigned long gap = last - end;
3554 -
3555 - if (gap > gapsize) {
3556 - gapsize = gap;
3557 - gapstart = end;
3558 - }
3559 - }
3560 - if (start < last)
3561 - last = start;
3562 - }
3563 -#undef e820
3564 -
3565 - /*
3566 - * See how much we want to round up: start off with
3567 - * rounding to the next 1MB area.
3568 - */
3569 - round = 0x100000;
3570 - while ((gapsize >> 4) > round)
3571 - round += round;
3572 - /* Fun with two's complement */
3573 - pci_mem_start = (gapstart + round) & -round;
3574 -
3575 - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
3576 - pci_mem_start, gapstart, gapsize);
3577 -}
3578 -
3579 -void __init print_memory_map(char *who)
3580 -{
3581 - int i;
3582 -
3583 - for (i = 0; i < e820.nr_map; i++) {
3584 - printk(" %s: %016Lx - %016Lx ", who,
3585 - e820.map[i].addr,
3586 - e820.map[i].addr + e820.map[i].size);
3587 - switch (e820.map[i].type) {
3588 - case E820_RAM: printk("(usable)\n");
3589 - break;
3590 - case E820_RESERVED:
3591 - printk("(reserved)\n");
3592 - break;
3593 - case E820_ACPI:
3594 - printk("(ACPI data)\n");
3595 - break;
3596 - case E820_NVS:
3597 - printk("(ACPI NVS)\n");
3598 - break;
3599 - default: printk("type %u\n", e820.map[i].type);
3600 - break;
3601 - }
3602 - }
3603 -}
3604 -
3605 -void __init limit_regions(unsigned long long size)
3606 -{
3607 - unsigned long long current_addr = 0;
3608 - int i;
3609 -
3610 - print_memory_map("limit_regions start");
3611 - for (i = 0; i < e820.nr_map; i++) {
3612 - current_addr = e820.map[i].addr + e820.map[i].size;
3613 - if (current_addr < size)
3614 - continue;
3615 -
3616 - if (e820.map[i].type != E820_RAM)
3617 - continue;
3618 -
3619 - if (e820.map[i].addr >= size) {
3620 - /*
3621 - * This region starts past the end of the
3622 - * requested size, skip it completely.
3623 - */
3624 - e820.nr_map = i;
3625 - } else {
3626 - e820.nr_map = i + 1;
3627 - e820.map[i].size -= current_addr - size;
3628 - }
3629 - print_memory_map("limit_regions endfor");
3630 - return;
3631 - }
3632 -#ifdef CONFIG_XEN
3633 - if (current_addr < size) {
3634 - /*
3635 - * The e820 map finished before our requested size so
3636 - * extend the final entry to the requested address.
3637 - */
3638 - --i;
3639 - if (e820.map[i].type == E820_RAM)
3640 - e820.map[i].size -= current_addr - size;
3641 - else
3642 - add_memory_region(current_addr, size - current_addr, E820_RAM);
3643 - }
3644 -#endif
3645 - print_memory_map("limit_regions endfunc");
3646 -}
3647 -
3648 -/*
3649 - * This function checks if any part of the range <start,end> is mapped
3650 - * with type.
3651 - */
3652 -int
3653 -e820_any_mapped(u64 start, u64 end, unsigned type)
3654 -{
3655 - int i;
3656 -
3657 -#ifndef CONFIG_XEN
3658 - for (i = 0; i < e820.nr_map; i++) {
3659 - const struct e820entry *ei = &e820.map[i];
3660 -#else
3661 - if (!is_initial_xendomain())
3662 - return 0;
3663 - for (i = 0; i < machine_e820.nr_map; ++i) {
3664 - const struct e820entry *ei = &machine_e820.map[i];
3665 -#endif
3666 -
3667 - if (type && ei->type != type)
3668 - continue;
3669 - if (ei->addr >= end || ei->addr + ei->size <= start)
3670 - continue;
3671 - return 1;
3672 - }
3673 - return 0;
3674 -}
3675 -EXPORT_SYMBOL_GPL(e820_any_mapped);
3676 -
3677 - /*
3678 - * This function checks if the entire range <start,end> is mapped with type.
3679 - *
3680 - * Note: this function only works correct if the e820 table is sorted and
3681 - * not-overlapping, which is the case
3682 - */
3683 -int __init
3684 -e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
3685 -{
3686 - u64 start = s;
3687 - u64 end = e;
3688 - int i;
3689 -
3690 -#ifndef CONFIG_XEN
3691 - for (i = 0; i < e820.nr_map; i++) {
3692 - struct e820entry *ei = &e820.map[i];
3693 -#else
3694 - if (!is_initial_xendomain())
3695 - return 0;
3696 - for (i = 0; i < machine_e820.nr_map; ++i) {
3697 - const struct e820entry *ei = &machine_e820.map[i];
3698 -#endif
3699 -
3700 - if (type && ei->type != type)
3701 - continue;
3702 - /* is the region (part) in overlap with the current region ?*/
3703 - if (ei->addr >= end || ei->addr + ei->size <= start)
3704 - continue;
3705 - /* if the region is at the beginning of <start,end> we move
3706 - * start to the end of the region since it's ok until there
3707 - */
3708 - if (ei->addr <= start)
3709 - start = ei->addr + ei->size;
3710 - /* if start is now at or beyond end, we're done, full
3711 - * coverage */
3712 - if (start >= end)
3713 - return 1; /* we're done */
3714 - }
3715 - return 0;
3716 -}
3717 -
3718 -static int __init parse_memmap(char *arg)
3719 -{
3720 - if (!arg)
3721 - return -EINVAL;
3722 -
3723 - if (strcmp(arg, "exactmap") == 0) {
3724 -#ifdef CONFIG_CRASH_DUMP
3725 - /* If we are doing a crash dump, we
3726 - * still need to know the real mem
3727 - * size before original memory map is
3728 - * reset.
3729 - */
3730 - propagate_e820_map();
3731 - saved_max_pfn = max_pfn;
3732 -#endif
3733 - e820.nr_map = 0;
3734 - user_defined_memmap = 1;
3735 - } else {
3736 - /* If the user specifies memory size, we
3737 - * limit the BIOS-provided memory map to
3738 - * that size. exactmap can be used to specify
3739 - * the exact map. mem=number can be used to
3740 - * trim the existing memory map.
3741 - */
3742 - unsigned long long start_at, mem_size;
3743 -
3744 - mem_size = memparse(arg, &arg);
3745 - if (*arg == '@') {
3746 - start_at = memparse(arg+1, &arg);
3747 - add_memory_region(start_at, mem_size, E820_RAM);
3748 - } else if (*arg == '#') {
3749 - start_at = memparse(arg+1, &arg);
3750 - add_memory_region(start_at, mem_size, E820_ACPI);
3751 - } else if (*arg == '$') {
3752 - start_at = memparse(arg+1, &arg);
3753 - add_memory_region(start_at, mem_size, E820_RESERVED);
3754 - } else {
3755 - limit_regions(mem_size);
3756 - user_defined_memmap = 1;
3757 - }
3758 - }
3759 - return 0;
3760 -}
3761 -early_param("memmap", parse_memmap);
3762 -
3763 -#ifndef CONFIG_XEN
3764 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
3765 - unsigned new_type)
3766 -{
3767 - int i;
3768 -
3769 - BUG_ON(old_type == new_type);
3770 -
3771 - for (i = 0; i < e820.nr_map; i++) {
3772 - struct e820entry *ei = &e820.map[i];
3773 - u64 final_start, final_end;
3774 - if (ei->type != old_type)
3775 - continue;
3776 - /* totally covered? */
3777 - if (ei->addr >= start && ei->size <= size) {
3778 - ei->type = new_type;
3779 - continue;
3780 - }
3781 - /* partially covered */
3782 - final_start = max(start, ei->addr);
3783 - final_end = min(start + size, ei->addr + ei->size);
3784 - if (final_start >= final_end)
3785 - continue;
3786 - add_memory_region(final_start, final_end - final_start,
3787 - new_type);
3788 - }
3789 -}
3790 -
3791 -void __init update_e820(void)
3792 -{
3793 - u8 nr_map;
3794 -
3795 - nr_map = e820.nr_map;
3796 - if (sanitize_e820_map(e820.map, &nr_map))
3797 - return;
3798 - e820.nr_map = nr_map;
3799 - printk(KERN_INFO "modified physical RAM map:\n");
3800 - print_memory_map("modified");
3801 -}
3802 -#endif
3803 --- sle11-2009-10-16.orig/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:38:05.000000000 +0100
3804 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3805 @@ -1,1045 +0,0 @@
3806 -/*
3807 - * Handle the memory map.
3808 - * The functions here do the job until bootmem takes over.
3809 - *
3810 - * Getting sanitize_e820_map() in sync with i386 version by applying change:
3811 - * - Provisions for empty E820 memory regions (reported by certain BIOSes).
3812 - * Alex Achenbach <xela@slit.de>, December 2002.
3813 - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
3814 - *
3815 - */
3816 -#include <linux/kernel.h>
3817 -#include <linux/types.h>
3818 -#include <linux/init.h>
3819 -#include <linux/bootmem.h>
3820 -#include <linux/ioport.h>
3821 -#include <linux/string.h>
3822 -#include <linux/kexec.h>
3823 -#include <linux/module.h>
3824 -#include <linux/mm.h>
3825 -#include <linux/suspend.h>
3826 -#include <linux/pfn.h>
3827 -
3828 -#include <asm/pgtable.h>
3829 -#include <asm/page.h>
3830 -#include <asm/e820.h>
3831 -#include <asm/proto.h>
3832 -#include <asm/setup.h>
3833 -#include <asm/sections.h>
3834 -#include <asm/kdebug.h>
3835 -#include <xen/interface/memory.h>
3836 -
3837 -struct e820map e820 __initdata;
3838 -#ifdef CONFIG_XEN
3839 -struct e820map machine_e820;
3840 -#endif
3841 -
3842 -/*
3843 - * PFN of last memory page.
3844 - */
3845 -unsigned long end_pfn;
3846 -
3847 -/*
3848 - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
3849 - * The direct mapping extends to max_pfn_mapped, so that we can directly access
3850 - * apertures, ACPI and other tables without having to play with fixmaps.
3851 - */
3852 -unsigned long max_pfn_mapped;
3853 -
3854 -/*
3855 - * Last pfn which the user wants to use.
3856 - */
3857 -static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
3858 -
3859 -/*
3860 - * Early reserved memory areas.
3861 - */
3862 -#define MAX_EARLY_RES 20
3863 -
3864 -struct early_res {
3865 - unsigned long start, end;
3866 - char name[16];
3867 -};
3868 -static struct early_res early_res[MAX_EARLY_RES] __initdata = {
3869 -#ifndef CONFIG_XEN
3870 - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
3871 -#ifdef CONFIG_X86_TRAMPOLINE
3872 - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
3873 -#endif
3874 -#endif
3875 - {}
3876 -};
3877 -
3878 -void __init reserve_early(unsigned long start, unsigned long end, char *name)
3879 -{
3880 - int i;
3881 - struct early_res *r;
3882 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3883 - r = &early_res[i];
3884 - if (end > r->start && start < r->end)
3885 - panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
3886 - start, end - 1, name?name:"", r->start, r->end - 1, r->name);
3887 - }
3888 - if (i >= MAX_EARLY_RES)
3889 - panic("Too many early reservations");
3890 - r = &early_res[i];
3891 - r->start = start;
3892 - r->end = end;
3893 - if (name)
3894 - strncpy(r->name, name, sizeof(r->name) - 1);
3895 -}
3896 -
3897 -void __init free_early(unsigned long start, unsigned long end)
3898 -{
3899 - struct early_res *r;
3900 - int i, j;
3901 -
3902 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3903 - r = &early_res[i];
3904 - if (start == r->start && end == r->end)
3905 - break;
3906 - }
3907 - if (i >= MAX_EARLY_RES || !early_res[i].end)
3908 - panic("free_early on not reserved area: %lx-%lx!", start, end);
3909 -
3910 - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
3911 - ;
3912 -
3913 - memmove(&early_res[i], &early_res[i + 1],
3914 - (j - 1 - i) * sizeof(struct early_res));
3915 -
3916 - early_res[j - 1].end = 0;
3917 -}
3918 -
3919 -void __init early_res_to_bootmem(unsigned long start, unsigned long end)
3920 -{
3921 - int i;
3922 - unsigned long final_start, final_end;
3923 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3924 - struct early_res *r = &early_res[i];
3925 - final_start = max(start, r->start);
3926 - final_end = min(end, r->end);
3927 - if (final_start >= final_end)
3928 - continue;
3929 - printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
3930 - final_start, final_end - 1, r->name);
3931 - reserve_bootmem_generic(final_start, final_end - final_start);
3932 - }
3933 -}
3934 -
3935 -/* Check for already reserved areas */
3936 -static inline int __init
3937 -bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
3938 -{
3939 - int i;
3940 - unsigned long addr = *addrp, last;
3941 - int changed = 0;
3942 -again:
3943 - last = addr + size;
3944 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3945 - struct early_res *r = &early_res[i];
3946 - if (last >= r->start && addr < r->end) {
3947 - *addrp = addr = round_up(r->end, align);
3948 - changed = 1;
3949 - goto again;
3950 - }
3951 - }
3952 - return changed;
3953 -}
3954 -
3955 -/* Check for already reserved areas */
3956 -static inline int __init
3957 -bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
3958 -{
3959 - int i;
3960 - unsigned long addr = *addrp, last;
3961 - unsigned long size = *sizep;
3962 - int changed = 0;
3963 -again:
3964 - last = addr + size;
3965 - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
3966 - struct early_res *r = &early_res[i];
3967 - if (last > r->start && addr < r->start) {
3968 - size = r->start - addr;
3969 - changed = 1;
3970 - goto again;
3971 - }
3972 - if (last > r->end && addr < r->end) {
3973 - addr = round_up(r->end, align);
3974 - size = last - addr;
3975 - changed = 1;
3976 - goto again;
3977 - }
3978 - if (last <= r->end && addr >= r->start) {
3979 - (*sizep)++;
3980 - return 0;
3981 - }
3982 - }
3983 - if (changed) {
3984 - *addrp = addr;
3985 - *sizep = size;
3986 - }
3987 - return changed;
3988 -}
3989 -/*
3990 - * This function checks if any part of the range <start,end> is mapped
3991 - * with type.
3992 - */
3993 -int
3994 -e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
3995 -{
3996 - int i;
3997 -
3998 -#ifndef CONFIG_XEN
3999 - for (i = 0; i < e820.nr_map; i++) {
4000 - struct e820entry *ei = &e820.map[i];
4001 -#else
4002 - if (!is_initial_xendomain())
4003 - return 0;
4004 - for (i = 0; i < machine_e820.nr_map; i++) {
4005 - const struct e820entry *ei = &machine_e820.map[i];
4006 -#endif
4007 -
4008 - if (type && ei->type != type)
4009 - continue;
4010 - if (ei->addr >= end || ei->addr + ei->size <= start)
4011 - continue;
4012 - return 1;
4013 - }
4014 - return 0;
4015 -}
4016 -EXPORT_SYMBOL_GPL(e820_any_mapped);
4017 -
4018 -/*
4019 - * This function checks if the entire range <start,end> is mapped with type.
4020 - *
4021 - * Note: this function only works correct if the e820 table is sorted and
4022 - * not-overlapping, which is the case
4023 - */
4024 -int __init e820_all_mapped(unsigned long start, unsigned long end,
4025 - unsigned type)
4026 -{
4027 - int i;
4028 -
4029 -#ifndef CONFIG_XEN
4030 - for (i = 0; i < e820.nr_map; i++) {
4031 - struct e820entry *ei = &e820.map[i];
4032 -#else
4033 - if (!is_initial_xendomain())
4034 - return 0;
4035 - for (i = 0; i < machine_e820.nr_map; i++) {
4036 - const struct e820entry *ei = &machine_e820.map[i];
4037 -#endif
4038 -
4039 - if (type && ei->type != type)
4040 - continue;
4041 - /* is the region (part) in overlap with the current region ?*/
4042 - if (ei->addr >= end || ei->addr + ei->size <= start)
4043 - continue;
4044 -
4045 - /* if the region is at the beginning of <start,end> we move
4046 - * start to the end of the region since it's ok until there
4047 - */
4048 - if (ei->addr <= start)
4049 - start = ei->addr + ei->size;
4050 - /*
4051 - * if start is now at or beyond end, we're done, full
4052 - * coverage
4053 - */
4054 - if (start >= end)
4055 - return 1;
4056 - }
4057 - return 0;
4058 -}
4059 -
4060 -/*
4061 - * Find a free area with specified alignment in a specific range.
4062 - */
4063 -unsigned long __init find_e820_area(unsigned long start, unsigned long end,
4064 - unsigned long size, unsigned long align)
4065 -{
4066 - int i;
4067 -
4068 - for (i = 0; i < e820.nr_map; i++) {
4069 - struct e820entry *ei = &e820.map[i];
4070 - unsigned long addr, last;
4071 - unsigned long ei_last;
4072 -
4073 - if (ei->type != E820_RAM)
4074 - continue;
4075 - addr = round_up(ei->addr, align);
4076 - ei_last = ei->addr + ei->size;
4077 - if (addr < start)
4078 - addr = round_up(start, align);
4079 - if (addr >= ei_last)
4080 - continue;
4081 - while (bad_addr(&addr, size, align) && addr+size <= ei_last)
4082 - ;
4083 - last = addr + size;
4084 - if (last > ei_last)
4085 - continue;
4086 - if (last > end)
4087 - continue;
4088 - return addr;
4089 - }
4090 - return -1UL;
4091 -}
4092 -
4093 -/*
4094 - * Find next free range after *start
4095 - */
4096 -unsigned long __init find_e820_area_size(unsigned long start,
4097 - unsigned long *sizep,
4098 - unsigned long align)
4099 -{
4100 - int i;
4101 -
4102 - for (i = 0; i < e820.nr_map; i++) {
4103 - struct e820entry *ei = &e820.map[i];
4104 - unsigned long addr, last;
4105 - unsigned long ei_last;
4106 -
4107 - if (ei->type != E820_RAM)
4108 - continue;
4109 - addr = round_up(ei->addr, align);
4110 - ei_last = ei->addr + ei->size;
4111 - if (addr < start)
4112 - addr = round_up(start, align);
4113 - if (addr >= ei_last)
4114 - continue;
4115 - *sizep = ei_last - addr;
4116 - while (bad_addr_size(&addr, sizep, align) &&
4117 - addr + *sizep <= ei_last)
4118 - ;
4119 - last = addr + *sizep;
4120 - if (last > ei_last)
4121 - continue;
4122 - return addr;
4123 - }
4124 - return -1UL;
4125 -
4126 -}
4127 -/*
4128 - * Find the highest page frame number we have available
4129 - */
4130 -unsigned long __init e820_end_of_ram(void)
4131 -{
4132 - unsigned long end_pfn;
4133 -
4134 - end_pfn = find_max_pfn_with_active_regions();
4135 -
4136 - if (end_pfn > max_pfn_mapped)
4137 - max_pfn_mapped = end_pfn;
4138 - if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
4139 - max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
4140 - if (end_pfn > end_user_pfn)
4141 - end_pfn = end_user_pfn;
4142 - if (end_pfn > max_pfn_mapped)
4143 - end_pfn = max_pfn_mapped;
4144 -
4145 - printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
4146 - return end_pfn;
4147 -}
4148 -
4149 -/*
4150 - * Mark e820 reserved areas as busy for the resource manager.
4151 - */
4152 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
4153 -{
4154 - int i;
4155 - struct resource *res;
4156 -
4157 - res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
4158 - for (i = 0; i < nr_map; i++) {
4159 - switch (e820[i].type) {
4160 - case E820_RAM: res->name = "System RAM"; break;
4161 - case E820_ACPI: res->name = "ACPI Tables"; break;
4162 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
4163 - default: res->name = "reserved";
4164 - }
4165 - res->start = e820[i].addr;
4166 - res->end = res->start + e820[i].size - 1;
4167 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4168 - insert_resource(&iomem_resource, res);
4169 - res++;
4170 - }
4171 -}
4172 -
4173 -#ifndef CONFIG_XEN
4174 -/*
4175 - * Find the ranges of physical addresses that do not correspond to
4176 - * e820 RAM areas and mark the corresponding pages as nosave for software
4177 - * suspend and suspend to RAM.
4178 - *
4179 - * This function requires the e820 map to be sorted and without any
4180 - * overlapping entries and assumes the first e820 area to be RAM.
4181 - */
4182 -void __init e820_mark_nosave_regions(void)
4183 -{
4184 - int i;
4185 - unsigned long paddr;
4186 -
4187 - paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
4188 - for (i = 1; i < e820.nr_map; i++) {
4189 - struct e820entry *ei = &e820.map[i];
4190 -
4191 - if (paddr < ei->addr)
4192 - register_nosave_region(PFN_DOWN(paddr),
4193 - PFN_UP(ei->addr));
4194 -
4195 - paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
4196 - if (ei->type != E820_RAM)
4197 - register_nosave_region(PFN_UP(ei->addr),
4198 - PFN_DOWN(paddr));
4199 -
4200 - if (paddr >= (end_pfn << PAGE_SHIFT))
4201 - break;
4202 - }
4203 -}
4204 -#endif
4205 -
4206 -/*
4207 - * Finds an active region in the address range from start_pfn to end_pfn and
4208 - * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
4209 - */
4210 -static int __init e820_find_active_region(const struct e820entry *ei,
4211 - unsigned long start_pfn,
4212 - unsigned long end_pfn,
4213 - unsigned long *ei_startpfn,
4214 - unsigned long *ei_endpfn)
4215 -{
4216 - *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
4217 - *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
4218 -
4219 - /* Skip map entries smaller than a page */
4220 - if (*ei_startpfn >= *ei_endpfn)
4221 - return 0;
4222 -
4223 - /* Check if max_pfn_mapped should be updated */
4224 - if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
4225 - max_pfn_mapped = *ei_endpfn;
4226 -
4227 - /* Skip if map is outside the node */
4228 - if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
4229 - *ei_startpfn >= end_pfn)
4230 - return 0;
4231 -
4232 - /* Check for overlaps */
4233 - if (*ei_startpfn < start_pfn)
4234 - *ei_startpfn = start_pfn;
4235 - if (*ei_endpfn > end_pfn)
4236 - *ei_endpfn = end_pfn;
4237 -
4238 - /* Obey end_user_pfn to save on memmap */
4239 - if (*ei_startpfn >= end_user_pfn)
4240 - return 0;
4241 - if (*ei_endpfn > end_user_pfn)
4242 - *ei_endpfn = end_user_pfn;
4243 -
4244 - return 1;
4245 -}
4246 -
4247 -/* Walk the e820 map and register active regions within a node */
4248 -void __init
4249 -e820_register_active_regions(int nid, unsigned long start_pfn,
4250 - unsigned long end_pfn)
4251 -{
4252 - unsigned long ei_startpfn;
4253 - unsigned long ei_endpfn;
4254 - int i;
4255 -
4256 - for (i = 0; i < e820.nr_map; i++)
4257 - if (e820_find_active_region(&e820.map[i],
4258 - start_pfn, end_pfn,
4259 - &ei_startpfn, &ei_endpfn))
4260 - add_active_range(nid, ei_startpfn, ei_endpfn);
4261 -}
4262 -
4263 -/*
4264 - * Add a memory region to the kernel e820 map.
4265 - */
4266 -void __init add_memory_region(unsigned long start, unsigned long size, int type)
4267 -{
4268 - int x = e820.nr_map;
4269 -
4270 - if (x == E820MAX) {
4271 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
4272 - return;
4273 - }
4274 -
4275 - e820.map[x].addr = start;
4276 - e820.map[x].size = size;
4277 - e820.map[x].type = type;
4278 - e820.nr_map++;
4279 -}
4280 -
4281 -/*
4282 - * Find the hole size (in bytes) in the memory range.
4283 - * @start: starting address of the memory range to scan
4284 - * @end: ending address of the memory range to scan
4285 - */
4286 -unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
4287 -{
4288 - unsigned long start_pfn = start >> PAGE_SHIFT;
4289 - unsigned long end_pfn = end >> PAGE_SHIFT;
4290 - unsigned long ei_startpfn, ei_endpfn, ram = 0;
4291 - int i;
4292 -
4293 - for (i = 0; i < e820.nr_map; i++) {
4294 - if (e820_find_active_region(&e820.map[i],
4295 - start_pfn, end_pfn,
4296 - &ei_startpfn, &ei_endpfn))
4297 - ram += ei_endpfn - ei_startpfn;
4298 - }
4299 - return end - start - (ram << PAGE_SHIFT);
4300 -}
4301 -
4302 -static void __init e820_print_map(char *who)
4303 -{
4304 - int i;
4305 -
4306 - for (i = 0; i < e820.nr_map; i++) {
4307 - printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
4308 - (unsigned long long) e820.map[i].addr,
4309 - (unsigned long long)
4310 - (e820.map[i].addr + e820.map[i].size));
4311 - switch (e820.map[i].type) {
4312 - case E820_RAM:
4313 - printk(KERN_CONT "(usable)\n");
4314 - break;
4315 - case E820_RESERVED:
4316 - printk(KERN_CONT "(reserved)\n");
4317 - break;
4318 - case E820_ACPI:
4319 - printk(KERN_CONT "(ACPI data)\n");
4320 - break;
4321 - case E820_NVS:
4322 - printk(KERN_CONT "(ACPI NVS)\n");
4323 - break;
4324 - default:
4325 - printk(KERN_CONT "type %u\n", e820.map[i].type);
4326 - break;
4327 - }
4328 - }
4329 -}
4330 -
4331 -/*
4332 - * Sanitize the BIOS e820 map.
4333 - *
4334 - * Some e820 responses include overlapping entries. The following
4335 - * replaces the original e820 map with a new one, removing overlaps.
4336 - *
4337 - */
4338 -static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
4339 -{
4340 - struct change_member {
4341 - struct e820entry *pbios; /* pointer to original bios entry */
4342 - unsigned long long addr; /* address for this change point */
4343 - };
4344 - static struct change_member change_point_list[2*E820MAX] __initdata;
4345 - static struct change_member *change_point[2*E820MAX] __initdata;
4346 - static struct e820entry *overlap_list[E820MAX] __initdata;
4347 - static struct e820entry new_bios[E820MAX] __initdata;
4348 - struct change_member *change_tmp;
4349 - unsigned long current_type, last_type;
4350 - unsigned long long last_addr;
4351 - int chgidx, still_changing;
4352 - int overlap_entries;
4353 - int new_bios_entry;
4354 - int old_nr, new_nr, chg_nr;
4355 - int i;
4356 -
4357 - /*
4358 - Visually we're performing the following
4359 - (1,2,3,4 = memory types)...
4360 -
4361 - Sample memory map (w/overlaps):
4362 - ____22__________________
4363 - ______________________4_
4364 - ____1111________________
4365 - _44_____________________
4366 - 11111111________________
4367 - ____________________33__
4368 - ___________44___________
4369 - __________33333_________
4370 - ______________22________
4371 - ___________________2222_
4372 - _________111111111______
4373 - _____________________11_
4374 - _________________4______
4375 -
4376 - Sanitized equivalent (no overlap):
4377 - 1_______________________
4378 - _44_____________________
4379 - ___1____________________
4380 - ____22__________________
4381 - ______11________________
4382 - _________1______________
4383 - __________3_____________
4384 - ___________44___________
4385 - _____________33_________
4386 - _______________2________
4387 - ________________1_______
4388 - _________________4______
4389 - ___________________2____
4390 - ____________________33__
4391 - ______________________4_
4392 - */
4393 -
4394 - /* if there's only one memory region, don't bother */
4395 - if (*pnr_map < 2)
4396 - return -1;
4397 -
4398 - old_nr = *pnr_map;
4399 -
4400 - /* bail out if we find any unreasonable addresses in bios map */
4401 - for (i = 0; i < old_nr; i++)
4402 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
4403 - return -1;
4404 -
4405 - /* create pointers for initial change-point information (for sorting) */
4406 - for (i = 0; i < 2 * old_nr; i++)
4407 - change_point[i] = &change_point_list[i];
4408 -
4409 - /* record all known change-points (starting and ending addresses),
4410 - omitting those that are for empty memory regions */
4411 - chgidx = 0;
4412 - for (i = 0; i < old_nr; i++) {
4413 - if (biosmap[i].size != 0) {
4414 - change_point[chgidx]->addr = biosmap[i].addr;
4415 - change_point[chgidx++]->pbios = &biosmap[i];
4416 - change_point[chgidx]->addr = biosmap[i].addr +
4417 - biosmap[i].size;
4418 - change_point[chgidx++]->pbios = &biosmap[i];
4419 - }
4420 - }
4421 - chg_nr = chgidx;
4422 -
4423 - /* sort change-point list by memory addresses (low -> high) */
4424 - still_changing = 1;
4425 - while (still_changing) {
4426 - still_changing = 0;
4427 - for (i = 1; i < chg_nr; i++) {
4428 - unsigned long long curaddr, lastaddr;
4429 - unsigned long long curpbaddr, lastpbaddr;
4430 -
4431 - curaddr = change_point[i]->addr;
4432 - lastaddr = change_point[i - 1]->addr;
4433 - curpbaddr = change_point[i]->pbios->addr;
4434 - lastpbaddr = change_point[i - 1]->pbios->addr;
4435 -
4436 - /*
4437 - * swap entries, when:
4438 - *
4439 - * curaddr > lastaddr or
4440 - * curaddr == lastaddr and curaddr == curpbaddr and
4441 - * lastaddr != lastpbaddr
4442 - */
4443 - if (curaddr < lastaddr ||
4444 - (curaddr == lastaddr && curaddr == curpbaddr &&
4445 - lastaddr != lastpbaddr)) {
4446 - change_tmp = change_point[i];
4447 - change_point[i] = change_point[i-1];
4448 - change_point[i-1] = change_tmp;
4449 - still_changing = 1;
4450 - }
4451 - }
4452 - }
4453 -
4454 - /* create a new bios memory map, removing overlaps */
4455 - overlap_entries = 0; /* number of entries in the overlap table */
4456 - new_bios_entry = 0; /* index for creating new bios map entries */
4457 - last_type = 0; /* start with undefined memory type */
4458 - last_addr = 0; /* start with 0 as last starting address */
4459 -
4460 - /* loop through change-points, determining affect on the new bios map */
4461 - for (chgidx = 0; chgidx < chg_nr; chgidx++) {
4462 - /* keep track of all overlapping bios entries */
4463 - if (change_point[chgidx]->addr ==
4464 - change_point[chgidx]->pbios->addr) {
4465 - /*
4466 - * add map entry to overlap list (> 1 entry
4467 - * implies an overlap)
4468 - */
4469 - overlap_list[overlap_entries++] =
4470 - change_point[chgidx]->pbios;
4471 - } else {
4472 - /*
4473 - * remove entry from list (order independent,
4474 - * so swap with last)
4475 - */
4476 - for (i = 0; i < overlap_entries; i++) {
4477 - if (overlap_list[i] ==
4478 - change_point[chgidx]->pbios)
4479 - overlap_list[i] =
4480 - overlap_list[overlap_entries-1];
4481 - }
4482 - overlap_entries--;
4483 - }
4484 - /*
4485 - * if there are overlapping entries, decide which
4486 - * "type" to use (larger value takes precedence --
4487 - * 1=usable, 2,3,4,4+=unusable)
4488 - */
4489 - current_type = 0;
4490 - for (i = 0; i < overlap_entries; i++)
4491 - if (overlap_list[i]->type > current_type)
4492 - current_type = overlap_list[i]->type;
4493 - /*
4494 - * continue building up new bios map based on this
4495 - * information
4496 - */
4497 - if (current_type != last_type) {
4498 - if (last_type != 0) {
4499 - new_bios[new_bios_entry].size =
4500 - change_point[chgidx]->addr - last_addr;
4501 - /*
4502 - * move forward only if the new size
4503 - * was non-zero
4504 - */
4505 - if (new_bios[new_bios_entry].size != 0)
4506 - /*
4507 - * no more space left for new
4508 - * bios entries ?
4509 - */
4510 - if (++new_bios_entry >= E820MAX)
4511 - break;
4512 - }
4513 - if (current_type != 0) {
4514 - new_bios[new_bios_entry].addr =
4515 - change_point[chgidx]->addr;
4516 - new_bios[new_bios_entry].type = current_type;
4517 - last_addr = change_point[chgidx]->addr;
4518 - }
4519 - last_type = current_type;
4520 - }
4521 - }
4522 - /* retain count for new bios entries */
4523 - new_nr = new_bios_entry;
4524 -
4525 - /* copy new bios mapping into original location */
4526 - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
4527 - *pnr_map = new_nr;
4528 -
4529 - return 0;
4530 -}
4531 -
4532 -/*
4533 - * Copy the BIOS e820 map into a safe place.
4534 - *
4535 - * Sanity-check it while we're at it..
4536 - *
4537 - * If we're lucky and live on a modern system, the setup code
4538 - * will have given us a memory map that we can use to properly
4539 - * set up memory. If we aren't, we'll fake a memory map.
4540 - */
4541 -static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
4542 -{
4543 -#ifndef CONFIG_XEN
4544 - /* Only one memory region (or negative)? Ignore it */
4545 - if (nr_map < 2)
4546 - return -1;
4547 -#else
4548 - BUG_ON(nr_map < 1);
4549 -#endif
4550 -
4551 - do {
4552 - u64 start = biosmap->addr;
4553 - u64 size = biosmap->size;
4554 - u64 end = start + size;
4555 - u32 type = biosmap->type;
4556 -
4557 - /* Overflow in 64 bits? Ignore the memory map. */
4558 - if (start > end)
4559 - return -1;
4560 -
4561 - add_memory_region(start, size, type);
4562 - } while (biosmap++, --nr_map);
4563 -
4564 -#ifdef CONFIG_XEN
4565 - if (is_initial_xendomain()) {
4566 - struct xen_memory_map memmap;
4567 -
4568 - memmap.nr_entries = E820MAX;
4569 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
4570 -
4571 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
4572 - BUG();
4573 - machine_e820.nr_map = memmap.nr_entries;
4574 - } else
4575 - machine_e820 = e820;
4576 -#endif
4577 -
4578 - return 0;
4579 -}
4580 -
4581 -static void early_panic(char *msg)
4582 -{
4583 - early_printk(msg);
4584 - panic(msg);
4585 -}
4586 -
4587 -/* We're not void only for x86 32-bit compat */
4588 -char * __init machine_specific_memory_setup(void)
4589 -{
4590 -#ifndef CONFIG_XEN
4591 - char *who = "BIOS-e820";
4592 - /*
4593 - * Try to copy the BIOS-supplied E820-map.
4594 - *
4595 - * Otherwise fake a memory map; one section from 0k->640k,
4596 - * the next section from 1mb->appropriate_mem_k
4597 - */
4598 - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
4599 - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
4600 - early_panic("Cannot find a valid memory map");
4601 -#else /* CONFIG_XEN */
4602 - char *who = "Xen";
4603 - int rc;
4604 - struct xen_memory_map memmap;
4605 - /*
4606 - * This is rather large for a stack variable but this early in
4607 - * the boot process we know we have plenty slack space.
4608 - */
4609 - struct e820entry map[E820MAX];
4610 -
4611 - memmap.nr_entries = E820MAX;
4612 - set_xen_guest_handle(memmap.buffer, map);
4613 -
4614 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
4615 - if ( rc == -ENOSYS ) {
4616 - memmap.nr_entries = 1;
4617 - map[0].addr = 0ULL;
4618 - map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
4619 - /* 8MB slack (to balance backend allocations). */
4620 - map[0].size += 8 << 20;
4621 - map[0].type = E820_RAM;
4622 - rc = 0;
4623 - }
4624 - BUG_ON(rc);
4625 -
4626 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
4627 -
4628 - if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
4629 - early_panic("Cannot find a valid memory map");
4630 -#endif
4631 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4632 - e820_print_map(who);
4633 -
4634 - /* In case someone cares... */
4635 - return who;
4636 -}
4637 -
4638 -static int __init parse_memopt(char *p)
4639 -{
4640 - int i;
4641 - unsigned long current_end;
4642 - unsigned long end;
4643 -
4644 - if (!p)
4645 - return -EINVAL;
4646 - end_user_pfn = memparse(p, &p);
4647 - end_user_pfn >>= PAGE_SHIFT;
4648 -
4649 - end = end_user_pfn<<PAGE_SHIFT;
4650 - i = e820.nr_map-1;
4651 - current_end = e820.map[i].addr + e820.map[i].size;
4652 -
4653 - if (current_end < end) {
4654 - /*
4655 - * The e820 map ends before our requested size so
4656 - * extend the final entry to the requested address.
4657 - */
4658 - if (e820.map[i].type == E820_RAM)
4659 - e820.map[i].size = end - e820.map[i].addr;
4660 - else
4661 - add_memory_region(current_end, end - current_end, E820_RAM);
4662 - }
4663 -
4664 - return 0;
4665 -}
4666 -early_param("mem", parse_memopt);
4667 -
4668 -static int userdef __initdata;
4669 -
4670 -static int __init parse_memmap_opt(char *p)
4671 -{
4672 - char *oldp;
4673 - unsigned long long start_at, mem_size;
4674 -
4675 - if (!strcmp(p, "exactmap")) {
4676 -#ifdef CONFIG_CRASH_DUMP
4677 - /*
4678 - * If we are doing a crash dump, we still need to know
4679 - * the real mem size before original memory map is
4680 - * reset.
4681 - */
4682 - e820_register_active_regions(0, 0, -1UL);
4683 - saved_max_pfn = e820_end_of_ram();
4684 - remove_all_active_ranges();
4685 -#endif
4686 - max_pfn_mapped = 0;
4687 - e820.nr_map = 0;
4688 - userdef = 1;
4689 - return 0;
4690 - }
4691 -
4692 - oldp = p;
4693 - mem_size = memparse(p, &p);
4694 - if (p == oldp)
4695 - return -EINVAL;
4696 -
4697 - userdef = 1;
4698 - if (*p == '@') {
4699 - start_at = memparse(p+1, &p);
4700 - add_memory_region(start_at, mem_size, E820_RAM);
4701 - } else if (*p == '#') {
4702 - start_at = memparse(p+1, &p);
4703 - add_memory_region(start_at, mem_size, E820_ACPI);
4704 - } else if (*p == '$') {
4705 - start_at = memparse(p+1, &p);
4706 - add_memory_region(start_at, mem_size, E820_RESERVED);
4707 - } else {
4708 - end_user_pfn = (mem_size >> PAGE_SHIFT);
4709 - }
4710 - return *p == '\0' ? 0 : -EINVAL;
4711 -}
4712 -early_param("memmap", parse_memmap_opt);
4713 -
4714 -void __init finish_e820_parsing(void)
4715 -{
4716 - if (userdef) {
4717 - char nr = e820.nr_map;
4718 -
4719 - if (sanitize_e820_map(e820.map, &nr) < 0)
4720 - early_panic("Invalid user supplied memory map");
4721 - e820.nr_map = nr;
4722 -
4723 - printk(KERN_INFO "user-defined physical RAM map:\n");
4724 - e820_print_map("user");
4725 - }
4726 -}
4727 -
4728 -#ifndef CONFIG_XEN
4729 -void __init update_memory_range(u64 start, u64 size, unsigned old_type,
4730 - unsigned new_type)
4731 -{
4732 - int i;
4733 -
4734 - BUG_ON(old_type == new_type);
4735 -
4736 - for (i = 0; i < e820.nr_map; i++) {
4737 - struct e820entry *ei = &e820.map[i];
4738 - u64 final_start, final_end;
4739 - if (ei->type != old_type)
4740 - continue;
4741 - /* totally covered? */
4742 - if (ei->addr >= start && ei->size <= size) {
4743 - ei->type = new_type;
4744 - continue;
4745 - }
4746 - /* partially covered */
4747 - final_start = max(start, ei->addr);
4748 - final_end = min(start + size, ei->addr + ei->size);
4749 - if (final_start >= final_end)
4750 - continue;
4751 - add_memory_region(final_start, final_end - final_start,
4752 - new_type);
4753 - }
4754 -}
4755 -
4756 -void __init update_e820(void)
4757 -{
4758 - u8 nr_map;
4759 -
4760 - nr_map = e820.nr_map;
4761 - if (sanitize_e820_map(e820.map, &nr_map))
4762 - return;
4763 - e820.nr_map = nr_map;
4764 - printk(KERN_INFO "modified physical RAM map:\n");
4765 - e820_print_map("modified");
4766 -}
4767 -#endif
4768 -
4769 -unsigned long pci_mem_start = 0xaeedbabe;
4770 -EXPORT_SYMBOL(pci_mem_start);
4771 -
4772 -/*
4773 - * Search for the biggest gap in the low 32 bits of the e820
4774 - * memory space. We pass this space to PCI to assign MMIO resources
4775 - * for hotplug or unconfigured devices in.
4776 - * Hopefully the BIOS let enough space left.
4777 - */
4778 -__init void e820_setup_gap(struct e820entry *e820, int nr_map)
4779 -{
4780 - unsigned long gapstart, gapsize, round;
4781 - unsigned long last;
4782 - int i;
4783 - int found = 0;
4784 -
4785 - last = 0x100000000ull;
4786 - gapstart = 0x10000000;
4787 - gapsize = 0x400000;
4788 - i = nr_map;
4789 - while (--i >= 0) {
4790 - unsigned long long start = e820[i].addr;
4791 - unsigned long long end = start + e820[i].size;
4792 -
4793 - /*
4794 - * Since "last" is at most 4GB, we know we'll
4795 - * fit in 32 bits if this condition is true
4796 - */
4797 - if (last > end) {
4798 - unsigned long gap = last - end;
4799 -
4800 - if (gap > gapsize) {
4801 - gapsize = gap;
4802 - gapstart = end;
4803 - found = 1;
4804 - }
4805 - }
4806 - if (start < last)
4807 - last = start;
4808 - }
4809 -
4810 - if (!found) {
4811 - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
4812 - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
4813 - "address range\n"
4814 - KERN_ERR "PCI: Unassigned devices with 32bit resource "
4815 - "registers may break!\n");
4816 - }
4817 -
4818 - /*
4819 - * See how much we want to round up: start off with
4820 - * rounding to the next 1MB area.
4821 - */
4822 - round = 0x100000;
4823 - while ((gapsize >> 4) > round)
4824 - round += round;
4825 - /* Fun with two's complement */
4826 - pci_mem_start = (gapstart + round) & -round;
4827 -
4828 - printk(KERN_INFO
4829 - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
4830 - pci_mem_start, gapstart, gapsize);
4831 -}
4832 -
4833 -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
4834 -{
4835 - int i;
4836 -
4837 - if (slot < 0 || slot >= e820.nr_map)
4838 - return -1;
4839 - for (i = slot; i < e820.nr_map; i++) {
4840 - if (e820.map[i].type != E820_RAM)
4841 - continue;
4842 - break;
4843 - }
4844 - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
4845 - return -1;
4846 - *addr = e820.map[i].addr;
4847 - *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
4848 - max_pfn << PAGE_SHIFT) - *addr;
4849 - return i + 1;
4850 -}
4851 --- sle11-2009-10-16.orig/arch/x86/kernel/early_printk-xen.c 2009-09-24 10:29:16.000000000 +0200
4852 +++ sle11-2009-10-16/arch/x86/kernel/early_printk-xen.c 2009-06-04 10:21:39.000000000 +0200
4853 @@ -225,7 +225,7 @@ static struct console simnow_console = {
4854 static struct console *early_console = &early_vga_console;
4855 static int early_console_initialized;
4856
4857 -void early_printk(const char *fmt, ...)
4858 +asmlinkage void early_printk(const char *fmt, ...)
4859 {
4860 char buf[512];
4861 int n;
4862 --- sle11-2009-10-16.orig/arch/x86/kernel/entry_32-xen.S 2009-03-16 16:38:05.000000000 +0100
4863 +++ sle11-2009-10-16/arch/x86/kernel/entry_32-xen.S 2009-06-04 10:21:39.000000000 +0200
4864 @@ -51,15 +51,26 @@
4865 #include <asm/percpu.h>
4866 #include <asm/dwarf2.h>
4867 #include <asm/processor-flags.h>
4868 -#include "irq_vectors.h"
4869 +#include <asm/ftrace.h>
4870 +#include <asm/irq_vectors.h>
4871 #include <xen/interface/xen.h>
4872
4873 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
4874 +#include <linux/elf-em.h>
4875 +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
4876 +#define __AUDIT_ARCH_LE 0x40000000
4877 +
4878 +#ifndef CONFIG_AUDITSYSCALL
4879 +#define sysenter_audit syscall_trace_entry
4880 +#define sysexit_audit syscall_exit_work
4881 +#endif
4882 +
4883 /*
4884 * We use macros for low-level operations which need to be overridden
4885 * for paravirtualization. The following will never clobber any registers:
4886 * INTERRUPT_RETURN (aka. "iret")
4887 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
4888 - * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
4889 + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
4890 *
4891 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
4892 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
4893 @@ -277,11 +288,6 @@ END(resume_kernel)
4894 #endif
4895 CFI_ENDPROC
4896
4897 - .macro test_tif ti_reg # system call tracing in operation / emulation
4898 - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
4899 - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
4900 - .endm
4901 -
4902 /* SYSENTER_RETURN points to after the "sysenter" instruction in
4903 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
4904
4905 @@ -338,8 +344,9 @@ sysenter_past_esp:
4906 .previous
4907
4908 GET_THREAD_INFO(%ebp)
4909 - test_tif %ebp
4910 - jnz syscall_trace_entry
4911 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4912 + jnz sysenter_audit
4913 +sysenter_do_call:
4914 cmpl $(nr_syscalls), %eax
4915 jae syscall_badsys
4916 call *sys_call_table(,%eax,4)
4917 @@ -349,14 +356,54 @@ sysenter_past_esp:
4918 TRACE_IRQS_OFF
4919 movl TI_flags(%ebp), %ecx
4920 testw $_TIF_ALLWORK_MASK, %cx
4921 - jne syscall_exit_work
4922 + jne sysexit_audit
4923 +sysenter_exit:
4924 /* if something modifies registers it must also disable sysexit */
4925 movl PT_EIP(%esp), %edx
4926 movl PT_OLDESP(%esp), %ecx
4927 xorl %ebp,%ebp
4928 TRACE_IRQS_ON
4929 1: mov PT_FS(%esp), %fs
4930 - ENABLE_INTERRUPTS_SYSCALL_RET
4931 + ENABLE_INTERRUPTS_SYSEXIT
4932 +
4933 +#ifdef CONFIG_AUDITSYSCALL
4934 +sysenter_audit:
4935 + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
4936 + jnz syscall_trace_entry
4937 + addl $4,%esp
4938 + CFI_ADJUST_CFA_OFFSET -4
4939 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
4940 + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
4941 + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
4942 + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
4943 + movl %eax,%edx /* 2nd arg: syscall number */
4944 + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
4945 + call audit_syscall_entry
4946 + pushl %ebx
4947 + CFI_ADJUST_CFA_OFFSET 4
4948 + movl PT_EAX(%esp),%eax /* reload syscall number */
4949 + jmp sysenter_do_call
4950 +
4951 +sysexit_audit:
4952 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4953 + jne syscall_exit_work
4954 + TRACE_IRQS_ON
4955 + ENABLE_INTERRUPTS(CLBR_ANY)
4956 + movl %eax,%edx /* second arg, syscall return value */
4957 + cmpl $0,%eax /* is it < 0? */
4958 + setl %al /* 1 if so, 0 if not */
4959 + movzbl %al,%eax /* zero-extend that */
4960 + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
4961 + call audit_syscall_exit
4962 + DISABLE_INTERRUPTS(CLBR_ANY)
4963 + TRACE_IRQS_OFF
4964 + movl TI_flags(%ebp), %ecx
4965 + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
4966 + jne syscall_exit_work
4967 + movl PT_EAX(%esp),%eax /* reload syscall return value */
4968 + jmp sysenter_exit
4969 +#endif
4970 +
4971 CFI_ENDPROC
4972 .pushsection .fixup,"ax"
4973 2: movl $0,PT_FS(%esp)
4974 @@ -400,7 +447,7 @@ ENTRY(system_call)
4975 CFI_ADJUST_CFA_OFFSET 4
4976 SAVE_ALL
4977 GET_THREAD_INFO(%ebp)
4978 - test_tif %ebp
4979 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
4980 jnz syscall_trace_entry
4981 cmpl $(nr_syscalls), %eax
4982 jae syscall_badsys
4983 @@ -413,10 +460,6 @@ syscall_exit:
4984 # setting need_resched or sigpending
4985 # between sampling and the iret
4986 TRACE_IRQS_OFF
4987 - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
4988 - jz no_singlestep
4989 - orl $_TIF_SINGLESTEP,TI_flags(%ebp)
4990 -no_singlestep:
4991 movl TI_flags(%ebp), %ecx
4992 testw $_TIF_ALLWORK_MASK, %cx # current->work
4993 jne syscall_exit_work
4994 @@ -588,12 +631,8 @@ END(work_pending)
4995 syscall_trace_entry:
4996 movl $-ENOSYS,PT_EAX(%esp)
4997 movl %esp, %eax
4998 - xorl %edx,%edx
4999 - call do_syscall_trace
5000 - cmpl $0, %eax
5001 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5002 - # so must skip actual syscall
5003 - movl PT_ORIG_EAX(%esp), %eax
5004 + call syscall_trace_enter
5005 + /* What it returned is what we'll actually use. */
5006 cmpl $(nr_syscalls), %eax
5007 jnae syscall_call
5008 jmp syscall_exit
5009 @@ -602,14 +641,13 @@ END(syscall_trace_entry)
5010 # perform syscall exit tracing
5011 ALIGN
5012 syscall_exit_work:
5013 - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
5014 + testb $_TIF_WORK_SYSCALL_EXIT, %cl
5015 jz work_pending
5016 TRACE_IRQS_ON
5017 - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
5018 + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
5019 # schedule() instead
5020 movl %esp, %eax
5021 - movl $1, %edx
5022 - call do_syscall_trace
5023 + call syscall_trace_leave
5024 jmp resume_userspace
5025 END(syscall_exit_work)
5026 CFI_ENDPROC
5027 @@ -1113,10 +1151,10 @@ ENTRY(native_iret)
5028 .previous
5029 END(native_iret)
5030
5031 -ENTRY(native_irq_enable_syscall_ret)
5032 +ENTRY(native_irq_enable_sysexit)
5033 sti
5034 sysexit
5035 -END(native_irq_enable_syscall_ret)
5036 +END(native_irq_enable_sysexit)
5037 #endif
5038
5039 KPROBE_ENTRY(int3)
5040 @@ -1265,6 +1303,77 @@ ENTRY(kernel_thread_helper)
5041 CFI_ENDPROC
5042 ENDPROC(kernel_thread_helper)
5043
5044 +#ifdef CONFIG_FTRACE
5045 +#ifdef CONFIG_DYNAMIC_FTRACE
5046 +
5047 +ENTRY(mcount)
5048 + pushl %eax
5049 + pushl %ecx
5050 + pushl %edx
5051 + movl 0xc(%esp), %eax
5052 + subl $MCOUNT_INSN_SIZE, %eax
5053 +
5054 +.globl mcount_call
5055 +mcount_call:
5056 + call ftrace_stub
5057 +
5058 + popl %edx
5059 + popl %ecx
5060 + popl %eax
5061 +
5062 + ret
5063 +END(mcount)
5064 +
5065 +ENTRY(ftrace_caller)
5066 + pushl %eax
5067 + pushl %ecx
5068 + pushl %edx
5069 + movl 0xc(%esp), %eax
5070 + movl 0x4(%ebp), %edx
5071 + subl $MCOUNT_INSN_SIZE, %eax
5072 +
5073 +.globl ftrace_call
5074 +ftrace_call:
5075 + call ftrace_stub
5076 +
5077 + popl %edx
5078 + popl %ecx
5079 + popl %eax
5080 +
5081 +.globl ftrace_stub
5082 +ftrace_stub:
5083 + ret
5084 +END(ftrace_caller)
5085 +
5086 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5087 +
5088 +ENTRY(mcount)
5089 + cmpl $ftrace_stub, ftrace_trace_function
5090 + jnz trace
5091 +.globl ftrace_stub
5092 +ftrace_stub:
5093 + ret
5094 +
5095 + /* taken from glibc */
5096 +trace:
5097 + pushl %eax
5098 + pushl %ecx
5099 + pushl %edx
5100 + movl 0xc(%esp), %eax
5101 + movl 0x4(%ebp), %edx
5102 + subl $MCOUNT_INSN_SIZE, %eax
5103 +
5104 + call *ftrace_trace_function
5105 +
5106 + popl %edx
5107 + popl %ecx
5108 + popl %eax
5109 +
5110 + jmp ftrace_stub
5111 +END(mcount)
5112 +#endif /* CONFIG_DYNAMIC_FTRACE */
5113 +#endif /* CONFIG_FTRACE */
5114 +
5115 #include <asm/alternative-asm.h>
5116
5117 # pv syscall call handler stub
5118 @@ -1290,7 +1399,7 @@ ENTRY(ia32pv_cstar_target)
5119 .previous
5120 SAVE_ALL
5121 GET_THREAD_INFO(%ebp)
5122 - test_tif %ebp
5123 + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
5124 jnz cstar_trace_entry
5125 cmpl $nr_syscalls,%eax
5126 jae cstar_badsys
5127 @@ -1324,29 +1433,21 @@ cstar_trace_entry:
5128 btl %eax,cstar_special
5129 jc .Lcstar_trace_special
5130 1: movl %esp,%eax
5131 - xorl %edx,%edx
5132 LOCK_PREFIX
5133 orl $_TIF_CSTAR,TI_flags(%ebp)
5134 - call do_syscall_trace
5135 + call syscall_trace_enter
5136 LOCK_PREFIX
5137 andl $~_TIF_CSTAR,TI_flags(%ebp)
5138 - testl %eax,%eax
5139 - jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
5140 - # so must skip actual syscall
5141 - movl PT_ORIG_EAX(%esp),%eax
5142 + /* What it returned is what we'll actually use. */
5143 cmpl $nr_syscalls,%eax
5144 jb .Lcstar_call
5145 jmp .Lcstar_exit
5146 .Lcstar_trace_special:
5147 movl PT_ECX(%esp),%ecx
5148 movl %esp,%eax
5149 - xorl %edx,%edx
5150 movl %ecx,PT_EBP(%esp) # put user EBP back in place
5151 - call do_syscall_trace
5152 - testl %eax,%eax
5153 - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
5154 - # so must skip actual syscall
5155 - movl PT_ORIG_EAX(%esp),%eax
5156 + call syscall_trace_enter
5157 + /* What it returned is what we'll actually use. */
5158 cmpl $nr_syscalls,%eax
5159 jb syscall_call
5160 jmp syscall_exit
5161 --- sle11-2009-10-16.orig/arch/x86/kernel/entry_64.S 2009-10-28 14:55:02.000000000 +0100
5162 +++ sle11-2009-10-16/arch/x86/kernel/entry_64.S 2009-06-04 10:21:39.000000000 +0200
5163 @@ -1409,7 +1409,7 @@ ENTRY(arch_unwind_init_running)
5164 ENDPROC(arch_unwind_init_running)
5165 #endif
5166
5167 -#ifdef CONFIG_XEN
5168 +#ifdef CONFIG_PARAVIRT_XEN
5169 ENTRY(xen_hypervisor_callback)
5170 zeroentry xen_do_hypervisor_callback
5171 END(xen_hypervisor_callback)
5172 @@ -1507,7 +1507,7 @@ ENTRY(xen_failsafe_callback)
5173 CFI_ENDPROC
5174 END(xen_failsafe_callback)
5175
5176 -#endif /* CONFIG_XEN */
5177 +#endif /* CONFIG_PARAVIRT_XEN */
5178
5179 #ifdef CONFIG_KDB
5180
5181 --- sle11-2009-10-16.orig/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:38:05.000000000 +0100
5182 +++ sle11-2009-10-16/arch/x86/kernel/entry_64-xen.S 2009-06-04 10:21:39.000000000 +0200
5183 @@ -53,19 +53,130 @@
5184 #include <asm/hw_irq.h>
5185 #include <asm/page.h>
5186 #include <asm/irqflags.h>
5187 +#include <asm/ftrace.h>
5188 #include <asm/errno.h>
5189 #include <xen/interface/xen.h>
5190 #include <xen/interface/features.h>
5191
5192 +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
5193 +#include <linux/elf-em.h>
5194 +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
5195 +#define __AUDIT_ARCH_64BIT 0x80000000
5196 +#define __AUDIT_ARCH_LE 0x40000000
5197 +
5198 .code64
5199
5200 +#ifdef CONFIG_FTRACE
5201 +#ifdef CONFIG_DYNAMIC_FTRACE
5202 +ENTRY(mcount)
5203 +
5204 + subq $0x38, %rsp
5205 + movq %rax, (%rsp)
5206 + movq %rcx, 8(%rsp)
5207 + movq %rdx, 16(%rsp)
5208 + movq %rsi, 24(%rsp)
5209 + movq %rdi, 32(%rsp)
5210 + movq %r8, 40(%rsp)
5211 + movq %r9, 48(%rsp)
5212 +
5213 + movq 0x38(%rsp), %rdi
5214 + subq $MCOUNT_INSN_SIZE, %rdi
5215 +
5216 +.globl mcount_call
5217 +mcount_call:
5218 + call ftrace_stub
5219 +
5220 + movq 48(%rsp), %r9
5221 + movq 40(%rsp), %r8
5222 + movq 32(%rsp), %rdi
5223 + movq 24(%rsp), %rsi
5224 + movq 16(%rsp), %rdx
5225 + movq 8(%rsp), %rcx
5226 + movq (%rsp), %rax
5227 + addq $0x38, %rsp
5228 +
5229 + retq
5230 +END(mcount)
5231 +
5232 +ENTRY(ftrace_caller)
5233 +
5234 + /* taken from glibc */
5235 + subq $0x38, %rsp
5236 + movq %rax, (%rsp)
5237 + movq %rcx, 8(%rsp)
5238 + movq %rdx, 16(%rsp)
5239 + movq %rsi, 24(%rsp)
5240 + movq %rdi, 32(%rsp)
5241 + movq %r8, 40(%rsp)
5242 + movq %r9, 48(%rsp)
5243 +
5244 + movq 0x38(%rsp), %rdi
5245 + movq 8(%rbp), %rsi
5246 + subq $MCOUNT_INSN_SIZE, %rdi
5247 +
5248 +.globl ftrace_call
5249 +ftrace_call:
5250 + call ftrace_stub
5251 +
5252 + movq 48(%rsp), %r9
5253 + movq 40(%rsp), %r8
5254 + movq 32(%rsp), %rdi
5255 + movq 24(%rsp), %rsi
5256 + movq 16(%rsp), %rdx
5257 + movq 8(%rsp), %rcx
5258 + movq (%rsp), %rax
5259 + addq $0x38, %rsp
5260 +
5261 +.globl ftrace_stub
5262 +ftrace_stub:
5263 + retq
5264 +END(ftrace_caller)
5265 +
5266 +#else /* ! CONFIG_DYNAMIC_FTRACE */
5267 +ENTRY(mcount)
5268 + cmpq $ftrace_stub, ftrace_trace_function
5269 + jnz trace
5270 +.globl ftrace_stub
5271 +ftrace_stub:
5272 + retq
5273 +
5274 +trace:
5275 + /* taken from glibc */
5276 + subq $0x38, %rsp
5277 + movq %rax, (%rsp)
5278 + movq %rcx, 8(%rsp)
5279 + movq %rdx, 16(%rsp)
5280 + movq %rsi, 24(%rsp)
5281 + movq %rdi, 32(%rsp)
5282 + movq %r8, 40(%rsp)
5283 + movq %r9, 48(%rsp)
5284 +
5285 + movq 0x38(%rsp), %rdi
5286 + movq 8(%rbp), %rsi
5287 + subq $MCOUNT_INSN_SIZE, %rdi
5288 +
5289 + call *ftrace_trace_function
5290 +
5291 + movq 48(%rsp), %r9
5292 + movq 40(%rsp), %r8
5293 + movq 32(%rsp), %rdi
5294 + movq 24(%rsp), %rsi
5295 + movq 16(%rsp), %rdx
5296 + movq 8(%rsp), %rcx
5297 + movq (%rsp), %rax
5298 + addq $0x38, %rsp
5299 +
5300 + jmp ftrace_stub
5301 +END(mcount)
5302 +#endif /* CONFIG_DYNAMIC_FTRACE */
5303 +#endif /* CONFIG_FTRACE */
5304 +
5305 #ifndef CONFIG_PREEMPT
5306 #define retint_kernel retint_restore_args
5307 #endif
5308
5309 #ifdef CONFIG_PARAVIRT
5310 -ENTRY(native_irq_enable_syscall_ret)
5311 - movq %gs:pda_oldrsp,%rsp
5312 +ENTRY(native_usergs_sysret64)
5313 swapgs
5314 sysretq
5315 #endif /* CONFIG_PARAVIRT */
5316 @@ -102,7 +213,7 @@ NMI_MASK = 0x80000000
5317 .macro FAKE_STACK_FRAME child_rip
5318 /* push in order ss, rsp, eflags, cs, rip */
5319 xorl %eax, %eax
5320 - pushq %rax /* ss */
5321 + pushq $__KERNEL_DS /* ss */
5322 CFI_ADJUST_CFA_OFFSET 8
5323 /*CFI_REL_OFFSET ss,0*/
5324 pushq %rax /* rsp */
5325 @@ -197,13 +308,13 @@ ENTRY(ret_from_fork)
5326 CFI_ADJUST_CFA_OFFSET -4
5327 call schedule_tail
5328 GET_THREAD_INFO(%rcx)
5329 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
5330 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5331 jnz rff_trace
5332 rff_action:
5333 RESTORE_REST
5334 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
5335 je int_ret_from_sys_call
5336 - testl $_TIF_IA32,threadinfo_flags(%rcx)
5337 + testl $_TIF_IA32,TI_flags(%rcx)
5338 jnz int_ret_from_sys_call
5339 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
5340 jmp ret_from_sys_call
5341 @@ -265,8 +376,9 @@ ENTRY(system_call)
5342 SAVE_ARGS -8,0
5343 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
5344 GET_THREAD_INFO(%rcx)
5345 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
5346 + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
5347 jnz tracesys
5348 +system_call_fastpath:
5349 cmpq $__NR_syscall_max,%rax
5350 ja badsys
5351 movq %r10,%rcx
5352 @@ -284,7 +396,7 @@ sysret_check:
5353 GET_THREAD_INFO(%rcx)
5354 DISABLE_INTERRUPTS(CLBR_NONE)
5355 TRACE_IRQS_OFF
5356 - movl threadinfo_flags(%rcx),%edx
5357 + movl TI_flags(%rcx),%edx
5358 andl %edi,%edx
5359 jnz sysret_careful
5360 CFI_REMEMBER_STATE
5361 @@ -315,16 +427,16 @@ sysret_careful:
5362 sysret_signal:
5363 TRACE_IRQS_ON
5364 ENABLE_INTERRUPTS(CLBR_NONE)
5365 - testl $_TIF_DO_NOTIFY_MASK,%edx
5366 - jz 1f
5367 -
5368 - /* Really a signal */
5369 +#ifdef CONFIG_AUDITSYSCALL
5370 + bt $TIF_SYSCALL_AUDIT,%edx
5371 + jc sysret_audit
5372 +#endif
5373 /* edx: work flags (arg3) */
5374 leaq do_notify_resume(%rip),%rax
5375 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
5376 xorl %esi,%esi # oldset -> arg2
5377 call ptregscall_common
5378 -1: movl $_TIF_NEED_RESCHED,%edi
5379 + movl $_TIF_WORK_MASK,%edi
5380 /* Use IRET because user could have changed frame. This
5381 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
5382 DISABLE_INTERRUPTS(CLBR_NONE)
5383 @@ -335,14 +447,56 @@ badsys:
5384 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
5385 jmp ret_from_sys_call
5386
5387 +#ifdef CONFIG_AUDITSYSCALL
5388 + /*
5389 + * Fast path for syscall audit without full syscall trace.
5390 + * We just call audit_syscall_entry() directly, and then
5391 + * jump back to the normal fast path.
5392 + */
5393 +auditsys:
5394 + movq %r10,%r9 /* 6th arg: 4th syscall arg */
5395 + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
5396 + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
5397 + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
5398 + movq %rax,%rsi /* 2nd arg: syscall number */
5399 + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
5400 + call audit_syscall_entry
5401 + LOAD_ARGS 0 /* reload call-clobbered registers */
5402 + jmp system_call_fastpath
5403 +
5404 + /*
5405 + * Return fast path for syscall audit. Call audit_syscall_exit()
5406 + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
5407 + * masked off.
5408 + */
5409 +sysret_audit:
5410 + movq %rax,%rsi /* second arg, syscall return value */
5411 + cmpq $0,%rax /* is it < 0? */
5412 + setl %al /* 1 if so, 0 if not */
5413 + movzbl %al,%edi /* zero-extend that into %edi */
5414 + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
5415 + call audit_syscall_exit
5416 + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
5417 + jmp sysret_check
5418 +#endif /* CONFIG_AUDITSYSCALL */
5419 +
5420 /* Do syscall tracing */
5421 tracesys:
5422 +#ifdef CONFIG_AUDITSYSCALL
5423 + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
5424 + jz auditsys
5425 +#endif
5426 SAVE_REST
5427 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
5428 FIXUP_TOP_OF_STACK %rdi
5429 movq %rsp,%rdi
5430 call syscall_trace_enter
5431 - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
5432 + /*
5433 + * Reload arg registers from stack in case ptrace changed them.
5434 + * We don't reload %rax because syscall_trace_enter() returned
5435 + * the value it wants us to use in the table lookup.
5436 + */
5437 + LOAD_ARGS ARGOFFSET, 1
5438 RESTORE_REST
5439 cmpq $__NR_syscall_max,%rax
5440 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
5441 @@ -356,6 +510,7 @@ tracesys:
5442 * Has correct top of stack, but partial stack frame.
5443 */
5444 .globl int_ret_from_sys_call
5445 + .globl int_with_check
5446 int_ret_from_sys_call:
5447 DISABLE_INTERRUPTS(CLBR_NONE)
5448 TRACE_IRQS_OFF
5449 @@ -370,10 +525,10 @@ int_ret_from_sys_call:
5450 int_with_check:
5451 LOCKDEP_SYS_EXIT_IRQ
5452 GET_THREAD_INFO(%rcx)
5453 - movl threadinfo_flags(%rcx),%edx
5454 + movl TI_flags(%rcx),%edx
5455 andl %edi,%edx
5456 jnz int_careful
5457 - andl $~TS_COMPAT,threadinfo_status(%rcx)
5458 + andl $~TS_COMPAT,TI_status(%rcx)
5459 jmp retint_restore_args
5460
5461 /* Either reschedule or signal or syscall exit tracking needed. */
5462 @@ -399,7 +554,7 @@ int_very_careful:
5463 ENABLE_INTERRUPTS(CLBR_NONE)
5464 SAVE_REST
5465 /* Check for syscall exit trace */
5466 - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
5467 + testl $_TIF_WORK_SYSCALL_EXIT,%edx
5468 jz int_signal
5469 pushq %rdi
5470 CFI_ADJUST_CFA_OFFSET 8
5471 @@ -407,7 +562,7 @@ int_very_careful:
5472 call syscall_trace_leave
5473 popq %rdi
5474 CFI_ADJUST_CFA_OFFSET -8
5475 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
5476 + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
5477 jmp int_restore_rest
5478
5479 int_signal:
5480 @@ -416,7 +571,7 @@ int_signal:
5481 movq %rsp,%rdi # &ptregs -> arg1
5482 xorl %esi,%esi # oldset -> arg2
5483 call do_notify_resume
5484 -1: movl $_TIF_NEED_RESCHED,%edi
5485 +1: movl $_TIF_WORK_MASK,%edi
5486 int_restore_rest:
5487 RESTORE_REST
5488 DISABLE_INTERRUPTS(CLBR_NONE)
5489 @@ -443,7 +598,6 @@ END(\label)
5490 PTREGSCALL stub_clone, sys_clone, %r8
5491 PTREGSCALL stub_fork, sys_fork, %rdi
5492 PTREGSCALL stub_vfork, sys_vfork, %rdi
5493 - PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
5494 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
5495 PTREGSCALL stub_iopl, sys_iopl, %rsi
5496
5497 @@ -517,10 +671,12 @@ END(stub_rt_sigreturn)
5498 *
5499 */
5500
5501 -retint_check:
5502 +retint_with_reschedule:
5503 CFI_DEFAULT_STACK adj=1
5504 + movl $_TIF_WORK_MASK,%edi
5505 +retint_check:
5506 LOCKDEP_SYS_EXIT_IRQ
5507 - movl threadinfo_flags(%rcx),%edx
5508 + movl TI_flags(%rcx),%edx
5509 andl %edi,%edx
5510 CFI_REMEMBER_STATE
5511 jnz retint_careful
5512 @@ -565,17 +721,16 @@ retint_signal:
5513 RESTORE_REST
5514 DISABLE_INTERRUPTS(CLBR_NONE)
5515 TRACE_IRQS_OFF
5516 - movl $_TIF_NEED_RESCHED,%edi
5517 GET_THREAD_INFO(%rcx)
5518 - jmp retint_check
5519 + jmp retint_with_reschedule
5520
5521 #ifdef CONFIG_PREEMPT
5522 /* Returning to kernel space. Check if we need preemption */
5523 /* rcx: threadinfo. interrupts off. */
5524 ENTRY(retint_kernel)
5525 - cmpl $0,threadinfo_preempt_count(%rcx)
5526 + cmpl $0,TI_preempt_count(%rcx)
5527 jnz retint_restore_args
5528 - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
5529 + bt $TIF_NEED_RESCHED,TI_flags(%rcx)
5530 jnc retint_restore_args
5531 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
5532 jnc retint_restore_args
5533 @@ -630,6 +785,9 @@ END(invalidate_interrupt\num)
5534 ENTRY(call_function_interrupt)
5535 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
5536 END(call_function_interrupt)
5537 +ENTRY(call_function_single_interrupt)
5538 + apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
5539 +END(call_function_single_interrupt)
5540 ENTRY(irq_move_cleanup_interrupt)
5541 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
5542 END(irq_move_cleanup_interrupt)
5543 @@ -639,6 +797,10 @@ ENTRY(apic_timer_interrupt)
5544 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
5545 END(apic_timer_interrupt)
5546
5547 +ENTRY(uv_bau_message_intr1)
5548 + apicinterrupt 220,uv_bau_message_interrupt
5549 +END(uv_bau_message_intr1)
5550 +
5551 ENTRY(error_interrupt)
5552 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
5553 END(error_interrupt)
5554 @@ -752,7 +914,7 @@ paranoid_restore\trace:
5555 jmp irq_return
5556 paranoid_userspace\trace:
5557 GET_THREAD_INFO(%rcx)
5558 - movl threadinfo_flags(%rcx),%ebx
5559 + movl TI_flags(%rcx),%ebx
5560 andl $_TIF_WORK_MASK,%ebx
5561 jz paranoid_swapgs\trace
5562 movq %rsp,%rdi /* &pt_regs */
5563 @@ -849,7 +1011,7 @@ error_exit:
5564 testb $3,CS-ARGOFFSET(%rsp)
5565 jz retint_kernel
5566 LOCKDEP_SYS_EXIT_IRQ
5567 - movl threadinfo_flags(%rcx),%edx
5568 + movl TI_flags(%rcx),%edx
5569 movl $_TIF_WORK_MASK,%edi
5570 andl %edi,%edx
5571 jnz retint_careful
5572 @@ -871,11 +1033,11 @@ error_kernelspace:
5573 iret run with kernel gs again, so don't set the user space flag.
5574 B stepping K8s sometimes report an truncated RIP for IRET
5575 exceptions returning to compat mode. Check for these here too. */
5576 - leaq irq_return(%rip),%rbp
5577 - cmpq %rbp,RIP(%rsp)
5578 + leaq irq_return(%rip),%rcx
5579 + cmpq %rcx,RIP(%rsp)
5580 je error_swapgs
5581 - movl %ebp,%ebp /* zero extend */
5582 - cmpq %rbp,RIP(%rsp)
5583 + movl %ecx,%ecx /* zero extend */
5584 + cmpq %rcx,RIP(%rsp)
5585 je error_swapgs
5586 cmpq $gs_change,RIP(%rsp)
5587 je error_swapgs
5588 @@ -1121,6 +1283,7 @@ END(device_not_available)
5589 /* runs on exception stack */
5590 KPROBE_ENTRY(debug)
5591 /* INTR_FRAME
5592 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5593 pushq $0
5594 CFI_ADJUST_CFA_OFFSET 8 */
5595 zeroentry do_debug
5596 @@ -1148,6 +1311,7 @@ END(do_nmi_callback)
5597
5598 KPROBE_ENTRY(int3)
5599 /* INTR_FRAME
5600 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5601 pushq $0
5602 CFI_ADJUST_CFA_OFFSET 8 */
5603 zeroentry do_int3
5604 @@ -1171,14 +1335,11 @@ ENTRY(coprocessor_segment_overrun)
5605 zeroentry do_coprocessor_segment_overrun
5606 END(coprocessor_segment_overrun)
5607
5608 -ENTRY(reserved)
5609 - zeroentry do_reserved
5610 -END(reserved)
5611 -
5612 #if 0
5613 /* runs on exception stack */
5614 ENTRY(double_fault)
5615 XCPT_FRAME
5616 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5617 paranoidentry do_double_fault
5618 jmp paranoid_exit1
5619 CFI_ENDPROC
5620 @@ -1196,6 +1357,7 @@ END(segment_not_present)
5621 /* runs on exception stack */
5622 ENTRY(stack_segment)
5623 /* XCPT_FRAME
5624 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5625 paranoidentry do_stack_segment */
5626 errorentry do_stack_segment
5627 /* jmp paranoid_exit1
5628 @@ -1222,6 +1384,7 @@ END(spurious_interrupt_bug)
5629 /* runs on exception stack */
5630 ENTRY(machine_check)
5631 INTR_FRAME
5632 + PARAVIRT_ADJUST_EXCEPTION_FRAME
5633 pushq $0
5634 CFI_ADJUST_CFA_OFFSET 8
5635 paranoidentry do_machine_check
5636 --- sle11-2009-10-16.orig/arch/x86/kernel/fixup.c 2009-03-16 16:33:40.000000000 +0100
5637 +++ sle11-2009-10-16/arch/x86/kernel/fixup.c 2009-06-04 10:21:39.000000000 +0200
5638 @@ -33,6 +33,7 @@
5639 #include <linux/kernel.h>
5640 #include <linux/delay.h>
5641 #include <linux/version.h>
5642 +#include <asm/traps.h>
5643
5644 #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
5645
5646 --- sle11-2009-10-16.orig/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
5647 +++ sle11-2009-10-16/arch/x86/kernel/genapic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
5648 @@ -58,7 +58,7 @@ void __init setup_apic_routing(void)
5649 else
5650 #endif
5651
5652 - if (num_possible_cpus() <= 8)
5653 + if (max_physical_apicid < 8)
5654 genapic = &apic_flat;
5655 else
5656 genapic = &apic_physflat;
5657 @@ -121,4 +121,5 @@ int is_uv_system(void)
5658 {
5659 return uv_system_type != UV_NONE;
5660 }
5661 +EXPORT_SYMBOL_GPL(is_uv_system);
5662 #endif
5663 --- sle11-2009-10-16.orig/arch/x86/kernel/genapic_xen_64.c 2009-03-16 16:38:05.000000000 +0100
5664 +++ sle11-2009-10-16/arch/x86/kernel/genapic_xen_64.c 2009-06-04 10:21:39.000000000 +0200
5665 @@ -43,7 +43,7 @@ void xen_send_IPI_shortcut(unsigned int
5666 __send_IPI_one(smp_processor_id(), vector);
5667 break;
5668 case APIC_DEST_ALLBUT:
5669 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5670 + for_each_possible_cpu(cpu) {
5671 if (cpu == smp_processor_id())
5672 continue;
5673 if (cpu_isset(cpu, cpu_online_map)) {
5674 @@ -52,7 +52,7 @@ void xen_send_IPI_shortcut(unsigned int
5675 }
5676 break;
5677 case APIC_DEST_ALLINC:
5678 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5679 + for_each_possible_cpu(cpu) {
5680 if (cpu_isset(cpu, cpu_online_map)) {
5681 __send_IPI_one(cpu, vector);
5682 }
5683 @@ -81,8 +81,6 @@ static cpumask_t xen_vector_allocation_d
5684 */
5685 static void xen_init_apic_ldr(void)
5686 {
5687 - Dprintk("%s\n", __FUNCTION__);
5688 - return;
5689 }
5690
5691 static void xen_send_IPI_allbutself(int vector)
5692 @@ -92,14 +90,12 @@ static void xen_send_IPI_allbutself(int
5693 * we get an APIC send error if we try to broadcast.
5694 * thus we have to avoid sending IPIs in this case.
5695 */
5696 - Dprintk("%s\n", __FUNCTION__);
5697 if (num_online_cpus() > 1)
5698 xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
5699 }
5700
5701 static void xen_send_IPI_all(int vector)
5702 {
5703 - Dprintk("%s\n", __FUNCTION__);
5704 xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
5705 }
5706
5707 @@ -109,11 +105,10 @@ static void xen_send_IPI_mask(cpumask_t
5708 unsigned int cpu;
5709 unsigned long flags;
5710
5711 - Dprintk("%s\n", __FUNCTION__);
5712 local_irq_save(flags);
5713 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
5714
5715 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
5716 + for_each_possible_cpu(cpu) {
5717 if (cpu_isset(cpu, cpumask)) {
5718 __send_IPI_one(cpu, vector);
5719 }
5720 @@ -125,14 +120,12 @@ static void xen_send_IPI_mask(cpumask_t
5721 static int xen_apic_id_registered(void)
5722 {
5723 /* better be set */
5724 - Dprintk("%s\n", __FUNCTION__);
5725 return physid_isset(smp_processor_id(), phys_cpu_present_map);
5726 }
5727 #endif
5728
5729 static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
5730 {
5731 - Dprintk("%s\n", __FUNCTION__);
5732 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
5733 }
5734
5735 @@ -140,7 +133,6 @@ static unsigned int phys_pkg_id(int inde
5736 {
5737 u32 ebx;
5738
5739 - Dprintk("%s\n", __FUNCTION__);
5740 ebx = cpuid_ebx(1);
5741 return ((ebx >> 24) & 0xFF) >> index_msb;
5742 }
5743 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5744 +++ sle11-2009-10-16/arch/x86/kernel/head-xen.c 2009-06-04 10:21:39.000000000 +0200
5745 @@ -0,0 +1,57 @@
5746 +#include <linux/kernel.h>
5747 +#include <linux/init.h>
5748 +
5749 +#include <asm/setup.h>
5750 +#include <asm/bios_ebda.h>
5751 +
5752 +#define BIOS_LOWMEM_KILOBYTES 0x413
5753 +
5754 +/*
5755 + * The BIOS places the EBDA/XBDA at the top of conventional
5756 + * memory, and usually decreases the reported amount of
5757 + * conventional memory (int 0x12) too. This also contains a
5758 + * workaround for Dell systems that neglect to reserve EBDA.
5759 + * The same workaround also avoids a problem with the AMD768MPX
5760 + * chipset: reserve a page before VGA to prevent PCI prefetch
5761 + * into it (errata #56). Usually the page is reserved anyways,
5762 + * unless you have no PS/2 mouse plugged in.
5763 + */
5764 +void __init reserve_ebda_region(void)
5765 +{
5766 +#ifndef CONFIG_XEN
5767 + unsigned int lowmem, ebda_addr;
5768 +
5769 + /* To determine the position of the EBDA and the */
5770 + /* end of conventional memory, we need to look at */
5771 + /* the BIOS data area. In a paravirtual environment */
5772 + /* that area is absent. We'll just have to assume */
5773 + /* that the paravirt case can handle memory setup */
5774 + /* correctly, without our help. */
5775 + if (paravirt_enabled())
5776 + return;
5777 +
5778 + /* end of low (conventional) memory */
5779 + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5780 + lowmem <<= 10;
5781 +
5782 + /* start of EBDA area */
5783 + ebda_addr = get_bios_ebda();
5784 +
5785 + /* Fixup: bios puts an EBDA in the top 64K segment */
5786 + /* of conventional memory, but does not adjust lowmem. */
5787 + if ((lowmem - ebda_addr) <= 0x10000)
5788 + lowmem = ebda_addr;
5789 +
5790 + /* Fixup: bios does not report an EBDA at all. */
5791 + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5792 + if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5793 + lowmem = 0x9f000;
5794 +
5795 + /* Paranoia: should never happen, but... */
5796 + if ((lowmem == 0) || (lowmem >= 0x100000))
5797 + lowmem = 0x9f000;
5798 +
5799 + /* reserve all memory between lowmem and the 1MB mark */
5800 + reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
5801 +#endif
5802 +}
5803 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
5804 +++ sle11-2009-10-16/arch/x86/kernel/head32-xen.c 2009-06-04 10:21:39.000000000 +0200
5805 @@ -0,0 +1,57 @@
5806 +/*
5807 + * linux/arch/i386/kernel/head32.c -- prepare to run common code
5808 + *
5809 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5810 + * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
5811 + */
5812 +
5813 +#include <linux/init.h>
5814 +#include <linux/start_kernel.h>
5815 +
5816 +#include <asm/setup.h>
5817 +#include <asm/sections.h>
5818 +#include <asm/e820.h>
5819 +#include <asm/bios_ebda.h>
5820 +
5821 +void __init i386_start_kernel(void)
5822 +{
5823 + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
5824 +
5825 +#ifndef CONFIG_XEN
5826 +#ifdef CONFIG_BLK_DEV_INITRD
5827 + /* Reserve INITRD */
5828 + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
5829 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
5830 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
5831 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
5832 + reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
5833 + }
5834 +#endif
5835 + reserve_early(init_pg_tables_start, init_pg_tables_end,
5836 + "INIT_PG_TABLE");
5837 +#else
5838 + reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
5839 + __pa(xen_start_info->pt_base)
5840 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
5841 + "Xen provided");
5842 +
5843 + {
5844 + int max_cmdline;
5845 +
5846 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
5847 + max_cmdline = COMMAND_LINE_SIZE;
5848 + memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
5849 + boot_command_line[max_cmdline-1] = '\0';
5850 + }
5851 +#endif
5852 +
5853 + reserve_ebda_region();
5854 +
5855 + /*
5856 + * At this point everything still needed from the boot loader
5857 + * or BIOS or kernel text should be early reserved or marked not
5858 + * RAM in e820. All other memory is free game.
5859 + */
5860 +
5861 + start_kernel();
5862 +}
5863 --- sle11-2009-10-16.orig/arch/x86/kernel/head64-xen.c 2009-03-16 16:38:05.000000000 +0100
5864 +++ sle11-2009-10-16/arch/x86/kernel/head64-xen.c 2009-06-04 10:21:39.000000000 +0200
5865 @@ -32,7 +32,26 @@
5866 #include <asm/e820.h>
5867 #include <asm/bios_ebda.h>
5868
5869 -unsigned long start_pfn;
5870 +/* boot cpu pda */
5871 +static struct x8664_pda _boot_cpu_pda __read_mostly;
5872 +
5873 +#ifdef CONFIG_SMP
5874 +/*
5875 + * We install an empty cpu_pda pointer table to indicate to early users
5876 + * (numa_set_node) that the cpu_pda pointer table for cpus other than
5877 + * the boot cpu is not yet setup.
5878 + */
5879 +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
5880 +#else
5881 +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
5882 +#endif
5883 +
5884 +void __init x86_64_init_pda(void)
5885 +{
5886 + _cpu_pda = __cpu_pda;
5887 + cpu_pda(0) = &_boot_cpu_pda;
5888 + pda_init(0);
5889 +}
5890
5891 #ifndef CONFIG_XEN
5892 static void __init zap_identity_mappings(void)
5893 @@ -77,83 +96,10 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
5894 unsigned int machine_to_phys_order;
5895 EXPORT_SYMBOL(machine_to_phys_order);
5896
5897 -#define BIOS_LOWMEM_KILOBYTES 0x413
5898 -
5899 -/*
5900 - * The BIOS places the EBDA/XBDA at the top of conventional
5901 - * memory, and usually decreases the reported amount of
5902 - * conventional memory (int 0x12) too. This also contains a
5903 - * workaround for Dell systems that neglect to reserve EBDA.
5904 - * The same workaround also avoids a problem with the AMD768MPX
5905 - * chipset: reserve a page before VGA to prevent PCI prefetch
5906 - * into it (errata #56). Usually the page is reserved anyways,
5907 - * unless you have no PS/2 mouse plugged in.
5908 - */
5909 -static void __init reserve_ebda_region(void)
5910 -{
5911 -#ifndef CONFIG_XEN
5912 - unsigned int lowmem, ebda_addr;
5913 -
5914 - /* To determine the position of the EBDA and the */
5915 - /* end of conventional memory, we need to look at */
5916 - /* the BIOS data area. In a paravirtual environment */
5917 - /* that area is absent. We'll just have to assume */
5918 - /* that the paravirt case can handle memory setup */
5919 - /* correctly, without our help. */
5920 - if (paravirt_enabled())
5921 - return;
5922 -
5923 - /* end of low (conventional) memory */
5924 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
5925 - lowmem <<= 10;
5926 -
5927 - /* start of EBDA area */
5928 - ebda_addr = get_bios_ebda();
5929 -
5930 - /* Fixup: bios puts an EBDA in the top 64K segment */
5931 - /* of conventional memory, but does not adjust lowmem. */
5932 - if ((lowmem - ebda_addr) <= 0x10000)
5933 - lowmem = ebda_addr;
5934 -
5935 - /* Fixup: bios does not report an EBDA at all. */
5936 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
5937 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
5938 - lowmem = 0x9f000;
5939 -
5940 - /* Paranoia: should never happen, but... */
5941 - if ((lowmem == 0) || (lowmem >= 0x100000))
5942 - lowmem = 0x9f000;
5943 -
5944 - /* reserve all memory between lowmem and the 1MB mark */
5945 - reserve_early(lowmem, 0x100000, "BIOS reserved");
5946 -#endif
5947 -}
5948 -
5949 -static void __init reserve_setup_data(void)
5950 -{
5951 -#ifndef CONFIG_XEN
5952 - struct setup_data *data;
5953 - unsigned long pa_data;
5954 - char buf[32];
5955 -
5956 - if (boot_params.hdr.version < 0x0209)
5957 - return;
5958 - pa_data = boot_params.hdr.setup_data;
5959 - while (pa_data) {
5960 - data = early_ioremap(pa_data, sizeof(*data));
5961 - sprintf(buf, "setup data %x", data->type);
5962 - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
5963 - pa_data = data->next;
5964 - early_iounmap(data, sizeof(*data));
5965 - }
5966 -#endif
5967 -}
5968 -
5969 void __init x86_64_start_kernel(char * real_mode_data)
5970 {
5971 struct xen_machphys_mapping mapping;
5972 unsigned long machine_to_phys_nr_ents;
5973 - int i;
5974
5975 /*
5976 * Build-time sanity checks on the kernel image and module
5977 @@ -167,6 +113,7 @@ void __init x86_64_start_kernel(char * r
5978 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
5979 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
5980 (__START_KERNEL & PGDIR_MASK)));
5981 + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
5982
5983 xen_setup_features();
5984
5985 @@ -174,8 +121,6 @@ void __init x86_64_start_kernel(char * r
5986 if (!xen_feature(XENFEAT_auto_translated_physmap))
5987 phys_to_machine_mapping =
5988 (unsigned long *)xen_start_info->mfn_list;
5989 - start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
5990 - xen_start_info->nr_pt_frames;
5991
5992 machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
5993 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
5994 @@ -208,19 +153,23 @@ void __init x86_64_start_kernel(char * r
5995
5996 early_printk("Kernel alive\n");
5997
5998 - for (i = 0; i < NR_CPUS; i++)
5999 - cpu_pda(i) = &boot_cpu_pda[i];
6000 + x86_64_init_pda();
6001
6002 - pda_init(0);
6003 + early_printk("Kernel really alive\n");
6004 +
6005 + x86_64_start_reservations(real_mode_data);
6006 +}
6007 +
6008 +void __init x86_64_start_reservations(char *real_mode_data)
6009 +{
6010 copy_bootdata(__va(real_mode_data));
6011
6012 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
6013
6014 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
6015 - start_pfn << PAGE_SHIFT, "Xen provided");
6016 -
6017 - reserve_ebda_region();
6018 - reserve_setup_data();
6019 + __pa(xen_start_info->pt_base)
6020 + + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
6021 + "Xen provided");
6022
6023 /*
6024 * At this point everything still needed from the boot loader
6025 --- sle11-2009-10-16.orig/arch/x86/kernel/head_64-xen.S 2009-02-16 16:17:21.000000000 +0100
6026 +++ sle11-2009-10-16/arch/x86/kernel/head_64-xen.S 2009-06-04 10:21:39.000000000 +0200
6027 @@ -95,53 +95,6 @@ NEXT_PAGE(hypercall_page)
6028
6029 #undef NEXT_PAGE
6030
6031 - .data
6032 -
6033 - .align 16
6034 - .globl cpu_gdt_descr
6035 -cpu_gdt_descr:
6036 - .word gdt_end-cpu_gdt_table-1
6037 -gdt:
6038 - .quad cpu_gdt_table
6039 -#ifdef CONFIG_SMP
6040 - .rept NR_CPUS-1
6041 - .word 0
6042 - .quad 0
6043 - .endr
6044 -#endif
6045 -
6046 -/* We need valid kernel segments for data and code in long mode too
6047 - * IRET will check the segment types kkeil 2000/10/28
6048 - * Also sysret mandates a special GDT layout
6049 - */
6050 -
6051 - .section .data.page_aligned, "aw"
6052 - .align PAGE_SIZE
6053 -
6054 -/* The TLS descriptors are currently at a different place compared to i386.
6055 - Hopefully nobody expects them at a fixed place (Wine?) */
6056 -
6057 -ENTRY(cpu_gdt_table)
6058 - .quad 0x0000000000000000 /* NULL descriptor */
6059 - .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
6060 - .quad 0x00af9b000000ffff /* __KERNEL_CS */
6061 - .quad 0x00cf93000000ffff /* __KERNEL_DS */
6062 - .quad 0x00cffb000000ffff /* __USER32_CS */
6063 - .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
6064 - .quad 0x00affb000000ffff /* __USER_CS */
6065 - .quad 0x0 /* unused */
6066 - .quad 0,0 /* TSS */
6067 - .quad 0,0 /* LDT */
6068 - .quad 0,0,0 /* three TLS descriptors */
6069 - .quad 0x0000f40000000000 /* node/CPU stored in limit */
6070 -gdt_end:
6071 - /* asm/segment.h:GDT_ENTRIES must match this */
6072 - /* This should be a multiple of the cache line size */
6073 - /* GDTs of other CPUs are now dynamically allocated */
6074 -
6075 - /* zero the remaining page */
6076 - .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
6077 -
6078 .section .bss.page_aligned, "aw", @nobits
6079 .align PAGE_SIZE
6080 ENTRY(empty_zero_page)
6081 --- sle11-2009-10-16.orig/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:38:05.000000000 +0100
6082 +++ sle11-2009-10-16/arch/x86/kernel/io_apic_32-xen.c 2009-06-04 10:21:39.000000000 +0200
6083 @@ -25,6 +25,7 @@
6084 #include <linux/init.h>
6085 #include <linux/delay.h>
6086 #include <linux/sched.h>
6087 +#include <linux/bootmem.h>
6088 #include <linux/mc146818rtc.h>
6089 #include <linux/compiler.h>
6090 #include <linux/acpi.h>
6091 @@ -75,7 +76,7 @@ static struct { int pin, apic; } ioapic_
6092 static DEFINE_SPINLOCK(ioapic_lock);
6093 static DEFINE_SPINLOCK(vector_lock);
6094
6095 -int timer_over_8254 __initdata = 1;
6096 +int timer_through_8259 __initdata;
6097
6098 /*
6099 * Is the SiS APIC rmw bug present ?
6100 @@ -89,15 +90,21 @@ int sis_apic_bug = -1;
6101 int nr_ioapic_registers[MAX_IO_APICS];
6102
6103 /* I/O APIC entries */
6104 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
6105 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
6106 int nr_ioapics;
6107
6108 /* MP IRQ source entries */
6109 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6110 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
6111
6112 /* # of MP IRQ source entries */
6113 int mp_irq_entries;
6114
6115 +#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
6116 +int mp_bus_id_to_type[MAX_MP_BUSSES];
6117 +#endif
6118 +
6119 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
6120 +
6121 static int disable_timer_pin_1 __initdata;
6122
6123 /*
6124 @@ -128,7 +135,7 @@ struct io_apic {
6125 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
6126 {
6127 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
6128 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
6129 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
6130 }
6131 #endif
6132
6133 @@ -142,7 +149,7 @@ static inline unsigned int io_apic_read(
6134 struct physdev_apic apic_op;
6135 int ret;
6136
6137 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6138 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6139 apic_op.reg = reg;
6140 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
6141 if (ret)
6142 @@ -160,7 +167,7 @@ static inline void io_apic_write(unsigne
6143 #else
6144 struct physdev_apic apic_op;
6145
6146 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
6147 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
6148 apic_op.reg = reg;
6149 apic_op.value = value;
6150 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
6151 @@ -288,7 +295,7 @@ static void __init replace_pin_at_irq(un
6152 }
6153 }
6154
6155 -static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
6156 +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
6157 {
6158 struct irq_pin_list *entry = irq_2_pin + irq;
6159 unsigned int pin, reg;
6160 @@ -308,30 +315,32 @@ static void __modify_IO_APIC_irq (unsign
6161 }
6162
6163 /* mask = 1 */
6164 -static void __mask_IO_APIC_irq (unsigned int irq)
6165 +static void __mask_IO_APIC_irq(unsigned int irq)
6166 {
6167 - __modify_IO_APIC_irq(irq, 0x00010000, 0);
6168 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
6169 }
6170
6171 /* mask = 0 */
6172 -static void __unmask_IO_APIC_irq (unsigned int irq)
6173 +static void __unmask_IO_APIC_irq(unsigned int irq)
6174 {
6175 - __modify_IO_APIC_irq(irq, 0, 0x00010000);
6176 + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
6177 }
6178
6179 /* mask = 1, trigger = 0 */
6180 -static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
6181 +static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
6182 {
6183 - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
6184 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
6185 + IO_APIC_REDIR_LEVEL_TRIGGER);
6186 }
6187
6188 /* mask = 0, trigger = 1 */
6189 -static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
6190 +static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
6191 {
6192 - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
6193 + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
6194 + IO_APIC_REDIR_MASKED);
6195 }
6196
6197 -static void mask_IO_APIC_irq (unsigned int irq)
6198 +static void mask_IO_APIC_irq(unsigned int irq)
6199 {
6200 unsigned long flags;
6201
6202 @@ -340,7 +349,7 @@ static void mask_IO_APIC_irq (unsigned i
6203 spin_unlock_irqrestore(&ioapic_lock, flags);
6204 }
6205
6206 -static void unmask_IO_APIC_irq (unsigned int irq)
6207 +static void unmask_IO_APIC_irq(unsigned int irq)
6208 {
6209 unsigned long flags;
6210
6211 @@ -352,7 +361,7 @@ static void unmask_IO_APIC_irq (unsigned
6212 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
6213 {
6214 struct IO_APIC_route_entry entry;
6215 -
6216 +
6217 /* Check delivery_mode to be sure we're not clearing an SMI pin */
6218 entry = ioapic_read_entry(apic, pin);
6219 if (entry.delivery_mode == dest_SMI)
6220 @@ -364,7 +373,7 @@ static void clear_IO_APIC_pin(unsigned i
6221 ioapic_mask_entry(apic, pin);
6222 }
6223
6224 -static void clear_IO_APIC (void)
6225 +static void clear_IO_APIC(void)
6226 {
6227 int apic, pin;
6228
6229 @@ -381,7 +390,7 @@ static void set_ioapic_affinity_irq(unsi
6230 struct irq_pin_list *entry = irq_2_pin + irq;
6231 unsigned int apicid_value;
6232 cpumask_t tmp;
6233 -
6234 +
6235 cpus_and(tmp, cpumask, cpu_online_map);
6236 if (cpus_empty(tmp))
6237 tmp = TARGET_CPUS;
6238 @@ -410,7 +419,7 @@ static void set_ioapic_affinity_irq(unsi
6239 # include <linux/kernel_stat.h> /* kstat */
6240 # include <linux/slab.h> /* kmalloc() */
6241 # include <linux/timer.h>
6242 -
6243 +
6244 #define IRQBALANCE_CHECK_ARCH -999
6245 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
6246 #define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
6247 @@ -422,14 +431,14 @@ static int physical_balance __read_mostl
6248 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
6249
6250 static struct irq_cpu_info {
6251 - unsigned long * last_irq;
6252 - unsigned long * irq_delta;
6253 + unsigned long *last_irq;
6254 + unsigned long *irq_delta;
6255 unsigned long irq;
6256 } irq_cpu_data[NR_CPUS];
6257
6258 #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
6259 -#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
6260 -#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
6261 +#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
6262 +#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
6263
6264 #define IDLE_ENOUGH(cpu,now) \
6265 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
6266 @@ -468,8 +477,8 @@ inside:
6267 if (cpu == -1)
6268 cpu = NR_CPUS-1;
6269 }
6270 - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
6271 - (search_idle && !IDLE_ENOUGH(cpu,now)));
6272 + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
6273 + (search_idle && !IDLE_ENOUGH(cpu, now)));
6274
6275 return cpu;
6276 }
6277 @@ -479,15 +488,14 @@ static inline void balance_irq(int cpu,
6278 unsigned long now = jiffies;
6279 cpumask_t allowed_mask;
6280 unsigned int new_cpu;
6281 -
6282 +
6283 if (irqbalance_disabled)
6284 - return;
6285 + return;
6286
6287 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
6288 new_cpu = move(cpu, allowed_mask, now, 1);
6289 - if (cpu != new_cpu) {
6290 + if (cpu != new_cpu)
6291 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
6292 - }
6293 }
6294
6295 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
6296 @@ -499,14 +507,14 @@ static inline void rotate_irqs_among_cpu
6297 if (!irq_desc[j].action)
6298 continue;
6299 /* Is it a significant load ? */
6300 - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
6301 + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
6302 useful_load_threshold)
6303 continue;
6304 balance_irq(i, j);
6305 }
6306 }
6307 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6308 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6309 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6310 return;
6311 }
6312
6313 @@ -535,22 +543,22 @@ static void do_irq_balance(void)
6314 /* Is this an active IRQ or balancing disabled ? */
6315 if (!irq_desc[j].action || irq_balancing_disabled(j))
6316 continue;
6317 - if ( package_index == i )
6318 - IRQ_DELTA(package_index,j) = 0;
6319 + if (package_index == i)
6320 + IRQ_DELTA(package_index, j) = 0;
6321 /* Determine the total count per processor per IRQ */
6322 value_now = (unsigned long) kstat_cpu(i).irqs[j];
6323
6324 /* Determine the activity per processor per IRQ */
6325 - delta = value_now - LAST_CPU_IRQ(i,j);
6326 + delta = value_now - LAST_CPU_IRQ(i, j);
6327
6328 /* Update last_cpu_irq[][] for the next time */
6329 - LAST_CPU_IRQ(i,j) = value_now;
6330 + LAST_CPU_IRQ(i, j) = value_now;
6331
6332 /* Ignore IRQs whose rate is less than the clock */
6333 if (delta < useful_load_threshold)
6334 continue;
6335 /* update the load for the processor or package total */
6336 - IRQ_DELTA(package_index,j) += delta;
6337 + IRQ_DELTA(package_index, j) += delta;
6338
6339 /* Keep track of the higher numbered sibling as well */
6340 if (i != package_index)
6341 @@ -576,7 +584,8 @@ static void do_irq_balance(void)
6342 max_cpu_irq = ULONG_MAX;
6343
6344 tryanothercpu:
6345 - /* Look for heaviest loaded processor.
6346 + /*
6347 + * Look for heaviest loaded processor.
6348 * We may come back to get the next heaviest loaded processor.
6349 * Skip processors with trivial loads.
6350 */
6351 @@ -585,7 +594,7 @@ tryanothercpu:
6352 for_each_online_cpu(i) {
6353 if (i != CPU_TO_PACKAGEINDEX(i))
6354 continue;
6355 - if (max_cpu_irq <= CPU_IRQ(i))
6356 + if (max_cpu_irq <= CPU_IRQ(i))
6357 continue;
6358 if (tmp_cpu_irq < CPU_IRQ(i)) {
6359 tmp_cpu_irq = CPU_IRQ(i);
6360 @@ -594,8 +603,9 @@ tryanothercpu:
6361 }
6362
6363 if (tmp_loaded == -1) {
6364 - /* In the case of small number of heavy interrupt sources,
6365 - * loading some of the cpus too much. We use Ingo's original
6366 + /*
6367 + * In the case of small number of heavy interrupt sources,
6368 + * loading some of the cpus too much. We use Ingo's original
6369 * approach to rotate them around.
6370 */
6371 if (!first_attempt && imbalance >= useful_load_threshold) {
6372 @@ -604,13 +614,14 @@ tryanothercpu:
6373 }
6374 goto not_worth_the_effort;
6375 }
6376 -
6377 +
6378 first_attempt = 0; /* heaviest search */
6379 max_cpu_irq = tmp_cpu_irq; /* load */
6380 max_loaded = tmp_loaded; /* processor */
6381 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
6382 -
6383 - /* if imbalance is less than approx 10% of max load, then
6384 +
6385 + /*
6386 + * if imbalance is less than approx 10% of max load, then
6387 * observe diminishing returns action. - quit
6388 */
6389 if (imbalance < (max_cpu_irq >> 3))
6390 @@ -626,26 +637,25 @@ tryanotherirq:
6391 /* Is this an active IRQ? */
6392 if (!irq_desc[j].action)
6393 continue;
6394 - if (imbalance <= IRQ_DELTA(max_loaded,j))
6395 + if (imbalance <= IRQ_DELTA(max_loaded, j))
6396 continue;
6397 /* Try to find the IRQ that is closest to the imbalance
6398 * without going over.
6399 */
6400 - if (move_this_load < IRQ_DELTA(max_loaded,j)) {
6401 - move_this_load = IRQ_DELTA(max_loaded,j);
6402 + if (move_this_load < IRQ_DELTA(max_loaded, j)) {
6403 + move_this_load = IRQ_DELTA(max_loaded, j);
6404 selected_irq = j;
6405 }
6406 }
6407 - if (selected_irq == -1) {
6408 + if (selected_irq == -1)
6409 goto tryanothercpu;
6410 - }
6411
6412 imbalance = move_this_load;
6413 -
6414 +
6415 /* For physical_balance case, we accumulated both load
6416 * values in the one of the siblings cpu_irq[],
6417 * to use the same code for physical and logical processors
6418 - * as much as possible.
6419 + * as much as possible.
6420 *
6421 * NOTE: the cpu_irq[] array holds the sum of the load for
6422 * sibling A and sibling B in the slot for the lowest numbered
6423 @@ -674,11 +684,11 @@ tryanotherirq:
6424 /* mark for change destination */
6425 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
6426
6427 - /* Since we made a change, come back sooner to
6428 + /* Since we made a change, come back sooner to
6429 * check for more variation.
6430 */
6431 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
6432 - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6433 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
6434 return;
6435 }
6436 goto tryanotherirq;
6437 @@ -689,7 +699,7 @@ not_worth_the_effort:
6438 * upward
6439 */
6440 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
6441 - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6442 + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
6443 return;
6444 }
6445
6446 @@ -728,13 +738,13 @@ static int __init balanced_irq_init(void
6447 cpumask_t tmp;
6448
6449 cpus_shift_right(tmp, cpu_online_map, 2);
6450 - c = &boot_cpu_data;
6451 + c = &boot_cpu_data;
6452 /* When not overwritten by the command line ask subarchitecture. */
6453 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
6454 irqbalance_disabled = NO_BALANCE_IRQ;
6455 if (irqbalance_disabled)
6456 return 0;
6457 -
6458 +
6459 /* disable irqbalance completely if there is only one processor online */
6460 if (num_online_cpus() < 2) {
6461 irqbalance_disabled = 1;
6462 @@ -748,16 +758,14 @@ static int __init balanced_irq_init(void
6463 physical_balance = 1;
6464
6465 for_each_online_cpu(i) {
6466 - irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6467 - irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6468 + irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6469 + irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
6470 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
6471 printk(KERN_ERR "balanced_irq_init: out of memory");
6472 goto failed;
6473 }
6474 - memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
6475 - memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
6476 }
6477 -
6478 +
6479 printk(KERN_INFO "Starting balanced_irq\n");
6480 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
6481 return 0;
6482 @@ -799,7 +807,7 @@ void send_IPI_self(int vector)
6483 /*
6484 * Send the IPI. The write to APIC_ICR fires this off.
6485 */
6486 - apic_write_around(APIC_ICR, cfg);
6487 + apic_write(APIC_ICR, cfg);
6488 #endif
6489 }
6490 #endif /* !CONFIG_SMP */
6491 @@ -853,10 +861,10 @@ static int find_irq_entry(int apic, int
6492 int i;
6493
6494 for (i = 0; i < mp_irq_entries; i++)
6495 - if (mp_irqs[i].mpc_irqtype == type &&
6496 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
6497 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
6498 - mp_irqs[i].mpc_dstirq == pin)
6499 + if (mp_irqs[i].mp_irqtype == type &&
6500 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
6501 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
6502 + mp_irqs[i].mp_dstirq == pin)
6503 return i;
6504
6505 return -1;
6506 @@ -871,13 +879,13 @@ static int __init find_isa_irq_pin(int i
6507 int i;
6508
6509 for (i = 0; i < mp_irq_entries; i++) {
6510 - int lbus = mp_irqs[i].mpc_srcbus;
6511 + int lbus = mp_irqs[i].mp_srcbus;
6512
6513 if (test_bit(lbus, mp_bus_not_pci) &&
6514 - (mp_irqs[i].mpc_irqtype == type) &&
6515 - (mp_irqs[i].mpc_srcbusirq == irq))
6516 + (mp_irqs[i].mp_irqtype == type) &&
6517 + (mp_irqs[i].mp_srcbusirq == irq))
6518
6519 - return mp_irqs[i].mpc_dstirq;
6520 + return mp_irqs[i].mp_dstirq;
6521 }
6522 return -1;
6523 }
6524 @@ -887,17 +895,17 @@ static int __init find_isa_irq_apic(int
6525 int i;
6526
6527 for (i = 0; i < mp_irq_entries; i++) {
6528 - int lbus = mp_irqs[i].mpc_srcbus;
6529 + int lbus = mp_irqs[i].mp_srcbus;
6530
6531 if (test_bit(lbus, mp_bus_not_pci) &&
6532 - (mp_irqs[i].mpc_irqtype == type) &&
6533 - (mp_irqs[i].mpc_srcbusirq == irq))
6534 + (mp_irqs[i].mp_irqtype == type) &&
6535 + (mp_irqs[i].mp_srcbusirq == irq))
6536 break;
6537 }
6538 if (i < mp_irq_entries) {
6539 int apic;
6540 - for(apic = 0; apic < nr_ioapics; apic++) {
6541 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
6542 + for (apic = 0; apic < nr_ioapics; apic++) {
6543 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
6544 return apic;
6545 }
6546 }
6547 @@ -918,28 +926,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6548
6549 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
6550 "slot:%d, pin:%d.\n", bus, slot, pin);
6551 - if (mp_bus_id_to_pci_bus[bus] == -1) {
6552 + if (test_bit(bus, mp_bus_not_pci)) {
6553 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
6554 return -1;
6555 }
6556 for (i = 0; i < mp_irq_entries; i++) {
6557 - int lbus = mp_irqs[i].mpc_srcbus;
6558 + int lbus = mp_irqs[i].mp_srcbus;
6559
6560 for (apic = 0; apic < nr_ioapics; apic++)
6561 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
6562 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
6563 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
6564 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
6565 break;
6566
6567 if (!test_bit(lbus, mp_bus_not_pci) &&
6568 - !mp_irqs[i].mpc_irqtype &&
6569 + !mp_irqs[i].mp_irqtype &&
6570 (bus == lbus) &&
6571 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
6572 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
6573 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
6574 + int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
6575
6576 if (!(apic || IO_APIC_IRQ(irq)))
6577 continue;
6578
6579 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
6580 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
6581 return irq;
6582 /*
6583 * Use the first all-but-pin matching entry as a
6584 @@ -954,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
6585 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
6586
6587 /*
6588 - * This function currently is only a helper for the i386 smp boot process where
6589 + * This function currently is only a helper for the i386 smp boot process where
6590 * we need to reprogram the ioredtbls to cater for the cpus which have come online
6591 * so mask in all cases should simply be TARGET_CPUS
6592 */
6593 @@ -1008,7 +1016,7 @@ static int EISA_ELCR(unsigned int irq)
6594 * EISA conforming in the MP table, that means its trigger type must
6595 * be read in from the ELCR */
6596
6597 -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
6598 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
6599 #define default_EISA_polarity(idx) default_ISA_polarity(idx)
6600
6601 /* PCI interrupts are always polarity one level triggered,
6602 @@ -1025,118 +1033,115 @@ static int EISA_ELCR(unsigned int irq)
6603
6604 static int MPBIOS_polarity(int idx)
6605 {
6606 - int bus = mp_irqs[idx].mpc_srcbus;
6607 + int bus = mp_irqs[idx].mp_srcbus;
6608 int polarity;
6609
6610 /*
6611 * Determine IRQ line polarity (high active or low active):
6612 */
6613 - switch (mp_irqs[idx].mpc_irqflag & 3)
6614 + switch (mp_irqs[idx].mp_irqflag & 3) {
6615 + case 0: /* conforms, ie. bus-type dependent polarity */
6616 {
6617 - case 0: /* conforms, ie. bus-type dependent polarity */
6618 - {
6619 - polarity = test_bit(bus, mp_bus_not_pci)?
6620 - default_ISA_polarity(idx):
6621 - default_PCI_polarity(idx);
6622 - break;
6623 - }
6624 - case 1: /* high active */
6625 - {
6626 - polarity = 0;
6627 - break;
6628 - }
6629 - case 2: /* reserved */
6630 - {
6631 - printk(KERN_WARNING "broken BIOS!!\n");
6632 - polarity = 1;
6633 - break;
6634 - }
6635 - case 3: /* low active */
6636 - {
6637 - polarity = 1;
6638 - break;
6639 - }
6640 - default: /* invalid */
6641 - {
6642 - printk(KERN_WARNING "broken BIOS!!\n");
6643 - polarity = 1;
6644 - break;
6645 - }
6646 + polarity = test_bit(bus, mp_bus_not_pci)?
6647 + default_ISA_polarity(idx):
6648 + default_PCI_polarity(idx);
6649 + break;
6650 + }
6651 + case 1: /* high active */
6652 + {
6653 + polarity = 0;
6654 + break;
6655 + }
6656 + case 2: /* reserved */
6657 + {
6658 + printk(KERN_WARNING "broken BIOS!!\n");
6659 + polarity = 1;
6660 + break;
6661 + }
6662 + case 3: /* low active */
6663 + {
6664 + polarity = 1;
6665 + break;
6666 + }
6667 + default: /* invalid */
6668 + {
6669 + printk(KERN_WARNING "broken BIOS!!\n");
6670 + polarity = 1;
6671 + break;
6672 + }
6673 }
6674 return polarity;
6675 }
6676
6677 static int MPBIOS_trigger(int idx)
6678 {
6679 - int bus = mp_irqs[idx].mpc_srcbus;
6680 + int bus = mp_irqs[idx].mp_srcbus;
6681 int trigger;
6682
6683 /*
6684 * Determine IRQ trigger mode (edge or level sensitive):
6685 */
6686 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
6687 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
6688 + case 0: /* conforms, ie. bus-type dependent */
6689 {
6690 - case 0: /* conforms, ie. bus-type dependent */
6691 - {
6692 - trigger = test_bit(bus, mp_bus_not_pci)?
6693 - default_ISA_trigger(idx):
6694 - default_PCI_trigger(idx);
6695 + trigger = test_bit(bus, mp_bus_not_pci)?
6696 + default_ISA_trigger(idx):
6697 + default_PCI_trigger(idx);
6698 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
6699 - switch (mp_bus_id_to_type[bus])
6700 - {
6701 - case MP_BUS_ISA: /* ISA pin */
6702 - {
6703 - /* set before the switch */
6704 - break;
6705 - }
6706 - case MP_BUS_EISA: /* EISA pin */
6707 - {
6708 - trigger = default_EISA_trigger(idx);
6709 - break;
6710 - }
6711 - case MP_BUS_PCI: /* PCI pin */
6712 - {
6713 - /* set before the switch */
6714 - break;
6715 - }
6716 - case MP_BUS_MCA: /* MCA pin */
6717 - {
6718 - trigger = default_MCA_trigger(idx);
6719 - break;
6720 - }
6721 - default:
6722 - {
6723 - printk(KERN_WARNING "broken BIOS!!\n");
6724 - trigger = 1;
6725 - break;
6726 - }
6727 - }
6728 -#endif
6729 + switch (mp_bus_id_to_type[bus]) {
6730 + case MP_BUS_ISA: /* ISA pin */
6731 + {
6732 + /* set before the switch */
6733 break;
6734 }
6735 - case 1: /* edge */
6736 + case MP_BUS_EISA: /* EISA pin */
6737 {
6738 - trigger = 0;
6739 + trigger = default_EISA_trigger(idx);
6740 break;
6741 }
6742 - case 2: /* reserved */
6743 + case MP_BUS_PCI: /* PCI pin */
6744 {
6745 - printk(KERN_WARNING "broken BIOS!!\n");
6746 - trigger = 1;
6747 + /* set before the switch */
6748 break;
6749 }
6750 - case 3: /* level */
6751 + case MP_BUS_MCA: /* MCA pin */
6752 {
6753 - trigger = 1;
6754 + trigger = default_MCA_trigger(idx);
6755 break;
6756 }
6757 - default: /* invalid */
6758 + default:
6759 {
6760 printk(KERN_WARNING "broken BIOS!!\n");
6761 - trigger = 0;
6762 + trigger = 1;
6763 break;
6764 }
6765 }
6766 +#endif
6767 + break;
6768 + }
6769 + case 1: /* edge */
6770 + {
6771 + trigger = 0;
6772 + break;
6773 + }
6774 + case 2: /* reserved */
6775 + {
6776 + printk(KERN_WARNING "broken BIOS!!\n");
6777 + trigger = 1;
6778 + break;
6779 + }
6780 + case 3: /* level */
6781 + {
6782 + trigger = 1;
6783 + break;
6784 + }
6785 + default: /* invalid */
6786 + {
6787 + printk(KERN_WARNING "broken BIOS!!\n");
6788 + trigger = 0;
6789 + break;
6790 + }
6791 + }
6792 return trigger;
6793 }
6794
6795 @@ -1153,16 +1158,16 @@ static inline int irq_trigger(int idx)
6796 static int pin_2_irq(int idx, int apic, int pin)
6797 {
6798 int irq, i;
6799 - int bus = mp_irqs[idx].mpc_srcbus;
6800 + int bus = mp_irqs[idx].mp_srcbus;
6801
6802 /*
6803 * Debugging check, we are in big trouble if this message pops up!
6804 */
6805 - if (mp_irqs[idx].mpc_dstirq != pin)
6806 + if (mp_irqs[idx].mp_dstirq != pin)
6807 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
6808
6809 if (test_bit(bus, mp_bus_not_pci))
6810 - irq = mp_irqs[idx].mpc_srcbusirq;
6811 + irq = mp_irqs[idx].mp_srcbusirq;
6812 else {
6813 /*
6814 * PCI IRQs are mapped in order
6815 @@ -1204,8 +1209,8 @@ static inline int IO_APIC_irq_trigger(in
6816
6817 for (apic = 0; apic < nr_ioapics; apic++) {
6818 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
6819 - idx = find_irq_entry(apic,pin,mp_INT);
6820 - if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
6821 + idx = find_irq_entry(apic, pin, mp_INT);
6822 + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
6823 return irq_trigger(idx);
6824 }
6825 }
6826 @@ -1291,25 +1296,25 @@ static void __init setup_IO_APIC_irqs(vo
6827 /*
6828 * add it to the IO-APIC irq-routing table:
6829 */
6830 - memset(&entry,0,sizeof(entry));
6831 + memset(&entry, 0, sizeof(entry));
6832
6833 entry.delivery_mode = INT_DELIVERY_MODE;
6834 entry.dest_mode = INT_DEST_MODE;
6835 entry.mask = 0; /* enable IRQ */
6836 - entry.dest.logical.logical_dest =
6837 + entry.dest.logical.logical_dest =
6838 cpu_mask_to_apicid(TARGET_CPUS);
6839
6840 - idx = find_irq_entry(apic,pin,mp_INT);
6841 + idx = find_irq_entry(apic, pin, mp_INT);
6842 if (idx == -1) {
6843 if (first_notcon) {
6844 apic_printk(APIC_VERBOSE, KERN_DEBUG
6845 " IO-APIC (apicid-pin) %d-%d",
6846 - mp_ioapics[apic].mpc_apicid,
6847 + mp_ioapics[apic].mp_apicid,
6848 pin);
6849 first_notcon = 0;
6850 } else
6851 apic_printk(APIC_VERBOSE, ", %d-%d",
6852 - mp_ioapics[apic].mpc_apicid, pin);
6853 + mp_ioapics[apic].mp_apicid, pin);
6854 continue;
6855 }
6856
6857 @@ -1343,7 +1348,7 @@ static void __init setup_IO_APIC_irqs(vo
6858 vector = assign_irq_vector(irq);
6859 entry.vector = vector;
6860 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
6861 -
6862 +
6863 if (!apic && (irq < 16))
6864 disable_8259A_irq(irq);
6865 }
6866 @@ -1355,27 +1360,23 @@ static void __init setup_IO_APIC_irqs(vo
6867 apic_printk(APIC_VERBOSE, " not connected.\n");
6868 }
6869
6870 +#ifndef CONFIG_XEN
6871 /*
6872 - * Set up the 8259A-master output pin:
6873 + * Set up the timer pin, possibly with the 8259A-master behind.
6874 */
6875 -#ifndef CONFIG_XEN
6876 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
6877 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
6878 + int vector)
6879 {
6880 struct IO_APIC_route_entry entry;
6881
6882 - memset(&entry,0,sizeof(entry));
6883 -
6884 - disable_8259A_irq(0);
6885 -
6886 - /* mask LVT0 */
6887 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6888 + memset(&entry, 0, sizeof(entry));
6889
6890 /*
6891 * We use logical delivery to get the timer IRQ
6892 * to the first CPU.
6893 */
6894 entry.dest_mode = INT_DEST_MODE;
6895 - entry.mask = 0; /* unmask IRQ now */
6896 + entry.mask = 1; /* mask IRQ now */
6897 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6898 entry.delivery_mode = INT_DELIVERY_MODE;
6899 entry.polarity = 0;
6900 @@ -1384,17 +1385,14 @@ static void __init setup_ExtINT_IRQ0_pin
6901
6902 /*
6903 * The timer IRQ doesn't have to know that behind the
6904 - * scene we have a 8259A-master in AEOI mode ...
6905 + * scene we may have a 8259A-master in AEOI mode ...
6906 */
6907 - irq_desc[0].chip = &ioapic_chip;
6908 - set_irq_handler(0, handle_edge_irq);
6909 + ioapic_register_intr(0, vector, IOAPIC_EDGE);
6910
6911 /*
6912 * Add it to the IO-APIC irq-routing table:
6913 */
6914 ioapic_write_entry(apic, pin, entry);
6915 -
6916 - enable_8259A_irq(0);
6917 }
6918
6919 void __init print_IO_APIC(void)
6920 @@ -1409,10 +1407,10 @@ void __init print_IO_APIC(void)
6921 if (apic_verbosity == APIC_QUIET)
6922 return;
6923
6924 - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6925 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
6926 for (i = 0; i < nr_ioapics; i++)
6927 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
6928 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
6929 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
6930
6931 /*
6932 * We are a bit conservative about what we expect. We have to
6933 @@ -1431,7 +1429,7 @@ void __init print_IO_APIC(void)
6934 reg_03.raw = io_apic_read(apic, 3);
6935 spin_unlock_irqrestore(&ioapic_lock, flags);
6936
6937 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
6938 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
6939 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
6940 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
6941 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
6942 @@ -1512,7 +1510,7 @@ void __init print_IO_APIC(void)
6943 return;
6944 }
6945
6946 -static void print_APIC_bitfield (int base)
6947 +static void print_APIC_bitfield(int base)
6948 {
6949 unsigned int v;
6950 int i, j;
6951 @@ -1533,7 +1531,7 @@ static void print_APIC_bitfield (int bas
6952 }
6953 }
6954
6955 -void /*__init*/ print_local_APIC(void * dummy)
6956 +void /*__init*/ print_local_APIC(void *dummy)
6957 {
6958 unsigned int v, ver, maxlvt;
6959
6960 @@ -1542,6 +1540,7 @@ void /*__init*/ print_local_APIC(void *
6961
6962 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
6963 smp_processor_id(), hard_smp_processor_id());
6964 + v = apic_read(APIC_ID);
6965 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
6966 GET_APIC_ID(read_apic_id()));
6967 v = apic_read(APIC_LVR);
6968 @@ -1616,9 +1615,9 @@ void /*__init*/ print_local_APIC(void *
6969 printk("\n");
6970 }
6971
6972 -void print_all_local_APICs (void)
6973 +void print_all_local_APICs(void)
6974 {
6975 - on_each_cpu(print_local_APIC, NULL, 1, 1);
6976 + on_each_cpu(print_local_APIC, NULL, 1);
6977 }
6978
6979 void /*__init*/ print_PIC(void)
6980 @@ -1639,11 +1638,11 @@ void /*__init*/ print_PIC(void)
6981 v = inb(0xa0) << 8 | inb(0x20);
6982 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
6983
6984 - outb(0x0b,0xa0);
6985 - outb(0x0b,0x20);
6986 + outb(0x0b, 0xa0);
6987 + outb(0x0b, 0x20);
6988 v = inb(0xa0) << 8 | inb(0x20);
6989 - outb(0x0a,0xa0);
6990 - outb(0x0a,0x20);
6991 + outb(0x0a, 0xa0);
6992 + outb(0x0a, 0x20);
6993
6994 spin_unlock_irqrestore(&i8259A_lock, flags);
6995
6996 @@ -1652,6 +1651,8 @@ void /*__init*/ print_PIC(void)
6997 v = inb(0x4d1) << 8 | inb(0x4d0);
6998 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
6999 }
7000 +#else
7001 +void __init print_IO_APIC(void) {}
7002 #endif /* !CONFIG_XEN */
7003
7004 static void __init enable_IO_APIC(void)
7005 @@ -1681,7 +1682,7 @@ static void __init enable_IO_APIC(void)
7006 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
7007 }
7008 #ifndef CONFIG_XEN
7009 - for(apic = 0; apic < nr_ioapics; apic++) {
7010 + for (apic = 0; apic < nr_ioapics; apic++) {
7011 int pin;
7012 /* See if any of the pins is in ExtINT mode */
7013 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
7014 @@ -1774,7 +1775,7 @@ void disable_IO_APIC(void)
7015 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
7016 */
7017
7018 -#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
7019 +#ifndef CONFIG_XEN
7020 static void __init setup_ioapic_ids_from_mpc(void)
7021 {
7022 union IO_APIC_reg_00 reg_00;
7023 @@ -1784,6 +1785,11 @@ static void __init setup_ioapic_ids_from
7024 unsigned char old_id;
7025 unsigned long flags;
7026
7027 +#ifdef CONFIG_X86_NUMAQ
7028 + if (found_numaq)
7029 + return;
7030 +#endif
7031 +
7032 /*
7033 * Don't check I/O APIC IDs for xAPIC systems. They have
7034 * no meaning without the serial APIC bus.
7035 @@ -1806,15 +1812,15 @@ static void __init setup_ioapic_ids_from
7036 spin_lock_irqsave(&ioapic_lock, flags);
7037 reg_00.raw = io_apic_read(apic, 0);
7038 spin_unlock_irqrestore(&ioapic_lock, flags);
7039 -
7040 - old_id = mp_ioapics[apic].mpc_apicid;
7041
7042 - if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
7043 + old_id = mp_ioapics[apic].mp_apicid;
7044 +
7045 + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
7046 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
7047 - apic, mp_ioapics[apic].mpc_apicid);
7048 + apic, mp_ioapics[apic].mp_apicid);
7049 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7050 reg_00.bits.ID);
7051 - mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
7052 + mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
7053 }
7054
7055 /*
7056 @@ -1823,9 +1829,9 @@ static void __init setup_ioapic_ids_from
7057 * 'stuck on smp_invalidate_needed IPI wait' messages.
7058 */
7059 if (check_apicid_used(phys_id_present_map,
7060 - mp_ioapics[apic].mpc_apicid)) {
7061 + mp_ioapics[apic].mp_apicid)) {
7062 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
7063 - apic, mp_ioapics[apic].mpc_apicid);
7064 + apic, mp_ioapics[apic].mp_apicid);
7065 for (i = 0; i < get_physical_broadcast(); i++)
7066 if (!physid_isset(i, phys_id_present_map))
7067 break;
7068 @@ -1834,13 +1840,13 @@ static void __init setup_ioapic_ids_from
7069 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
7070 i);
7071 physid_set(i, phys_id_present_map);
7072 - mp_ioapics[apic].mpc_apicid = i;
7073 + mp_ioapics[apic].mp_apicid = i;
7074 } else {
7075 physid_mask_t tmp;
7076 - tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
7077 + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
7078 apic_printk(APIC_VERBOSE, "Setting %d in the "
7079 "phys_id_present_map\n",
7080 - mp_ioapics[apic].mpc_apicid);
7081 + mp_ioapics[apic].mp_apicid);
7082 physids_or(phys_id_present_map, phys_id_present_map, tmp);
7083 }
7084
7085 @@ -1849,21 +1855,21 @@ static void __init setup_ioapic_ids_from
7086 * We need to adjust the IRQ routing table
7087 * if the ID changed.
7088 */
7089 - if (old_id != mp_ioapics[apic].mpc_apicid)
7090 + if (old_id != mp_ioapics[apic].mp_apicid)
7091 for (i = 0; i < mp_irq_entries; i++)
7092 - if (mp_irqs[i].mpc_dstapic == old_id)
7093 - mp_irqs[i].mpc_dstapic
7094 - = mp_ioapics[apic].mpc_apicid;
7095 + if (mp_irqs[i].mp_dstapic == old_id)
7096 + mp_irqs[i].mp_dstapic
7097 + = mp_ioapics[apic].mp_apicid;
7098
7099 /*
7100 * Read the right value from the MPC table and
7101 * write it into the ID register.
7102 - */
7103 + */
7104 apic_printk(APIC_VERBOSE, KERN_INFO
7105 "...changing IO-APIC physical APIC ID to %d ...",
7106 - mp_ioapics[apic].mpc_apicid);
7107 + mp_ioapics[apic].mp_apicid);
7108
7109 - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
7110 + reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
7111 spin_lock_irqsave(&ioapic_lock, flags);
7112 io_apic_write(apic, 0, reg_00.raw);
7113 spin_unlock_irqrestore(&ioapic_lock, flags);
7114 @@ -1874,17 +1880,13 @@ static void __init setup_ioapic_ids_from
7115 spin_lock_irqsave(&ioapic_lock, flags);
7116 reg_00.raw = io_apic_read(apic, 0);
7117 spin_unlock_irqrestore(&ioapic_lock, flags);
7118 - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
7119 + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
7120 printk("could not set ID!\n");
7121 else
7122 apic_printk(APIC_VERBOSE, " ok.\n");
7123 }
7124 }
7125 -#else
7126 -static void __init setup_ioapic_ids_from_mpc(void) { }
7127 -#endif
7128
7129 -#ifndef CONFIG_XEN
7130 int no_timer_check __initdata;
7131
7132 static int __init notimercheck(char *s)
7133 @@ -2077,45 +2079,53 @@ static inline void init_IO_APIC_traps(vo
7134 * The local APIC irq-chip implementation:
7135 */
7136
7137 -static void ack_apic(unsigned int irq)
7138 +static void ack_lapic_irq(unsigned int irq)
7139 {
7140 ack_APIC_irq();
7141 }
7142
7143 -static void mask_lapic_irq (unsigned int irq)
7144 +static void mask_lapic_irq(unsigned int irq)
7145 {
7146 unsigned long v;
7147
7148 v = apic_read(APIC_LVT0);
7149 - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
7150 + apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
7151 }
7152
7153 -static void unmask_lapic_irq (unsigned int irq)
7154 +static void unmask_lapic_irq(unsigned int irq)
7155 {
7156 unsigned long v;
7157
7158 v = apic_read(APIC_LVT0);
7159 - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
7160 + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
7161 }
7162
7163 static struct irq_chip lapic_chip __read_mostly = {
7164 - .name = "local-APIC-edge",
7165 + .name = "local-APIC",
7166 .mask = mask_lapic_irq,
7167 .unmask = unmask_lapic_irq,
7168 - .eoi = ack_apic,
7169 + .ack = ack_lapic_irq,
7170 };
7171
7172 +static void lapic_register_intr(int irq, int vector)
7173 +{
7174 + irq_desc[irq].status &= ~IRQ_LEVEL;
7175 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
7176 + "edge");
7177 + set_intr_gate(vector, interrupt[irq]);
7178 +}
7179 +
7180 static void __init setup_nmi(void)
7181 {
7182 /*
7183 - * Dirty trick to enable the NMI watchdog ...
7184 + * Dirty trick to enable the NMI watchdog ...
7185 * We put the 8259A master into AEOI mode and
7186 * unmask on all local APICs LVT0 as NMI.
7187 *
7188 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
7189 * is from Maciej W. Rozycki - so we do not have to EOI from
7190 * the NMI handler or the timer interrupt.
7191 - */
7192 + */
7193 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
7194
7195 enable_NMI_through_LVT0();
7196 @@ -2191,11 +2201,16 @@ static inline void __init unlock_ExtINT_
7197 static inline void __init check_timer(void)
7198 {
7199 int apic1, pin1, apic2, pin2;
7200 + int no_pin1 = 0;
7201 int vector;
7202 + unsigned int ver;
7203 unsigned long flags;
7204
7205 local_irq_save(flags);
7206
7207 + ver = apic_read(APIC_LVR);
7208 + ver = GET_APIC_VERSION(ver);
7209 +
7210 /*
7211 * get/set the timer IRQ vector:
7212 */
7213 @@ -2204,34 +2219,54 @@ static inline void __init check_timer(vo
7214 set_intr_gate(vector, interrupt[0]);
7215
7216 /*
7217 - * Subtle, code in do_timer_interrupt() expects an AEOI
7218 - * mode for the 8259A whenever interrupts are routed
7219 - * through I/O APICs. Also IRQ0 has to be enabled in
7220 - * the 8259A which implies the virtual wire has to be
7221 - * disabled in the local APIC.
7222 + * As IRQ0 is to be enabled in the 8259A, the virtual
7223 + * wire has to be disabled in the local APIC. Also
7224 + * timer interrupts need to be acknowledged manually in
7225 + * the 8259A for the i82489DX when using the NMI
7226 + * watchdog as that APIC treats NMIs as level-triggered.
7227 + * The AEOI mode will finish them in the 8259A
7228 + * automatically.
7229 */
7230 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7231 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
7232 init_8259A(1);
7233 - timer_ack = 1;
7234 - if (timer_over_8254 > 0)
7235 - enable_8259A_irq(0);
7236 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
7237
7238 pin1 = find_isa_irq_pin(0, mp_INT);
7239 apic1 = find_isa_irq_apic(0, mp_INT);
7240 pin2 = ioapic_i8259.pin;
7241 apic2 = ioapic_i8259.apic;
7242
7243 - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
7244 - vector, apic1, pin1, apic2, pin2);
7245 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
7246 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
7247 + vector, apic1, pin1, apic2, pin2);
7248 +
7249 + /*
7250 + * Some BIOS writers are clueless and report the ExtINTA
7251 + * I/O APIC input from the cascaded 8259A as the timer
7252 + * interrupt input. So just in case, if only one pin
7253 + * was found above, try it both directly and through the
7254 + * 8259A.
7255 + */
7256 + if (pin1 == -1) {
7257 + pin1 = pin2;
7258 + apic1 = apic2;
7259 + no_pin1 = 1;
7260 + } else if (pin2 == -1) {
7261 + pin2 = pin1;
7262 + apic2 = apic1;
7263 + }
7264
7265 if (pin1 != -1) {
7266 /*
7267 * Ok, does IRQ0 through the IOAPIC work?
7268 */
7269 + if (no_pin1) {
7270 + add_pin_to_irq(0, apic1, pin1);
7271 + setup_timer_IRQ0_pin(apic1, pin1, vector);
7272 + }
7273 unmask_IO_APIC_irq(0);
7274 if (timer_irq_works()) {
7275 if (nmi_watchdog == NMI_IO_APIC) {
7276 - disable_8259A_irq(0);
7277 setup_nmi();
7278 enable_8259A_irq(0);
7279 }
7280 @@ -2240,71 +2275,77 @@ static inline void __init check_timer(vo
7281 goto out;
7282 }
7283 clear_IO_APIC_pin(apic1, pin1);
7284 - printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
7285 - "IO-APIC\n");
7286 - }
7287 -
7288 - printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
7289 - if (pin2 != -1) {
7290 - printk("\n..... (found pin %d) ...", pin2);
7291 + if (!no_pin1)
7292 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
7293 + "8254 timer not connected to IO-APIC\n");
7294 +
7295 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
7296 + "(IRQ0) through the 8259A ...\n");
7297 + apic_printk(APIC_QUIET, KERN_INFO
7298 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
7299 /*
7300 * legacy devices should be connected to IO APIC #0
7301 */
7302 - setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
7303 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7304 + setup_timer_IRQ0_pin(apic2, pin2, vector);
7305 + unmask_IO_APIC_irq(0);
7306 + enable_8259A_irq(0);
7307 if (timer_irq_works()) {
7308 - printk("works.\n");
7309 - if (pin1 != -1)
7310 - replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
7311 - else
7312 - add_pin_to_irq(0, apic2, pin2);
7313 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
7314 + timer_through_8259 = 1;
7315 if (nmi_watchdog == NMI_IO_APIC) {
7316 + disable_8259A_irq(0);
7317 setup_nmi();
7318 + enable_8259A_irq(0);
7319 }
7320 goto out;
7321 }
7322 /*
7323 * Cleanup, just in case ...
7324 */
7325 + disable_8259A_irq(0);
7326 clear_IO_APIC_pin(apic2, pin2);
7327 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
7328 }
7329 - printk(" failed.\n");
7330
7331 if (nmi_watchdog == NMI_IO_APIC) {
7332 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
7333 - nmi_watchdog = 0;
7334 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
7335 + "through the IO-APIC - disabling NMI Watchdog!\n");
7336 + nmi_watchdog = NMI_NONE;
7337 }
7338 + timer_ack = 0;
7339
7340 - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
7341 + apic_printk(APIC_QUIET, KERN_INFO
7342 + "...trying to set up timer as Virtual Wire IRQ...\n");
7343
7344 - disable_8259A_irq(0);
7345 - set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
7346 - "fasteoi");
7347 - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7348 + lapic_register_intr(0, vector);
7349 + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
7350 enable_8259A_irq(0);
7351
7352 if (timer_irq_works()) {
7353 - printk(" works.\n");
7354 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7355 goto out;
7356 }
7357 - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7358 - printk(" failed.\n");
7359 + disable_8259A_irq(0);
7360 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
7361 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
7362
7363 - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
7364 + apic_printk(APIC_QUIET, KERN_INFO
7365 + "...trying to set up timer as ExtINT IRQ...\n");
7366
7367 - timer_ack = 0;
7368 init_8259A(0);
7369 make_8259A_irq(0);
7370 - apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
7371 + apic_write(APIC_LVT0, APIC_DM_EXTINT);
7372
7373 unlock_ExtINT_logic();
7374
7375 if (timer_irq_works()) {
7376 - printk(" works.\n");
7377 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
7378 goto out;
7379 }
7380 - printk(" failed :(.\n");
7381 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
7382 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
7383 - "report. Then try booting with the 'noapic' option");
7384 + "report. Then try booting with the 'noapic' option.\n");
7385 out:
7386 local_irq_restore(flags);
7387 }
7388 @@ -2314,11 +2355,21 @@ int timer_uses_ioapic_pin_0 = 0;
7389 #endif
7390
7391 /*
7392 - *
7393 - * IRQ's that are handled by the PIC in the MPS IOAPIC case.
7394 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
7395 - * Linux doesn't really care, as it's not actually used
7396 - * for any interrupt handling anyway.
7397 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
7398 + * to devices. However there may be an I/O APIC pin available for
7399 + * this interrupt regardless. The pin may be left unconnected, but
7400 + * typically it will be reused as an ExtINT cascade interrupt for
7401 + * the master 8259A. In the MPS case such a pin will normally be
7402 + * reported as an ExtINT interrupt in the MP table. With ACPI
7403 + * there is no provision for ExtINT interrupts, and in the absence
7404 + * of an override it would be treated as an ordinary ISA I/O APIC
7405 + * interrupt, that is edge-triggered and unmasked by default. We
7406 + * used to do this, but it caused problems on some systems because
7407 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
7408 + * the same ExtINT cascade interrupt to drive the local APIC of the
7409 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
7410 + * the I/O APIC in all cases now. No actual device should request
7411 + * it anyway. --macro
7412 */
7413 #define PIC_IRQS (1 << PIC_CASCADE_IR)
7414
7415 @@ -2328,25 +2379,22 @@ void __init setup_IO_APIC(void)
7416 int i;
7417
7418 /* Reserve all the system vectors. */
7419 - for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
7420 + for (i = first_system_vector; i < NR_VECTORS; i++)
7421 set_bit(i, used_vectors);
7422 #endif
7423
7424 enable_IO_APIC();
7425
7426 - if (acpi_ioapic)
7427 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
7428 - else
7429 - io_apic_irqs = ~PIC_IRQS;
7430 + io_apic_irqs = ~PIC_IRQS;
7431
7432 printk("ENABLING IO-APIC IRQs\n");
7433
7434 +#ifndef CONFIG_XEN
7435 /*
7436 * Set up IO-APIC IRQ routing.
7437 */
7438 if (!acpi_ioapic)
7439 setup_ioapic_ids_from_mpc();
7440 -#ifndef CONFIG_XEN
7441 sync_Arb_IDs();
7442 #endif
7443 setup_IO_APIC_irqs();
7444 @@ -2356,28 +2404,14 @@ void __init setup_IO_APIC(void)
7445 print_IO_APIC();
7446 }
7447
7448 -static int __init setup_disable_8254_timer(char *s)
7449 -{
7450 - timer_over_8254 = -1;
7451 - return 1;
7452 -}
7453 -static int __init setup_enable_8254_timer(char *s)
7454 -{
7455 - timer_over_8254 = 2;
7456 - return 1;
7457 -}
7458 -
7459 -__setup("disable_8254_timer", setup_disable_8254_timer);
7460 -__setup("enable_8254_timer", setup_enable_8254_timer);
7461 -
7462 /*
7463 * Called after all the initialization is done. If we didnt find any
7464 * APIC bugs then we can allow the modify fast path
7465 */
7466 -
7467 +
7468 static int __init io_apic_bug_finalize(void)
7469 {
7470 - if(sis_apic_bug == -1)
7471 + if (sis_apic_bug == -1)
7472 sis_apic_bug = 0;
7473 if (is_initial_xendomain()) {
7474 struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
7475 @@ -2396,17 +2430,17 @@ struct sysfs_ioapic_data {
7476 struct sys_device dev;
7477 struct IO_APIC_route_entry entry[0];
7478 };
7479 -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
7480 +static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
7481
7482 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
7483 {
7484 struct IO_APIC_route_entry *entry;
7485 struct sysfs_ioapic_data *data;
7486 int i;
7487 -
7488 +
7489 data = container_of(dev, struct sysfs_ioapic_data, dev);
7490 entry = data->entry;
7491 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7492 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7493 entry[i] = ioapic_read_entry(dev->id, i);
7494
7495 return 0;
7496 @@ -2419,18 +2453,18 @@ static int ioapic_resume(struct sys_devi
7497 unsigned long flags;
7498 union IO_APIC_reg_00 reg_00;
7499 int i;
7500 -
7501 +
7502 data = container_of(dev, struct sysfs_ioapic_data, dev);
7503 entry = data->entry;
7504
7505 spin_lock_irqsave(&ioapic_lock, flags);
7506 reg_00.raw = io_apic_read(dev->id, 0);
7507 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
7508 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
7509 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
7510 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
7511 io_apic_write(dev->id, 0, reg_00.raw);
7512 }
7513 spin_unlock_irqrestore(&ioapic_lock, flags);
7514 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
7515 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
7516 ioapic_write_entry(dev->id, i, entry[i]);
7517
7518 return 0;
7519 @@ -2444,24 +2478,23 @@ static struct sysdev_class ioapic_sysdev
7520
7521 static int __init ioapic_init_sysfs(void)
7522 {
7523 - struct sys_device * dev;
7524 + struct sys_device *dev;
7525 int i, size, error = 0;
7526
7527 error = sysdev_class_register(&ioapic_sysdev_class);
7528 if (error)
7529 return error;
7530
7531 - for (i = 0; i < nr_ioapics; i++ ) {
7532 - size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7533 + for (i = 0; i < nr_ioapics; i++) {
7534 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
7535 * sizeof(struct IO_APIC_route_entry);
7536 - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
7537 + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
7538 if (!mp_ioapic_data[i]) {
7539 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
7540 continue;
7541 }
7542 - memset(mp_ioapic_data[i], 0, size);
7543 dev = &mp_ioapic_data[i]->dev;
7544 - dev->id = i;
7545 + dev->id = i;
7546 dev->cls = &ioapic_sysdev_class;
7547 error = sysdev_register(dev);
7548 if (error) {
7549 @@ -2538,7 +2571,7 @@ static int msi_compose_msg(struct pci_de
7550 msg->address_lo =
7551 MSI_ADDR_BASE_LO |
7552 ((INT_DEST_MODE == 0) ?
7553 - MSI_ADDR_DEST_MODE_PHYSICAL:
7554 +MSI_ADDR_DEST_MODE_PHYSICAL:
7555 MSI_ADDR_DEST_MODE_LOGICAL) |
7556 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7557 MSI_ADDR_REDIRECTION_CPU:
7558 @@ -2549,7 +2582,7 @@ static int msi_compose_msg(struct pci_de
7559 MSI_DATA_TRIGGER_EDGE |
7560 MSI_DATA_LEVEL_ASSERT |
7561 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
7562 - MSI_DATA_DELIVERY_FIXED:
7563 +MSI_DATA_DELIVERY_FIXED:
7564 MSI_DATA_DELIVERY_LOWPRI) |
7565 MSI_DATA_VECTOR(vector);
7566 }
7567 @@ -2720,12 +2753,12 @@ int arch_setup_ht_irq(unsigned int irq,
7568 #endif /* CONFIG_HT_IRQ */
7569
7570 /* --------------------------------------------------------------------------
7571 - ACPI-based IOAPIC Configuration
7572 + ACPI-based IOAPIC Configuration
7573 -------------------------------------------------------------------------- */
7574
7575 #ifdef CONFIG_ACPI
7576
7577 -int __init io_apic_get_unique_id (int ioapic, int apic_id)
7578 +int __init io_apic_get_unique_id(int ioapic, int apic_id)
7579 {
7580 #ifndef CONFIG_XEN
7581 union IO_APIC_reg_00 reg_00;
7582 @@ -2735,10 +2768,10 @@ int __init io_apic_get_unique_id (int io
7583 int i = 0;
7584
7585 /*
7586 - * The P4 platform supports up to 256 APIC IDs on two separate APIC
7587 - * buses (one for LAPICs, one for IOAPICs), where predecessors only
7588 + * The P4 platform supports up to 256 APIC IDs on two separate APIC
7589 + * buses (one for LAPICs, one for IOAPICs), where predecessors only
7590 * supports up to 16 on one shared APIC bus.
7591 - *
7592 + *
7593 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
7594 * advantage of new APIC bus architecture.
7595 */
7596 @@ -2757,7 +2790,7 @@ int __init io_apic_get_unique_id (int io
7597 }
7598
7599 /*
7600 - * Every APIC in a system must have a unique ID or we get lots of nice
7601 + * Every APIC in a system must have a unique ID or we get lots of nice
7602 * 'stuck on smp_invalidate_needed IPI wait' messages.
7603 */
7604 if (check_apicid_used(apic_id_map, apic_id)) {
7605 @@ -2774,7 +2807,7 @@ int __init io_apic_get_unique_id (int io
7606 "trying %d\n", ioapic, apic_id, i);
7607
7608 apic_id = i;
7609 - }
7610 + }
7611
7612 tmp = apicid_to_cpu_present(apic_id);
7613 physids_or(apic_id_map, apic_id_map, tmp);
7614 @@ -2802,7 +2835,7 @@ int __init io_apic_get_unique_id (int io
7615 }
7616
7617
7618 -int __init io_apic_get_version (int ioapic)
7619 +int __init io_apic_get_version(int ioapic)
7620 {
7621 union IO_APIC_reg_01 reg_01;
7622 unsigned long flags;
7623 @@ -2815,7 +2848,7 @@ int __init io_apic_get_version (int ioap
7624 }
7625
7626
7627 -int __init io_apic_get_redir_entries (int ioapic)
7628 +int __init io_apic_get_redir_entries(int ioapic)
7629 {
7630 union IO_APIC_reg_01 reg_01;
7631 unsigned long flags;
7632 @@ -2828,7 +2861,7 @@ int __init io_apic_get_redir_entries (in
7633 }
7634
7635
7636 -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
7637 +int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
7638 {
7639 struct IO_APIC_route_entry entry;
7640
7641 @@ -2844,7 +2877,7 @@ int io_apic_set_pci_routing (int ioapic,
7642 * corresponding device driver registers for this IRQ.
7643 */
7644
7645 - memset(&entry,0,sizeof(entry));
7646 + memset(&entry, 0, sizeof(entry));
7647
7648 entry.delivery_mode = INT_DELIVERY_MODE;
7649 entry.dest_mode = INT_DEST_MODE;
7650 @@ -2863,7 +2896,7 @@ int io_apic_set_pci_routing (int ioapic,
7651
7652 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
7653 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
7654 - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
7655 + mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
7656 edge_level, active_high_low);
7657
7658 ioapic_register_intr(irq, entry.vector, edge_level);
7659 @@ -2884,8 +2917,8 @@ int acpi_get_override_irq(int bus_irq, i
7660 return -1;
7661
7662 for (i = 0; i < mp_irq_entries; i++)
7663 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
7664 - mp_irqs[i].mpc_srcbusirq == bus_irq)
7665 + if (mp_irqs[i].mp_irqtype == mp_INT &&
7666 + mp_irqs[i].mp_srcbusirq == bus_irq)
7667 break;
7668 if (i >= mp_irq_entries)
7669 return -1;
7670 @@ -2918,3 +2951,35 @@ static int __init parse_noapic(char *arg
7671 return 0;
7672 }
7673 early_param("noapic", parse_noapic);
7674 +
7675 +#ifndef CONFIG_XEN
7676 +void __init ioapic_init_mappings(void)
7677 +{
7678 + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
7679 + int i;
7680 +
7681 + for (i = 0; i < nr_ioapics; i++) {
7682 + if (smp_found_config) {
7683 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
7684 + if (!ioapic_phys) {
7685 + printk(KERN_ERR
7686 + "WARNING: bogus zero IO-APIC "
7687 + "address found in MPTABLE, "
7688 + "disabling IO/APIC support!\n");
7689 + smp_found_config = 0;
7690 + skip_ioapic_setup = 1;
7691 + goto fake_ioapic_page;
7692 + }
7693 + } else {
7694 +fake_ioapic_page:
7695 + ioapic_phys = (unsigned long)
7696 + alloc_bootmem_pages(PAGE_SIZE);
7697 + ioapic_phys = __pa(ioapic_phys);
7698 + }
7699 + set_fixmap_nocache(idx, ioapic_phys);
7700 + printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
7701 + __fix_to_virt(idx), ioapic_phys);
7702 + idx++;
7703 + }
7704 +}
7705 +#endif
7706 --- sle11-2009-10-16.orig/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
7707 +++ sle11-2009-10-16/arch/x86/kernel/io_apic_64-xen.c 2009-06-04 10:21:39.000000000 +0200
7708 @@ -45,6 +45,7 @@
7709 #include <asm/proto.h>
7710 #include <asm/acpi.h>
7711 #include <asm/dma.h>
7712 +#include <asm/i8259.h>
7713 #include <asm/nmi.h>
7714 #include <asm/msidef.h>
7715 #include <asm/hypertransport.h>
7716 @@ -63,10 +64,16 @@ struct irq_cfg {
7717 };
7718
7719 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
7720 -struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7721 +static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
7722
7723 static int assign_irq_vector(int irq, cpumask_t mask);
7724
7725 +#ifndef CONFIG_XEN
7726 +int first_system_vector = 0xfe;
7727 +
7728 +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
7729 +#endif
7730 +
7731 #define __apicdebuginit __init
7732
7733 int sis_apic_bug; /* not actually supported, dummy for compile */
7734 @@ -89,14 +96,14 @@ unsigned long io_apic_irqs;
7735
7736 #define clear_IO_APIC() ((void)0)
7737 #else
7738 -int timer_over_8254 __initdata = 1;
7739 +int timer_through_8259 __initdata;
7740
7741 /* Where if anywhere is the i8259 connect in external int mode */
7742 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
7743 #endif
7744
7745 static DEFINE_SPINLOCK(ioapic_lock);
7746 -DEFINE_SPINLOCK(vector_lock);
7747 +static DEFINE_SPINLOCK(vector_lock);
7748
7749 /*
7750 * # of IRQ routing registers
7751 @@ -104,15 +111,17 @@ DEFINE_SPINLOCK(vector_lock);
7752 int nr_ioapic_registers[MAX_IO_APICS];
7753
7754 /* I/O APIC entries */
7755 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7756 +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
7757 int nr_ioapics;
7758
7759 /* MP IRQ source entries */
7760 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7761 +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7762
7763 /* # of MP IRQ source entries */
7764 int mp_irq_entries;
7765
7766 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
7767 +
7768 /*
7769 * Rough estimation of how many shared IRQs there are, can
7770 * be changed anytime.
7771 @@ -141,7 +150,7 @@ struct io_apic {
7772 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
7773 {
7774 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
7775 - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
7776 + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
7777 }
7778 #endif
7779
7780 @@ -155,7 +164,7 @@ static inline unsigned int io_apic_read(
7781 struct physdev_apic apic_op;
7782 int ret;
7783
7784 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7785 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7786 apic_op.reg = reg;
7787 ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
7788 if (ret)
7789 @@ -173,7 +182,7 @@ static inline void io_apic_write(unsigne
7790 #else
7791 struct physdev_apic apic_op;
7792
7793 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
7794 + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
7795 apic_op.reg = reg;
7796 apic_op.value = value;
7797 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
7798 @@ -209,7 +218,7 @@ static bool io_apic_level_ack_pending(un
7799 break;
7800 reg = io_apic_read(entry->apic, 0x10 + pin*2);
7801 /* Is the remote IRR bit set? */
7802 - if ((reg >> 14) & 1) {
7803 + if (reg & IO_APIC_REDIR_REMOTE_IRR) {
7804 spin_unlock_irqrestore(&ioapic_lock, flags);
7805 return true;
7806 }
7807 @@ -308,7 +317,7 @@ static void __target_IO_APIC_irq(unsigne
7808 break;
7809 io_apic_write(apic, 0x11 + pin*2, dest);
7810 reg = io_apic_read(apic, 0x10 + pin*2);
7811 - reg &= ~0x000000ff;
7812 + reg &= ~IO_APIC_REDIR_VECTOR_MASK;
7813 reg |= vector;
7814 io_apic_modify(apic, reg);
7815 if (!entry->next)
7816 @@ -372,6 +381,26 @@ static void add_pin_to_irq(unsigned int
7817 }
7818
7819 #ifndef CONFIG_XEN
7820 +/*
7821 + * Reroute an IRQ to a different pin.
7822 + */
7823 +static void __init replace_pin_at_irq(unsigned int irq,
7824 + int oldapic, int oldpin,
7825 + int newapic, int newpin)
7826 +{
7827 + struct irq_pin_list *entry = irq_2_pin + irq;
7828 +
7829 + while (1) {
7830 + if (entry->apic == oldapic && entry->pin == oldpin) {
7831 + entry->apic = newapic;
7832 + entry->pin = newpin;
7833 + }
7834 + if (!entry->next)
7835 + break;
7836 + entry = irq_2_pin + entry->next;
7837 + }
7838 +}
7839 +
7840 #define __DO_ACTION(R, ACTION, FINAL) \
7841 \
7842 { \
7843 @@ -399,10 +428,11 @@ static void add_pin_to_irq(unsigned int
7844 static void name##_IO_APIC_irq (unsigned int irq) \
7845 __DO_ACTION(R, ACTION, FINAL)
7846
7847 -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
7848 - /* mask = 1 */
7849 -DO_ACTION( __unmask, 0, &= 0xfffeffff, )
7850 - /* mask = 0 */
7851 +/* mask = 1 */
7852 +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
7853 +
7854 +/* mask = 0 */
7855 +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
7856
7857 static void mask_IO_APIC_irq (unsigned int irq)
7858 {
7859 @@ -465,22 +495,6 @@ static int __init disable_timer_pin_setu
7860 }
7861 __setup("disable_timer_pin_1", disable_timer_pin_setup);
7862
7863 -#ifndef CONFIG_XEN
7864 -static int __init setup_disable_8254_timer(char *s)
7865 -{
7866 - timer_over_8254 = -1;
7867 - return 1;
7868 -}
7869 -static int __init setup_enable_8254_timer(char *s)
7870 -{
7871 - timer_over_8254 = 2;
7872 - return 1;
7873 -}
7874 -
7875 -__setup("disable_8254_timer", setup_disable_8254_timer);
7876 -__setup("enable_8254_timer", setup_enable_8254_timer);
7877 -#endif /* !CONFIG_XEN */
7878 -
7879
7880 /*
7881 * Find the IRQ entry number of a certain pin.
7882 @@ -490,10 +504,10 @@ static int find_irq_entry(int apic, int
7883 int i;
7884
7885 for (i = 0; i < mp_irq_entries; i++)
7886 - if (mp_irqs[i].mpc_irqtype == type &&
7887 - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
7888 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
7889 - mp_irqs[i].mpc_dstirq == pin)
7890 + if (mp_irqs[i].mp_irqtype == type &&
7891 + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
7892 + mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
7893 + mp_irqs[i].mp_dstirq == pin)
7894 return i;
7895
7896 return -1;
7897 @@ -508,13 +522,13 @@ static int __init find_isa_irq_pin(int i
7898 int i;
7899
7900 for (i = 0; i < mp_irq_entries; i++) {
7901 - int lbus = mp_irqs[i].mpc_srcbus;
7902 + int lbus = mp_irqs[i].mp_srcbus;
7903
7904 if (test_bit(lbus, mp_bus_not_pci) &&
7905 - (mp_irqs[i].mpc_irqtype == type) &&
7906 - (mp_irqs[i].mpc_srcbusirq == irq))
7907 + (mp_irqs[i].mp_irqtype == type) &&
7908 + (mp_irqs[i].mp_srcbusirq == irq))
7909
7910 - return mp_irqs[i].mpc_dstirq;
7911 + return mp_irqs[i].mp_dstirq;
7912 }
7913 return -1;
7914 }
7915 @@ -524,17 +538,17 @@ static int __init find_isa_irq_apic(int
7916 int i;
7917
7918 for (i = 0; i < mp_irq_entries; i++) {
7919 - int lbus = mp_irqs[i].mpc_srcbus;
7920 + int lbus = mp_irqs[i].mp_srcbus;
7921
7922 if (test_bit(lbus, mp_bus_not_pci) &&
7923 - (mp_irqs[i].mpc_irqtype == type) &&
7924 - (mp_irqs[i].mpc_srcbusirq == irq))
7925 + (mp_irqs[i].mp_irqtype == type) &&
7926 + (mp_irqs[i].mp_srcbusirq == irq))
7927 break;
7928 }
7929 if (i < mp_irq_entries) {
7930 int apic;
7931 for(apic = 0; apic < nr_ioapics; apic++) {
7932 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
7933 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
7934 return apic;
7935 }
7936 }
7937 @@ -555,28 +569,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7938
7939 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
7940 bus, slot, pin);
7941 - if (mp_bus_id_to_pci_bus[bus] == -1) {
7942 + if (test_bit(bus, mp_bus_not_pci)) {
7943 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
7944 return -1;
7945 }
7946 for (i = 0; i < mp_irq_entries; i++) {
7947 - int lbus = mp_irqs[i].mpc_srcbus;
7948 + int lbus = mp_irqs[i].mp_srcbus;
7949
7950 for (apic = 0; apic < nr_ioapics; apic++)
7951 - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
7952 - mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
7953 + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
7954 + mp_irqs[i].mp_dstapic == MP_APIC_ALL)
7955 break;
7956
7957 if (!test_bit(lbus, mp_bus_not_pci) &&
7958 - !mp_irqs[i].mpc_irqtype &&
7959 + !mp_irqs[i].mp_irqtype &&
7960 (bus == lbus) &&
7961 - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
7962 - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
7963 + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
7964 + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
7965
7966 if (!(apic || IO_APIC_IRQ(irq)))
7967 continue;
7968
7969 - if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
7970 + if (pin == (mp_irqs[i].mp_srcbusirq & 3))
7971 return irq;
7972 /*
7973 * Use the first all-but-pin matching entry as a
7974 @@ -604,13 +618,13 @@ int IO_APIC_get_PCI_irq_vector(int bus,
7975
7976 static int MPBIOS_polarity(int idx)
7977 {
7978 - int bus = mp_irqs[idx].mpc_srcbus;
7979 + int bus = mp_irqs[idx].mp_srcbus;
7980 int polarity;
7981
7982 /*
7983 * Determine IRQ line polarity (high active or low active):
7984 */
7985 - switch (mp_irqs[idx].mpc_irqflag & 3)
7986 + switch (mp_irqs[idx].mp_irqflag & 3)
7987 {
7988 case 0: /* conforms, ie. bus-type dependent polarity */
7989 if (test_bit(bus, mp_bus_not_pci))
7990 @@ -646,13 +660,13 @@ static int MPBIOS_polarity(int idx)
7991
7992 static int MPBIOS_trigger(int idx)
7993 {
7994 - int bus = mp_irqs[idx].mpc_srcbus;
7995 + int bus = mp_irqs[idx].mp_srcbus;
7996 int trigger;
7997
7998 /*
7999 * Determine IRQ trigger mode (edge or level sensitive):
8000 */
8001 - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
8002 + switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
8003 {
8004 case 0: /* conforms, ie. bus-type dependent */
8005 if (test_bit(bus, mp_bus_not_pci))
8006 @@ -699,16 +713,16 @@ static inline int irq_trigger(int idx)
8007 static int pin_2_irq(int idx, int apic, int pin)
8008 {
8009 int irq, i;
8010 - int bus = mp_irqs[idx].mpc_srcbus;
8011 + int bus = mp_irqs[idx].mp_srcbus;
8012
8013 /*
8014 * Debugging check, we are in big trouble if this message pops up!
8015 */
8016 - if (mp_irqs[idx].mpc_dstirq != pin)
8017 + if (mp_irqs[idx].mp_dstirq != pin)
8018 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
8019
8020 if (test_bit(bus, mp_bus_not_pci)) {
8021 - irq = mp_irqs[idx].mpc_srcbusirq;
8022 + irq = mp_irqs[idx].mp_srcbusirq;
8023 } else {
8024 /*
8025 * PCI IRQs are mapped in order
8026 @@ -722,6 +736,19 @@ static int pin_2_irq(int idx, int apic,
8027 return irq;
8028 }
8029
8030 +void lock_vector_lock(void)
8031 +{
8032 + /* Used to the online set of cpus does not change
8033 + * during assign_irq_vector.
8034 + */
8035 + spin_lock(&vector_lock);
8036 +}
8037 +
8038 +void unlock_vector_lock(void)
8039 +{
8040 + spin_unlock(&vector_lock);
8041 +}
8042 +
8043 static int __assign_irq_vector(int irq, cpumask_t mask)
8044 {
8045 struct physdev_irq irq_op;
8046 @@ -773,7 +800,7 @@ static void __clear_irq_vector(int irq)
8047
8048 vector = cfg->vector;
8049 cpus_and(mask, cfg->domain, cpu_online_map);
8050 - for_each_cpu_mask(cpu, mask)
8051 + for_each_cpu_mask_nr(cpu, mask)
8052 per_cpu(vector_irq, cpu)[vector] = -1;
8053
8054 cfg->vector = 0;
8055 @@ -842,7 +869,7 @@ static void setup_IO_APIC_irq(int apic,
8056 apic_printk(APIC_VERBOSE,KERN_DEBUG
8057 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
8058 "IRQ %d Mode:%i Active:%i)\n",
8059 - apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
8060 + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
8061 irq, trigger, polarity);
8062
8063 /*
8064 @@ -883,10 +910,10 @@ static void __init setup_IO_APIC_irqs(vo
8065 idx = find_irq_entry(apic,pin,mp_INT);
8066 if (idx == -1) {
8067 if (first_notcon) {
8068 - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8069 + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
8070 first_notcon = 0;
8071 } else
8072 - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
8073 + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
8074 continue;
8075 }
8076 if (!first_notcon) {
8077 @@ -908,26 +935,21 @@ static void __init setup_IO_APIC_irqs(vo
8078
8079 #ifndef CONFIG_XEN
8080 /*
8081 - * Set up the 8259A-master output pin as broadcast to all
8082 - * CPUs.
8083 + * Set up the timer pin, possibly with the 8259A-master behind.
8084 */
8085 -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
8086 +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
8087 + int vector)
8088 {
8089 struct IO_APIC_route_entry entry;
8090
8091 memset(&entry, 0, sizeof(entry));
8092
8093 - disable_8259A_irq(0);
8094 -
8095 - /* mask LVT0 */
8096 - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8097 -
8098 /*
8099 * We use logical delivery to get the timer IRQ
8100 * to the first CPU.
8101 */
8102 entry.dest_mode = INT_DEST_MODE;
8103 - entry.mask = 0; /* unmask IRQ now */
8104 + entry.mask = 1; /* mask IRQ now */
8105 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
8106 entry.delivery_mode = INT_DELIVERY_MODE;
8107 entry.polarity = 0;
8108 @@ -936,7 +958,7 @@ static void __init setup_ExtINT_IRQ0_pin
8109
8110 /*
8111 * The timer IRQ doesn't have to know that behind the
8112 - * scene we have a 8259A-master in AEOI mode ...
8113 + * scene we may have a 8259A-master in AEOI mode ...
8114 */
8115 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
8116
8117 @@ -944,8 +966,6 @@ static void __init setup_ExtINT_IRQ0_pin
8118 * Add it to the IO-APIC irq-routing table:
8119 */
8120 ioapic_write_entry(apic, pin, entry);
8121 -
8122 - enable_8259A_irq(0);
8123 }
8124
8125 void __apicdebuginit print_IO_APIC(void)
8126 @@ -962,7 +982,7 @@ void __apicdebuginit print_IO_APIC(void)
8127 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
8128 for (i = 0; i < nr_ioapics; i++)
8129 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
8130 - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
8131 + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
8132
8133 /*
8134 * We are a bit conservative about what we expect. We have to
8135 @@ -980,7 +1000,7 @@ void __apicdebuginit print_IO_APIC(void)
8136 spin_unlock_irqrestore(&ioapic_lock, flags);
8137
8138 printk("\n");
8139 - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
8140 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
8141 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
8142 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
8143
8144 @@ -1072,6 +1092,7 @@ void __apicdebuginit print_local_APIC(vo
8145
8146 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
8147 smp_processor_id(), hard_smp_processor_id());
8148 + v = apic_read(APIC_ID);
8149 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
8150 v = apic_read(APIC_LVR);
8151 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
8152 @@ -1141,7 +1162,7 @@ void __apicdebuginit print_local_APIC(vo
8153
8154 void print_all_local_APICs (void)
8155 {
8156 - on_each_cpu(print_local_APIC, NULL, 1, 1);
8157 + on_each_cpu(print_local_APIC, NULL, 1);
8158 }
8159
8160 void __apicdebuginit print_PIC(void)
8161 @@ -1175,6 +1196,8 @@ void __apicdebuginit print_PIC(void)
8162 v = inb(0x4d1) << 8 | inb(0x4d0);
8163 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
8164 }
8165 +#else
8166 +void __apicdebuginit print_IO_APIC(void) {}
8167 #endif /* !CONFIG_XEN */
8168
8169 void __init enable_IO_APIC(void)
8170 @@ -1359,12 +1382,10 @@ static unsigned int startup_ioapic_irq(u
8171 static int ioapic_retrigger_irq(unsigned int irq)
8172 {
8173 struct irq_cfg *cfg = &irq_cfg[irq];
8174 - cpumask_t mask;
8175 unsigned long flags;
8176
8177 spin_lock_irqsave(&vector_lock, flags);
8178 - mask = cpumask_of_cpu(first_cpu(cfg->domain));
8179 - send_IPI_mask(mask, cfg->vector);
8180 + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
8181 spin_unlock_irqrestore(&vector_lock, flags);
8182
8183 return 1;
8184 @@ -1545,7 +1566,7 @@ static inline void init_IO_APIC_traps(vo
8185 }
8186
8187 #ifndef CONFIG_XEN
8188 -static void enable_lapic_irq (unsigned int irq)
8189 +static void unmask_lapic_irq(unsigned int irq)
8190 {
8191 unsigned long v;
8192
8193 @@ -1553,7 +1574,7 @@ static void enable_lapic_irq (unsigned i
8194 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
8195 }
8196
8197 -static void disable_lapic_irq (unsigned int irq)
8198 +static void mask_lapic_irq(unsigned int irq)
8199 {
8200 unsigned long v;
8201
8202 @@ -1566,19 +1587,20 @@ static void ack_lapic_irq (unsigned int
8203 ack_APIC_irq();
8204 }
8205
8206 -static void end_lapic_irq (unsigned int i) { /* nothing */ }
8207 -
8208 -static struct hw_interrupt_type lapic_irq_type __read_mostly = {
8209 - .name = "local-APIC",
8210 - .typename = "local-APIC-edge",
8211 - .startup = NULL, /* startup_irq() not used for IRQ0 */
8212 - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
8213 - .enable = enable_lapic_irq,
8214 - .disable = disable_lapic_irq,
8215 - .ack = ack_lapic_irq,
8216 - .end = end_lapic_irq,
8217 +static struct irq_chip lapic_chip __read_mostly = {
8218 + .name = "local-APIC",
8219 + .mask = mask_lapic_irq,
8220 + .unmask = unmask_lapic_irq,
8221 + .ack = ack_lapic_irq,
8222 };
8223
8224 +static void lapic_register_intr(int irq)
8225 +{
8226 + irq_desc[irq].status &= ~IRQ_LEVEL;
8227 + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
8228 + "edge");
8229 +}
8230 +
8231 static void __init setup_nmi(void)
8232 {
8233 /*
8234 @@ -1664,6 +1686,7 @@ static inline void __init check_timer(vo
8235 struct irq_cfg *cfg = irq_cfg + 0;
8236 int apic1, pin1, apic2, pin2;
8237 unsigned long flags;
8238 + int no_pin1 = 0;
8239
8240 local_irq_save(flags);
8241
8242 @@ -1674,34 +1697,48 @@ static inline void __init check_timer(vo
8243 assign_irq_vector(0, TARGET_CPUS);
8244
8245 /*
8246 - * Subtle, code in do_timer_interrupt() expects an AEOI
8247 - * mode for the 8259A whenever interrupts are routed
8248 - * through I/O APICs. Also IRQ0 has to be enabled in
8249 - * the 8259A which implies the virtual wire has to be
8250 - * disabled in the local APIC.
8251 + * As IRQ0 is to be enabled in the 8259A, the virtual
8252 + * wire has to be disabled in the local APIC.
8253 */
8254 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
8255 init_8259A(1);
8256 - if (timer_over_8254 > 0)
8257 - enable_8259A_irq(0);
8258
8259 pin1 = find_isa_irq_pin(0, mp_INT);
8260 apic1 = find_isa_irq_apic(0, mp_INT);
8261 pin2 = ioapic_i8259.pin;
8262 apic2 = ioapic_i8259.apic;
8263
8264 - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
8265 - cfg->vector, apic1, pin1, apic2, pin2);
8266 + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
8267 + "apic1=%d pin1=%d apic2=%d pin2=%d\n",
8268 + cfg->vector, apic1, pin1, apic2, pin2);
8269 +
8270 + /*
8271 + * Some BIOS writers are clueless and report the ExtINTA
8272 + * I/O APIC input from the cascaded 8259A as the timer
8273 + * interrupt input. So just in case, if only one pin
8274 + * was found above, try it both directly and through the
8275 + * 8259A.
8276 + */
8277 + if (pin1 == -1) {
8278 + pin1 = pin2;
8279 + apic1 = apic2;
8280 + no_pin1 = 1;
8281 + } else if (pin2 == -1) {
8282 + pin2 = pin1;
8283 + apic2 = apic1;
8284 + }
8285
8286 if (pin1 != -1) {
8287 /*
8288 * Ok, does IRQ0 through the IOAPIC work?
8289 */
8290 + if (no_pin1) {
8291 + add_pin_to_irq(0, apic1, pin1);
8292 + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
8293 + }
8294 unmask_IO_APIC_irq(0);
8295 if (!no_timer_check && timer_irq_works()) {
8296 - nmi_watchdog_default();
8297 if (nmi_watchdog == NMI_IO_APIC) {
8298 - disable_8259A_irq(0);
8299 setup_nmi();
8300 enable_8259A_irq(0);
8301 }
8302 @@ -1710,54 +1747,62 @@ static inline void __init check_timer(vo
8303 goto out;
8304 }
8305 clear_IO_APIC_pin(apic1, pin1);
8306 - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
8307 - "connected to IO-APIC\n");
8308 - }
8309 -
8310 - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
8311 - "through the 8259A ... ");
8312 - if (pin2 != -1) {
8313 - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
8314 - apic2, pin2);
8315 + if (!no_pin1)
8316 + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
8317 + "8254 timer not connected to IO-APIC\n");
8318 +
8319 + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
8320 + "(IRQ0) through the 8259A ...\n");
8321 + apic_printk(APIC_QUIET, KERN_INFO
8322 + "..... (found apic %d pin %d) ...\n", apic2, pin2);
8323 /*
8324 * legacy devices should be connected to IO APIC #0
8325 */
8326 - setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
8327 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
8328 + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
8329 + unmask_IO_APIC_irq(0);
8330 + enable_8259A_irq(0);
8331 if (timer_irq_works()) {
8332 - apic_printk(APIC_VERBOSE," works.\n");
8333 - nmi_watchdog_default();
8334 + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
8335 + timer_through_8259 = 1;
8336 if (nmi_watchdog == NMI_IO_APIC) {
8337 + disable_8259A_irq(0);
8338 setup_nmi();
8339 + enable_8259A_irq(0);
8340 }
8341 goto out;
8342 }
8343 /*
8344 * Cleanup, just in case ...
8345 */
8346 + disable_8259A_irq(0);
8347 clear_IO_APIC_pin(apic2, pin2);
8348 + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
8349 }
8350 - apic_printk(APIC_VERBOSE," failed.\n");
8351
8352 if (nmi_watchdog == NMI_IO_APIC) {
8353 - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
8354 - nmi_watchdog = 0;
8355 + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
8356 + "through the IO-APIC - disabling NMI Watchdog!\n");
8357 + nmi_watchdog = NMI_NONE;
8358 }
8359
8360 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
8361 + apic_printk(APIC_QUIET, KERN_INFO
8362 + "...trying to set up timer as Virtual Wire IRQ...\n");
8363
8364 - disable_8259A_irq(0);
8365 - irq_desc[0].chip = &lapic_irq_type;
8366 + lapic_register_intr(0);
8367 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
8368 enable_8259A_irq(0);
8369
8370 if (timer_irq_works()) {
8371 - apic_printk(APIC_VERBOSE," works.\n");
8372 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8373 goto out;
8374 }
8375 + disable_8259A_irq(0);
8376 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
8377 - apic_printk(APIC_VERBOSE," failed.\n");
8378 + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
8379
8380 - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
8381 + apic_printk(APIC_QUIET, KERN_INFO
8382 + "...trying to set up timer as ExtINT IRQ...\n");
8383
8384 init_8259A(0);
8385 make_8259A_irq(0);
8386 @@ -1766,11 +1811,12 @@ static inline void __init check_timer(vo
8387 unlock_ExtINT_logic();
8388
8389 if (timer_irq_works()) {
8390 - apic_printk(APIC_VERBOSE," works.\n");
8391 + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
8392 goto out;
8393 }
8394 - apic_printk(APIC_VERBOSE," failed :(.\n");
8395 - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
8396 + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
8397 + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
8398 + "report. Then try booting with the 'noapic' option.\n");
8399 out:
8400 local_irq_restore(flags);
8401 }
8402 @@ -1788,10 +1834,21 @@ __setup("no_timer_check", notimercheck);
8403
8404 /*
8405 *
8406 - * IRQs that are handled by the PIC in the MPS IOAPIC case.
8407 - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
8408 - * Linux doesn't really care, as it's not actually used
8409 - * for any interrupt handling anyway.
8410 + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
8411 + * to devices. However there may be an I/O APIC pin available for
8412 + * this interrupt regardless. The pin may be left unconnected, but
8413 + * typically it will be reused as an ExtINT cascade interrupt for
8414 + * the master 8259A. In the MPS case such a pin will normally be
8415 + * reported as an ExtINT interrupt in the MP table. With ACPI
8416 + * there is no provision for ExtINT interrupts, and in the absence
8417 + * of an override it would be treated as an ordinary ISA I/O APIC
8418 + * interrupt, that is edge-triggered and unmasked by default. We
8419 + * used to do this, but it caused problems on some systems because
8420 + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
8421 + * the same ExtINT cascade interrupt to drive the local APIC of the
8422 + * bootstrap processor. Therefore we refrain from routing IRQ2 to
8423 + * the I/O APIC in all cases now. No actual device should request
8424 + * it anyway. --macro
8425 */
8426 #define PIC_IRQS (1<<2)
8427
8428 @@ -1799,10 +1856,7 @@ void __init setup_IO_APIC(void)
8429 {
8430 enable_IO_APIC();
8431
8432 - if (acpi_ioapic)
8433 - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
8434 - else
8435 - io_apic_irqs = ~PIC_IRQS;
8436 + io_apic_irqs = ~PIC_IRQS;
8437
8438 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
8439
8440 @@ -1851,8 +1905,8 @@ static int ioapic_resume(struct sys_devi
8441
8442 spin_lock_irqsave(&ioapic_lock, flags);
8443 reg_00.raw = io_apic_read(dev->id, 0);
8444 - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
8445 - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
8446 + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
8447 + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
8448 io_apic_write(dev->id, 0, reg_00.raw);
8449 }
8450 spin_unlock_irqrestore(&ioapic_lock, flags);
8451 @@ -2254,8 +2308,8 @@ int acpi_get_override_irq(int bus_irq, i
8452 return -1;
8453
8454 for (i = 0; i < mp_irq_entries; i++)
8455 - if (mp_irqs[i].mpc_irqtype == mp_INT &&
8456 - mp_irqs[i].mpc_srcbusirq == bus_irq)
8457 + if (mp_irqs[i].mp_irqtype == mp_INT &&
8458 + mp_irqs[i].mp_srcbusirq == bus_irq)
8459 break;
8460 if (i >= mp_irq_entries)
8461 return -1;
8462 @@ -2349,7 +2403,7 @@ void __init ioapic_init_mappings(void)
8463 ioapic_res = ioapic_setup_resources();
8464 for (i = 0; i < nr_ioapics; i++) {
8465 if (smp_found_config) {
8466 - ioapic_phys = mp_ioapics[i].mpc_apicaddr;
8467 + ioapic_phys = mp_ioapics[i].mp_apicaddr;
8468 } else {
8469 ioapic_phys = (unsigned long)
8470 alloc_bootmem_pages(PAGE_SIZE);
8471 --- sle11-2009-10-16.orig/arch/x86/kernel/ipi-xen.c 2009-03-16 16:38:05.000000000 +0100
8472 +++ sle11-2009-10-16/arch/x86/kernel/ipi-xen.c 2009-06-04 10:21:39.000000000 +0200
8473 @@ -8,7 +8,6 @@
8474 #include <linux/kernel_stat.h>
8475 #include <linux/mc146818rtc.h>
8476 #include <linux/cache.h>
8477 -#include <linux/interrupt.h>
8478 #include <linux/cpu.h>
8479 #include <linux/module.h>
8480
8481 @@ -85,7 +84,7 @@ void __send_IPI_shortcut(unsigned int sh
8482 /*
8483 * Send the IPI. The write to APIC_ICR fires this off.
8484 */
8485 - apic_write_around(APIC_ICR, cfg);
8486 + apic_write(APIC_ICR, cfg);
8487 #else
8488 int cpu;
8489
8490 @@ -132,7 +131,7 @@ static inline void __send_IPI_dest_field
8491 * prepare target chip field
8492 */
8493 cfg = __prepare_ICR2(mask);
8494 - apic_write_around(APIC_ICR2, cfg);
8495 + apic_write(APIC_ICR2, cfg);
8496
8497 /*
8498 * program the ICR
8499 @@ -142,7 +141,7 @@ static inline void __send_IPI_dest_field
8500 /*
8501 * Send the IPI. The write to APIC_ICR fires this off.
8502 */
8503 - apic_write_around(APIC_ICR, cfg);
8504 + apic_write(APIC_ICR, cfg);
8505 }
8506 #endif
8507
8508 --- sle11-2009-10-16.orig/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:38:05.000000000 +0100
8509 +++ sle11-2009-10-16/arch/x86/kernel/irq_32-xen.c 2009-06-04 10:21:39.000000000 +0200
8510 @@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
8511 #endif
8512 }
8513
8514 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
8515 +/* Debugging check for stack overflow: is there less than 1KB free? */
8516 +static int check_stack_overflow(void)
8517 +{
8518 + long sp;
8519 +
8520 + __asm__ __volatile__("andl %%esp,%0" :
8521 + "=r" (sp) : "0" (THREAD_SIZE - 1));
8522 +
8523 + return sp < (sizeof(struct thread_info) + STACK_WARN);
8524 +}
8525 +
8526 +static void print_stack_overflow(void)
8527 +{
8528 + printk(KERN_WARNING "low stack detected by irq handler\n");
8529 + dump_stack();
8530 +}
8531 +
8532 +#else
8533 +static inline int check_stack_overflow(void) { return 0; }
8534 +static inline void print_stack_overflow(void) { }
8535 +#endif
8536 +
8537 #ifdef CONFIG_4KSTACKS
8538 /*
8539 * per-CPU IRQ handling contexts (thread information and stack)
8540 @@ -59,48 +82,26 @@ union irq_ctx {
8541
8542 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
8543 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
8544 -#endif
8545 -
8546 -/*
8547 - * do_IRQ handles all normal device IRQ's (the special
8548 - * SMP cross-CPU interrupts have their own specific
8549 - * handlers).
8550 - */
8551 -unsigned int do_IRQ(struct pt_regs *regs)
8552 -{
8553 - struct pt_regs *old_regs;
8554 - /* high bit used in ret_from_ code */
8555 - int irq = ~regs->orig_ax;
8556 - struct irq_desc *desc = irq_desc + irq;
8557 -#ifdef CONFIG_4KSTACKS
8558 - union irq_ctx *curctx, *irqctx;
8559 - u32 *isp;
8560 -#endif
8561
8562 - if (unlikely((unsigned)irq >= NR_IRQS)) {
8563 - printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8564 - __func__, irq);
8565 - BUG();
8566 - }
8567 +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8568 +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
8569
8570 - old_regs = set_irq_regs(regs);
8571 - /*irq_enter();*/
8572 -#ifdef CONFIG_DEBUG_STACKOVERFLOW
8573 - /* Debugging check for stack overflow: is there less than 1KB free? */
8574 - {
8575 - long sp;
8576 -
8577 - __asm__ __volatile__("andl %%esp,%0" :
8578 - "=r" (sp) : "0" (THREAD_SIZE - 1));
8579 - if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
8580 - printk("do_IRQ: stack overflow: %ld\n",
8581 - sp - sizeof(struct thread_info));
8582 - dump_stack();
8583 - }
8584 - }
8585 -#endif
8586 +static void call_on_stack(void *func, void *stack)
8587 +{
8588 + asm volatile("xchgl %%ebx,%%esp \n"
8589 + "call *%%edi \n"
8590 + "movl %%ebx,%%esp \n"
8591 + : "=b" (stack)
8592 + : "0" (stack),
8593 + "D"(func)
8594 + : "memory", "cc", "edx", "ecx", "eax");
8595 +}
8596
8597 -#ifdef CONFIG_4KSTACKS
8598 +static inline int
8599 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
8600 +{
8601 + union irq_ctx *curctx, *irqctx;
8602 + u32 *isp, arg1, arg2;
8603
8604 curctx = (union irq_ctx *) current_thread_info();
8605 irqctx = hardirq_ctx[smp_processor_id()];
8606 @@ -111,52 +112,39 @@ unsigned int do_IRQ(struct pt_regs *regs
8607 * handler) we can't do that and just have to keep using the
8608 * current stack (which is the irq stack already after all)
8609 */
8610 - if (curctx != irqctx) {
8611 - int arg1, arg2, bx;
8612 -
8613 - /* build the stack frame on the IRQ stack */
8614 - isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8615 - irqctx->tinfo.task = curctx->tinfo.task;
8616 - irqctx->tinfo.previous_esp = current_stack_pointer;
8617 + if (unlikely(curctx == irqctx))
8618 + return 0;
8619
8620 - /*
8621 - * Copy the softirq bits in preempt_count so that the
8622 - * softirq checks work in the hardirq context.
8623 - */
8624 - irqctx->tinfo.preempt_count =
8625 - (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8626 - (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8627 -
8628 - asm volatile(
8629 - " xchgl %%ebx,%%esp \n"
8630 - " call *%%edi \n"
8631 - " movl %%ebx,%%esp \n"
8632 - : "=a" (arg1), "=d" (arg2), "=b" (bx)
8633 - : "0" (irq), "1" (desc), "2" (isp),
8634 - "D" (desc->handle_irq)
8635 - : "memory", "cc", "ecx"
8636 - );
8637 - } else
8638 -#endif
8639 - desc->handle_irq(irq, desc);
8640 + /* build the stack frame on the IRQ stack */
8641 + isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
8642 + irqctx->tinfo.task = curctx->tinfo.task;
8643 + irqctx->tinfo.previous_esp = current_stack_pointer;
8644
8645 - /*irq_exit();*/
8646 - set_irq_regs(old_regs);
8647 + /*
8648 + * Copy the softirq bits in preempt_count so that the
8649 + * softirq checks work in the hardirq context.
8650 + */
8651 + irqctx->tinfo.preempt_count =
8652 + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
8653 + (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
8654 +
8655 + if (unlikely(overflow))
8656 + call_on_stack(print_stack_overflow, isp);
8657 +
8658 + asm volatile("xchgl %%ebx,%%esp \n"
8659 + "call *%%edi \n"
8660 + "movl %%ebx,%%esp \n"
8661 + : "=a" (arg1), "=d" (arg2), "=b" (isp)
8662 + : "0" (irq), "1" (desc), "2" (isp),
8663 + "D" (desc->handle_irq)
8664 + : "memory", "cc", "ecx");
8665 return 1;
8666 }
8667
8668 -#ifdef CONFIG_4KSTACKS
8669 -
8670 -static char softirq_stack[NR_CPUS * THREAD_SIZE]
8671 - __attribute__((__section__(".bss.page_aligned")));
8672 -
8673 -static char hardirq_stack[NR_CPUS * THREAD_SIZE]
8674 - __attribute__((__section__(".bss.page_aligned")));
8675 -
8676 /*
8677 * allocate per-cpu stacks for hardirq and for softirq processing
8678 */
8679 -void irq_ctx_init(int cpu)
8680 +void __cpuinit irq_ctx_init(int cpu)
8681 {
8682 union irq_ctx *irqctx;
8683
8684 @@ -164,25 +152,25 @@ void irq_ctx_init(int cpu)
8685 return;
8686
8687 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
8688 - irqctx->tinfo.task = NULL;
8689 - irqctx->tinfo.exec_domain = NULL;
8690 - irqctx->tinfo.cpu = cpu;
8691 - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8692 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8693 + irqctx->tinfo.task = NULL;
8694 + irqctx->tinfo.exec_domain = NULL;
8695 + irqctx->tinfo.cpu = cpu;
8696 + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
8697 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8698
8699 hardirq_ctx[cpu] = irqctx;
8700
8701 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
8702 - irqctx->tinfo.task = NULL;
8703 - irqctx->tinfo.exec_domain = NULL;
8704 - irqctx->tinfo.cpu = cpu;
8705 - irqctx->tinfo.preempt_count = 0;
8706 - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8707 + irqctx->tinfo.task = NULL;
8708 + irqctx->tinfo.exec_domain = NULL;
8709 + irqctx->tinfo.cpu = cpu;
8710 + irqctx->tinfo.preempt_count = 0;
8711 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
8712
8713 softirq_ctx[cpu] = irqctx;
8714
8715 - printk("CPU %u irqstacks, hard=%p soft=%p\n",
8716 - cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8717 + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
8718 + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
8719 }
8720
8721 void irq_ctx_exit(int cpu)
8722 @@ -211,25 +199,56 @@ asmlinkage void do_softirq(void)
8723 /* build the stack frame on the softirq stack */
8724 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
8725
8726 - asm volatile(
8727 - " xchgl %%ebx,%%esp \n"
8728 - " call __do_softirq \n"
8729 - " movl %%ebx,%%esp \n"
8730 - : "=b"(isp)
8731 - : "0"(isp)
8732 - : "memory", "cc", "edx", "ecx", "eax"
8733 - );
8734 + call_on_stack(__do_softirq, isp);
8735 /*
8736 * Shouldnt happen, we returned above if in_interrupt():
8737 - */
8738 + */
8739 WARN_ON_ONCE(softirq_count());
8740 }
8741
8742 local_irq_restore(flags);
8743 }
8744 +
8745 +#else
8746 +static inline int
8747 +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
8748 #endif
8749
8750 /*
8751 + * do_IRQ handles all normal device IRQ's (the special
8752 + * SMP cross-CPU interrupts have their own specific
8753 + * handlers).
8754 + */
8755 +unsigned int do_IRQ(struct pt_regs *regs)
8756 +{
8757 + struct pt_regs *old_regs;
8758 + /* high bit used in ret_from_ code */
8759 + int overflow, irq = ~regs->orig_ax;
8760 + struct irq_desc *desc = irq_desc + irq;
8761 +
8762 + if (unlikely((unsigned)irq >= NR_IRQS)) {
8763 + printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
8764 + __func__, irq);
8765 + BUG();
8766 + }
8767 +
8768 + old_regs = set_irq_regs(regs);
8769 + /*irq_enter();*/
8770 +
8771 + overflow = check_stack_overflow();
8772 +
8773 + if (!execute_on_irq_stack(overflow, desc, irq)) {
8774 + if (unlikely(overflow))
8775 + print_stack_overflow();
8776 + desc->handle_irq(irq, desc);
8777 + }
8778 +
8779 + /*irq_exit();*/
8780 + set_irq_regs(old_regs);
8781 + return 1;
8782 +}
8783 +
8784 +/*
8785 * Interrupt statistics:
8786 */
8787
8788 @@ -337,6 +356,42 @@ skip:
8789 return 0;
8790 }
8791
8792 +/*
8793 + * /proc/stat helpers
8794 + */
8795 +u64 arch_irq_stat_cpu(unsigned int cpu)
8796 +{
8797 + u64 sum = nmi_count(cpu);
8798 +
8799 +#ifdef CONFIG_X86_LOCAL_APIC
8800 + sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
8801 +#endif
8802 +#ifdef CONFIG_SMP
8803 + sum += per_cpu(irq_stat, cpu).irq_resched_count;
8804 + sum += per_cpu(irq_stat, cpu).irq_call_count;
8805 +#ifndef CONFIG_XEN
8806 + sum += per_cpu(irq_stat, cpu).irq_tlb_count;
8807 +#endif
8808 +#endif
8809 +#ifdef CONFIG_X86_MCE
8810 + sum += per_cpu(irq_stat, cpu).irq_thermal_count;
8811 +#endif
8812 +#ifdef CONFIG_X86_LOCAL_APIC
8813 + sum += per_cpu(irq_stat, cpu).irq_spurious_count;
8814 +#endif
8815 + return sum;
8816 +}
8817 +
8818 +u64 arch_irq_stat(void)
8819 +{
8820 + u64 sum = atomic_read(&irq_err_count);
8821 +
8822 +#ifdef CONFIG_X86_IO_APIC
8823 + sum += atomic_read(&irq_mis_count);
8824 +#endif
8825 + return sum;
8826 +}
8827 +
8828 #ifdef CONFIG_HOTPLUG_CPU
8829
8830 void fixup_irqs(cpumask_t map)
8831 --- sle11-2009-10-16.orig/arch/x86/kernel/irq_64-xen.c 2009-03-16 16:33:40.000000000 +0100
8832 +++ sle11-2009-10-16/arch/x86/kernel/irq_64-xen.c 2009-06-04 10:21:39.000000000 +0200
8833 @@ -163,6 +163,34 @@ skip:
8834 }
8835
8836 /*
8837 + * /proc/stat helpers
8838 + */
8839 +u64 arch_irq_stat_cpu(unsigned int cpu)
8840 +{
8841 + u64 sum = cpu_pda(cpu)->__nmi_count;
8842 +
8843 + sum += cpu_pda(cpu)->apic_timer_irqs;
8844 +#ifdef CONFIG_SMP
8845 + sum += cpu_pda(cpu)->irq_resched_count;
8846 + sum += cpu_pda(cpu)->irq_call_count;
8847 +#ifndef CONFIG_XEN
8848 + sum += cpu_pda(cpu)->irq_tlb_count;
8849 +#endif
8850 +#endif
8851 +#ifdef CONFIG_X86_MCE
8852 + sum += cpu_pda(cpu)->irq_thermal_count;
8853 + sum += cpu_pda(cpu)->irq_threshold_count;
8854 +#endif
8855 + sum += cpu_pda(cpu)->irq_spurious_count;
8856 + return sum;
8857 +}
8858 +
8859 +u64 arch_irq_stat(void)
8860 +{
8861 + return atomic_read(&irq_err_count);
8862 +}
8863 +
8864 +/*
8865 * do_IRQ handles all normal device IRQ's (the special
8866 * SMP cross-CPU interrupts have their own specific
8867 * handlers).
8868 --- sle11-2009-10-16.orig/arch/x86/kernel/ldt-xen.c 2009-03-16 16:33:40.000000000 +0100
8869 +++ sle11-2009-10-16/arch/x86/kernel/ldt-xen.c 2009-06-04 10:21:39.000000000 +0200
8870 @@ -20,9 +20,9 @@
8871 #include <asm/mmu_context.h>
8872
8873 #ifdef CONFIG_SMP
8874 -static void flush_ldt(void *null)
8875 +static void flush_ldt(void *current_mm)
8876 {
8877 - if (current->active_mm)
8878 + if (current->active_mm == current_mm)
8879 load_LDT(&current->active_mm->context);
8880 }
8881 #endif
8882 @@ -62,8 +62,6 @@ static int alloc_ldt(mm_context_t *pc, i
8883
8884 if (reload) {
8885 #ifdef CONFIG_SMP
8886 - cpumask_t mask;
8887 -
8888 preempt_disable();
8889 #endif
8890 make_pages_readonly(newldt,
8891 @@ -71,9 +69,9 @@ static int alloc_ldt(mm_context_t *pc, i
8892 XENFEAT_writable_descriptor_tables);
8893 load_LDT(pc);
8894 #ifdef CONFIG_SMP
8895 - mask = cpumask_of_cpu(smp_processor_id());
8896 - if (!cpus_equal(current->mm->cpu_vm_mask, mask))
8897 - smp_call_function(flush_ldt, NULL, 1, 1);
8898 + if (!cpus_equal(current->mm->cpu_vm_mask,
8899 + cpumask_of_cpu(smp_processor_id())))
8900 + smp_call_function(flush_ldt, current->mm, 1);
8901 preempt_enable();
8902 #endif
8903 }
8904 --- sle11-2009-10-16.orig/arch/x86/kernel/machine_kexec_32.c 2008-11-25 12:35:53.000000000 +0100
8905 +++ sle11-2009-10-16/arch/x86/kernel/machine_kexec_32.c 2009-06-04 10:21:39.000000000 +0200
8906 @@ -68,6 +68,8 @@ void machine_kexec_setup_load_arg(xen_ke
8907 xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
8908 xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
8909
8910 + if (image->type == KEXEC_TYPE_DEFAULT)
8911 + xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
8912 }
8913
8914 int __init machine_kexec_setup_resources(struct resource *hypervisor,
8915 --- sle11-2009-10-16.orig/arch/x86/kernel/microcode-xen.c 2009-03-16 16:38:05.000000000 +0100
8916 +++ sle11-2009-10-16/arch/x86/kernel/microcode-xen.c 2009-06-04 10:21:39.000000000 +0200
8917 @@ -5,13 +5,14 @@
8918 * 2006 Shaohua Li <shaohua.li@intel.com>
8919 *
8920 * This driver allows to upgrade microcode on Intel processors
8921 - * belonging to IA-32 family - PentiumPro, Pentium II,
8922 + * belonging to IA-32 family - PentiumPro, Pentium II,
8923 * Pentium III, Xeon, Pentium 4, etc.
8924 *
8925 - * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
8926 - * Order Number 245472 or free download from:
8927 - *
8928 - * http://developer.intel.com/design/pentium4/manuals/245472.htm
8929 + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
8930 + * Software Developer's Manual
8931 + * Order Number 253668 or free download from:
8932 + *
8933 + * http://developer.intel.com/design/pentium4/manuals/253668.htm
8934 *
8935 * For more information, go to http://www.urbanmyth.org/microcode
8936 *
8937 @@ -26,6 +27,7 @@
8938 #include <linux/kernel.h>
8939 #include <linux/init.h>
8940 #include <linux/sched.h>
8941 +#include <linux/smp_lock.h>
8942 #include <linux/cpumask.h>
8943 #include <linux/module.h>
8944 #include <linux/slab.h>
8945 @@ -86,6 +88,7 @@ static int do_microcode_update (const vo
8946
8947 static int microcode_open (struct inode *unused1, struct file *unused2)
8948 {
8949 + cycle_kernel_lock();
8950 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8951 }
8952
8953 @@ -162,7 +165,7 @@ static int request_microcode(void)
8954 c->x86, c->x86_model, c->x86_mask);
8955 error = request_firmware(&firmware, name, &microcode_pdev->dev);
8956 if (error) {
8957 - pr_debug("microcode: ucode data file %s load failed\n", name);
8958 + pr_debug("microcode: data file %s load failed\n", name);
8959 return error;
8960 }
8961
8962 @@ -183,6 +186,9 @@ static int __init microcode_init (void)
8963 {
8964 int error;
8965
8966 + printk(KERN_INFO
8967 + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8968 +
8969 error = microcode_dev_init();
8970 if (error)
8971 return error;
8972 @@ -195,8 +201,6 @@ static int __init microcode_init (void)
8973
8974 request_microcode();
8975
8976 - printk(KERN_INFO
8977 - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
8978 return 0;
8979 }
8980
8981 --- sle11-2009-10-16.orig/arch/x86/kernel/mpparse-xen.c 2009-03-16 16:38:05.000000000 +0100
8982 +++ sle11-2009-10-16/arch/x86/kernel/mpparse-xen.c 2009-06-04 10:21:39.000000000 +0200
8983 @@ -25,6 +25,9 @@
8984 #include <asm/proto.h>
8985 #include <asm/acpi.h>
8986 #include <asm/bios_ebda.h>
8987 +#include <asm/e820.h>
8988 +#include <asm/trampoline.h>
8989 +#include <asm/setup.h>
8990
8991 #include <mach_apic.h>
8992 #ifdef CONFIG_X86_32
8993 @@ -32,27 +35,10 @@
8994 #include <mach_mpparse.h>
8995 #endif
8996
8997 -/* Have we found an MP table */
8998 -int smp_found_config;
8999 -
9000 -/*
9001 - * Various Linux-internal data structures created from the
9002 - * MP-table.
9003 - */
9004 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9005 -int mp_bus_id_to_type[MAX_MP_BUSSES];
9006 -#endif
9007 -
9008 -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
9009 -int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
9010 -
9011 -static int mp_current_pci_id;
9012 -
9013 -int pic_mode;
9014 -
9015 -/*
9016 - * Intel MP BIOS table parsing routines:
9017 - */
9018 +static void *_bus_to_virt(unsigned long ma)
9019 +{
9020 + return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma);
9021 +}
9022
9023 /*
9024 * Checksum an MP configuration block.
9025 @@ -68,19 +54,7 @@ static int __init mpf_checksum(unsigned
9026 return sum & 0xFF;
9027 }
9028
9029 -#ifdef CONFIG_X86_NUMAQ
9030 -/*
9031 - * Have to match translation table entries to main table entries by counter
9032 - * hence the mpc_record variable .... can't see a less disgusting way of
9033 - * doing this ....
9034 - */
9035 -
9036 -static int mpc_record;
9037 -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
9038 - __cpuinitdata;
9039 -#endif
9040 -
9041 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
9042 +static void __init MP_processor_info(struct mpc_config_processor *m)
9043 {
9044 #ifndef CONFIG_XEN
9045 int apicid;
9046 @@ -90,11 +64,12 @@ static void __cpuinit MP_processor_info(
9047 disabled_cpus++;
9048 return;
9049 }
9050 -#ifdef CONFIG_X86_NUMAQ
9051 - apicid = mpc_apic_id(m, translation_table[mpc_record]);
9052 -#else
9053 - apicid = m->mpc_apicid;
9054 -#endif
9055 +
9056 + if (x86_quirks->mpc_apic_id)
9057 + apicid = x86_quirks->mpc_apic_id(m);
9058 + else
9059 + apicid = m->mpc_apicid;
9060 +
9061 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
9062 bootup_cpu = " (Bootup-CPU)";
9063 boot_cpu_physical_apicid = m->mpc_apicid;
9064 @@ -107,18 +82,17 @@ static void __cpuinit MP_processor_info(
9065 #endif
9066 }
9067
9068 +#ifdef CONFIG_X86_IO_APIC
9069 static void __init MP_bus_info(struct mpc_config_bus *m)
9070 {
9071 char str[7];
9072 -
9073 memcpy(str, m->mpc_bustype, 6);
9074 str[6] = 0;
9075
9076 -#ifdef CONFIG_X86_NUMAQ
9077 - mpc_oem_bus_info(m, str, translation_table[mpc_record]);
9078 -#else
9079 - Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
9080 -#endif
9081 + if (x86_quirks->mpc_oem_bus_info)
9082 + x86_quirks->mpc_oem_bus_info(m, str);
9083 + else
9084 + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
9085
9086 #if MAX_MP_BUSSES < 256
9087 if (m->mpc_busid >= MAX_MP_BUSSES) {
9088 @@ -135,12 +109,10 @@ static void __init MP_bus_info(struct mp
9089 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
9090 #endif
9091 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
9092 -#ifdef CONFIG_X86_NUMAQ
9093 - mpc_oem_pci_bus(m, translation_table[mpc_record]);
9094 -#endif
9095 + if (x86_quirks->mpc_oem_pci_bus)
9096 + x86_quirks->mpc_oem_pci_bus(m);
9097 +
9098 clear_bit(m->mpc_busid, mp_bus_not_pci);
9099 - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
9100 - mp_current_pci_id++;
9101 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
9102 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
9103 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
9104 @@ -151,6 +123,7 @@ static void __init MP_bus_info(struct mp
9105 } else
9106 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
9107 }
9108 +#endif
9109
9110 #ifdef CONFIG_X86_IO_APIC
9111
9112 @@ -180,117 +153,111 @@ static void __init MP_ioapic_info(struct
9113 if (bad_ioapic(m->mpc_apicaddr))
9114 return;
9115
9116 - mp_ioapics[nr_ioapics] = *m;
9117 + mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
9118 + mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
9119 + mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
9120 + mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
9121 + mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
9122 nr_ioapics++;
9123 }
9124
9125 -static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9126 +static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
9127 {
9128 - mp_irqs[mp_irq_entries] = *m;
9129 - Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
9130 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9131 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9132 m->mpc_irqtype, m->mpc_irqflag & 3,
9133 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
9134 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
9135 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
9136 - panic("Max # of irq sources exceeded!!\n");
9137 }
9138
9139 -#endif
9140 -
9141 -static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9142 +static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
9143 {
9144 - Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
9145 - " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9146 - m->mpc_irqtype, m->mpc_irqflag & 3,
9147 - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9148 - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9149 + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
9150 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
9151 + mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
9152 + (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
9153 + mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
9154 }
9155
9156 -#ifdef CONFIG_X86_NUMAQ
9157 -static void __init MP_translation_info(struct mpc_config_translation *m)
9158 +static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
9159 + struct mp_config_intsrc *mp_irq)
9160 {
9161 - printk(KERN_INFO
9162 - "Translation: record %d, type %d, quad %d, global %d, local %d\n",
9163 - mpc_record, m->trans_type, m->trans_quad, m->trans_global,
9164 - m->trans_local);
9165 + mp_irq->mp_dstapic = m->mpc_dstapic;
9166 + mp_irq->mp_type = m->mpc_type;
9167 + mp_irq->mp_irqtype = m->mpc_irqtype;
9168 + mp_irq->mp_irqflag = m->mpc_irqflag;
9169 + mp_irq->mp_srcbus = m->mpc_srcbus;
9170 + mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
9171 + mp_irq->mp_dstirq = m->mpc_dstirq;
9172 +}
9173
9174 - if (mpc_record >= MAX_MPC_ENTRY)
9175 - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
9176 - else
9177 - translation_table[mpc_record] = m; /* stash this for later */
9178 - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
9179 - node_set_online(m->trans_quad);
9180 +static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
9181 + struct mpc_config_intsrc *m)
9182 +{
9183 + m->mpc_dstapic = mp_irq->mp_dstapic;
9184 + m->mpc_type = mp_irq->mp_type;
9185 + m->mpc_irqtype = mp_irq->mp_irqtype;
9186 + m->mpc_irqflag = mp_irq->mp_irqflag;
9187 + m->mpc_srcbus = mp_irq->mp_srcbus;
9188 + m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
9189 + m->mpc_dstirq = mp_irq->mp_dstirq;
9190 }
9191
9192 -/*
9193 - * Read/parse the MPC oem tables
9194 - */
9195 +static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
9196 + struct mpc_config_intsrc *m)
9197 +{
9198 + if (mp_irq->mp_dstapic != m->mpc_dstapic)
9199 + return 1;
9200 + if (mp_irq->mp_type != m->mpc_type)
9201 + return 2;
9202 + if (mp_irq->mp_irqtype != m->mpc_irqtype)
9203 + return 3;
9204 + if (mp_irq->mp_irqflag != m->mpc_irqflag)
9205 + return 4;
9206 + if (mp_irq->mp_srcbus != m->mpc_srcbus)
9207 + return 5;
9208 + if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
9209 + return 6;
9210 + if (mp_irq->mp_dstirq != m->mpc_dstirq)
9211 + return 7;
9212
9213 -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
9214 - unsigned short oemsize)
9215 + return 0;
9216 +}
9217 +
9218 +static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
9219 {
9220 - int count = sizeof(*oemtable); /* the header size */
9221 - unsigned char *oemptr = ((unsigned char *)oemtable) + count;
9222 + int i;
9223
9224 - mpc_record = 0;
9225 - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
9226 - oemtable);
9227 - if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
9228 - printk(KERN_WARNING
9229 - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
9230 - oemtable->oem_signature[0], oemtable->oem_signature[1],
9231 - oemtable->oem_signature[2], oemtable->oem_signature[3]);
9232 - return;
9233 - }
9234 - if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
9235 - printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
9236 - return;
9237 - }
9238 - while (count < oemtable->oem_length) {
9239 - switch (*oemptr) {
9240 - case MP_TRANSLATION:
9241 - {
9242 - struct mpc_config_translation *m =
9243 - (struct mpc_config_translation *)oemptr;
9244 - MP_translation_info(m);
9245 - oemptr += sizeof(*m);
9246 - count += sizeof(*m);
9247 - ++mpc_record;
9248 - break;
9249 - }
9250 - default:
9251 - {
9252 - printk(KERN_WARNING
9253 - "Unrecognised OEM table entry type! - %d\n",
9254 - (int)*oemptr);
9255 - return;
9256 - }
9257 - }
9258 + print_MP_intsrc_info(m);
9259 +
9260 + for (i = 0; i < mp_irq_entries; i++) {
9261 + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
9262 + return;
9263 }
9264 +
9265 + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
9266 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
9267 + panic("Max # of irq sources exceeded!!\n");
9268 }
9269
9270 -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
9271 - char *productid)
9272 +#endif
9273 +
9274 +static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
9275 {
9276 - if (strncmp(oem, "IBM NUMA", 8))
9277 - printk("Warning! May not be a NUMA-Q system!\n");
9278 - if (mpc->mpc_oemptr)
9279 - smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
9280 - mpc->mpc_oemsize);
9281 + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
9282 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
9283 + m->mpc_irqtype, m->mpc_irqflag & 3,
9284 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
9285 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
9286 }
9287 -#endif /* CONFIG_X86_NUMAQ */
9288
9289 /*
9290 * Read/parse the MPC
9291 */
9292
9293 -static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9294 +static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
9295 + char *str)
9296 {
9297 - char str[16];
9298 - char oem[10];
9299 - int count = sizeof(*mpc);
9300 - unsigned char *mpt = ((unsigned char *)mpc) + count;
9301
9302 if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
9303 printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
9304 @@ -313,19 +280,41 @@ static int __init smp_read_mpc(struct mp
9305 }
9306 memcpy(oem, mpc->mpc_oem, 8);
9307 oem[8] = 0;
9308 - printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
9309 + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
9310
9311 memcpy(str, mpc->mpc_productid, 12);
9312 str[12] = 0;
9313 - printk("Product ID: %s ", str);
9314
9315 -#ifdef CONFIG_X86_32
9316 - mps_oem_check(mpc, oem, str);
9317 -#endif
9318 - printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
9319 + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
9320
9321 printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
9322
9323 + return 1;
9324 +}
9325 +
9326 +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
9327 +{
9328 + char str[16];
9329 + char oem[10];
9330 +
9331 + int count = sizeof(*mpc);
9332 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9333 +
9334 + if (!smp_check_mpc(mpc, oem, str))
9335 + return 0;
9336 +
9337 +#ifdef CONFIG_X86_32
9338 + /*
9339 + * need to make sure summit and es7000's mps_oem_check is safe to be
9340 + * called early via genericarch 's mps_oem_check
9341 + */
9342 + if (early) {
9343 +#ifdef CONFIG_X86_NUMAQ
9344 + numaq_mps_oem_check(mpc, oem, str);
9345 +#endif
9346 + } else
9347 + mps_oem_check(mpc, oem, str);
9348 +#endif
9349 /* save the local APIC address, it might be non-default */
9350 if (!acpi_lapic)
9351 mp_lapic_addr = mpc->mpc_lapic;
9352 @@ -333,12 +322,17 @@ static int __init smp_read_mpc(struct mp
9353 if (early)
9354 return 1;
9355
9356 + if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
9357 + struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
9358 + x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
9359 + }
9360 +
9361 /*
9362 * Now process the configuration blocks.
9363 */
9364 -#ifdef CONFIG_X86_NUMAQ
9365 - mpc_record = 0;
9366 -#endif
9367 + if (x86_quirks->mpc_record)
9368 + *x86_quirks->mpc_record = 0;
9369 +
9370 while (count < mpc->mpc_length) {
9371 switch (*mpt) {
9372 case MP_PROCESSOR:
9373 @@ -356,7 +350,9 @@ static int __init smp_read_mpc(struct mp
9374 {
9375 struct mpc_config_bus *m =
9376 (struct mpc_config_bus *)mpt;
9377 +#ifdef CONFIG_X86_IO_APIC
9378 MP_bus_info(m);
9379 +#endif
9380 mpt += sizeof(*m);
9381 count += sizeof(*m);
9382 break;
9383 @@ -402,10 +398,14 @@ static int __init smp_read_mpc(struct mp
9384 count = mpc->mpc_length;
9385 break;
9386 }
9387 -#ifdef CONFIG_X86_NUMAQ
9388 - ++mpc_record;
9389 -#endif
9390 + if (x86_quirks->mpc_record)
9391 + (*x86_quirks->mpc_record)++;
9392 }
9393 +
9394 +#ifdef CONFIG_X86_GENERICARCH
9395 + generic_bigsmp_probe();
9396 +#endif
9397 +
9398 setup_apic_routing();
9399 if (!num_processors)
9400 printk(KERN_ERR "MPTABLE: no processors registered!\n");
9401 @@ -431,7 +431,7 @@ static void __init construct_default_ioi
9402 intsrc.mpc_type = MP_INTSRC;
9403 intsrc.mpc_irqflag = 0; /* conforming */
9404 intsrc.mpc_srcbus = 0;
9405 - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
9406 + intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
9407
9408 intsrc.mpc_irqtype = mp_INT;
9409
9410 @@ -492,40 +492,11 @@ static void __init construct_default_ioi
9411 MP_intsrc_info(&intsrc);
9412 }
9413
9414 -#endif
9415
9416 -static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9417 +static void __init construct_ioapic_table(int mpc_default_type)
9418 {
9419 - struct mpc_config_processor processor;
9420 - struct mpc_config_bus bus;
9421 -#ifdef CONFIG_X86_IO_APIC
9422 struct mpc_config_ioapic ioapic;
9423 -#endif
9424 - struct mpc_config_lintsrc lintsrc;
9425 - int linttypes[2] = { mp_ExtINT, mp_NMI };
9426 - int i;
9427 -
9428 - /*
9429 - * local APIC has default address
9430 - */
9431 - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9432 -
9433 - /*
9434 - * 2 CPUs, numbered 0 & 1.
9435 - */
9436 - processor.mpc_type = MP_PROCESSOR;
9437 - /* Either an integrated APIC or a discrete 82489DX. */
9438 - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9439 - processor.mpc_cpuflag = CPU_ENABLED;
9440 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9441 - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9442 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9443 - processor.mpc_reserved[0] = 0;
9444 - processor.mpc_reserved[1] = 0;
9445 - for (i = 0; i < 2; i++) {
9446 - processor.mpc_apicid = i;
9447 - MP_processor_info(&processor);
9448 - }
9449 + struct mpc_config_bus bus;
9450
9451 bus.mpc_type = MP_BUS;
9452 bus.mpc_busid = 0;
9453 @@ -554,7 +525,6 @@ static inline void __init construct_defa
9454 MP_bus_info(&bus);
9455 }
9456
9457 -#ifdef CONFIG_X86_IO_APIC
9458 ioapic.mpc_type = MP_IOAPIC;
9459 ioapic.mpc_apicid = 2;
9460 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9461 @@ -566,7 +536,42 @@ static inline void __init construct_defa
9462 * We set up most of the low 16 IO-APIC pins according to MPS rules.
9463 */
9464 construct_default_ioirq_mptable(mpc_default_type);
9465 +}
9466 +#else
9467 +static inline void __init construct_ioapic_table(int mpc_default_type) { }
9468 #endif
9469 +
9470 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
9471 +{
9472 + struct mpc_config_processor processor;
9473 + struct mpc_config_lintsrc lintsrc;
9474 + int linttypes[2] = { mp_ExtINT, mp_NMI };
9475 + int i;
9476 +
9477 + /*
9478 + * local APIC has default address
9479 + */
9480 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
9481 +
9482 + /*
9483 + * 2 CPUs, numbered 0 & 1.
9484 + */
9485 + processor.mpc_type = MP_PROCESSOR;
9486 + /* Either an integrated APIC or a discrete 82489DX. */
9487 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
9488 + processor.mpc_cpuflag = CPU_ENABLED;
9489 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9490 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9491 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9492 + processor.mpc_reserved[0] = 0;
9493 + processor.mpc_reserved[1] = 0;
9494 + for (i = 0; i < 2; i++) {
9495 + processor.mpc_apicid = i;
9496 + MP_processor_info(&processor);
9497 + }
9498 +
9499 + construct_ioapic_table(mpc_default_type);
9500 +
9501 lintsrc.mpc_type = MP_LINTSRC;
9502 lintsrc.mpc_irqflag = 0; /* conforming */
9503 lintsrc.mpc_srcbusid = 0;
9504 @@ -584,10 +589,14 @@ static struct intel_mp_floating *mpf_fou
9505 /*
9506 * Scan the memory blocks for an SMP configuration block.
9507 */
9508 -static void __init __get_smp_config(unsigned early)
9509 +static void __init __get_smp_config(unsigned int early)
9510 {
9511 struct intel_mp_floating *mpf = mpf_found;
9512
9513 + if (x86_quirks->mach_get_smp_config) {
9514 + if (x86_quirks->mach_get_smp_config(early))
9515 + return;
9516 + }
9517 if (acpi_lapic && early)
9518 return;
9519 /*
9520 @@ -604,7 +613,7 @@ static void __init __get_smp_config(unsi
9521
9522 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
9523 mpf->mpf_specification);
9524 -#ifdef CONFIG_X86_32
9525 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
9526 if (mpf->mpf_feature2 & (1 << 7)) {
9527 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
9528 pic_mode = 1;
9529 @@ -635,8 +644,10 @@ static void __init __get_smp_config(unsi
9530 * Read the physical hardware table. Anything here will
9531 * override the defaults.
9532 */
9533 - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
9534 + if (!smp_read_mpc(_bus_to_virt(mpf->mpf_physptr), early)) {
9535 +#ifdef CONFIG_X86_LOCAL_APIC
9536 smp_found_config = 0;
9537 +#endif
9538 printk(KERN_ERR
9539 "BIOS bug, MP table errors detected!...\n");
9540 printk(KERN_ERR "... disabling SMP support. "
9541 @@ -690,10 +701,11 @@ void __init get_smp_config(void)
9542 static int __init smp_scan_config(unsigned long base, unsigned long length,
9543 unsigned reserve)
9544 {
9545 - unsigned int *bp = isa_bus_to_virt(base);
9546 + unsigned int *bp = _bus_to_virt(base);
9547 struct intel_mp_floating *mpf;
9548
9549 - Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
9550 + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
9551 + bp, length);
9552 BUILD_BUG_ON(sizeof(*mpf) != 16);
9553
9554 while (length > 0) {
9555 @@ -703,16 +715,22 @@ static int __init smp_scan_config(unsign
9556 !mpf_checksum((unsigned char *)bp, 16) &&
9557 ((mpf->mpf_specification == 1)
9558 || (mpf->mpf_specification == 4))) {
9559 -
9560 +#ifdef CONFIG_X86_LOCAL_APIC
9561 smp_found_config = 1;
9562 +#endif
9563 mpf_found = mpf;
9564 -#ifdef CONFIG_X86_32
9565 +
9566 #ifndef CONFIG_XEN
9567 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9568 mpf, virt_to_phys(mpf));
9569 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
9570 +
9571 + if (!reserve)
9572 + return 1;
9573 + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
9574 BOOTMEM_DEFAULT);
9575 if (mpf->mpf_physptr) {
9576 + unsigned long size = PAGE_SIZE;
9577 +#ifdef CONFIG_X86_32
9578 /*
9579 * We cannot access to MPC table to compute
9580 * table size yet, as only few megabytes from
9581 @@ -722,27 +740,18 @@ static int __init smp_scan_config(unsign
9582 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
9583 * in reserve_bootmem.
9584 */
9585 - unsigned long size = PAGE_SIZE;
9586 unsigned long end = max_low_pfn * PAGE_SIZE;
9587 if (mpf->mpf_physptr + size > end)
9588 size = end - mpf->mpf_physptr;
9589 - reserve_bootmem(mpf->mpf_physptr, size,
9590 +#endif
9591 + reserve_bootmem_generic(mpf->mpf_physptr, size,
9592 BOOTMEM_DEFAULT);
9593 }
9594 #else
9595 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
9596 - mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
9597 -#endif
9598 -#elif !defined(CONFIG_XEN)
9599 - if (!reserve)
9600 - return 1;
9601 -
9602 - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
9603 - if (mpf->mpf_physptr)
9604 - reserve_bootmem_generic(mpf->mpf_physptr,
9605 - PAGE_SIZE);
9606 + mpf, ((void *)bp - _bus_to_virt(base)) + base);
9607 #endif
9608 - return 1;
9609 + return 1;
9610 }
9611 bp += 4;
9612 length -= 16;
9613 @@ -750,12 +759,16 @@ static int __init smp_scan_config(unsign
9614 return 0;
9615 }
9616
9617 -static void __init __find_smp_config(unsigned reserve)
9618 +static void __init __find_smp_config(unsigned int reserve)
9619 {
9620 #ifndef CONFIG_XEN
9621 unsigned int address;
9622 #endif
9623
9624 + if (x86_quirks->mach_find_smp_config) {
9625 + if (x86_quirks->mach_find_smp_config(reserve))
9626 + return;
9627 + }
9628 /*
9629 * FIXME: Linux assumes you have 640K of base ram..
9630 * this continues the error...
9631 @@ -802,300 +815,297 @@ void __init find_smp_config(void)
9632 __find_smp_config(1);
9633 }
9634
9635 -/* --------------------------------------------------------------------------
9636 - ACPI-based MP Configuration
9637 - -------------------------------------------------------------------------- */
9638 -
9639 -/*
9640 - * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
9641 - */
9642 -int es7000_plat;
9643 -
9644 -#ifdef CONFIG_ACPI
9645 +#ifdef CONFIG_X86_IO_APIC
9646 +static u8 __initdata irq_used[MAX_IRQ_SOURCES];
9647
9648 -#ifdef CONFIG_X86_IO_APIC
9649 +static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
9650 +{
9651 + int i;
9652
9653 -#define MP_ISA_BUS 0
9654 + if (m->mpc_irqtype != mp_INT)
9655 + return 0;
9656
9657 -extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
9658 + if (m->mpc_irqflag != 0x0f)
9659 + return 0;
9660
9661 -static int mp_find_ioapic(int gsi)
9662 -{
9663 - int i = 0;
9664 + /* not legacy */
9665
9666 - /* Find the IOAPIC that manages this GSI. */
9667 - for (i = 0; i < nr_ioapics; i++) {
9668 - if ((gsi >= mp_ioapic_routing[i].gsi_base)
9669 - && (gsi <= mp_ioapic_routing[i].gsi_end))
9670 - return i;
9671 + for (i = 0; i < mp_irq_entries; i++) {
9672 + if (mp_irqs[i].mp_irqtype != mp_INT)
9673 + continue;
9674 +
9675 + if (mp_irqs[i].mp_irqflag != 0x0f)
9676 + continue;
9677 +
9678 + if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
9679 + continue;
9680 + if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
9681 + continue;
9682 + if (irq_used[i]) {
9683 + /* already claimed */
9684 + return -2;
9685 + }
9686 + irq_used[i] = 1;
9687 + return i;
9688 }
9689
9690 - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9691 + /* not found */
9692 return -1;
9693 }
9694
9695 -static u8 __init uniq_ioapic_id(u8 id)
9696 -{
9697 -#ifdef CONFIG_X86_32
9698 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
9699 - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
9700 - return io_apic_get_unique_id(nr_ioapics, id);
9701 - else
9702 - return id;
9703 -#else
9704 - int i;
9705 - DECLARE_BITMAP(used, 256);
9706 - bitmap_zero(used, 256);
9707 - for (i = 0; i < nr_ioapics; i++) {
9708 - struct mpc_config_ioapic *ia = &mp_ioapics[i];
9709 - __set_bit(ia->mpc_apicid, used);
9710 - }
9711 - if (!test_bit(id, used))
9712 - return id;
9713 - return find_first_zero_bit(used, 256);
9714 +#define SPARE_SLOT_NUM 20
9715 +
9716 +static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
9717 #endif
9718 -}
9719
9720 -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
9721 +static int __init replace_intsrc_all(struct mp_config_table *mpc,
9722 + unsigned long mpc_new_phys,
9723 + unsigned long mpc_new_length)
9724 {
9725 - int idx = 0;
9726 -
9727 - if (bad_ioapic(address))
9728 - return;
9729 +#ifdef CONFIG_X86_IO_APIC
9730 + int i;
9731 + int nr_m_spare = 0;
9732 +#endif
9733
9734 - idx = nr_ioapics;
9735 + int count = sizeof(*mpc);
9736 + unsigned char *mpt = ((unsigned char *)mpc) + count;
9737
9738 - mp_ioapics[idx].mpc_type = MP_IOAPIC;
9739 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9740 - mp_ioapics[idx].mpc_apicaddr = address;
9741 + printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
9742 + while (count < mpc->mpc_length) {
9743 + switch (*mpt) {
9744 + case MP_PROCESSOR:
9745 + {
9746 + struct mpc_config_processor *m =
9747 + (struct mpc_config_processor *)mpt;
9748 + mpt += sizeof(*m);
9749 + count += sizeof(*m);
9750 + break;
9751 + }
9752 + case MP_BUS:
9753 + {
9754 + struct mpc_config_bus *m =
9755 + (struct mpc_config_bus *)mpt;
9756 + mpt += sizeof(*m);
9757 + count += sizeof(*m);
9758 + break;
9759 + }
9760 + case MP_IOAPIC:
9761 + {
9762 + mpt += sizeof(struct mpc_config_ioapic);
9763 + count += sizeof(struct mpc_config_ioapic);
9764 + break;
9765 + }
9766 + case MP_INTSRC:
9767 + {
9768 +#ifdef CONFIG_X86_IO_APIC
9769 + struct mpc_config_intsrc *m =
9770 + (struct mpc_config_intsrc *)mpt;
9771
9772 -#ifndef CONFIG_XEN
9773 - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
9774 + apic_printk(APIC_VERBOSE, "OLD ");
9775 + print_MP_intsrc_info(m);
9776 + i = get_MP_intsrc_index(m);
9777 + if (i > 0) {
9778 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9779 + apic_printk(APIC_VERBOSE, "NEW ");
9780 + print_mp_irq_info(&mp_irqs[i]);
9781 + } else if (!i) {
9782 + /* legacy, do nothing */
9783 + } else if (nr_m_spare < SPARE_SLOT_NUM) {
9784 + /*
9785 + * not found (-1), or duplicated (-2)
9786 + * are invalid entries,
9787 + * we need to use the slot later
9788 + */
9789 + m_spare[nr_m_spare] = m;
9790 + nr_m_spare++;
9791 + }
9792 #endif
9793 - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
9794 -#ifdef CONFIG_X86_32
9795 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9796 -#else
9797 - mp_ioapics[idx].mpc_apicver = 0;
9798 + mpt += sizeof(struct mpc_config_intsrc);
9799 + count += sizeof(struct mpc_config_intsrc);
9800 + break;
9801 + }
9802 + case MP_LINTSRC:
9803 + {
9804 + struct mpc_config_lintsrc *m =
9805 + (struct mpc_config_lintsrc *)mpt;
9806 + mpt += sizeof(*m);
9807 + count += sizeof(*m);
9808 + break;
9809 + }
9810 + default:
9811 + /* wrong mptable */
9812 + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
9813 + printk(KERN_ERR "type %x\n", *mpt);
9814 + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
9815 + 1, mpc, mpc->mpc_length, 1);
9816 + goto out;
9817 + }
9818 + }
9819 +
9820 +#ifdef CONFIG_X86_IO_APIC
9821 + for (i = 0; i < mp_irq_entries; i++) {
9822 + if (irq_used[i])
9823 + continue;
9824 +
9825 + if (mp_irqs[i].mp_irqtype != mp_INT)
9826 + continue;
9827 +
9828 + if (mp_irqs[i].mp_irqflag != 0x0f)
9829 + continue;
9830 +
9831 + if (nr_m_spare > 0) {
9832 + apic_printk(APIC_VERBOSE, "*NEW* found\n");
9833 + nr_m_spare--;
9834 + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
9835 + m_spare[nr_m_spare] = NULL;
9836 + } else {
9837 + struct mpc_config_intsrc *m =
9838 + (struct mpc_config_intsrc *)mpt;
9839 + count += sizeof(struct mpc_config_intsrc);
9840 + if (!mpc_new_phys) {
9841 + printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
9842 + } else {
9843 + if (count <= mpc_new_length)
9844 + printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
9845 + else {
9846 + printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
9847 + goto out;
9848 + }
9849 + }
9850 + assign_to_mpc_intsrc(&mp_irqs[i], m);
9851 + mpc->mpc_length = count;
9852 + mpt += sizeof(struct mpc_config_intsrc);
9853 + }
9854 + print_mp_irq_info(&mp_irqs[i]);
9855 + }
9856 #endif
9857 - /*
9858 - * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9859 - * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9860 - */
9861 - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9862 - mp_ioapic_routing[idx].gsi_base = gsi_base;
9863 - mp_ioapic_routing[idx].gsi_end = gsi_base +
9864 - io_apic_get_redir_entries(idx);
9865 -
9866 - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
9867 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
9868 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9869 - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
9870 +out:
9871 + /* update checksum */
9872 + mpc->mpc_checksum = 0;
9873 + mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
9874 + mpc->mpc_length);
9875
9876 - nr_ioapics++;
9877 + return 0;
9878 }
9879
9880 -void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
9881 -{
9882 - struct mpc_config_intsrc intsrc;
9883 - int ioapic = -1;
9884 - int pin = -1;
9885 -
9886 - /*
9887 - * Convert 'gsi' to 'ioapic.pin'.
9888 - */
9889 - ioapic = mp_find_ioapic(gsi);
9890 - if (ioapic < 0)
9891 - return;
9892 - pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9893 +static int __initdata enable_update_mptable;
9894
9895 - /*
9896 - * TBD: This check is for faulty timer entries, where the override
9897 - * erroneously sets the trigger to level, resulting in a HUGE
9898 - * increase of timer interrupts!
9899 - */
9900 - if ((bus_irq == 0) && (trigger == 3))
9901 - trigger = 1;
9902 +static int __init update_mptable_setup(char *str)
9903 +{
9904 + enable_update_mptable = 1;
9905 + return 0;
9906 +}
9907 +early_param("update_mptable", update_mptable_setup);
9908
9909 - intsrc.mpc_type = MP_INTSRC;
9910 - intsrc.mpc_irqtype = mp_INT;
9911 - intsrc.mpc_irqflag = (trigger << 2) | polarity;
9912 - intsrc.mpc_srcbus = MP_ISA_BUS;
9913 - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
9914 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
9915 - intsrc.mpc_dstirq = pin; /* INTIN# */
9916 +static unsigned long __initdata mpc_new_phys;
9917 +static unsigned long mpc_new_length __initdata = 4096;
9918
9919 - MP_intsrc_info(&intsrc);
9920 +/* alloc_mptable or alloc_mptable=4k */
9921 +static int __initdata alloc_mptable;
9922 +static int __init parse_alloc_mptable_opt(char *p)
9923 +{
9924 + enable_update_mptable = 1;
9925 + alloc_mptable = 1;
9926 + if (!p)
9927 + return 0;
9928 + mpc_new_length = memparse(p, &p);
9929 + return 0;
9930 }
9931 +early_param("alloc_mptable", parse_alloc_mptable_opt);
9932
9933 -void __init mp_config_acpi_legacy_irqs(void)
9934 +void __init early_reserve_e820_mpc_new(void)
9935 {
9936 - struct mpc_config_intsrc intsrc;
9937 - int i = 0;
9938 - int ioapic = -1;
9939 -
9940 -#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
9941 - /*
9942 - * Fabricate the legacy ISA bus (bus #31).
9943 - */
9944 - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9945 + if (enable_update_mptable && alloc_mptable) {
9946 + u64 startt = 0;
9947 +#ifdef CONFIG_X86_TRAMPOLINE
9948 + startt = TRAMPOLINE_BASE;
9949 #endif
9950 - set_bit(MP_ISA_BUS, mp_bus_not_pci);
9951 - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9952 -
9953 - /*
9954 - * Older generations of ES7000 have no legacy identity mappings
9955 - */
9956 - if (es7000_plat == 1)
9957 - return;
9958 -
9959 - /*
9960 - * Locate the IOAPIC that manages the ISA IRQs (0-15).
9961 - */
9962 - ioapic = mp_find_ioapic(0);
9963 - if (ioapic < 0)
9964 - return;
9965 -
9966 - intsrc.mpc_type = MP_INTSRC;
9967 - intsrc.mpc_irqflag = 0; /* Conforming */
9968 - intsrc.mpc_srcbus = MP_ISA_BUS;
9969 -#ifdef CONFIG_X86_IO_APIC
9970 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9971 -#endif
9972 - /*
9973 - * Use the default configuration for the IRQs 0-15. Unless
9974 - * overridden by (MADT) interrupt source override entries.
9975 - */
9976 - for (i = 0; i < 16; i++) {
9977 - int idx;
9978 -
9979 - for (idx = 0; idx < mp_irq_entries; idx++) {
9980 - struct mpc_config_intsrc *irq = mp_irqs + idx;
9981 -
9982 - /* Do we already have a mapping for this ISA IRQ? */
9983 - if (irq->mpc_srcbus == MP_ISA_BUS
9984 - && irq->mpc_srcbusirq == i)
9985 - break;
9986 -
9987 - /* Do we already have a mapping for this IOAPIC pin */
9988 - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9989 - (irq->mpc_dstirq == i))
9990 - break;
9991 - }
9992 -
9993 - if (idx != mp_irq_entries) {
9994 - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9995 - continue; /* IRQ already used */
9996 - }
9997 -
9998 - intsrc.mpc_irqtype = mp_INT;
9999 - intsrc.mpc_srcbusirq = i; /* Identity mapped */
10000 - intsrc.mpc_dstirq = i;
10001 -
10002 - MP_intsrc_info(&intsrc);
10003 + mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
10004 }
10005 }
10006
10007 -int mp_register_gsi(u32 gsi, int triggering, int polarity)
10008 +static int __init update_mp_table(void)
10009 {
10010 - int ioapic;
10011 - int ioapic_pin;
10012 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10013 -#define MAX_GSI_NUM 4096
10014 -#define IRQ_COMPRESSION_START 64
10015 + char str[16];
10016 + char oem[10];
10017 + struct intel_mp_floating *mpf;
10018 + struct mp_config_table *mpc;
10019 + struct mp_config_table *mpc_new;
10020 +
10021 + if (!enable_update_mptable)
10022 + return 0;
10023 +
10024 + mpf = mpf_found;
10025 + if (!mpf)
10026 + return 0;
10027
10028 - static int pci_irq = IRQ_COMPRESSION_START;
10029 /*
10030 - * Mapping between Global System Interrupts, which
10031 - * represent all possible interrupts, and IRQs
10032 - * assigned to actual devices.
10033 + * Now see if we need to go further.
10034 */
10035 - static int gsi_to_irq[MAX_GSI_NUM];
10036 -#else
10037 + if (mpf->mpf_feature1 != 0)
10038 + return 0;
10039
10040 - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
10041 - return gsi;
10042 -#endif
10043 + if (!mpf->mpf_physptr)
10044 + return 0;
10045
10046 - /* Don't set up the ACPI SCI because it's already set up */
10047 - if (acpi_gbl_FADT.sci_interrupt == gsi)
10048 - return gsi;
10049 + mpc = _bus_to_virt(mpf->mpf_physptr);
10050
10051 - ioapic = mp_find_ioapic(gsi);
10052 - if (ioapic < 0) {
10053 - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
10054 - return gsi;
10055 - }
10056 + if (!smp_check_mpc(mpc, oem, str))
10057 + return 0;
10058
10059 - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
10060 + printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
10061 + printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
10062
10063 -#ifndef CONFIG_X86_32
10064 - if (ioapic_renumber_irq)
10065 - gsi = ioapic_renumber_irq(ioapic, gsi);
10066 -#endif
10067 + if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
10068 + mpc_new_phys = 0;
10069 + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
10070 + mpc_new_length);
10071 + }
10072 +
10073 + if (!mpc_new_phys) {
10074 + unsigned char old, new;
10075 + /* check if we can change the postion */
10076 + mpc->mpc_checksum = 0;
10077 + old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10078 + mpc->mpc_checksum = 0xff;
10079 + new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
10080 + if (old == new) {
10081 + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
10082 + return 0;
10083 + }
10084 + printk(KERN_INFO "use in-positon replacing\n");
10085 + } else {
10086 + maddr_t mpc_new_bus;
10087
10088 - /*
10089 - * Avoid pin reprogramming. PRTs typically include entries
10090 - * with redundant pin->gsi mappings (but unique PCI devices);
10091 - * we only program the IOAPIC on the first.
10092 - */
10093 - if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
10094 - printk(KERN_ERR "Invalid reference to IOAPIC pin "
10095 - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
10096 - ioapic_pin);
10097 - return gsi;
10098 - }
10099 - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
10100 - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
10101 - mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
10102 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10103 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
10104 -#else
10105 - return gsi;
10106 -#endif
10107 + mpc_new_bus = phys_to_machine(mpc_new_phys);
10108 + mpf->mpf_physptr = mpc_new_bus;
10109 + mpc_new = phys_to_virt(mpc_new_phys);
10110 + memcpy(mpc_new, mpc, mpc->mpc_length);
10111 + mpc = mpc_new;
10112 + /* check if we can modify that */
10113 + if (mpc_new_bus - mpf->mpf_physptr) {
10114 + struct intel_mp_floating *mpf_new;
10115 + /* steal 16 bytes from [0, 1k) */
10116 + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
10117 + mpf_new = isa_bus_to_virt(0x400 - 16);
10118 + memcpy(mpf_new, mpf, 16);
10119 + mpf = mpf_new;
10120 + mpf->mpf_physptr = mpc_new_bus;
10121 + }
10122 + mpf->mpf_checksum = 0;
10123 + mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
10124 + printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
10125 }
10126
10127 - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
10128 -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
10129 /*
10130 - * For GSI >= 64, use IRQ compression
10131 + * only replace the one with mp_INT and
10132 + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
10133 + * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
10134 + * may need pci=routeirq for all coverage
10135 */
10136 - if ((gsi >= IRQ_COMPRESSION_START)
10137 - && (triggering == ACPI_LEVEL_SENSITIVE)) {
10138 - /*
10139 - * For PCI devices assign IRQs in order, avoiding gaps
10140 - * due to unused I/O APIC pins.
10141 - */
10142 - int irq = gsi;
10143 - if (gsi < MAX_GSI_NUM) {
10144 - /*
10145 - * Retain the VIA chipset work-around (gsi > 15), but
10146 - * avoid a problem where the 8254 timer (IRQ0) is setup
10147 - * via an override (so it's not on pin 0 of the ioapic),
10148 - * and at the same time, the pin 0 interrupt is a PCI
10149 - * type. The gsi > 15 test could cause these two pins
10150 - * to be shared as IRQ0, and they are not shareable.
10151 - * So test for this condition, and if necessary, avoid
10152 - * the pin collision.
10153 - */
10154 - gsi = pci_irq++;
10155 - /*
10156 - * Don't assign IRQ used by ACPI SCI
10157 - */
10158 - if (gsi == acpi_gbl_FADT.sci_interrupt)
10159 - gsi = pci_irq++;
10160 - gsi_to_irq[irq] = gsi;
10161 - } else {
10162 - printk(KERN_ERR "GSI %u is too high\n", gsi);
10163 - return gsi;
10164 - }
10165 - }
10166 -#endif
10167 - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
10168 - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
10169 - polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
10170 - return gsi;
10171 + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
10172 +
10173 + return 0;
10174 }
10175
10176 -#endif /* CONFIG_X86_IO_APIC */
10177 -#endif /* CONFIG_ACPI */
10178 +late_initcall(update_mp_table);
10179 --- sle11-2009-10-16.orig/arch/x86/kernel/nmi.c 2009-10-28 14:55:02.000000000 +0100
10180 +++ sle11-2009-10-16/arch/x86/kernel/nmi.c 2009-06-04 10:21:39.000000000 +0200
10181 @@ -27,7 +27,9 @@
10182 #include <linux/kdebug.h>
10183 #include <linux/smp.h>
10184
10185 +#ifndef CONFIG_XEN
10186 #include <asm/i8259.h>
10187 +#endif
10188 #include <asm/io_apic.h>
10189 #include <asm/smp.h>
10190 #include <asm/nmi.h>
10191 @@ -179,8 +181,10 @@ int __init check_nmi_watchdog(void)
10192 kfree(prev_nmi_count);
10193 return 0;
10194 error:
10195 +#ifndef CONFIG_XEN
10196 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
10197 disable_8259A_irq(0);
10198 +#endif
10199 #ifdef CONFIG_X86_32
10200 timer_ack = 0;
10201 #endif
10202 --- sle11-2009-10-16.orig/arch/x86/kernel/pci-dma-xen.c 2009-10-22 11:31:59.000000000 +0200
10203 +++ sle11-2009-10-16/arch/x86/kernel/pci-dma-xen.c 2009-06-04 10:21:39.000000000 +0200
10204 @@ -5,13 +5,13 @@
10205
10206 #include <asm/proto.h>
10207 #include <asm/dma.h>
10208 -#include <asm/gart.h>
10209 +#include <asm/iommu.h>
10210 #include <asm/calgary.h>
10211 +#include <asm/amd_iommu.h>
10212
10213 -int forbid_dac __read_mostly;
10214 -EXPORT_SYMBOL(forbid_dac);
10215 +static int forbid_dac __read_mostly;
10216
10217 -const struct dma_mapping_ops *dma_ops;
10218 +struct dma_mapping_ops *dma_ops;
10219 EXPORT_SYMBOL(dma_ops);
10220
10221 static int iommu_sac_force __read_mostly;
10222 @@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_si
10223 void __init dma32_reserve_bootmem(void)
10224 {
10225 unsigned long size, align;
10226 - if (end_pfn <= MAX_DMA32_PFN)
10227 + if (max_pfn <= MAX_DMA32_PFN)
10228 return;
10229
10230 + /*
10231 + * check aperture_64.c allocate_aperture() for reason about
10232 + * using 512M as goal
10233 + */
10234 align = 64ULL<<20;
10235 size = round_up(dma32_bootmem_size, align);
10236 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
10237 - __pa(MAX_DMA_ADDRESS));
10238 + 512ULL<<20);
10239 if (dma32_bootmem_ptr)
10240 dma32_bootmem_size = size;
10241 else
10242 @@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
10243 }
10244 static void __init dma32_free_bootmem(void)
10245 {
10246 - int node;
10247
10248 - if (end_pfn <= MAX_DMA32_PFN)
10249 + if (max_pfn <= MAX_DMA32_PFN)
10250 return;
10251
10252 if (!dma32_bootmem_ptr)
10253 return;
10254
10255 - for_each_online_node(node)
10256 - free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
10257 - dma32_bootmem_size);
10258 + free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
10259
10260 dma32_bootmem_ptr = NULL;
10261 dma32_bootmem_size = 0;
10262 @@ -107,7 +108,7 @@ static void __init dma32_free_bootmem(vo
10263 #define dma32_free_bootmem() ((void)0)
10264 #endif
10265
10266 -static const struct dma_mapping_ops swiotlb_dma_ops = {
10267 +static struct dma_mapping_ops swiotlb_dma_ops = {
10268 .mapping_error = swiotlb_dma_mapping_error,
10269 .map_single = swiotlb_map_single_phys,
10270 .unmap_single = swiotlb_unmap_single,
10271 @@ -130,25 +131,31 @@ void __init pci_iommu_alloc(void)
10272 * The order of these functions is important for
10273 * fall-back/fail-over reasons
10274 */
10275 -#ifdef CONFIG_GART_IOMMU
10276 gart_iommu_hole_init();
10277 -#endif
10278
10279 -#ifdef CONFIG_CALGARY_IOMMU
10280 detect_calgary();
10281 -#endif
10282
10283 detect_intel_iommu();
10284
10285 -#ifdef CONFIG_SWIOTLB
10286 + amd_iommu_detect();
10287 +
10288 swiotlb_init();
10289 if (swiotlb) {
10290 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
10291 dma_ops = &swiotlb_dma_ops;
10292 }
10293 -#endif
10294 }
10295
10296 +#ifndef CONFIG_XEN
10297 +unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
10298 +{
10299 + unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
10300 +
10301 + return size >> PAGE_SHIFT;
10302 +}
10303 +EXPORT_SYMBOL(iommu_num_pages);
10304 +#endif
10305 +
10306 /*
10307 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
10308 * documentation.
10309 @@ -201,9 +208,7 @@ static __init int iommu_setup(char *p)
10310 swiotlb = 1;
10311 #endif
10312
10313 -#ifdef CONFIG_GART_IOMMU
10314 gart_parse_options(p);
10315 -#endif
10316
10317 #ifdef CONFIG_CALGARY_IOMMU
10318 if (!strncmp(p, "calgary", 7))
10319 @@ -245,136 +250,19 @@ int range_straddles_page_boundary(paddr_
10320 !check_pages_physically_contiguous(pfn, offset, size));
10321 }
10322
10323 -#ifdef CONFIG_X86_32
10324 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
10325 - dma_addr_t device_addr, size_t size, int flags)
10326 -{
10327 - void __iomem *mem_base = NULL;
10328 - int pages = size >> PAGE_SHIFT;
10329 - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
10330 -
10331 - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
10332 - goto out;
10333 - if (!size)
10334 - goto out;
10335 - if (dev->dma_mem)
10336 - goto out;
10337 -
10338 - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
10339 -
10340 - mem_base = ioremap(bus_addr, size);
10341 - if (!mem_base)
10342 - goto out;
10343 -
10344 - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
10345 - if (!dev->dma_mem)
10346 - goto out;
10347 - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
10348 - if (!dev->dma_mem->bitmap)
10349 - goto free1_out;
10350 -
10351 - dev->dma_mem->virt_base = mem_base;
10352 - dev->dma_mem->device_base = device_addr;
10353 - dev->dma_mem->size = pages;
10354 - dev->dma_mem->flags = flags;
10355 -
10356 - if (flags & DMA_MEMORY_MAP)
10357 - return DMA_MEMORY_MAP;
10358 -
10359 - return DMA_MEMORY_IO;
10360 -
10361 - free1_out:
10362 - kfree(dev->dma_mem);
10363 - out:
10364 - if (mem_base)
10365 - iounmap(mem_base);
10366 - return 0;
10367 -}
10368 -EXPORT_SYMBOL(dma_declare_coherent_memory);
10369 -
10370 -void dma_release_declared_memory(struct device *dev)
10371 -{
10372 - struct dma_coherent_mem *mem = dev->dma_mem;
10373 -
10374 - if (!mem)
10375 - return;
10376 - dev->dma_mem = NULL;
10377 - iounmap(mem->virt_base);
10378 - kfree(mem->bitmap);
10379 - kfree(mem);
10380 -}
10381 -EXPORT_SYMBOL(dma_release_declared_memory);
10382 -
10383 -void *dma_mark_declared_memory_occupied(struct device *dev,
10384 - dma_addr_t device_addr, size_t size)
10385 -{
10386 - struct dma_coherent_mem *mem = dev->dma_mem;
10387 - int pos, err;
10388 - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
10389 -
10390 - pages >>= PAGE_SHIFT;
10391 -
10392 - if (!mem)
10393 - return ERR_PTR(-EINVAL);
10394 -
10395 - pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
10396 - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
10397 - if (err != 0)
10398 - return ERR_PTR(err);
10399 - return mem->virt_base + (pos << PAGE_SHIFT);
10400 -}
10401 -EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
10402 -
10403 -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
10404 - dma_addr_t *dma_handle, void **ret)
10405 -{
10406 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10407 - int order = get_order(size);
10408 -
10409 - if (mem) {
10410 - int page = bitmap_find_free_region(mem->bitmap, mem->size,
10411 - order);
10412 - if (page >= 0) {
10413 - *dma_handle = mem->device_base + (page << PAGE_SHIFT);
10414 - *ret = mem->virt_base + (page << PAGE_SHIFT);
10415 - memset(*ret, 0, size);
10416 - }
10417 - if (mem->flags & DMA_MEMORY_EXCLUSIVE)
10418 - *ret = NULL;
10419 - }
10420 - return (mem != NULL);
10421 -}
10422 -
10423 -static int dma_release_coherent(struct device *dev, int order, void *vaddr)
10424 -{
10425 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
10426 -
10427 - if (mem && vaddr >= mem->virt_base && vaddr <
10428 - (mem->virt_base + (mem->size << PAGE_SHIFT))) {
10429 - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
10430 -
10431 - bitmap_release_region(mem->bitmap, page, order);
10432 - return 1;
10433 - }
10434 - return 0;
10435 -}
10436 -#else
10437 -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
10438 -#define dma_release_coherent(dev, order, vaddr) (0)
10439 -#endif /* CONFIG_X86_32 */
10440 -
10441 int dma_supported(struct device *dev, u64 mask)
10442 {
10443 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10444 +
10445 #ifdef CONFIG_PCI
10446 if (mask > 0xffffffff && forbid_dac > 0) {
10447 - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
10448 - dev->bus_id);
10449 + dev_info(dev, "PCI: Disallowing DAC for device\n");
10450 return 0;
10451 }
10452 #endif
10453
10454 - if (dma_ops->dma_supported)
10455 - return dma_ops->dma_supported(dev, mask);
10456 + if (ops->dma_supported)
10457 + return ops->dma_supported(dev, mask);
10458
10459 /* Copied from i386. Doesn't make much sense, because it will
10460 only work for pci_alloc_coherent.
10461 @@ -395,8 +283,7 @@ int dma_supported(struct device *dev, u6
10462 type. Normally this doesn't make any difference, but gives
10463 more gentle handling of IOMMU overflow. */
10464 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
10465 - printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
10466 - dev->bus_id, mask);
10467 + dev_info(dev, "Force SAC with mask %Lx\n", mask);
10468 return 0;
10469 }
10470
10471 @@ -422,6 +309,9 @@ void *
10472 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
10473 gfp_t gfp)
10474 {
10475 +#ifndef CONFIG_XEN
10476 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10477 +#endif
10478 void *memory = NULL;
10479 struct page *page;
10480 unsigned long dma_mask = 0;
10481 @@ -431,7 +321,7 @@ dma_alloc_coherent(struct device *dev, s
10482 /* ignore region specifiers */
10483 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
10484
10485 - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
10486 + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
10487 return memory;
10488
10489 if (!dev) {
10490 @@ -491,8 +381,8 @@ dma_alloc_coherent(struct device *dev, s
10491 /* Let low level make its own zone decisions */
10492 gfp &= ~(GFP_DMA32|GFP_DMA);
10493
10494 - if (dma_ops->alloc_coherent)
10495 - return dma_ops->alloc_coherent(dev, size,
10496 + if (ops->alloc_coherent)
10497 + return ops->alloc_coherent(dev, size,
10498 dma_handle, gfp);
10499 return NULL;
10500 }
10501 @@ -504,14 +394,14 @@ dma_alloc_coherent(struct device *dev, s
10502 }
10503 }
10504
10505 - if (dma_ops->alloc_coherent) {
10506 + if (ops->alloc_coherent) {
10507 free_pages((unsigned long)memory, order);
10508 gfp &= ~(GFP_DMA|GFP_DMA32);
10509 - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
10510 + return ops->alloc_coherent(dev, size, dma_handle, gfp);
10511 }
10512
10513 - if (dma_ops->map_simple) {
10514 - *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
10515 + if (ops->map_simple) {
10516 + *dma_handle = ops->map_simple(dev, virt_to_bus(memory),
10517 size,
10518 PCI_DMA_BIDIRECTIONAL);
10519 if (*dma_handle != bad_dma_address)
10520 @@ -542,13 +432,17 @@ EXPORT_SYMBOL(dma_alloc_coherent);
10521 void dma_free_coherent(struct device *dev, size_t size,
10522 void *vaddr, dma_addr_t bus)
10523 {
10524 +#ifndef CONFIG_XEN
10525 + struct dma_mapping_ops *ops = get_dma_ops(dev);
10526 +#endif
10527 +
10528 int order = get_order(size);
10529 WARN_ON(irqs_disabled()); /* for portability */
10530 - if (dma_release_coherent(dev, order, vaddr))
10531 + if (dma_release_from_coherent(dev, order, vaddr))
10532 return;
10533 #ifndef CONFIG_XEN
10534 - if (dma_ops->unmap_single)
10535 - dma_ops->unmap_single(dev, bus, size, 0);
10536 + if (ops->unmap_single)
10537 + ops->unmap_single(dev, bus, size, 0);
10538 #endif
10539 xen_destroy_contiguous_region((unsigned long)vaddr, order);
10540 free_pages((unsigned long)vaddr, order);
10541 @@ -557,15 +451,13 @@ EXPORT_SYMBOL(dma_free_coherent);
10542
10543 static int __init pci_iommu_init(void)
10544 {
10545 -#ifdef CONFIG_CALGARY_IOMMU
10546 calgary_iommu_init();
10547 -#endif
10548
10549 intel_iommu_init();
10550
10551 -#ifdef CONFIG_GART_IOMMU
10552 + amd_iommu_init();
10553 +
10554 gart_iommu_init();
10555 -#endif
10556
10557 no_iommu_init();
10558 return 0;
10559 --- sle11-2009-10-16.orig/arch/x86/kernel/pci-nommu-xen.c 2009-03-16 16:38:05.000000000 +0100
10560 +++ sle11-2009-10-16/arch/x86/kernel/pci-nommu-xen.c 2009-06-04 10:21:39.000000000 +0200
10561 @@ -79,18 +79,12 @@ gnttab_unmap_single(struct device *dev,
10562 gnttab_dma_unmap_page(dma_addr);
10563 }
10564
10565 -static int nommu_mapping_error(dma_addr_t dma_addr)
10566 -{
10567 - return (dma_addr == bad_dma_address);
10568 -}
10569 -
10570 -static const struct dma_mapping_ops nommu_dma_ops = {
10571 +static struct dma_mapping_ops nommu_dma_ops = {
10572 .map_single = gnttab_map_single,
10573 .unmap_single = gnttab_unmap_single,
10574 .map_sg = gnttab_map_sg,
10575 .unmap_sg = gnttab_unmap_sg,
10576 .dma_supported = swiotlb_dma_supported,
10577 - .mapping_error = nommu_mapping_error
10578 };
10579
10580 void __init no_iommu_init(void)
10581 --- sle11-2009-10-16.orig/arch/x86/kernel/probe_roms_32.c 2009-10-28 14:55:02.000000000 +0100
10582 +++ sle11-2009-10-16/arch/x86/kernel/probe_roms_32.c 2009-06-04 10:21:39.000000000 +0200
10583 @@ -99,6 +99,11 @@ void __init probe_roms(void)
10584 unsigned char c;
10585 int i;
10586
10587 +#ifdef CONFIG_XEN
10588 + if (!is_initial_xendomain())
10589 + return;
10590 +#endif
10591 +
10592 /* video rom */
10593 upper = adapter_rom_resources[0].start;
10594 for (start = video_rom_resource.start; start < upper; start += 2048) {
10595 @@ -131,7 +136,7 @@ void __init probe_roms(void)
10596 upper = system_rom_resource.start;
10597
10598 /* check for extension rom (ignore length byte!) */
10599 - rom = isa_bus_to_virt(extension_rom_resource.start);
10600 + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
10601 if (romsignature(rom)) {
10602 length = extension_rom_resource.end - extension_rom_resource.start + 1;
10603 if (romchecksum(rom, length)) {
10604 --- sle11-2009-10-16.orig/arch/x86/kernel/process-xen.c 2009-03-16 16:38:05.000000000 +0100
10605 +++ sle11-2009-10-16/arch/x86/kernel/process-xen.c 2009-06-04 10:21:39.000000000 +0200
10606 @@ -6,6 +6,13 @@
10607 #include <linux/sched.h>
10608 #include <linux/module.h>
10609 #include <linux/pm.h>
10610 +#include <linux/clockchips.h>
10611 +#include <asm/system.h>
10612 +
10613 +unsigned long idle_halt;
10614 +EXPORT_SYMBOL(idle_halt);
10615 +unsigned long idle_nomwait;
10616 +EXPORT_SYMBOL(idle_nomwait);
10617
10618 struct kmem_cache *task_xstate_cachep;
10619
10620 @@ -45,6 +52,70 @@ void arch_task_cache_init(void)
10621 SLAB_PANIC, NULL);
10622 }
10623
10624 +/*
10625 + * Idle related variables and functions
10626 + */
10627 +unsigned long boot_option_idle_override = 0;
10628 +EXPORT_SYMBOL(boot_option_idle_override);
10629 +
10630 +/*
10631 + * Powermanagement idle function, if any..
10632 + */
10633 +void (*pm_idle)(void);
10634 +EXPORT_SYMBOL(pm_idle);
10635 +
10636 +#ifdef CONFIG_X86_32
10637 +/*
10638 + * This halt magic was a workaround for ancient floppy DMA
10639 + * wreckage. It should be safe to remove.
10640 + */
10641 +static int hlt_counter;
10642 +void disable_hlt(void)
10643 +{
10644 + hlt_counter++;
10645 +}
10646 +EXPORT_SYMBOL(disable_hlt);
10647 +
10648 +void enable_hlt(void)
10649 +{
10650 + hlt_counter--;
10651 +}
10652 +EXPORT_SYMBOL(enable_hlt);
10653 +
10654 +static inline int hlt_use_halt(void)
10655 +{
10656 + return (!hlt_counter && boot_cpu_data.hlt_works_ok);
10657 +}
10658 +#else
10659 +static inline int hlt_use_halt(void)
10660 +{
10661 + return 1;
10662 +}
10663 +#endif
10664 +
10665 +/*
10666 + * We use this if we don't have any better
10667 + * idle routine..
10668 + */
10669 +void xen_idle(void)
10670 +{
10671 + current_thread_info()->status &= ~TS_POLLING;
10672 + /*
10673 + * TS_POLLING-cleared state must be visible before we
10674 + * test NEED_RESCHED:
10675 + */
10676 + smp_mb();
10677 +
10678 + if (!need_resched())
10679 + safe_halt(); /* enables interrupts racelessly */
10680 + else
10681 + local_irq_enable();
10682 + current_thread_info()->status |= TS_POLLING;
10683 +}
10684 +#ifdef CONFIG_APM_MODULE
10685 +EXPORT_SYMBOL(default_idle);
10686 +#endif
10687 +
10688 static void do_nothing(void *unused)
10689 {
10690 }
10691 @@ -61,7 +132,7 @@ void cpu_idle_wait(void)
10692 {
10693 smp_mb();
10694 /* kick all the CPUs so that they exit out of pm_idle */
10695 - smp_call_function(do_nothing, NULL, 0, 1);
10696 + smp_call_function(do_nothing, NULL, 1);
10697 }
10698 EXPORT_SYMBOL_GPL(cpu_idle_wait);
10699
10700 @@ -125,60 +196,175 @@ static void poll_idle(void)
10701 *
10702 * idle=mwait overrides this decision and forces the usage of mwait.
10703 */
10704 +static int __cpuinitdata force_mwait;
10705 +
10706 +#define MWAIT_INFO 0x05
10707 +#define MWAIT_ECX_EXTENDED_INFO 0x01
10708 +#define MWAIT_EDX_C1 0xf0
10709 +
10710 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
10711 {
10712 + u32 eax, ebx, ecx, edx;
10713 +
10714 if (force_mwait)
10715 return 1;
10716
10717 - if (c->x86_vendor == X86_VENDOR_AMD) {
10718 - switch(c->x86) {
10719 - case 0x10:
10720 - case 0x11:
10721 - return 0;
10722 + if (c->cpuid_level < MWAIT_INFO)
10723 + return 0;
10724 +
10725 + cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
10726 + /* Check, whether EDX has extended info about MWAIT */
10727 + if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
10728 + return 1;
10729 +
10730 + /*
10731 + * edx enumeratios MONITOR/MWAIT extensions. Check, whether
10732 + * C1 supports MWAIT
10733 + */
10734 + return (edx & MWAIT_EDX_C1);
10735 +}
10736 +
10737 +/*
10738 + * Check for AMD CPUs, which have potentially C1E support
10739 + */
10740 +static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
10741 +{
10742 + if (c->x86_vendor != X86_VENDOR_AMD)
10743 + return 0;
10744 +
10745 + if (c->x86 < 0x0F)
10746 + return 0;
10747 +
10748 + /* Family 0x0f models < rev F do not have C1E */
10749 + if (c->x86 == 0x0f && c->x86_model < 0x40)
10750 + return 0;
10751 +
10752 + return 1;
10753 +}
10754 +
10755 +static cpumask_t c1e_mask = CPU_MASK_NONE;
10756 +static int c1e_detected;
10757 +
10758 +void c1e_remove_cpu(int cpu)
10759 +{
10760 + cpu_clear(cpu, c1e_mask);
10761 +}
10762 +
10763 +/*
10764 + * C1E aware idle routine. We check for C1E active in the interrupt
10765 + * pending message MSR. If we detect C1E, then we handle it the same
10766 + * way as C3 power states (local apic timer and TSC stop)
10767 + */
10768 +static void c1e_idle(void)
10769 +{
10770 + if (need_resched())
10771 + return;
10772 +
10773 + if (!c1e_detected) {
10774 + u32 lo, hi;
10775 +
10776 + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
10777 + if (lo & K8_INTP_C1E_ACTIVE_MASK) {
10778 + c1e_detected = 1;
10779 + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
10780 + mark_tsc_unstable("TSC halt in AMD C1E");
10781 + printk(KERN_INFO "System has AMD C1E enabled\n");
10782 + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
10783 }
10784 }
10785 - return 1;
10786 +
10787 + if (c1e_detected) {
10788 + int cpu = smp_processor_id();
10789 +
10790 + if (!cpu_isset(cpu, c1e_mask)) {
10791 + cpu_set(cpu, c1e_mask);
10792 + /*
10793 + * Force broadcast so ACPI can not interfere. Needs
10794 + * to run with interrupts enabled as it uses
10795 + * smp_function_call.
10796 + */
10797 + local_irq_enable();
10798 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
10799 + &cpu);
10800 + printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
10801 + cpu);
10802 + local_irq_disable();
10803 + }
10804 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
10805 +
10806 + default_idle();
10807 +
10808 + /*
10809 + * The switch back from broadcast mode needs to be
10810 + * called with interrupts disabled.
10811 + */
10812 + local_irq_disable();
10813 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
10814 + local_irq_enable();
10815 + } else
10816 + default_idle();
10817 }
10818 #endif
10819
10820 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
10821 {
10822 #ifndef CONFIG_XEN
10823 - static int selected;
10824 -
10825 - if (selected)
10826 - return;
10827 #ifdef CONFIG_X86_SMP
10828 if (pm_idle == poll_idle && smp_num_siblings > 1) {
10829 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
10830 " performance may degrade.\n");
10831 }
10832 #endif
10833 + if (pm_idle)
10834 + return;
10835 +
10836 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
10837 /*
10838 - * Skip, if setup has overridden idle.
10839 * One CPU supports mwait => All CPUs supports mwait
10840 */
10841 - if (!pm_idle) {
10842 - printk(KERN_INFO "using mwait in idle threads.\n");
10843 - pm_idle = mwait_idle;
10844 - }
10845 - }
10846 - selected = 1;
10847 + printk(KERN_INFO "using mwait in idle threads.\n");
10848 + pm_idle = mwait_idle;
10849 + } else if (check_c1e_idle(c)) {
10850 + printk(KERN_INFO "using C1E aware idle routine\n");
10851 + pm_idle = c1e_idle;
10852 + } else
10853 + pm_idle = default_idle;
10854 #endif
10855 }
10856
10857 static int __init idle_setup(char *str)
10858 {
10859 + if (!str)
10860 + return -EINVAL;
10861 +
10862 if (!strcmp(str, "poll")) {
10863 printk("using polling idle threads.\n");
10864 pm_idle = poll_idle;
10865 - }
10866 #ifndef CONFIG_XEN
10867 - else if (!strcmp(str, "mwait"))
10868 + } else if (!strcmp(str, "mwait"))
10869 force_mwait = 1;
10870 + else if (!strcmp(str, "halt")) {
10871 + /*
10872 + * When the boot option of idle=halt is added, halt is
10873 + * forced to be used for CPU idle. In such case CPU C2/C3
10874 + * won't be used again.
10875 + * To continue to load the CPU idle driver, don't touch
10876 + * the boot_option_idle_override.
10877 + */
10878 + pm_idle = default_idle;
10879 + idle_halt = 1;
10880 + return 0;
10881 + } else if (!strcmp(str, "nomwait")) {
10882 + /*
10883 + * If the boot option of "idle=nomwait" is added,
10884 + * it means that mwait will be disabled for CPU C2/C3
10885 + * states. In such case it won't touch the variable
10886 + * of boot_option_idle_override.
10887 + */
10888 + idle_nomwait = 1;
10889 + return 0;
10890 #endif
10891 - else
10892 + } else
10893 return -1;
10894
10895 boot_option_idle_override = 1;
10896 --- sle11-2009-10-16.orig/arch/x86/kernel/process_32-xen.c 2009-03-16 16:38:05.000000000 +0100
10897 +++ sle11-2009-10-16/arch/x86/kernel/process_32-xen.c 2009-06-04 10:21:39.000000000 +0200
10898 @@ -59,15 +59,11 @@
10899 #include <asm/tlbflush.h>
10900 #include <asm/cpu.h>
10901 #include <asm/kdebug.h>
10902 +#include <asm/idle.h>
10903
10904 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
10905 asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
10906
10907 -static int hlt_counter;
10908 -
10909 -unsigned long boot_option_idle_override = 0;
10910 -EXPORT_SYMBOL(boot_option_idle_override);
10911 -
10912 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
10913 EXPORT_PER_CPU_SYMBOL(current_task);
10914
10915 @@ -82,46 +78,27 @@ unsigned long thread_saved_pc(struct tas
10916 return ((unsigned long *)tsk->thread.sp)[3];
10917 }
10918
10919 -/*
10920 - * Powermanagement idle function, if any..
10921 - */
10922 -void (*pm_idle)(void);
10923 -EXPORT_SYMBOL(pm_idle);
10924 +#ifdef CONFIG_HOTPLUG_CPU
10925 +#ifndef CONFIG_XEN
10926 +#include <asm/nmi.h>
10927
10928 -void disable_hlt(void)
10929 +static void cpu_exit_clear(void)
10930 {
10931 - hlt_counter++;
10932 -}
10933 + int cpu = raw_smp_processor_id();
10934
10935 -EXPORT_SYMBOL(disable_hlt);
10936 -
10937 -void enable_hlt(void)
10938 -{
10939 - hlt_counter--;
10940 -}
10941 + idle_task_exit();
10942
10943 -EXPORT_SYMBOL(enable_hlt);
10944 + cpu_uninit();
10945 + irq_ctx_exit(cpu);
10946
10947 -static void xen_idle(void)
10948 -{
10949 - current_thread_info()->status &= ~TS_POLLING;
10950 - /*
10951 - * TS_POLLING-cleared state must be visible before we
10952 - * test NEED_RESCHED:
10953 - */
10954 - smp_mb();
10955 + cpu_clear(cpu, cpu_callout_map);
10956 + cpu_clear(cpu, cpu_callin_map);
10957
10958 - if (!need_resched())
10959 - safe_halt(); /* enables interrupts racelessly */
10960 - else
10961 - local_irq_enable();
10962 - current_thread_info()->status |= TS_POLLING;
10963 + numa_remove_cpu(cpu);
10964 + c1e_remove_cpu(cpu);
10965 }
10966 -#ifdef CONFIG_APM_MODULE
10967 -EXPORT_SYMBOL(default_idle);
10968 #endif
10969
10970 -#ifdef CONFIG_HOTPLUG_CPU
10971 static inline void play_dead(void)
10972 {
10973 idle_task_exit();
10974 @@ -152,13 +129,11 @@ void cpu_idle(void)
10975
10976 /* endless idle loop with no priority at all */
10977 while (1) {
10978 - tick_nohz_stop_sched_tick();
10979 + tick_nohz_stop_sched_tick(1);
10980 while (!need_resched()) {
10981 - void (*idle)(void);
10982
10983 check_pgt_cache();
10984 rmb();
10985 - idle = xen_idle; /* no alternatives */
10986
10987 if (rcu_pending(cpu))
10988 rcu_check_callbacks(cpu, 0);
10989 @@ -168,7 +143,10 @@ void cpu_idle(void)
10990
10991 local_irq_disable();
10992 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
10993 - idle();
10994 + /* Don't trace irqs off for idle */
10995 + stop_critical_timings();
10996 + xen_idle();
10997 + start_critical_timings();
10998 }
10999 tick_nohz_restart_sched_tick();
11000 preempt_enable_no_resched();
11001 --- sle11-2009-10-16.orig/arch/x86/kernel/process_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11002 +++ sle11-2009-10-16/arch/x86/kernel/process_64-xen.c 2009-06-04 10:21:39.000000000 +0200
11003 @@ -65,15 +65,6 @@ asmlinkage extern void ret_from_fork(voi
11004
11005 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
11006
11007 -unsigned long boot_option_idle_override = 0;
11008 -EXPORT_SYMBOL(boot_option_idle_override);
11009 -
11010 -/*
11011 - * Powermanagement idle function, if any..
11012 - */
11013 -void (*pm_idle)(void);
11014 -EXPORT_SYMBOL(pm_idle);
11015 -
11016 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
11017
11018 void idle_notifier_register(struct notifier_block *n)
11019 @@ -103,25 +94,13 @@ void exit_idle(void)
11020 __exit_idle();
11021 }
11022
11023 -static void xen_idle(void)
11024 -{
11025 - current_thread_info()->status &= ~TS_POLLING;
11026 - /*
11027 - * TS_POLLING-cleared state must be visible before we
11028 - * test NEED_RESCHED:
11029 - */
11030 - smp_mb();
11031 - if (!need_resched())
11032 - safe_halt(); /* enables interrupts racelessly */
11033 - else
11034 - local_irq_enable();
11035 - current_thread_info()->status |= TS_POLLING;
11036 -}
11037 -
11038 #ifdef CONFIG_HOTPLUG_CPU
11039 static inline void play_dead(void)
11040 {
11041 idle_task_exit();
11042 +#ifndef CONFIG_XEN
11043 + c1e_remove_cpu(raw_smp_processor_id());
11044 +#endif
11045 local_irq_disable();
11046 cpu_clear(smp_processor_id(), cpu_initialized);
11047 preempt_enable_no_resched();
11048 @@ -146,12 +125,11 @@ void cpu_idle(void)
11049 current_thread_info()->status |= TS_POLLING;
11050 /* endless idle loop with no priority at all */
11051 while (1) {
11052 - tick_nohz_stop_sched_tick();
11053 + tick_nohz_stop_sched_tick(1);
11054 while (!need_resched()) {
11055 - void (*idle)(void);
11056
11057 rmb();
11058 - idle = xen_idle; /* no alternatives */
11059 +
11060 if (cpu_is_offline(smp_processor_id()))
11061 play_dead();
11062 /*
11063 @@ -161,7 +139,10 @@ void cpu_idle(void)
11064 */
11065 local_irq_disable();
11066 enter_idle();
11067 - idle();
11068 + /* Don't trace irqs off for idle */
11069 + stop_critical_timings();
11070 + xen_idle();
11071 + start_critical_timings();
11072 /* In many cases the interrupt that ended idle
11073 has already called exit_idle. But some idle
11074 loops can be woken up without interrupt. */
11075 @@ -271,7 +252,7 @@ void exit_thread(void)
11076 }
11077 }
11078
11079 -void load_gs_index(unsigned gs)
11080 +void xen_load_gs_index(unsigned gs)
11081 {
11082 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
11083 }
11084 @@ -373,10 +354,10 @@ int copy_thread(int nr, unsigned long cl
11085 p->thread.fs = me->thread.fs;
11086 p->thread.gs = me->thread.gs;
11087
11088 - asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
11089 - asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
11090 - asm("mov %%es,%0" : "=m" (p->thread.es));
11091 - asm("mov %%ds,%0" : "=m" (p->thread.ds));
11092 + savesegment(gs, p->thread.gsindex);
11093 + savesegment(fs, p->thread.fsindex);
11094 + savesegment(es, p->thread.es);
11095 + savesegment(ds, p->thread.ds);
11096
11097 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
11098 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
11099 @@ -417,7 +398,9 @@ out:
11100 void
11101 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
11102 {
11103 - asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
11104 + loadsegment(fs, 0);
11105 + loadsegment(es, 0);
11106 + loadsegment(ds, 0);
11107 load_gs_index(0);
11108 regs->ip = new_ip;
11109 regs->sp = new_sp;
11110 @@ -557,8 +540,8 @@ static inline void __switch_to_xtra(stru
11111 struct task_struct *
11112 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
11113 {
11114 - struct thread_struct *prev = &prev_p->thread,
11115 - *next = &next_p->thread;
11116 + struct thread_struct *prev = &prev_p->thread;
11117 + struct thread_struct *next = &next_p->thread;
11118 int cpu = smp_processor_id();
11119 #ifndef CONFIG_X86_NO_TSS
11120 struct tss_struct *tss = &per_cpu(init_tss, cpu);
11121 @@ -661,12 +644,25 @@ __switch_to(struct task_struct *prev_p,
11122 */
11123 if (unlikely(next->es))
11124 loadsegment(es, next->es);
11125 -
11126 +
11127 if (unlikely(next->ds))
11128 loadsegment(ds, next->ds);
11129
11130 + /*
11131 + * Leave lazy mode, flushing any hypercalls made here.
11132 + * This must be done before restoring TLS segments so
11133 + * the GDT and LDT are properly updated, and must be
11134 + * done before math_state_restore, so the TS bit is up
11135 + * to date.
11136 + */
11137 + arch_leave_lazy_cpu_mode();
11138 +
11139 /*
11140 * Switch FS and GS.
11141 + *
11142 + * Segment register != 0 always requires a reload. Also
11143 + * reload when it has changed. When prev process used 64bit
11144 + * base always reload to avoid an information leak.
11145 */
11146 if (unlikely(next->fsindex))
11147 loadsegment(fs, next->fsindex);
11148 @@ -687,7 +683,8 @@ __switch_to(struct task_struct *prev_p,
11149 write_pda(oldrsp, next->usersp);
11150 write_pda(pcurrent, next_p);
11151 write_pda(kernelstack,
11152 - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
11153 + (unsigned long)task_stack_page(next_p) +
11154 + THREAD_SIZE - PDA_STACKOFFSET);
11155 #ifdef CONFIG_CC_STACKPROTECTOR
11156 write_pda(stack_canary, next_p->stack_canary);
11157
11158 @@ -848,7 +845,7 @@ long do_arch_prctl(struct task_struct *t
11159 set_32bit_tls(task, FS_TLS, addr);
11160 if (doit) {
11161 load_TLS(&task->thread, cpu);
11162 - asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
11163 + loadsegment(fs, FS_TLS_SEL);
11164 }
11165 task->thread.fsindex = FS_TLS_SEL;
11166 task->thread.fs = 0;
11167 @@ -858,7 +855,7 @@ long do_arch_prctl(struct task_struct *t
11168 if (doit) {
11169 /* set the selector to 0 to not confuse
11170 __switch_to */
11171 - asm volatile("movl %0,%%fs" :: "r" (0));
11172 + loadsegment(fs, 0);
11173 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
11174 addr);
11175 }
11176 @@ -882,7 +879,7 @@ long do_arch_prctl(struct task_struct *t
11177 if (task->thread.gsindex == GS_TLS_SEL)
11178 base = read_32bit_tls(task, GS_TLS);
11179 else if (doit) {
11180 - asm("movl %%gs,%0" : "=r" (gsindex));
11181 + savesegment(gs, gsindex);
11182 if (gsindex)
11183 rdmsrl(MSR_KERNEL_GS_BASE, base);
11184 else
11185 --- sle11-2009-10-16.orig/arch/x86/kernel/quirks-xen.c 2009-03-16 16:33:40.000000000 +0100
11186 +++ sle11-2009-10-16/arch/x86/kernel/quirks-xen.c 2009-06-04 10:21:39.000000000 +0200
11187 @@ -63,6 +63,7 @@ static enum {
11188 ICH_FORCE_HPET_RESUME,
11189 VT8237_FORCE_HPET_RESUME,
11190 NVIDIA_FORCE_HPET_RESUME,
11191 + ATI_FORCE_HPET_RESUME,
11192 } force_hpet_resume_type;
11193
11194 static void __iomem *rcba_base;
11195 @@ -156,6 +157,8 @@ static void ich_force_enable_hpet(struct
11196
11197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
11198 ich_force_enable_hpet);
11199 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
11200 + ich_force_enable_hpet);
11201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
11202 ich_force_enable_hpet);
11203 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
11204 @@ -172,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
11205
11206 static struct pci_dev *cached_dev;
11207
11208 +static void hpet_print_force_info(void)
11209 +{
11210 + printk(KERN_INFO "HPET not enabled in BIOS. "
11211 + "You might try hpet=force boot option\n");
11212 +}
11213 +
11214 static void old_ich_force_hpet_resume(void)
11215 {
11216 u32 val;
11217 @@ -251,8 +260,12 @@ static void old_ich_force_enable_hpet_us
11218 {
11219 if (hpet_force_user)
11220 old_ich_force_enable_hpet(dev);
11221 + else
11222 + hpet_print_force_info();
11223 }
11224
11225 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
11226 + old_ich_force_enable_hpet_user);
11227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
11228 old_ich_force_enable_hpet_user);
11229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
11230 @@ -288,9 +301,14 @@ static void vt8237_force_enable_hpet(str
11231 {
11232 u32 uninitialized_var(val);
11233
11234 - if (!hpet_force_user || hpet_address || force_hpet_address)
11235 + if (hpet_address || force_hpet_address)
11236 return;
11237
11238 + if (!hpet_force_user) {
11239 + hpet_print_force_info();
11240 + return;
11241 + }
11242 +
11243 pci_read_config_dword(dev, 0x68, &val);
11244 /*
11245 * Bit 7 is HPET enable bit.
11246 @@ -328,6 +346,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_V
11247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
11248 vt8237_force_enable_hpet);
11249
11250 +static void ati_force_hpet_resume(void)
11251 +{
11252 + pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
11253 + printk(KERN_DEBUG "Force enabled HPET at resume\n");
11254 +}
11255 +
11256 +static void ati_force_enable_hpet(struct pci_dev *dev)
11257 +{
11258 + u32 uninitialized_var(val);
11259 +
11260 + if (hpet_address || force_hpet_address)
11261 + return;
11262 +
11263 + if (!hpet_force_user) {
11264 + hpet_print_force_info();
11265 + return;
11266 + }
11267 +
11268 + pci_write_config_dword(dev, 0x14, 0xfed00000);
11269 + pci_read_config_dword(dev, 0x14, &val);
11270 + force_hpet_address = val;
11271 + force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
11272 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
11273 + force_hpet_address);
11274 + cached_dev = dev;
11275 + return;
11276 +}
11277 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
11278 + ati_force_enable_hpet);
11279 +
11280 /*
11281 * Undocumented chipset feature taken from LinuxBIOS.
11282 */
11283 @@ -341,8 +389,13 @@ static void nvidia_force_enable_hpet(str
11284 {
11285 u32 uninitialized_var(val);
11286
11287 - if (!hpet_force_user || hpet_address || force_hpet_address)
11288 + if (hpet_address || force_hpet_address)
11289 + return;
11290 +
11291 + if (!hpet_force_user) {
11292 + hpet_print_force_info();
11293 return;
11294 + }
11295
11296 pci_write_config_dword(dev, 0x44, 0xfed00001);
11297 pci_read_config_dword(dev, 0x44, &val);
11298 @@ -395,6 +448,9 @@ void force_hpet_resume(void)
11299 case NVIDIA_FORCE_HPET_RESUME:
11300 nvidia_force_hpet_resume();
11301 return;
11302 + case ATI_FORCE_HPET_RESUME:
11303 + ati_force_hpet_resume();
11304 + return;
11305 default:
11306 break;
11307 }
11308 --- sle11-2009-10-16.orig/arch/x86/kernel/setup-xen.c 2009-03-16 16:38:05.000000000 +0100
11309 +++ sle11-2009-10-16/arch/x86/kernel/setup-xen.c 2009-06-04 10:21:39.000000000 +0200
11310 @@ -1,141 +1,1132 @@
11311 -#include <linux/kernel.h>
11312 +/*
11313 + * Copyright (C) 1995 Linus Torvalds
11314 + *
11315 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11316 + *
11317 + * Memory region support
11318 + * David Parsons <orc@pell.chi.il.us>, July-August 1999
11319 + *
11320 + * Added E820 sanitization routine (removes overlapping memory regions);
11321 + * Brian Moyle <bmoyle@mvista.com>, February 2001
11322 + *
11323 + * Moved CPU detection code to cpu/${cpu}.c
11324 + * Patrick Mochel <mochel@osdl.org>, March 2002
11325 + *
11326 + * Provisions for empty E820 memory regions (reported by certain BIOSes).
11327 + * Alex Achenbach <xela@slit.de>, December 2002.
11328 + *
11329 + */
11330 +
11331 +/*
11332 + * This file handles the architecture-dependent parts of initialization
11333 + */
11334 +
11335 +#include <linux/sched.h>
11336 +#include <linux/mm.h>
11337 +#include <linux/mmzone.h>
11338 +#include <linux/screen_info.h>
11339 +#include <linux/ioport.h>
11340 +#include <linux/acpi.h>
11341 +#include <linux/apm_bios.h>
11342 +#include <linux/initrd.h>
11343 +#include <linux/bootmem.h>
11344 +#include <linux/seq_file.h>
11345 +#include <linux/console.h>
11346 +#include <linux/mca.h>
11347 +#include <linux/root_dev.h>
11348 +#include <linux/highmem.h>
11349 #include <linux/module.h>
11350 +#include <linux/efi.h>
11351 #include <linux/init.h>
11352 -#include <linux/bootmem.h>
11353 +#include <linux/edd.h>
11354 +#include <linux/iscsi_ibft.h>
11355 +#include <linux/nodemask.h>
11356 +#include <linux/kexec.h>
11357 +#include <linux/dmi.h>
11358 +#include <linux/pfn.h>
11359 +#include <linux/pci.h>
11360 +#include <asm/pci-direct.h>
11361 +#include <linux/init_ohci1394_dma.h>
11362 +#include <linux/kvm_para.h>
11363 +
11364 +#include <linux/errno.h>
11365 +#include <linux/kernel.h>
11366 +#include <linux/stddef.h>
11367 +#include <linux/unistd.h>
11368 +#include <linux/ptrace.h>
11369 +#include <linux/slab.h>
11370 +#include <linux/user.h>
11371 +#include <linux/delay.h>
11372 +
11373 +#include <linux/kallsyms.h>
11374 +#include <linux/cpufreq.h>
11375 +#include <linux/dma-mapping.h>
11376 +#include <linux/ctype.h>
11377 +#include <linux/uaccess.h>
11378 +
11379 #include <linux/percpu.h>
11380 -#include <asm/smp.h>
11381 -#include <asm/percpu.h>
11382 +#include <linux/crash_dump.h>
11383 +
11384 +#include <video/edid.h>
11385 +
11386 +#include <asm/mtrr.h>
11387 +#include <asm/apic.h>
11388 +#include <asm/e820.h>
11389 +#include <asm/mpspec.h>
11390 +#include <asm/setup.h>
11391 +#include <asm/arch_hooks.h>
11392 +#include <asm/efi.h>
11393 #include <asm/sections.h>
11394 +#include <asm/dmi.h>
11395 +#include <asm/io_apic.h>
11396 +#include <asm/ist.h>
11397 +#include <asm/vmi.h>
11398 +#include <setup_arch.h>
11399 +#include <asm/bios_ebda.h>
11400 +#include <asm/cacheflush.h>
11401 #include <asm/processor.h>
11402 -#include <asm/setup.h>
11403 +#include <asm/bugs.h>
11404 +
11405 +#include <asm/system.h>
11406 +#include <asm/vsyscall.h>
11407 +#include <asm/smp.h>
11408 +#include <asm/desc.h>
11409 +#include <asm/dma.h>
11410 +#include <asm/iommu.h>
11411 +#include <asm/mmu_context.h>
11412 +#include <asm/proto.h>
11413 +
11414 +#include <mach_apic.h>
11415 +#include <asm/paravirt.h>
11416 +
11417 +#include <asm/percpu.h>
11418 #include <asm/topology.h>
11419 -#include <asm/mpspec.h>
11420 #include <asm/apicdef.h>
11421 +#ifdef CONFIG_X86_64
11422 +#include <asm/numa_64.h>
11423 +#endif
11424 +
11425 +#ifdef CONFIG_XEN
11426 +#include <asm/hypervisor.h>
11427 +#include <xen/interface/kexec.h>
11428 +#include <xen/interface/memory.h>
11429 +#include <xen/interface/nmi.h>
11430 +#include <xen/interface/physdev.h>
11431 +#include <xen/features.h>
11432 +#include <xen/firmware.h>
11433 +#include <xen/xencons.h>
11434 +
11435 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11436 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
11437
11438 -#ifdef CONFIG_X86_LOCAL_APIC
11439 -unsigned int num_processors;
11440 -unsigned disabled_cpus __cpuinitdata;
11441 -/* Processor that is doing the boot up */
11442 -unsigned int boot_cpu_physical_apicid = -1U;
11443 -EXPORT_SYMBOL(boot_cpu_physical_apicid);
11444 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
11445 +static struct notifier_block xen_panic_block = {
11446 + xen_panic_event, NULL, 0 /* try to go last */
11447 +};
11448
11449 -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
11450 -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
11451 +unsigned long *phys_to_machine_mapping;
11452 +EXPORT_SYMBOL(phys_to_machine_mapping);
11453
11454 -/* Bitmask of physically existing CPUs */
11455 -physid_mask_t phys_cpu_present_map;
11456 +unsigned long *pfn_to_mfn_frame_list_list,
11457 +#ifdef CONFIG_X86_64
11458 + *pfn_to_mfn_frame_list[512];
11459 +#else
11460 + *pfn_to_mfn_frame_list[128];
11461 +#endif
11462 +
11463 +/* Raw start-of-day parameters from the hypervisor. */
11464 +start_info_t *xen_start_info;
11465 +EXPORT_SYMBOL(xen_start_info);
11466 +#endif
11467 +
11468 +#ifndef ARCH_SETUP
11469 +#define ARCH_SETUP
11470 +#endif
11471 +
11472 +#ifndef CONFIG_XEN
11473 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
11474 +struct boot_params __initdata boot_params;
11475 +#else
11476 +struct boot_params boot_params;
11477 +#endif
11478 #endif
11479
11480 -#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
11481 /*
11482 - * Copy data used in early init routines from the initial arrays to the
11483 - * per cpu data areas. These arrays then become expendable and the
11484 - * *_early_ptr's are zeroed indicating that the static arrays are gone.
11485 + * Machine setup..
11486 */
11487 -static void __init setup_per_cpu_maps(void)
11488 +static struct resource data_resource = {
11489 + .name = "Kernel data",
11490 + .start = 0,
11491 + .end = 0,
11492 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11493 +};
11494 +
11495 +static struct resource code_resource = {
11496 + .name = "Kernel code",
11497 + .start = 0,
11498 + .end = 0,
11499 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11500 +};
11501 +
11502 +static struct resource bss_resource = {
11503 + .name = "Kernel bss",
11504 + .start = 0,
11505 + .end = 0,
11506 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11507 +};
11508 +
11509 +
11510 +#ifdef CONFIG_X86_32
11511 +#ifndef CONFIG_XEN
11512 +/* This value is set up by the early boot code to point to the value
11513 + immediately after the boot time page tables. It contains a *physical*
11514 + address, and must not be in the .bss segment! */
11515 +unsigned long init_pg_tables_start __initdata = ~0UL;
11516 +unsigned long init_pg_tables_end __initdata = ~0UL;
11517 +#endif
11518 +
11519 +static struct resource video_ram_resource = {
11520 + .name = "Video RAM area",
11521 + .start = 0xa0000,
11522 + .end = 0xbffff,
11523 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11524 +};
11525 +
11526 +/* cpu data as detected by the assembly code in head.S */
11527 +struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11528 +/* common cpu data for all cpus */
11529 +struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
11530 +EXPORT_SYMBOL(boot_cpu_data);
11531 +#ifndef CONFIG_XEN
11532 +static void set_mca_bus(int x)
11533 +{
11534 +#ifdef CONFIG_MCA
11535 + MCA_bus = x;
11536 +#endif
11537 +}
11538 +
11539 +unsigned int def_to_bigsmp;
11540 +
11541 +/* for MCA, but anyone else can use it if they want */
11542 +unsigned int machine_id;
11543 +unsigned int machine_submodel_id;
11544 +unsigned int BIOS_revision;
11545 +
11546 +struct apm_info apm_info;
11547 +EXPORT_SYMBOL(apm_info);
11548 +#endif
11549 +
11550 +#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
11551 +struct ist_info ist_info;
11552 +EXPORT_SYMBOL(ist_info);
11553 +#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
11554 +struct ist_info ist_info;
11555 +#endif
11556 +
11557 +#else
11558 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
11559 +EXPORT_SYMBOL(boot_cpu_data);
11560 +#endif
11561 +
11562 +
11563 +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
11564 +unsigned long mmu_cr4_features;
11565 +#else
11566 +unsigned long mmu_cr4_features = X86_CR4_PAE;
11567 +#endif
11568 +
11569 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
11570 +int bootloader_type;
11571 +
11572 +/*
11573 + * Early DMI memory
11574 + */
11575 +int dmi_alloc_index;
11576 +char dmi_alloc_data[DMI_MAX_DATA];
11577 +
11578 +/*
11579 + * Setup options
11580 + */
11581 +struct screen_info screen_info;
11582 +EXPORT_SYMBOL(screen_info);
11583 +struct edid_info edid_info;
11584 +EXPORT_SYMBOL_GPL(edid_info);
11585 +
11586 +extern int root_mountflags;
11587 +
11588 +unsigned long saved_video_mode;
11589 +
11590 +#define RAMDISK_IMAGE_START_MASK 0x07FF
11591 +#define RAMDISK_PROMPT_FLAG 0x8000
11592 +#define RAMDISK_LOAD_FLAG 0x4000
11593 +
11594 +static char __initdata command_line[COMMAND_LINE_SIZE];
11595 +
11596 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11597 +struct edd edd;
11598 +#ifdef CONFIG_EDD_MODULE
11599 +EXPORT_SYMBOL(edd);
11600 +#endif
11601 +#ifndef CONFIG_XEN
11602 +/**
11603 + * copy_edd() - Copy the BIOS EDD information
11604 + * from boot_params into a safe place.
11605 + *
11606 + */
11607 +static inline void copy_edd(void)
11608 +{
11609 + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
11610 + sizeof(edd.mbr_signature));
11611 + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
11612 + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
11613 + edd.edd_info_nr = boot_params.eddbuf_entries;
11614 +}
11615 +#endif
11616 +#else
11617 +static inline void copy_edd(void)
11618 +{
11619 +}
11620 +#endif
11621 +
11622 +#ifdef CONFIG_BLK_DEV_INITRD
11623 +
11624 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11625 +
11626 +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
11627 +static void __init relocate_initrd(void)
11628 +{
11629 +
11630 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11631 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11632 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11633 + u64 ramdisk_here;
11634 + unsigned long slop, clen, mapaddr;
11635 + char *p, *q;
11636 +
11637 + /* We need to move the initrd down into lowmem */
11638 + ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
11639 + PAGE_SIZE);
11640 +
11641 + if (ramdisk_here == -1ULL)
11642 + panic("Cannot find place for new RAMDISK of size %lld\n",
11643 + ramdisk_size);
11644 +
11645 + /* Note: this includes all the lowmem currently occupied by
11646 + the initrd, we rely on that fact to keep the data intact. */
11647 + reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
11648 + "NEW RAMDISK");
11649 + initrd_start = ramdisk_here + PAGE_OFFSET;
11650 + initrd_end = initrd_start + ramdisk_size;
11651 + printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
11652 + ramdisk_here, ramdisk_here + ramdisk_size);
11653 +
11654 + q = (char *)initrd_start;
11655 +
11656 + /* Copy any lowmem portion of the initrd */
11657 + if (ramdisk_image < end_of_lowmem) {
11658 + clen = end_of_lowmem - ramdisk_image;
11659 + p = (char *)__va(ramdisk_image);
11660 + memcpy(q, p, clen);
11661 + q += clen;
11662 + ramdisk_image += clen;
11663 + ramdisk_size -= clen;
11664 + }
11665 +
11666 + /* Copy the highmem portion of the initrd */
11667 + while (ramdisk_size) {
11668 + slop = ramdisk_image & ~PAGE_MASK;
11669 + clen = ramdisk_size;
11670 + if (clen > MAX_MAP_CHUNK-slop)
11671 + clen = MAX_MAP_CHUNK-slop;
11672 + mapaddr = ramdisk_image & PAGE_MASK;
11673 + p = early_ioremap(mapaddr, clen+slop);
11674 + memcpy(q, p+slop, clen);
11675 + early_iounmap(p, clen+slop);
11676 + q += clen;
11677 + ramdisk_image += clen;
11678 + ramdisk_size -= clen;
11679 + }
11680 + /* high pages is not converted by early_res_to_bootmem */
11681 + ramdisk_image = boot_params.hdr.ramdisk_image;
11682 + ramdisk_size = boot_params.hdr.ramdisk_size;
11683 + printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
11684 + " %08llx - %08llx\n",
11685 + ramdisk_image, ramdisk_image + ramdisk_size - 1,
11686 + ramdisk_here, ramdisk_here + ramdisk_size - 1);
11687 +}
11688 +#endif
11689 +
11690 +static void __init reserve_initrd(void)
11691 {
11692 #ifndef CONFIG_XEN
11693 - int cpu;
11694 + u64 ramdisk_image = boot_params.hdr.ramdisk_image;
11695 + u64 ramdisk_size = boot_params.hdr.ramdisk_size;
11696 + u64 ramdisk_end = ramdisk_image + ramdisk_size;
11697 + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11698 +
11699 + if (!boot_params.hdr.type_of_loader ||
11700 + !ramdisk_image || !ramdisk_size)
11701 + return; /* No initrd provided by bootloader */
11702 +#else
11703 + unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
11704 + unsigned long ramdisk_size = xen_start_info->mod_len;
11705 + unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
11706 + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
11707
11708 - for_each_possible_cpu(cpu) {
11709 - per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
11710 - per_cpu(x86_bios_cpu_apicid, cpu) =
11711 - x86_bios_cpu_apicid_init[cpu];
11712 -#ifdef CONFIG_NUMA
11713 - per_cpu(x86_cpu_to_node_map, cpu) =
11714 - x86_cpu_to_node_map_init[cpu];
11715 + if (!xen_start_info->mod_start || !ramdisk_size)
11716 + return; /* No initrd provided by bootloader */
11717 #endif
11718 +
11719 + initrd_start = 0;
11720 +
11721 + if (ramdisk_size >= (end_of_lowmem>>1)) {
11722 + free_early(ramdisk_image, ramdisk_end);
11723 + printk(KERN_ERR "initrd too large to handle, "
11724 + "disabling initrd\n");
11725 + return;
11726 }
11727
11728 - /* indicate the early static arrays will soon be gone */
11729 - x86_cpu_to_apicid_early_ptr = NULL;
11730 - x86_bios_cpu_apicid_early_ptr = NULL;
11731 -#ifdef CONFIG_NUMA
11732 - x86_cpu_to_node_map_early_ptr = NULL;
11733 + printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
11734 + ramdisk_end);
11735 +
11736 +
11737 + if (ramdisk_end <= end_of_lowmem) {
11738 + /* All in lowmem, easy case */
11739 + /*
11740 + * don't need to reserve again, already reserved early
11741 + * in i386_start_kernel
11742 + */
11743 + initrd_start = ramdisk_image + PAGE_OFFSET;
11744 + initrd_end = initrd_start + ramdisk_size;
11745 +#ifdef CONFIG_X86_64_XEN
11746 + initrd_below_start_ok = 1;
11747 #endif
11748 + return;
11749 + }
11750 +
11751 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
11752 + relocate_initrd();
11753 +#else
11754 + printk(KERN_ERR "initrd extends beyond end of memory "
11755 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11756 + ramdisk_end, end_of_lowmem);
11757 + initrd_start = 0;
11758 #endif
11759 + free_early(ramdisk_image, ramdisk_end);
11760 }
11761 +#else
11762 +static void __init reserve_initrd(void)
11763 +{
11764 +}
11765 +#endif /* CONFIG_BLK_DEV_INITRD */
11766 +
11767 +static void __init parse_setup_data(void)
11768 +{
11769 +#ifndef CONFIG_XEN
11770 + struct setup_data *data;
11771 + u64 pa_data;
11772
11773 -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
11774 -cpumask_t *cpumask_of_cpu_map __read_mostly;
11775 -EXPORT_SYMBOL(cpumask_of_cpu_map);
11776 + if (boot_params.hdr.version < 0x0209)
11777 + return;
11778 + pa_data = boot_params.hdr.setup_data;
11779 + while (pa_data) {
11780 + data = early_ioremap(pa_data, PAGE_SIZE);
11781 + switch (data->type) {
11782 + case SETUP_E820_EXT:
11783 + parse_e820_ext(data, pa_data);
11784 + break;
11785 + default:
11786 + break;
11787 + }
11788 + pa_data = data->next;
11789 + early_iounmap(data, PAGE_SIZE);
11790 + }
11791 +#endif
11792 +}
11793
11794 -/* requires nr_cpu_ids to be initialized */
11795 -static void __init setup_cpumask_of_cpu(void)
11796 +static void __init e820_reserve_setup_data(void)
11797 {
11798 - int i;
11799 +#ifndef CONFIG_XEN
11800 + struct setup_data *data;
11801 + u64 pa_data;
11802 + int found = 0;
11803 +
11804 + if (boot_params.hdr.version < 0x0209)
11805 + return;
11806 + pa_data = boot_params.hdr.setup_data;
11807 + while (pa_data) {
11808 + data = early_ioremap(pa_data, sizeof(*data));
11809 + e820_update_range(pa_data, sizeof(*data)+data->len,
11810 + E820_RAM, E820_RESERVED_KERN);
11811 + found = 1;
11812 + pa_data = data->next;
11813 + early_iounmap(data, sizeof(*data));
11814 + }
11815 + if (!found)
11816 + return;
11817
11818 - /* alloc_bootmem zeroes memory */
11819 - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
11820 - for (i = 0; i < nr_cpu_ids; i++)
11821 - cpu_set(i, cpumask_of_cpu_map[i]);
11822 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
11823 + memcpy(&e820_saved, &e820, sizeof(struct e820map));
11824 + printk(KERN_INFO "extended physical RAM map:\n");
11825 + e820_print_map("reserve setup_data");
11826 +#endif
11827 }
11828 -#else
11829 -static inline void setup_cpumask_of_cpu(void) { }
11830 +
11831 +static void __init reserve_early_setup_data(void)
11832 +{
11833 +#ifndef CONFIG_XEN
11834 + struct setup_data *data;
11835 + u64 pa_data;
11836 + char buf[32];
11837 +
11838 + if (boot_params.hdr.version < 0x0209)
11839 + return;
11840 + pa_data = boot_params.hdr.setup_data;
11841 + while (pa_data) {
11842 + data = early_ioremap(pa_data, sizeof(*data));
11843 + sprintf(buf, "setup data %x", data->type);
11844 + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
11845 + pa_data = data->next;
11846 + early_iounmap(data, sizeof(*data));
11847 + }
11848 #endif
11849 +}
11850
11851 -#ifdef CONFIG_X86_32
11852 /*
11853 - * Great future not-so-futuristic plan: make i386 and x86_64 do it
11854 - * the same way
11855 + * --------- Crashkernel reservation ------------------------------
11856 + */
11857 +
11858 +#ifdef CONFIG_KEXEC
11859 +
11860 +#ifndef CONFIG_XEN
11861 +/**
11862 + * Reserve @size bytes of crashkernel memory at any suitable offset.
11863 + *
11864 + * @size: Size of the crashkernel memory to reserve.
11865 + * Returns the base address on success, and -1ULL on failure.
11866 + */
11867 +unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
11868 +{
11869 + const unsigned long long alignment = 16<<20; /* 16M */
11870 + unsigned long long start = 0LL;
11871 +
11872 + while (1) {
11873 + int ret;
11874 +
11875 + start = find_e820_area(start, ULONG_MAX, size, alignment);
11876 + if (start == -1ULL)
11877 + return start;
11878 +
11879 + /* try to reserve it */
11880 + ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
11881 + if (ret >= 0)
11882 + return start;
11883 +
11884 + start += alignment;
11885 + }
11886 +}
11887 +
11888 +static inline unsigned long long get_total_mem(void)
11889 +{
11890 + unsigned long long total;
11891 +
11892 + total = max_low_pfn - min_low_pfn;
11893 +#ifdef CONFIG_HIGHMEM
11894 + total += highend_pfn - highstart_pfn;
11895 +#endif
11896 +
11897 + return total << PAGE_SHIFT;
11898 +}
11899 +
11900 +static void __init reserve_crashkernel(void)
11901 +{
11902 + unsigned long long total_mem;
11903 + unsigned long long crash_size, crash_base;
11904 + int ret;
11905 +
11906 + total_mem = get_total_mem();
11907 +
11908 + ret = parse_crashkernel(boot_command_line, total_mem,
11909 + &crash_size, &crash_base);
11910 + if (ret != 0 || crash_size <= 0)
11911 + return;
11912 +
11913 + /* 0 means: find the address automatically */
11914 + if (crash_base <= 0) {
11915 + crash_base = find_and_reserve_crashkernel(crash_size);
11916 + if (crash_base == -1ULL) {
11917 + pr_info("crashkernel reservation failed. "
11918 + "No suitable area found.\n");
11919 + return;
11920 + }
11921 + } else {
11922 + ret = reserve_bootmem_generic(crash_base, crash_size,
11923 + BOOTMEM_EXCLUSIVE);
11924 + if (ret < 0) {
11925 + pr_info("crashkernel reservation failed - "
11926 + "memory is in use\n");
11927 + return;
11928 + }
11929 + }
11930 +
11931 + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
11932 + "for crashkernel (System RAM: %ldMB)\n",
11933 + (unsigned long)(crash_size >> 20),
11934 + (unsigned long)(crash_base >> 20),
11935 + (unsigned long)(total_mem >> 20));
11936 +
11937 + crashk_res.start = crash_base;
11938 + crashk_res.end = crash_base + crash_size - 1;
11939 + insert_resource(&iomem_resource, &crashk_res);
11940 +}
11941 +#else
11942 +#define reserve_crashkernel xen_machine_kexec_setup_resources
11943 +#endif
11944 +#else
11945 +static void __init reserve_crashkernel(void)
11946 +{
11947 +}
11948 +#endif
11949 +
11950 +static struct resource standard_io_resources[] = {
11951 + { .name = "dma1", .start = 0x00, .end = 0x1f,
11952 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11953 + { .name = "pic1", .start = 0x20, .end = 0x21,
11954 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11955 + { .name = "timer0", .start = 0x40, .end = 0x43,
11956 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11957 + { .name = "timer1", .start = 0x50, .end = 0x53,
11958 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11959 + { .name = "keyboard", .start = 0x60, .end = 0x60,
11960 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11961 + { .name = "keyboard", .start = 0x64, .end = 0x64,
11962 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11963 + { .name = "dma page reg", .start = 0x80, .end = 0x8f,
11964 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11965 + { .name = "pic2", .start = 0xa0, .end = 0xa1,
11966 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11967 + { .name = "dma2", .start = 0xc0, .end = 0xdf,
11968 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
11969 + { .name = "fpu", .start = 0xf0, .end = 0xff,
11970 + .flags = IORESOURCE_BUSY | IORESOURCE_IO }
11971 +};
11972 +
11973 +static void __init reserve_standard_io_resources(void)
11974 +{
11975 + int i;
11976 +
11977 + /* Nothing to do if not running in dom0. */
11978 + if (!is_initial_xendomain())
11979 + return;
11980 +
11981 + /* request I/O space for devices used on all i[345]86 PCs */
11982 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
11983 + request_resource(&ioport_resource, &standard_io_resources[i]);
11984 +
11985 +}
11986 +
11987 +#ifdef CONFIG_PROC_VMCORE
11988 +/* elfcorehdr= specifies the location of elf core header
11989 + * stored by the crashed kernel. This option will be passed
11990 + * by kexec loader to the capture kernel.
11991 */
11992 -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
11993 -EXPORT_SYMBOL(__per_cpu_offset);
11994 +static int __init setup_elfcorehdr(char *arg)
11995 +{
11996 + char *end;
11997 + if (!arg)
11998 + return -EINVAL;
11999 + elfcorehdr_addr = memparse(arg, &end);
12000 + return end > arg ? 0 : -EINVAL;
12001 +}
12002 +early_param("elfcorehdr", setup_elfcorehdr);
12003 #endif
12004
12005 +static struct x86_quirks default_x86_quirks __initdata;
12006 +
12007 +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
12008 +
12009 +/*
12010 + * Determine if we were loaded by an EFI loader. If so, then we have also been
12011 + * passed the efi memmap, systab, etc., so we should use these data structures
12012 + * for initialization. Note, the efi init code path is determined by the
12013 + * global efi_enabled. This allows the same kernel image to be used on existing
12014 + * systems (with a traditional BIOS) as well as on EFI systems.
12015 + */
12016 /*
12017 - * Great future plan:
12018 - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
12019 - * Always point %gs to its beginning
12020 + * setup_arch - architecture-specific boot-time initializations
12021 + *
12022 + * Note: On x86_64, fixmaps are ready for use even before this is called.
12023 */
12024 -void __init setup_per_cpu_areas(void)
12025 +
12026 +void __init setup_arch(char **cmdline_p)
12027 {
12028 - int i, highest_cpu = 0;
12029 - unsigned long size;
12030 +#ifdef CONFIG_XEN
12031 + unsigned int i;
12032 + unsigned long p2m_pages;
12033 + struct physdev_set_iopl set_iopl;
12034
12035 -#ifdef CONFIG_HOTPLUG_CPU
12036 - prefill_possible_map();
12037 +#ifdef CONFIG_X86_32
12038 + /* Force a quick death if the kernel panics (not domain 0). */
12039 + extern int panic_timeout;
12040 + if (!panic_timeout && !is_initial_xendomain())
12041 + panic_timeout = 1;
12042 #endif
12043
12044 - /* Copy section for each CPU (we discard the original) */
12045 - size = PERCPU_ENOUGH_ROOM;
12046 - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
12047 - size);
12048 -
12049 - for_each_possible_cpu(i) {
12050 - char *ptr;
12051 -#ifndef CONFIG_NEED_MULTIPLE_NODES
12052 - ptr = alloc_bootmem_pages(size);
12053 -#else
12054 - int node = early_cpu_to_node(i);
12055 - if (!node_online(node) || !NODE_DATA(node)) {
12056 - ptr = alloc_bootmem_pages(size);
12057 - printk(KERN_INFO
12058 - "cpu %d has no node or node-local memory\n", i);
12059 - }
12060 - else
12061 - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
12062 + /* Register a call for panic conditions. */
12063 + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12064 +
12065 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12066 + VMASST_TYPE_writable_pagetables));
12067 +#ifdef CONFIG_X86_32
12068 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
12069 + VMASST_TYPE_4gb_segments));
12070 +#endif
12071 +#endif /* CONFIG_XEN */
12072 +
12073 +#ifdef CONFIG_X86_32
12074 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12075 + visws_early_detect();
12076 + pre_setup_arch_hook();
12077 +#else
12078 + printk(KERN_INFO "Command line: %s\n", boot_command_line);
12079 +#endif
12080 +
12081 + early_cpu_init();
12082 + early_ioremap_init();
12083 +
12084 +#ifndef CONFIG_XEN
12085 + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
12086 + screen_info = boot_params.screen_info;
12087 + edid_info = boot_params.edid_info;
12088 +#ifdef CONFIG_X86_32
12089 + apm_info.bios = boot_params.apm_bios_info;
12090 + ist_info = boot_params.ist_info;
12091 + if (boot_params.sys_desc_table.length != 0) {
12092 + set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
12093 + machine_id = boot_params.sys_desc_table.table[0];
12094 + machine_submodel_id = boot_params.sys_desc_table.table[1];
12095 + BIOS_revision = boot_params.sys_desc_table.table[2];
12096 + }
12097 +#endif
12098 + saved_video_mode = boot_params.hdr.vid_mode;
12099 + bootloader_type = boot_params.hdr.type_of_loader;
12100 +
12101 +#ifdef CONFIG_BLK_DEV_RAM
12102 + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
12103 + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
12104 + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
12105 +#endif
12106 +#ifdef CONFIG_EFI
12107 + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
12108 +#ifdef CONFIG_X86_32
12109 + "EL32",
12110 +#else
12111 + "EL64",
12112 #endif
12113 - if (!ptr)
12114 - panic("Cannot allocate cpu data for CPU %d\n", i);
12115 + 4)) {
12116 + efi_enabled = 1;
12117 + efi_reserve_early();
12118 + }
12119 +#endif
12120 +#else /* CONFIG_XEN */
12121 +#ifdef CONFIG_X86_32
12122 + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12123 + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12124 + */
12125 + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12126 +#else
12127 + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
12128 +#endif
12129 + if (is_initial_xendomain()) {
12130 + const struct dom0_vga_console_info *info =
12131 + (void *)((char *)xen_start_info +
12132 + xen_start_info->console.dom0.info_off);
12133 +
12134 + dom0_init_screen_info(info,
12135 + xen_start_info->console.dom0.info_size);
12136 + xen_start_info->console.domU.mfn = 0;
12137 + xen_start_info->console.domU.evtchn = 0;
12138 + } else
12139 + screen_info.orig_video_isVGA = 0;
12140 + copy_edid();
12141 +#endif /* CONFIG_XEN */
12142 +
12143 + ARCH_SETUP
12144 +
12145 + setup_memory_map();
12146 + parse_setup_data();
12147 + /* update the e820_saved too */
12148 + e820_reserve_setup_data();
12149 +
12150 + copy_edd();
12151 +
12152 +#ifndef CONFIG_XEN
12153 + if (!boot_params.hdr.root_flags)
12154 + root_mountflags &= ~MS_RDONLY;
12155 +#endif
12156 + init_mm.start_code = (unsigned long) _text;
12157 + init_mm.end_code = (unsigned long) _etext;
12158 + init_mm.end_data = (unsigned long) _edata;
12159 +#ifdef CONFIG_X86_32
12160 +#ifndef CONFIG_XEN
12161 + init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
12162 +#else
12163 + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12164 + xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12165 +#endif
12166 +#else
12167 + init_mm.brk = (unsigned long) &_end;
12168 +#endif
12169 +
12170 + code_resource.start = virt_to_phys(_text);
12171 + code_resource.end = virt_to_phys(_etext)-1;
12172 + data_resource.start = virt_to_phys(_etext);
12173 + data_resource.end = virt_to_phys(_edata)-1;
12174 + bss_resource.start = virt_to_phys(&__bss_start);
12175 + bss_resource.end = virt_to_phys(&__bss_stop)-1;
12176 +
12177 + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
12178 + *cmdline_p = command_line;
12179 +
12180 + parse_early_param();
12181 +
12182 #ifdef CONFIG_X86_64
12183 - cpu_pda(i)->data_offset = ptr - __per_cpu_start;
12184 + check_efer();
12185 +#endif
12186 +
12187 +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
12188 + /*
12189 + * Must be before kernel pagetables are setup
12190 + * or fixmap area is touched.
12191 + */
12192 + vmi_init();
12193 +#endif
12194 +
12195 + /* after early param, so could get panic from serial */
12196 + reserve_early_setup_data();
12197 +
12198 + if (acpi_mps_check()) {
12199 +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
12200 + disable_apic = 1;
12201 +#endif
12202 + setup_clear_cpu_cap(X86_FEATURE_APIC);
12203 + }
12204 +
12205 +#ifdef CONFIG_PCI
12206 + if (pci_early_dump_regs)
12207 + early_dump_pci_devices();
12208 +#endif
12209 +
12210 + finish_e820_parsing();
12211 +
12212 +#ifdef CONFIG_X86_32
12213 + probe_roms();
12214 +#endif
12215 +
12216 +#ifndef CONFIG_XEN
12217 + /* after parse_early_param, so could debug it */
12218 + insert_resource(&iomem_resource, &code_resource);
12219 + insert_resource(&iomem_resource, &data_resource);
12220 + insert_resource(&iomem_resource, &bss_resource);
12221 +
12222 + if (efi_enabled)
12223 + efi_init();
12224 +
12225 +#ifdef CONFIG_X86_32
12226 + if (ppro_with_ram_bug()) {
12227 + e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
12228 + E820_RESERVED);
12229 + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
12230 + printk(KERN_INFO "fixed physical RAM map:\n");
12231 + e820_print_map("bad_ppro");
12232 + }
12233 #else
12234 - __per_cpu_offset[i] = ptr - __per_cpu_start;
12235 + early_gart_iommu_check();
12236 #endif
12237 - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
12238 +#endif /* CONFIG_XEN */
12239
12240 - highest_cpu = i;
12241 + /*
12242 + * partially used pages are not usable - thus
12243 + * we are rounding upwards:
12244 + */
12245 + max_pfn = e820_end_of_ram_pfn();
12246 +
12247 + /* preallocate 4k for mptable mpc */
12248 + early_reserve_e820_mpc_new();
12249 + /* update e820 for memory not covered by WB MTRRs */
12250 + mtrr_bp_init();
12251 +#ifndef CONFIG_XEN
12252 + if (mtrr_trim_uncached_memory(max_pfn))
12253 + max_pfn = e820_end_of_ram_pfn();
12254 +#endif
12255 +
12256 +#ifdef CONFIG_X86_32
12257 + /* max_low_pfn get updated here */
12258 + find_low_pfn_range();
12259 +#else
12260 + num_physpages = max_pfn;
12261 + max_mapnr = max_pfn;
12262 +
12263 +
12264 + /* How many end-of-memory variables you have, grandma! */
12265 + /* need this before calling reserve_initrd */
12266 + if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
12267 + max_low_pfn = e820_end_of_low_ram_pfn();
12268 + else
12269 + max_low_pfn = max_pfn;
12270 +
12271 + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
12272 +#endif
12273 +
12274 + /* max_pfn_mapped is updated here */
12275 + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
12276 + max_pfn_mapped = max_low_pfn_mapped;
12277 +
12278 +#ifdef CONFIG_X86_64
12279 + if (max_pfn > max_low_pfn) {
12280 + max_pfn_mapped = init_memory_mapping(1UL<<32,
12281 + max_pfn<<PAGE_SHIFT);
12282 + /* can we preseve max_low_pfn ?*/
12283 + max_low_pfn = max_pfn;
12284 }
12285 +#endif
12286
12287 - nr_cpu_ids = highest_cpu + 1;
12288 - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
12289 + /*
12290 + * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
12291 + */
12292
12293 - /* Setup percpu data maps */
12294 - setup_per_cpu_maps();
12295 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
12296 + if (init_ohci1394_dma_early)
12297 + init_ohci1394_dma_on_all_controllers();
12298 +#endif
12299
12300 - /* Setup cpumask_of_cpu map */
12301 - setup_cpumask_of_cpu();
12302 -}
12303 + reserve_initrd();
12304 +
12305 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12306 + vsmp_init();
12307 +#endif
12308 +
12309 + if (is_initial_xendomain())
12310 + dmi_scan_machine();
12311 +
12312 + io_delay_init();
12313 +
12314 +#ifdef CONFIG_ACPI
12315 + if (!is_initial_xendomain()) {
12316 + printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12317 + disable_acpi();
12318 + }
12319 +#endif
12320 +
12321 + /*
12322 + * Parse the ACPI tables for possible boot-time SMP configuration.
12323 + */
12324 + acpi_boot_table_init();
12325 +
12326 +#ifdef CONFIG_ACPI_NUMA
12327 + /*
12328 + * Parse SRAT to discover nodes.
12329 + */
12330 + acpi_numa_init();
12331 +#endif
12332 +
12333 + initmem_init(0, max_pfn);
12334
12335 +#ifdef CONFIG_ACPI_SLEEP
12336 + /*
12337 + * Reserve low memory region for sleep support.
12338 + */
12339 + acpi_reserve_bootmem();
12340 #endif
12341 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
12342 + /*
12343 + * Find and reserve possible boot-time SMP configuration:
12344 + */
12345 + find_smp_config();
12346 +#endif
12347 + reserve_crashkernel();
12348 +
12349 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12350 + /*
12351 + * dma32_reserve_bootmem() allocates bootmem which may conflict
12352 + * with the crashkernel command line, so do that after
12353 + * reserve_crashkernel()
12354 + */
12355 + dma32_reserve_bootmem();
12356 +#endif
12357 +
12358 + reserve_ibft_region();
12359 +
12360 +#ifdef CONFIG_KVM_CLOCK
12361 + kvmclock_init();
12362 +#endif
12363 +
12364 + xen_pagetable_setup_start(swapper_pg_dir);
12365 + paging_init();
12366 + xen_pagetable_setup_done(swapper_pg_dir);
12367 + paravirt_post_allocator_init();
12368 +
12369 +#ifdef CONFIG_X86_64
12370 + map_vsyscall();
12371 +#endif
12372 +
12373 +#ifdef CONFIG_XEN
12374 + p2m_pages = max_pfn;
12375 + if (xen_start_info->nr_pages > max_pfn) {
12376 + /*
12377 + * the max_pfn was shrunk (probably by mem= or highmem=
12378 + * kernel parameter); shrink reservation with the HV
12379 + */
12380 + struct xen_memory_reservation reservation = {
12381 + .address_bits = 0,
12382 + .extent_order = 0,
12383 + .domid = DOMID_SELF
12384 + };
12385 + unsigned int difference;
12386 + int ret;
12387 +
12388 + difference = xen_start_info->nr_pages - max_pfn;
12389 +
12390 + set_xen_guest_handle(reservation.extent_start,
12391 + ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
12392 + reservation.nr_extents = difference;
12393 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
12394 + &reservation);
12395 + BUG_ON(ret != difference);
12396 + }
12397 + else if (max_pfn > xen_start_info->nr_pages)
12398 + p2m_pages = xen_start_info->nr_pages;
12399 +
12400 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12401 + unsigned long i, j;
12402 + unsigned int k, fpp;
12403 +
12404 + /* Make sure we have a large enough P->M table. */
12405 + phys_to_machine_mapping = alloc_bootmem_pages(
12406 + max_pfn * sizeof(unsigned long));
12407 + memset(phys_to_machine_mapping, ~0,
12408 + max_pfn * sizeof(unsigned long));
12409 + memcpy(phys_to_machine_mapping,
12410 + (unsigned long *)xen_start_info->mfn_list,
12411 + p2m_pages * sizeof(unsigned long));
12412 + free_bootmem(
12413 + __pa(xen_start_info->mfn_list),
12414 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12415 + sizeof(unsigned long))));
12416 +
12417 + /*
12418 + * Initialise the list of the frames that specify the list of
12419 + * frames that make up the p2m table. Used by save/restore.
12420 + */
12421 + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
12422 +
12423 + fpp = PAGE_SIZE/sizeof(unsigned long);
12424 + for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
12425 + if (j == fpp)
12426 + j = 0;
12427 + if (j == 0) {
12428 + k++;
12429 + BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
12430 + pfn_to_mfn_frame_list[k] =
12431 + alloc_bootmem_pages(PAGE_SIZE);
12432 + pfn_to_mfn_frame_list_list[k] =
12433 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
12434 + }
12435 + pfn_to_mfn_frame_list[k][j] =
12436 + virt_to_mfn(&phys_to_machine_mapping[i]);
12437 + }
12438 + HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12439 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12440 + virt_to_mfn(pfn_to_mfn_frame_list_list);
12441 + }
12442 +
12443 + /* Mark all ISA DMA channels in-use - using them wouldn't work. */
12444 + for (i = 0; i < MAX_DMA_CHANNELS; ++i)
12445 + if (i != 4 && request_dma(i, "xen") != 0)
12446 + BUG();
12447 +#endif /* CONFIG_XEN */
12448 +
12449 +#ifdef CONFIG_X86_GENERICARCH
12450 + generic_apic_probe();
12451 +#endif
12452 +
12453 +#ifndef CONFIG_XEN
12454 + early_quirks();
12455 +#endif
12456 +
12457 + /*
12458 + * Read APIC and some other early information from ACPI tables.
12459 + */
12460 + acpi_boot_init();
12461 +
12462 +#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
12463 + /*
12464 + * get boot-time SMP configuration:
12465 + */
12466 + if (smp_found_config)
12467 + get_smp_config();
12468 +#endif
12469 +
12470 + prefill_possible_map();
12471 +#ifdef CONFIG_X86_64
12472 + init_cpu_to_node();
12473 +#endif
12474 +
12475 +#ifndef CONFIG_XEN
12476 + init_apic_mappings();
12477 + ioapic_init_mappings();
12478 +
12479 + kvm_guest_init();
12480 +
12481 + e820_reserve_resources();
12482 + e820_mark_nosave_regions(max_low_pfn);
12483 +#else
12484 + if (is_initial_xendomain())
12485 + e820_reserve_resources();
12486 +#endif
12487 +
12488 +#ifdef CONFIG_X86_32
12489 + if (is_initial_xendomain())
12490 + request_resource(&iomem_resource, &video_ram_resource);
12491 +#endif
12492 + reserve_standard_io_resources();
12493 +
12494 +#ifndef CONFIG_XEN
12495 + e820_setup_gap();
12496 +
12497 +#ifdef CONFIG_VT
12498 +#if defined(CONFIG_VGA_CONSOLE)
12499 + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12500 + conswitchp = &vga_con;
12501 +#elif defined(CONFIG_DUMMY_CONSOLE)
12502 + conswitchp = &dummy_con;
12503 +#endif
12504 +#endif
12505 +#else /* CONFIG_XEN */
12506 + if (is_initial_xendomain())
12507 + e820_setup_gap();
12508 +
12509 + set_iopl.iopl = 1;
12510 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
12511 +
12512 +#ifdef CONFIG_VT
12513 +#ifdef CONFIG_DUMMY_CONSOLE
12514 + conswitchp = &dummy_con;
12515 +#endif
12516 +#ifdef CONFIG_VGA_CONSOLE
12517 + if (is_initial_xendomain())
12518 + conswitchp = &vga_con;
12519 +#endif
12520 +#endif
12521 +#endif /* CONFIG_XEN */
12522 +}
12523 +
12524 +#ifdef CONFIG_XEN
12525 +static int
12526 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12527 +{
12528 + HYPERVISOR_shutdown(SHUTDOWN_crash);
12529 + /* we're never actually going to get here... */
12530 + return NOTIFY_DONE;
12531 +}
12532 +#endif /* !CONFIG_XEN */
12533 --- sle11-2009-10-16.orig/arch/x86/kernel/setup64-xen.c 2009-03-16 16:38:05.000000000 +0100
12534 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12535 @@ -1,370 +0,0 @@
12536 -/*
12537 - * X86-64 specific CPU setup.
12538 - * Copyright (C) 1995 Linus Torvalds
12539 - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
12540 - * See setup.c for older changelog.
12541 - *
12542 - * Jun Nakajima <jun.nakajima@intel.com>
12543 - * Modified for Xen
12544 - *
12545 - */
12546 -#include <linux/init.h>
12547 -#include <linux/kernel.h>
12548 -#include <linux/sched.h>
12549 -#include <linux/string.h>
12550 -#include <linux/bootmem.h>
12551 -#include <linux/bitops.h>
12552 -#include <linux/module.h>
12553 -#include <linux/kgdb.h>
12554 -#include <asm/pda.h>
12555 -#include <asm/pgtable.h>
12556 -#include <asm/processor.h>
12557 -#include <asm/desc.h>
12558 -#include <asm/atomic.h>
12559 -#include <asm/mmu_context.h>
12560 -#include <asm/smp.h>
12561 -#include <asm/i387.h>
12562 -#include <asm/percpu.h>
12563 -#include <asm/proto.h>
12564 -#include <asm/sections.h>
12565 -#include <asm/setup.h>
12566 -#include <asm/genapic.h>
12567 -#ifdef CONFIG_XEN
12568 -#include <asm/hypervisor.h>
12569 -#endif
12570 -
12571 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
12572 -struct boot_params __initdata boot_params;
12573 -#else
12574 -struct boot_params boot_params;
12575 -#endif
12576 -
12577 -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
12578 -
12579 -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
12580 -EXPORT_SYMBOL(_cpu_pda);
12581 -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
12582 -
12583 -#ifndef CONFIG_X86_NO_IDT
12584 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
12585 -#endif
12586 -
12587 -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
12588 -
12589 -unsigned long __supported_pte_mask __read_mostly = ~0UL;
12590 -EXPORT_SYMBOL(__supported_pte_mask);
12591 -
12592 -static int do_not_nx __cpuinitdata = 0;
12593 -
12594 -/* noexec=on|off
12595 -Control non executable mappings for 64bit processes.
12596 -
12597 -on Enable(default)
12598 -off Disable
12599 -*/
12600 -static int __init nonx_setup(char *str)
12601 -{
12602 - if (!str)
12603 - return -EINVAL;
12604 - if (!strncmp(str, "on", 2)) {
12605 - __supported_pte_mask |= _PAGE_NX;
12606 - do_not_nx = 0;
12607 - } else if (!strncmp(str, "off", 3)) {
12608 - do_not_nx = 1;
12609 - __supported_pte_mask &= ~_PAGE_NX;
12610 - }
12611 - return 0;
12612 -}
12613 -early_param("noexec", nonx_setup);
12614 -
12615 -int force_personality32 = 0;
12616 -
12617 -/* noexec32=on|off
12618 -Control non executable heap for 32bit processes.
12619 -To control the stack too use noexec=off
12620 -
12621 -on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
12622 -off PROT_READ implies PROT_EXEC
12623 -*/
12624 -static int __init nonx32_setup(char *str)
12625 -{
12626 - if (!strcmp(str, "on"))
12627 - force_personality32 &= ~READ_IMPLIES_EXEC;
12628 - else if (!strcmp(str, "off"))
12629 - force_personality32 |= READ_IMPLIES_EXEC;
12630 - return 1;
12631 -}
12632 -__setup("noexec32=", nonx32_setup);
12633 -
12634 -#ifdef CONFIG_XEN
12635 -static void __init_refok switch_pt(int cpu)
12636 -{
12637 - if (cpu == 0)
12638 - xen_init_pt();
12639 - xen_pt_switch(__pa_symbol(init_level4_pgt));
12640 - xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
12641 -}
12642 -#define switch_pt() switch_pt(cpu)
12643 -
12644 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12645 -{
12646 - unsigned long frames[16];
12647 - unsigned long va;
12648 - int f;
12649 -
12650 - for (va = gdt_descr->address, f = 0;
12651 - va < gdt_descr->address + gdt_descr->size;
12652 - va += PAGE_SIZE, f++) {
12653 - frames[f] = virt_to_mfn(va);
12654 - make_page_readonly(
12655 - (void *)va, XENFEAT_writable_descriptor_tables);
12656 - }
12657 - if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
12658 - sizeof (struct desc_struct)))
12659 - BUG();
12660 -}
12661 -#else
12662 -static void switch_pt(void)
12663 -{
12664 - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
12665 -}
12666 -
12667 -static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
12668 -{
12669 - load_gdt(gdt_descr);
12670 - load_idt(idt_descr);
12671 -}
12672 -#endif
12673 -
12674 -void pda_init(int cpu)
12675 -{
12676 - struct x8664_pda *pda = cpu_pda(cpu);
12677 -
12678 - /* Setup up data that may be needed in __get_free_pages early */
12679 - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
12680 -#ifndef CONFIG_XEN
12681 - /* Memory clobbers used to order PDA accessed */
12682 - mb();
12683 - wrmsrl(MSR_GS_BASE, pda);
12684 - mb();
12685 -#else
12686 - if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
12687 - (unsigned long)pda))
12688 - BUG();
12689 -#endif
12690 - pda->cpunumber = cpu;
12691 - pda->irqcount = -1;
12692 - pda->kernelstack =
12693 - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
12694 - pda->active_mm = &init_mm;
12695 - pda->mmu_state = 0;
12696 -
12697 - if (cpu == 0) {
12698 - /* others are initialized in smpboot.c */
12699 - pda->pcurrent = &init_task;
12700 - pda->irqstackptr = boot_cpu_stack;
12701 - } else {
12702 - pda->irqstackptr = (char *)
12703 - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
12704 - if (!pda->irqstackptr)
12705 - panic("cannot allocate irqstack for cpu %d", cpu);
12706 - }
12707 -
12708 - switch_pt();
12709 -
12710 - pda->irqstackptr += IRQSTACKSIZE-64;
12711 -}
12712 -
12713 -#ifndef CONFIG_X86_NO_TSS
12714 -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
12715 -__attribute__((section(".bss.page_aligned")));
12716 -#endif
12717 -
12718 -extern asmlinkage void ignore_sysret(void);
12719 -
12720 -/* May not be marked __init: used by software suspend */
12721 -void syscall_init(void)
12722 -{
12723 -#ifndef CONFIG_XEN
12724 - /*
12725 - * LSTAR and STAR live in a bit strange symbiosis.
12726 - * They both write to the same internal register. STAR allows to set CS/DS
12727 - * but only a 32bit target. LSTAR sets the 64bit rip.
12728 - */
12729 - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
12730 - wrmsrl(MSR_LSTAR, system_call);
12731 - wrmsrl(MSR_CSTAR, ignore_sysret);
12732 -
12733 - /* Flags to clear on syscall */
12734 - wrmsrl(MSR_SYSCALL_MASK,
12735 - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
12736 -#endif
12737 -#ifdef CONFIG_IA32_EMULATION
12738 - syscall32_cpu_init ();
12739 -#else
12740 - {
12741 - static const struct callback_register cstar = {
12742 - .type = CALLBACKTYPE_syscall32,
12743 - .address = (unsigned long)ignore_sysret
12744 - };
12745 - if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
12746 - printk(KERN_WARNING "Unable to register CSTAR callback\n");
12747 - }
12748 -#endif
12749 -}
12750 -
12751 -void __cpuinit check_efer(void)
12752 -{
12753 - unsigned long efer;
12754 -
12755 - rdmsrl(MSR_EFER, efer);
12756 - if (!(efer & EFER_NX) || do_not_nx) {
12757 - __supported_pte_mask &= ~_PAGE_NX;
12758 - }
12759 -}
12760 -
12761 -unsigned long kernel_eflags;
12762 -
12763 -#ifndef CONFIG_X86_NO_TSS
12764 -/*
12765 - * Copies of the original ist values from the tss are only accessed during
12766 - * debugging, no special alignment required.
12767 - */
12768 -DEFINE_PER_CPU(struct orig_ist, orig_ist);
12769 -#endif
12770 -
12771 -/*
12772 - * cpu_init() initializes state that is per-CPU. Some data is already
12773 - * initialized (naturally) in the bootstrap process, such as the GDT
12774 - * and IDT. We reload them nevertheless, this function acts as a
12775 - * 'CPU state barrier', nothing should get across.
12776 - * A lot of state is already set up in PDA init.
12777 - */
12778 -void __cpuinit cpu_init (void)
12779 -{
12780 - int cpu = stack_smp_processor_id();
12781 -#ifndef CONFIG_X86_NO_TSS
12782 - struct tss_struct *t = &per_cpu(init_tss, cpu);
12783 - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
12784 - unsigned long v;
12785 - char *estacks = NULL;
12786 - unsigned i;
12787 -#endif
12788 - struct task_struct *me;
12789 -
12790 - /* CPU 0 is initialised in head64.c */
12791 - if (cpu != 0) {
12792 - pda_init(cpu);
12793 - }
12794 -#ifndef CONFIG_X86_NO_TSS
12795 - else
12796 - estacks = boot_exception_stacks;
12797 -#endif
12798 -
12799 - me = current;
12800 -
12801 - if (cpu_test_and_set(cpu, cpu_initialized))
12802 - panic("CPU#%d already initialized!\n", cpu);
12803 -
12804 - printk("Initializing CPU#%d\n", cpu);
12805 -
12806 - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
12807 -
12808 - /*
12809 - * Initialize the per-CPU GDT with the boot GDT,
12810 - * and set up the GDT descriptor:
12811 - */
12812 -#ifndef CONFIG_XEN
12813 - if (cpu)
12814 - memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
12815 -#endif
12816 -
12817 - cpu_gdt_descr[cpu].size = GDT_SIZE;
12818 - cpu_gdt_init(&cpu_gdt_descr[cpu]);
12819 -
12820 - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
12821 - syscall_init();
12822 -
12823 - wrmsrl(MSR_FS_BASE, 0);
12824 - wrmsrl(MSR_KERNEL_GS_BASE, 0);
12825 - barrier();
12826 -
12827 - check_efer();
12828 -
12829 -#ifndef CONFIG_X86_NO_TSS
12830 - /*
12831 - * set up and load the per-CPU TSS
12832 - */
12833 - for (v = 0; v < N_EXCEPTION_STACKS; v++) {
12834 - static const unsigned int order[N_EXCEPTION_STACKS] = {
12835 - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
12836 - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
12837 - };
12838 - if (cpu) {
12839 - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
12840 - if (!estacks)
12841 - panic("Cannot allocate exception stack %ld %d\n",
12842 - v, cpu);
12843 - }
12844 - estacks += PAGE_SIZE << order[v];
12845 - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
12846 - }
12847 -
12848 - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
12849 - /*
12850 - * <= is required because the CPU will access up to
12851 - * 8 bits beyond the end of the IO permission bitmap.
12852 - */
12853 - for (i = 0; i <= IO_BITMAP_LONGS; i++)
12854 - t->io_bitmap[i] = ~0UL;
12855 -#endif
12856 -
12857 - atomic_inc(&init_mm.mm_count);
12858 - me->active_mm = &init_mm;
12859 - if (me->mm)
12860 - BUG();
12861 - enter_lazy_tlb(&init_mm, me);
12862 -
12863 -#ifndef CONFIG_X86_NO_TSS
12864 - set_tss_desc(cpu, t);
12865 -#endif
12866 -#ifndef CONFIG_XEN
12867 - load_TR_desc();
12868 -#endif
12869 - load_LDT(&init_mm.context);
12870 -
12871 -#ifdef CONFIG_KGDB
12872 - /*
12873 - * If the kgdb is connected no debug regs should be altered. This
12874 - * is only applicable when KGDB and a KGDB I/O module are built
12875 - * into the kernel and you are using early debugging with
12876 - * kgdbwait. KGDB will control the kernel HW breakpoint registers.
12877 - */
12878 - if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
12879 - arch_kgdb_ops.correct_hw_break();
12880 - else {
12881 -#endif
12882 - /*
12883 - * Clear all 6 debug registers:
12884 - */
12885 -
12886 - set_debugreg(0UL, 0);
12887 - set_debugreg(0UL, 1);
12888 - set_debugreg(0UL, 2);
12889 - set_debugreg(0UL, 3);
12890 - set_debugreg(0UL, 6);
12891 - set_debugreg(0UL, 7);
12892 -#ifdef CONFIG_KGDB
12893 - /* If the kgdb is connected no debug regs should be altered. */
12894 - }
12895 -#endif
12896 -
12897 - fpu_init();
12898 -
12899 - asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
12900 - if (raw_irqs_disabled())
12901 - kernel_eflags &= ~X86_EFLAGS_IF;
12902 -
12903 - if (is_uv_system())
12904 - uv_cpu_init();
12905 -}
12906 --- sle11-2009-10-16.orig/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:38:05.000000000 +0100
12907 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12908 @@ -1,1151 +0,0 @@
12909 -/*
12910 - * Copyright (C) 1995 Linus Torvalds
12911 - *
12912 - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
12913 - *
12914 - * Memory region support
12915 - * David Parsons <orc@pell.chi.il.us>, July-August 1999
12916 - *
12917 - * Added E820 sanitization routine (removes overlapping memory regions);
12918 - * Brian Moyle <bmoyle@mvista.com>, February 2001
12919 - *
12920 - * Moved CPU detection code to cpu/${cpu}.c
12921 - * Patrick Mochel <mochel@osdl.org>, March 2002
12922 - *
12923 - * Provisions for empty E820 memory regions (reported by certain BIOSes).
12924 - * Alex Achenbach <xela@slit.de>, December 2002.
12925 - *
12926 - */
12927 -
12928 -/*
12929 - * This file handles the architecture-dependent parts of initialization
12930 - */
12931 -
12932 -#include <linux/sched.h>
12933 -#include <linux/mm.h>
12934 -#include <linux/mmzone.h>
12935 -#include <linux/screen_info.h>
12936 -#include <linux/ioport.h>
12937 -#include <linux/acpi.h>
12938 -#include <linux/apm_bios.h>
12939 -#include <linux/initrd.h>
12940 -#include <linux/bootmem.h>
12941 -#include <linux/seq_file.h>
12942 -#include <linux/console.h>
12943 -#include <linux/mca.h>
12944 -#include <linux/root_dev.h>
12945 -#include <linux/highmem.h>
12946 -#include <linux/module.h>
12947 -#include <linux/efi.h>
12948 -#include <linux/init.h>
12949 -#include <linux/edd.h>
12950 -#include <linux/iscsi_ibft.h>
12951 -#include <linux/nodemask.h>
12952 -#include <linux/kernel.h>
12953 -#include <linux/percpu.h>
12954 -#include <linux/notifier.h>
12955 -#include <linux/kexec.h>
12956 -#include <linux/crash_dump.h>
12957 -#include <linux/dmi.h>
12958 -#include <linux/pfn.h>
12959 -#include <linux/pci.h>
12960 -#include <linux/init_ohci1394_dma.h>
12961 -#include <linux/kvm_para.h>
12962 -
12963 -#include <video/edid.h>
12964 -
12965 -#include <asm/mtrr.h>
12966 -#include <asm/apic.h>
12967 -#include <asm/e820.h>
12968 -#include <asm/mpspec.h>
12969 -#include <asm/mmzone.h>
12970 -#include <asm/setup.h>
12971 -#include <asm/arch_hooks.h>
12972 -#include <asm/sections.h>
12973 -#include <asm/io_apic.h>
12974 -#include <asm/ist.h>
12975 -#include <asm/io.h>
12976 -#include <asm/hypervisor.h>
12977 -#include <xen/interface/physdev.h>
12978 -#include <xen/interface/memory.h>
12979 -#include <xen/features.h>
12980 -#include <xen/firmware.h>
12981 -#include <xen/xencons.h>
12982 -#include <setup_arch.h>
12983 -#include <asm/bios_ebda.h>
12984 -#include <asm/cacheflush.h>
12985 -#include <asm/processor.h>
12986 -
12987 -#ifdef CONFIG_XEN
12988 -#include <xen/interface/kexec.h>
12989 -#endif
12990 -
12991 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
12992 -static struct notifier_block xen_panic_block = {
12993 - xen_panic_event, NULL, 0 /* try to go last */
12994 -};
12995 -
12996 -/*
12997 - * Machine setup..
12998 - */
12999 -static struct resource data_resource = {
13000 - .name = "Kernel data",
13001 - .start = 0,
13002 - .end = 0,
13003 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13004 -};
13005 -
13006 -static struct resource code_resource = {
13007 - .name = "Kernel code",
13008 - .start = 0,
13009 - .end = 0,
13010 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13011 -};
13012 -
13013 -static struct resource bss_resource = {
13014 - .name = "Kernel bss",
13015 - .start = 0,
13016 - .end = 0,
13017 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13018 -};
13019 -
13020 -static struct resource video_ram_resource = {
13021 - .name = "Video RAM area",
13022 - .start = 0xa0000,
13023 - .end = 0xbffff,
13024 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
13025 -};
13026 -
13027 -static struct resource standard_io_resources[] = { {
13028 - .name = "dma1",
13029 - .start = 0x0000,
13030 - .end = 0x001f,
13031 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13032 -}, {
13033 - .name = "pic1",
13034 - .start = 0x0020,
13035 - .end = 0x0021,
13036 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13037 -}, {
13038 - .name = "timer0",
13039 - .start = 0x0040,
13040 - .end = 0x0043,
13041 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13042 -}, {
13043 - .name = "timer1",
13044 - .start = 0x0050,
13045 - .end = 0x0053,
13046 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13047 -}, {
13048 - .name = "keyboard",
13049 - .start = 0x0060,
13050 - .end = 0x0060,
13051 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13052 -}, {
13053 - .name = "keyboard",
13054 - .start = 0x0064,
13055 - .end = 0x0064,
13056 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13057 -}, {
13058 - .name = "dma page reg",
13059 - .start = 0x0080,
13060 - .end = 0x008f,
13061 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13062 -}, {
13063 - .name = "pic2",
13064 - .start = 0x00a0,
13065 - .end = 0x00a1,
13066 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13067 -}, {
13068 - .name = "dma2",
13069 - .start = 0x00c0,
13070 - .end = 0x00df,
13071 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13072 -}, {
13073 - .name = "fpu",
13074 - .start = 0x00f0,
13075 - .end = 0x00ff,
13076 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
13077 -} };
13078 -
13079 -/* cpu data as detected by the assembly code in head.S */
13080 -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13081 -/* common cpu data for all cpus */
13082 -struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
13083 -EXPORT_SYMBOL(boot_cpu_data);
13084 -
13085 -unsigned int def_to_bigsmp;
13086 -
13087 -#ifndef CONFIG_X86_PAE
13088 -unsigned long mmu_cr4_features;
13089 -#else
13090 -unsigned long mmu_cr4_features = X86_CR4_PAE;
13091 -#endif
13092 -
13093 -/* for MCA, but anyone else can use it if they want */
13094 -unsigned int machine_id;
13095 -unsigned int machine_submodel_id;
13096 -unsigned int BIOS_revision;
13097 -
13098 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
13099 -int bootloader_type;
13100 -
13101 -/* user-defined highmem size */
13102 -static unsigned int highmem_pages = -1;
13103 -
13104 -/*
13105 - * Setup options
13106 - */
13107 -struct screen_info screen_info;
13108 -EXPORT_SYMBOL(screen_info);
13109 -struct apm_info apm_info;
13110 -EXPORT_SYMBOL(apm_info);
13111 -struct edid_info edid_info;
13112 -EXPORT_SYMBOL_GPL(edid_info);
13113 -#ifndef CONFIG_XEN
13114 -#define copy_edid() (edid_info = boot_params.edid_info)
13115 -#endif
13116 -struct ist_info ist_info;
13117 -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
13118 - defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
13119 -EXPORT_SYMBOL(ist_info);
13120 -#endif
13121 -
13122 -extern void early_cpu_init(void);
13123 -extern int root_mountflags;
13124 -
13125 -unsigned long saved_video_mode;
13126 -
13127 -#define RAMDISK_IMAGE_START_MASK 0x07FF
13128 -#define RAMDISK_PROMPT_FLAG 0x8000
13129 -#define RAMDISK_LOAD_FLAG 0x4000
13130 -
13131 -static char __initdata command_line[COMMAND_LINE_SIZE];
13132 -
13133 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
13134 -struct boot_params __initdata boot_params;
13135 -#else
13136 -struct boot_params boot_params;
13137 -#endif
13138 -
13139 -/*
13140 - * Point at the empty zero page to start with. We map the real shared_info
13141 - * page as soon as fixmap is up and running.
13142 - */
13143 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
13144 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
13145 -
13146 -unsigned long *phys_to_machine_mapping;
13147 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
13148 -EXPORT_SYMBOL(phys_to_machine_mapping);
13149 -
13150 -/* Raw start-of-day parameters from the hypervisor. */
13151 -start_info_t *xen_start_info;
13152 -EXPORT_SYMBOL(xen_start_info);
13153 -
13154 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
13155 -struct edd edd;
13156 -#ifdef CONFIG_EDD_MODULE
13157 -EXPORT_SYMBOL(edd);
13158 -#endif
13159 -#ifndef CONFIG_XEN
13160 -/**
13161 - * copy_edd() - Copy the BIOS EDD information
13162 - * from boot_params into a safe place.
13163 - *
13164 - */
13165 -static inline void copy_edd(void)
13166 -{
13167 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
13168 - sizeof(edd.mbr_signature));
13169 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
13170 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
13171 - edd.edd_info_nr = boot_params.eddbuf_entries;
13172 -}
13173 -#endif
13174 -#else
13175 -static inline void copy_edd(void)
13176 -{
13177 -}
13178 -#endif
13179 -
13180 -int __initdata user_defined_memmap;
13181 -
13182 -/*
13183 - * "mem=nopentium" disables the 4MB page tables.
13184 - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
13185 - * to <mem>, overriding the bios size.
13186 - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
13187 - * <start> to <start>+<mem>, overriding the bios size.
13188 - *
13189 - * HPA tells me bootloaders need to parse mem=, so no new
13190 - * option should be mem= [also see Documentation/i386/boot.txt]
13191 - */
13192 -static int __init parse_mem(char *arg)
13193 -{
13194 - if (!arg)
13195 - return -EINVAL;
13196 -
13197 - if (strcmp(arg, "nopentium") == 0) {
13198 - setup_clear_cpu_cap(X86_FEATURE_PSE);
13199 - } else {
13200 - /* If the user specifies memory size, we
13201 - * limit the BIOS-provided memory map to
13202 - * that size. exactmap can be used to specify
13203 - * the exact map. mem=number can be used to
13204 - * trim the existing memory map.
13205 - */
13206 - unsigned long long mem_size;
13207 -
13208 - mem_size = memparse(arg, &arg);
13209 - limit_regions(mem_size);
13210 - user_defined_memmap = 1;
13211 - }
13212 - return 0;
13213 -}
13214 -early_param("mem", parse_mem);
13215 -
13216 -#ifdef CONFIG_PROC_VMCORE
13217 -/* elfcorehdr= specifies the location of elf core header
13218 - * stored by the crashed kernel.
13219 - */
13220 -static int __init parse_elfcorehdr(char *arg)
13221 -{
13222 - if (!arg)
13223 - return -EINVAL;
13224 -
13225 - elfcorehdr_addr = memparse(arg, &arg);
13226 - return 0;
13227 -}
13228 -early_param("elfcorehdr", parse_elfcorehdr);
13229 -#endif /* CONFIG_PROC_VMCORE */
13230 -
13231 -/*
13232 - * highmem=size forces highmem to be exactly 'size' bytes.
13233 - * This works even on boxes that have no highmem otherwise.
13234 - * This also works to reduce highmem size on bigger boxes.
13235 - */
13236 -static int __init parse_highmem(char *arg)
13237 -{
13238 - if (!arg)
13239 - return -EINVAL;
13240 -
13241 - highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
13242 - return 0;
13243 -}
13244 -early_param("highmem", parse_highmem);
13245 -
13246 -/*
13247 - * vmalloc=size forces the vmalloc area to be exactly 'size'
13248 - * bytes. This can be used to increase (or decrease) the
13249 - * vmalloc area - the default is 128m.
13250 - */
13251 -static int __init parse_vmalloc(char *arg)
13252 -{
13253 - if (!arg)
13254 - return -EINVAL;
13255 -
13256 - __VMALLOC_RESERVE = memparse(arg, &arg);
13257 - return 0;
13258 -}
13259 -early_param("vmalloc", parse_vmalloc);
13260 -
13261 -#ifndef CONFIG_XEN
13262 -/*
13263 - * reservetop=size reserves a hole at the top of the kernel address space which
13264 - * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
13265 - * so relocating the fixmap can be done before paging initialization.
13266 - */
13267 -static int __init parse_reservetop(char *arg)
13268 -{
13269 - unsigned long address;
13270 -
13271 - if (!arg)
13272 - return -EINVAL;
13273 -
13274 - address = memparse(arg, &arg);
13275 - reserve_top_address(address);
13276 - return 0;
13277 -}
13278 -early_param("reservetop", parse_reservetop);
13279 -#endif
13280 -
13281 -/*
13282 - * Determine low and high memory ranges:
13283 - */
13284 -unsigned long __init find_max_low_pfn(void)
13285 -{
13286 - unsigned long max_low_pfn;
13287 -
13288 - max_low_pfn = max_pfn;
13289 - if (max_low_pfn > MAXMEM_PFN) {
13290 - if (highmem_pages == -1)
13291 - highmem_pages = max_pfn - MAXMEM_PFN;
13292 - if (highmem_pages + MAXMEM_PFN < max_pfn)
13293 - max_pfn = MAXMEM_PFN + highmem_pages;
13294 - if (highmem_pages + MAXMEM_PFN > max_pfn) {
13295 - printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
13296 - highmem_pages = 0;
13297 - }
13298 - max_low_pfn = MAXMEM_PFN;
13299 -#ifndef CONFIG_HIGHMEM
13300 - /* Maximum memory usable is what is directly addressable */
13301 - printk(KERN_WARNING "Warning only %ldMB will be used.\n",
13302 - MAXMEM>>20);
13303 - if (max_pfn > MAX_NONPAE_PFN)
13304 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13305 - else
13306 - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
13307 - max_pfn = MAXMEM_PFN;
13308 -#else /* !CONFIG_HIGHMEM */
13309 -#ifndef CONFIG_HIGHMEM64G
13310 - if (max_pfn > MAX_NONPAE_PFN) {
13311 - max_pfn = MAX_NONPAE_PFN;
13312 - printk(KERN_WARNING "Warning only 4GB will be used.\n");
13313 - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
13314 - }
13315 -#endif /* !CONFIG_HIGHMEM64G */
13316 -#endif /* !CONFIG_HIGHMEM */
13317 - } else {
13318 - if (highmem_pages == -1)
13319 - highmem_pages = 0;
13320 -#ifdef CONFIG_HIGHMEM
13321 - if (highmem_pages >= max_pfn) {
13322 - printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
13323 - highmem_pages = 0;
13324 - }
13325 - if (highmem_pages) {
13326 - if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
13327 - printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
13328 - highmem_pages = 0;
13329 - }
13330 - max_low_pfn -= highmem_pages;
13331 - }
13332 -#else
13333 - if (highmem_pages)
13334 - printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
13335 -#endif
13336 - }
13337 - return max_low_pfn;
13338 -}
13339 -
13340 -#ifndef CONFIG_XEN
13341 -#define BIOS_LOWMEM_KILOBYTES 0x413
13342 -
13343 -/*
13344 - * The BIOS places the EBDA/XBDA at the top of conventional
13345 - * memory, and usually decreases the reported amount of
13346 - * conventional memory (int 0x12) too. This also contains a
13347 - * workaround for Dell systems that neglect to reserve EBDA.
13348 - * The same workaround also avoids a problem with the AMD768MPX
13349 - * chipset: reserve a page before VGA to prevent PCI prefetch
13350 - * into it (errata #56). Usually the page is reserved anyways,
13351 - * unless you have no PS/2 mouse plugged in.
13352 - */
13353 -static void __init reserve_ebda_region(void)
13354 -{
13355 - unsigned int lowmem, ebda_addr;
13356 -
13357 - /* To determine the position of the EBDA and the */
13358 - /* end of conventional memory, we need to look at */
13359 - /* the BIOS data area. In a paravirtual environment */
13360 - /* that area is absent. We'll just have to assume */
13361 - /* that the paravirt case can handle memory setup */
13362 - /* correctly, without our help. */
13363 - if (paravirt_enabled())
13364 - return;
13365 -
13366 - /* end of low (conventional) memory */
13367 - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
13368 - lowmem <<= 10;
13369 -
13370 - /* start of EBDA area */
13371 - ebda_addr = get_bios_ebda();
13372 -
13373 - /* Fixup: bios puts an EBDA in the top 64K segment */
13374 - /* of conventional memory, but does not adjust lowmem. */
13375 - if ((lowmem - ebda_addr) <= 0x10000)
13376 - lowmem = ebda_addr;
13377 -
13378 - /* Fixup: bios does not report an EBDA at all. */
13379 - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
13380 - if ((ebda_addr == 0) && (lowmem >= 0x9f000))
13381 - lowmem = 0x9f000;
13382 -
13383 - /* Paranoia: should never happen, but... */
13384 - if ((lowmem == 0) || (lowmem >= 0x100000))
13385 - lowmem = 0x9f000;
13386 -
13387 - /* reserve all memory between lowmem and the 1MB mark */
13388 - reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
13389 -}
13390 -#endif
13391 -
13392 -#ifndef CONFIG_NEED_MULTIPLE_NODES
13393 -static void __init setup_bootmem_allocator(void);
13394 -static unsigned long __init setup_memory(void)
13395 -{
13396 - /*
13397 - * partially used pages are not usable - thus
13398 - * we are rounding upwards:
13399 - */
13400 - min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
13401 - xen_start_info->nr_pt_frames;
13402 -
13403 - max_low_pfn = find_max_low_pfn();
13404 -
13405 -#ifdef CONFIG_HIGHMEM
13406 - highstart_pfn = highend_pfn = max_pfn;
13407 - if (max_pfn > max_low_pfn) {
13408 - highstart_pfn = max_low_pfn;
13409 - }
13410 - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
13411 - pages_to_mb(highend_pfn - highstart_pfn));
13412 - num_physpages = highend_pfn;
13413 - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
13414 -#else
13415 - num_physpages = max_low_pfn;
13416 - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
13417 -#endif
13418 -#ifdef CONFIG_FLATMEM
13419 - max_mapnr = num_physpages;
13420 -#endif
13421 - printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
13422 - pages_to_mb(max_low_pfn));
13423 -
13424 - setup_bootmem_allocator();
13425 -
13426 - return max_low_pfn;
13427 -}
13428 -
13429 -static void __init zone_sizes_init(void)
13430 -{
13431 - unsigned long max_zone_pfns[MAX_NR_ZONES];
13432 - memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
13433 - max_zone_pfns[ZONE_DMA] =
13434 - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
13435 - max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
13436 -#ifdef CONFIG_HIGHMEM
13437 - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
13438 - add_active_range(0, 0, highend_pfn);
13439 -#else
13440 - add_active_range(0, 0, max_low_pfn);
13441 -#endif
13442 -
13443 - free_area_init_nodes(max_zone_pfns);
13444 -}
13445 -#else
13446 -extern unsigned long __init setup_memory(void);
13447 -extern void zone_sizes_init(void);
13448 -#endif /* !CONFIG_NEED_MULTIPLE_NODES */
13449 -
13450 -static inline unsigned long long get_total_mem(void)
13451 -{
13452 - unsigned long long total;
13453 -
13454 - total = max_low_pfn - min_low_pfn;
13455 -#ifdef CONFIG_HIGHMEM
13456 - total += highend_pfn - highstart_pfn;
13457 -#endif
13458 -
13459 - return total << PAGE_SHIFT;
13460 -}
13461 -
13462 -#ifdef CONFIG_KEXEC
13463 -#ifndef CONFIG_XEN
13464 -static void __init reserve_crashkernel(void)
13465 -{
13466 - unsigned long long total_mem;
13467 - unsigned long long crash_size, crash_base;
13468 - int ret;
13469 -
13470 - total_mem = get_total_mem();
13471 -
13472 - ret = parse_crashkernel(boot_command_line, total_mem,
13473 - &crash_size, &crash_base);
13474 - if (ret == 0 && crash_size > 0) {
13475 - if (crash_base > 0) {
13476 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
13477 - "for crashkernel (System RAM: %ldMB)\n",
13478 - (unsigned long)(crash_size >> 20),
13479 - (unsigned long)(crash_base >> 20),
13480 - (unsigned long)(total_mem >> 20));
13481 -
13482 - if (reserve_bootmem(crash_base, crash_size,
13483 - BOOTMEM_EXCLUSIVE) < 0) {
13484 - printk(KERN_INFO "crashkernel reservation "
13485 - "failed - memory is in use\n");
13486 - return;
13487 - }
13488 -
13489 - crashk_res.start = crash_base;
13490 - crashk_res.end = crash_base + crash_size - 1;
13491 - } else
13492 - printk(KERN_INFO "crashkernel reservation failed - "
13493 - "you have to specify a base address\n");
13494 - }
13495 -}
13496 -#else
13497 -#define reserve_crashkernel xen_machine_kexec_setup_resources
13498 -#endif
13499 -#else
13500 -static inline void __init reserve_crashkernel(void)
13501 -{}
13502 -#endif
13503 -
13504 -#ifdef CONFIG_BLK_DEV_INITRD
13505 -
13506 -static bool do_relocate_initrd = false;
13507 -
13508 -static void __init reserve_initrd(void)
13509 -{
13510 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
13511 - unsigned long ramdisk_size = xen_start_info->mod_len;
13512 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
13513 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13514 - unsigned long ramdisk_here;
13515 -
13516 - initrd_start = 0;
13517 -
13518 - if (!xen_start_info->mod_start || !ramdisk_size)
13519 - return; /* No initrd provided by bootloader */
13520 -
13521 - if (ramdisk_end < ramdisk_image) {
13522 - printk(KERN_ERR "initrd wraps around end of memory, "
13523 - "disabling initrd\n");
13524 - return;
13525 - }
13526 - if (ramdisk_size >= end_of_lowmem/2) {
13527 - printk(KERN_ERR "initrd too large to handle, "
13528 - "disabling initrd\n");
13529 - return;
13530 - }
13531 - if (ramdisk_end <= end_of_lowmem) {
13532 - /* All in lowmem, easy case */
13533 - reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
13534 - initrd_start = ramdisk_image + PAGE_OFFSET;
13535 - initrd_end = initrd_start+ramdisk_size;
13536 - return;
13537 - }
13538 -
13539 - /* We need to move the initrd down into lowmem */
13540 - ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
13541 -
13542 - /* Note: this includes all the lowmem currently occupied by
13543 - the initrd, we rely on that fact to keep the data intact. */
13544 - reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
13545 - initrd_start = ramdisk_here + PAGE_OFFSET;
13546 - initrd_end = initrd_start + ramdisk_size;
13547 -
13548 - do_relocate_initrd = true;
13549 -}
13550 -
13551 -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
13552 -
13553 -static void __init relocate_initrd(void)
13554 -{
13555 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
13556 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
13557 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
13558 - unsigned long ramdisk_here;
13559 - unsigned long slop, clen, mapaddr;
13560 - char *p, *q;
13561 -
13562 - if (!do_relocate_initrd)
13563 - return;
13564 -
13565 - ramdisk_here = initrd_start - PAGE_OFFSET;
13566 -
13567 - q = (char *)initrd_start;
13568 -
13569 - /* Copy any lowmem portion of the initrd */
13570 - if (ramdisk_image < end_of_lowmem) {
13571 - clen = end_of_lowmem - ramdisk_image;
13572 - p = (char *)__va(ramdisk_image);
13573 - memcpy(q, p, clen);
13574 - q += clen;
13575 - ramdisk_image += clen;
13576 - ramdisk_size -= clen;
13577 - }
13578 -
13579 - /* Copy the highmem portion of the initrd */
13580 - while (ramdisk_size) {
13581 - slop = ramdisk_image & ~PAGE_MASK;
13582 - clen = ramdisk_size;
13583 - if (clen > MAX_MAP_CHUNK-slop)
13584 - clen = MAX_MAP_CHUNK-slop;
13585 - mapaddr = ramdisk_image & PAGE_MASK;
13586 - p = early_ioremap(mapaddr, clen+slop);
13587 - memcpy(q, p+slop, clen);
13588 - early_iounmap(p, clen+slop);
13589 - q += clen;
13590 - ramdisk_image += clen;
13591 - ramdisk_size -= clen;
13592 - }
13593 -}
13594 -
13595 -#endif /* CONFIG_BLK_DEV_INITRD */
13596 -
13597 -void __init setup_bootmem_allocator(void)
13598 -{
13599 - unsigned long bootmap_size;
13600 - /*
13601 - * Initialize the boot-time allocator (with low memory only):
13602 - */
13603 - bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
13604 -
13605 - register_bootmem_low_pages(max_low_pfn);
13606 -
13607 - /*
13608 - * Reserve the bootmem bitmap itself as well. We do this in two
13609 - * steps (first step was init_bootmem()) because this catches
13610 - * the (very unlikely) case of us accidentally initializing the
13611 - * bootmem allocator with an invalid RAM area.
13612 - */
13613 - reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
13614 - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
13615 - BOOTMEM_DEFAULT);
13616 -
13617 -#ifndef CONFIG_XEN
13618 - /*
13619 - * reserve physical page 0 - it's a special BIOS page on many boxes,
13620 - * enabling clean reboots, SMP operation, laptop functions.
13621 - */
13622 - reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
13623 -
13624 - /* reserve EBDA region */
13625 - reserve_ebda_region();
13626 -
13627 -#ifdef CONFIG_SMP
13628 - /*
13629 - * But first pinch a few for the stack/trampoline stuff
13630 - * FIXME: Don't need the extra page at 4K, but need to fix
13631 - * trampoline before removing it. (see the GDT stuff)
13632 - */
13633 - reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
13634 -#endif
13635 -#ifdef CONFIG_ACPI_SLEEP
13636 - /*
13637 - * Reserve low memory region for sleep support.
13638 - */
13639 - acpi_reserve_bootmem();
13640 -#endif
13641 -#endif /* !CONFIG_XEN */
13642 -
13643 -#ifdef CONFIG_BLK_DEV_INITRD
13644 - reserve_initrd();
13645 -#endif
13646 - numa_kva_reserve();
13647 - reserve_crashkernel();
13648 -
13649 - reserve_ibft_region();
13650 -}
13651 -
13652 -/*
13653 - * The node 0 pgdat is initialized before all of these because
13654 - * it's needed for bootmem. node>0 pgdats have their virtual
13655 - * space allocated before the pagetables are in place to access
13656 - * them, so they can't be cleared then.
13657 - *
13658 - * This should all compile down to nothing when NUMA is off.
13659 - */
13660 -static void __init remapped_pgdat_init(void)
13661 -{
13662 - int nid;
13663 -
13664 - for_each_online_node(nid) {
13665 - if (nid != 0)
13666 - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
13667 - }
13668 -}
13669 -
13670 -#ifdef CONFIG_MCA
13671 -static void set_mca_bus(int x)
13672 -{
13673 - MCA_bus = x;
13674 -}
13675 -#else
13676 -static void set_mca_bus(int x) { }
13677 -#endif
13678 -
13679 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
13680 -char * __init __attribute__((weak)) memory_setup(void)
13681 -{
13682 - return machine_specific_memory_setup();
13683 -}
13684 -
13685 -#ifdef CONFIG_NUMA
13686 -/*
13687 - * In the golden day, when everything among i386 and x86_64 will be
13688 - * integrated, this will not live here
13689 - */
13690 -void *x86_cpu_to_node_map_early_ptr;
13691 -int x86_cpu_to_node_map_init[NR_CPUS] = {
13692 - [0 ... NR_CPUS-1] = NUMA_NO_NODE
13693 -};
13694 -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
13695 -#endif
13696 -
13697 -/*
13698 - * Determine if we were loaded by an EFI loader. If so, then we have also been
13699 - * passed the efi memmap, systab, etc., so we should use these data structures
13700 - * for initialization. Note, the efi init code path is determined by the
13701 - * global efi_enabled. This allows the same kernel image to be used on existing
13702 - * systems (with a traditional BIOS) as well as on EFI systems.
13703 - */
13704 -void __init setup_arch(char **cmdline_p)
13705 -{
13706 - int i, j, k, fpp;
13707 - struct physdev_set_iopl set_iopl;
13708 - unsigned long max_low_pfn;
13709 - unsigned long p2m_pages;
13710 -
13711 - /* Force a quick death if the kernel panics (not domain 0). */
13712 - extern int panic_timeout;
13713 - if (!panic_timeout && !is_initial_xendomain())
13714 - panic_timeout = 1;
13715 -
13716 - /* Register a call for panic conditions. */
13717 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
13718 -
13719 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13720 - VMASST_TYPE_4gb_segments));
13721 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
13722 - VMASST_TYPE_writable_pagetables));
13723 -
13724 - memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
13725 - pre_setup_arch_hook();
13726 - early_cpu_init();
13727 - early_ioremap_init();
13728 -#ifdef CONFIG_SMP
13729 - prefill_possible_map();
13730 -#endif
13731 -
13732 -#ifdef CONFIG_EFI
13733 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
13734 - "EL32", 4))
13735 - efi_enabled = 1;
13736 -#endif
13737 -
13738 - /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
13739 - properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
13740 - */
13741 - ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
13742 - screen_info = boot_params.screen_info;
13743 - copy_edid();
13744 - apm_info.bios = boot_params.apm_bios_info;
13745 - ist_info = boot_params.ist_info;
13746 - saved_video_mode = boot_params.hdr.vid_mode;
13747 - if( boot_params.sys_desc_table.length != 0 ) {
13748 - set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
13749 - machine_id = boot_params.sys_desc_table.table[0];
13750 - machine_submodel_id = boot_params.sys_desc_table.table[1];
13751 - BIOS_revision = boot_params.sys_desc_table.table[2];
13752 - }
13753 - bootloader_type = boot_params.hdr.type_of_loader;
13754 -
13755 - if (is_initial_xendomain()) {
13756 - const struct dom0_vga_console_info *info =
13757 - (void *)((char *)xen_start_info +
13758 - xen_start_info->console.dom0.info_off);
13759 -
13760 - dom0_init_screen_info(info,
13761 - xen_start_info->console.dom0.info_size);
13762 - xen_start_info->console.domU.mfn = 0;
13763 - xen_start_info->console.domU.evtchn = 0;
13764 - } else
13765 - screen_info.orig_video_isVGA = 0;
13766 -
13767 -#ifdef CONFIG_BLK_DEV_RAM
13768 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
13769 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
13770 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
13771 -#endif
13772 -
13773 - ARCH_SETUP
13774 -
13775 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
13776 - print_memory_map(memory_setup());
13777 -
13778 - copy_edd();
13779 -
13780 - if (!boot_params.hdr.root_flags)
13781 - root_mountflags &= ~MS_RDONLY;
13782 - init_mm.start_code = (unsigned long) _text;
13783 - init_mm.end_code = (unsigned long) _etext;
13784 - init_mm.end_data = (unsigned long) _edata;
13785 - init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
13786 - xen_start_info->nr_pt_frames) << PAGE_SHIFT;
13787 -
13788 - code_resource.start = virt_to_phys(_text);
13789 - code_resource.end = virt_to_phys(_etext)-1;
13790 - data_resource.start = virt_to_phys(_etext);
13791 - data_resource.end = virt_to_phys(_edata)-1;
13792 - bss_resource.start = virt_to_phys(&__bss_start);
13793 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
13794 -
13795 - if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
13796 - i = COMMAND_LINE_SIZE;
13797 - memcpy(boot_command_line, xen_start_info->cmd_line, i);
13798 - boot_command_line[i - 1] = '\0';
13799 - parse_early_param();
13800 -
13801 - if (user_defined_memmap) {
13802 - printk(KERN_INFO "user-defined physical RAM map:\n");
13803 - print_memory_map("user");
13804 - }
13805 -
13806 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
13807 - *cmdline_p = command_line;
13808 -
13809 - if (efi_enabled)
13810 - efi_init();
13811 -
13812 - /* update e820 for memory not covered by WB MTRRs */
13813 - propagate_e820_map();
13814 - mtrr_bp_init();
13815 -#ifndef CONFIG_XEN
13816 - if (mtrr_trim_uncached_memory(max_pfn))
13817 - propagate_e820_map();
13818 -#endif
13819 -
13820 - max_low_pfn = setup_memory();
13821 -
13822 -#ifdef CONFIG_KVM_CLOCK
13823 - kvmclock_init();
13824 -#endif
13825 -
13826 -#ifdef CONFIG_VMI
13827 - /*
13828 - * Must be after max_low_pfn is determined, and before kernel
13829 - * pagetables are setup.
13830 - */
13831 - vmi_init();
13832 -#endif
13833 - kvm_guest_init();
13834 -
13835 - /*
13836 - * NOTE: before this point _nobody_ is allowed to allocate
13837 - * any memory using the bootmem allocator. Although the
13838 - * allocator is now initialised only the first 8Mb of the kernel
13839 - * virtual address space has been mapped. All allocations before
13840 - * paging_init() has completed must use the alloc_bootmem_low_pages()
13841 - * variant (which allocates DMA'able memory) and care must be taken
13842 - * not to exceed the 8Mb limit.
13843 - */
13844 -
13845 -#ifdef CONFIG_SMP
13846 - smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
13847 -#endif
13848 - paging_init();
13849 -
13850 - /*
13851 - * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
13852 - */
13853 -
13854 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
13855 - if (init_ohci1394_dma_early)
13856 - init_ohci1394_dma_on_all_controllers();
13857 -#endif
13858 -
13859 - remapped_pgdat_init();
13860 - sparse_init();
13861 - zone_sizes_init();
13862 -
13863 -#ifdef CONFIG_X86_FIND_SMP_CONFIG
13864 - /*
13865 - * Find and reserve possible boot-time SMP configuration:
13866 - */
13867 - find_smp_config();
13868 -#endif
13869 -
13870 - p2m_pages = max_pfn;
13871 - if (xen_start_info->nr_pages > max_pfn) {
13872 - /*
13873 - * the max_pfn was shrunk (probably by mem= or highmem=
13874 - * kernel parameter); shrink reservation with the HV
13875 - */
13876 - struct xen_memory_reservation reservation = {
13877 - .address_bits = 0,
13878 - .extent_order = 0,
13879 - .domid = DOMID_SELF
13880 - };
13881 - unsigned int difference;
13882 - int ret;
13883 -
13884 - difference = xen_start_info->nr_pages - max_pfn;
13885 -
13886 - set_xen_guest_handle(reservation.extent_start,
13887 - ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
13888 - reservation.nr_extents = difference;
13889 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
13890 - &reservation);
13891 - BUG_ON (ret != difference);
13892 - }
13893 - else if (max_pfn > xen_start_info->nr_pages)
13894 - p2m_pages = xen_start_info->nr_pages;
13895 -
13896 - /* Make sure we have a correctly sized P->M table. */
13897 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
13898 - phys_to_machine_mapping = alloc_bootmem_low_pages(
13899 - max_pfn * sizeof(unsigned long));
13900 - memset(phys_to_machine_mapping, ~0,
13901 - max_pfn * sizeof(unsigned long));
13902 - memcpy(phys_to_machine_mapping,
13903 - (unsigned long *)xen_start_info->mfn_list,
13904 - p2m_pages * sizeof(unsigned long));
13905 - free_bootmem(
13906 - __pa(xen_start_info->mfn_list),
13907 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
13908 - sizeof(unsigned long))));
13909 -
13910 - /*
13911 - * Initialise the list of the frames that specify the list of
13912 - * frames that make up the p2m table. Used by save/restore
13913 - */
13914 - pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
13915 -
13916 - fpp = PAGE_SIZE/sizeof(unsigned long);
13917 - for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
13918 - if ((j % fpp) == 0) {
13919 - k++;
13920 - BUG_ON(k>=16);
13921 - pfn_to_mfn_frame_list[k] =
13922 - alloc_bootmem_low_pages(PAGE_SIZE);
13923 - pfn_to_mfn_frame_list_list[k] =
13924 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
13925 - j=0;
13926 - }
13927 - pfn_to_mfn_frame_list[k][j] =
13928 - virt_to_mfn(&phys_to_machine_mapping[i]);
13929 - }
13930 - HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
13931 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
13932 - virt_to_mfn(pfn_to_mfn_frame_list_list);
13933 - }
13934 -
13935 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
13936 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
13937 - if (i != 4 && request_dma(i, "xen") != 0)
13938 - BUG();
13939 -
13940 - /*
13941 - * NOTE: at this point the bootmem allocator is fully available.
13942 - */
13943 -
13944 -#ifdef CONFIG_BLK_DEV_INITRD
13945 - relocate_initrd();
13946 -#endif
13947 -
13948 - paravirt_post_allocator_init();
13949 -
13950 - if (is_initial_xendomain())
13951 - dmi_scan_machine();
13952 -
13953 - io_delay_init();
13954 -
13955 -#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
13956 - /*
13957 - * setup to use the early static init tables during kernel startup
13958 - * X86_SMP will exclude sub-arches that don't deal well with it.
13959 - */
13960 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
13961 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
13962 -#ifdef CONFIG_NUMA
13963 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
13964 -#endif
13965 -#endif
13966 -
13967 -#ifdef CONFIG_X86_GENERICARCH
13968 - generic_apic_probe();
13969 -#endif
13970 -
13971 - set_iopl.iopl = 1;
13972 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
13973 -
13974 -#ifdef CONFIG_ACPI
13975 - if (!is_initial_xendomain()) {
13976 - printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
13977 - acpi_disabled = 1;
13978 - acpi_ht = 0;
13979 - }
13980 -
13981 - /*
13982 - * Parse the ACPI tables for possible boot-time SMP configuration.
13983 - */
13984 - acpi_boot_table_init();
13985 -#endif
13986 -
13987 -#ifndef CONFIG_XEN
13988 - early_quirks();
13989 -#endif
13990 -
13991 -#ifdef CONFIG_ACPI
13992 - acpi_boot_init();
13993 -
13994 -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
13995 - if (def_to_bigsmp)
13996 - printk(KERN_WARNING "More than 8 CPUs detected and "
13997 - "CONFIG_X86_PC cannot handle it.\nUse "
13998 - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
13999 -#endif
14000 -#endif
14001 -#ifdef CONFIG_X86_LOCAL_APIC
14002 - if (smp_found_config)
14003 - get_smp_config();
14004 -#endif
14005 -
14006 - e820_register_memory();
14007 - e820_mark_nosave_regions();
14008 -
14009 - if (is_initial_xendomain()) {
14010 -#ifdef CONFIG_VT
14011 -#if defined(CONFIG_VGA_CONSOLE)
14012 - if (!efi_enabled ||
14013 - (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14014 - conswitchp = &vga_con;
14015 -#elif defined(CONFIG_DUMMY_CONSOLE)
14016 - conswitchp = &dummy_con;
14017 -#endif
14018 -#endif
14019 - } else {
14020 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14021 - conswitchp = &dummy_con;
14022 -#endif
14023 - }
14024 -}
14025 -
14026 -static int
14027 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14028 -{
14029 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14030 - /* we're never actually going to get here... */
14031 - return NOTIFY_DONE;
14032 -}
14033 -
14034 -/*
14035 - * Request address space for all standard resources
14036 - *
14037 - * This is called just before pcibios_init(), which is also a
14038 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
14039 - */
14040 -static int __init request_standard_resources(void)
14041 -{
14042 - int i;
14043 -
14044 - /* Nothing to do if not running in dom0. */
14045 - if (!is_initial_xendomain())
14046 - return 0;
14047 -
14048 - printk(KERN_INFO "Setting up standard PCI resources\n");
14049 - init_iomem_resources(&code_resource, &data_resource, &bss_resource);
14050 -
14051 - request_resource(&iomem_resource, &video_ram_resource);
14052 -
14053 - /* request I/O space for devices used on all i[345]86 PCs */
14054 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14055 - request_resource(&ioport_resource, &standard_io_resources[i]);
14056 - return 0;
14057 -}
14058 -
14059 -subsys_initcall(request_standard_resources);
14060 --- sle11-2009-10-16.orig/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:38:05.000000000 +0100
14061 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
14062 @@ -1,1433 +0,0 @@
14063 -/*
14064 - * Copyright (C) 1995 Linus Torvalds
14065 - */
14066 -
14067 -/*
14068 - * This file handles the architecture-dependent parts of initialization
14069 - */
14070 -
14071 -#include <linux/errno.h>
14072 -#include <linux/sched.h>
14073 -#include <linux/kernel.h>
14074 -#include <linux/mm.h>
14075 -#include <linux/stddef.h>
14076 -#include <linux/unistd.h>
14077 -#include <linux/ptrace.h>
14078 -#include <linux/slab.h>
14079 -#include <linux/user.h>
14080 -#include <linux/screen_info.h>
14081 -#include <linux/ioport.h>
14082 -#include <linux/delay.h>
14083 -#include <linux/init.h>
14084 -#include <linux/initrd.h>
14085 -#include <linux/highmem.h>
14086 -#include <linux/bootmem.h>
14087 -#include <linux/module.h>
14088 -#include <asm/processor.h>
14089 -#include <linux/console.h>
14090 -#include <linux/seq_file.h>
14091 -#include <linux/crash_dump.h>
14092 -#include <linux/root_dev.h>
14093 -#include <linux/pci.h>
14094 -#include <asm/pci-direct.h>
14095 -#include <linux/efi.h>
14096 -#include <linux/acpi.h>
14097 -#include <linux/kallsyms.h>
14098 -#include <linux/edd.h>
14099 -#include <linux/iscsi_ibft.h>
14100 -#include <linux/mmzone.h>
14101 -#include <linux/kexec.h>
14102 -#include <linux/cpufreq.h>
14103 -#include <linux/dmi.h>
14104 -#include <linux/dma-mapping.h>
14105 -#include <linux/ctype.h>
14106 -#include <linux/sort.h>
14107 -#include <linux/uaccess.h>
14108 -#include <linux/init_ohci1394_dma.h>
14109 -#include <linux/kvm_para.h>
14110 -
14111 -#include <asm/mtrr.h>
14112 -#include <asm/uaccess.h>
14113 -#include <asm/system.h>
14114 -#include <asm/vsyscall.h>
14115 -#include <asm/io.h>
14116 -#include <asm/smp.h>
14117 -#include <asm/msr.h>
14118 -#include <asm/desc.h>
14119 -#include <video/edid.h>
14120 -#include <asm/e820.h>
14121 -#include <asm/dma.h>
14122 -#include <asm/gart.h>
14123 -#include <asm/mpspec.h>
14124 -#include <asm/mmu_context.h>
14125 -#include <asm/proto.h>
14126 -#include <asm/setup.h>
14127 -#include <asm/numa.h>
14128 -#include <asm/sections.h>
14129 -#include <asm/dmi.h>
14130 -#include <asm/cacheflush.h>
14131 -#include <asm/mce.h>
14132 -#include <asm/ds.h>
14133 -#include <asm/topology.h>
14134 -#include <asm/pat.h>
14135 -
14136 -#include <mach_apic.h>
14137 -#ifdef CONFIG_XEN
14138 -#include <linux/percpu.h>
14139 -#include <xen/interface/physdev.h>
14140 -#include "setup_arch_pre.h"
14141 -#include <asm/hypervisor.h>
14142 -#include <xen/interface/nmi.h>
14143 -#include <xen/features.h>
14144 -#include <xen/firmware.h>
14145 -#include <xen/xencons.h>
14146 -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
14147 -#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
14148 -#include <asm/mach-xen/setup_arch_post.h>
14149 -#include <xen/interface/memory.h>
14150 -
14151 -#ifdef CONFIG_XEN
14152 -#include <xen/interface/kexec.h>
14153 -#endif
14154 -
14155 -extern unsigned long start_pfn;
14156 -extern struct edid_info edid_info;
14157 -
14158 -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
14159 -EXPORT_SYMBOL(HYPERVISOR_shared_info);
14160 -
14161 -static int xen_panic_event(struct notifier_block *, unsigned long, void *);
14162 -static struct notifier_block xen_panic_block = {
14163 - xen_panic_event, NULL, 0 /* try to go last */
14164 -};
14165 -
14166 -unsigned long *phys_to_machine_mapping;
14167 -unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
14168 -
14169 -EXPORT_SYMBOL(phys_to_machine_mapping);
14170 -
14171 -DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
14172 -DEFINE_PER_CPU(int, nr_multicall_ents);
14173 -
14174 -/* Raw start-of-day parameters from the hypervisor. */
14175 -start_info_t *xen_start_info;
14176 -EXPORT_SYMBOL(xen_start_info);
14177 -#endif
14178 -
14179 -/*
14180 - * Machine setup..
14181 - */
14182 -
14183 -struct cpuinfo_x86 boot_cpu_data __read_mostly;
14184 -EXPORT_SYMBOL(boot_cpu_data);
14185 -
14186 -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
14187 -
14188 -unsigned long mmu_cr4_features;
14189 -
14190 -/* Boot loader ID as an integer, for the benefit of proc_dointvec */
14191 -int bootloader_type;
14192 -
14193 -unsigned long saved_video_mode;
14194 -
14195 -int force_mwait __cpuinitdata;
14196 -
14197 -/*
14198 - * Early DMI memory
14199 - */
14200 -int dmi_alloc_index;
14201 -char dmi_alloc_data[DMI_MAX_DATA];
14202 -
14203 -/*
14204 - * Setup options
14205 - */
14206 -struct screen_info screen_info;
14207 -EXPORT_SYMBOL(screen_info);
14208 -struct sys_desc_table_struct {
14209 - unsigned short length;
14210 - unsigned char table[0];
14211 -};
14212 -
14213 -struct edid_info edid_info;
14214 -EXPORT_SYMBOL_GPL(edid_info);
14215 -
14216 -extern int root_mountflags;
14217 -
14218 -char __initdata command_line[COMMAND_LINE_SIZE];
14219 -
14220 -static struct resource standard_io_resources[] = {
14221 - { .name = "dma1", .start = 0x00, .end = 0x1f,
14222 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14223 - { .name = "pic1", .start = 0x20, .end = 0x21,
14224 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14225 - { .name = "timer0", .start = 0x40, .end = 0x43,
14226 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14227 - { .name = "timer1", .start = 0x50, .end = 0x53,
14228 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14229 - { .name = "keyboard", .start = 0x60, .end = 0x60,
14230 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14231 - { .name = "keyboard", .start = 0x64, .end = 0x64,
14232 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14233 - { .name = "dma page reg", .start = 0x80, .end = 0x8f,
14234 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14235 - { .name = "pic2", .start = 0xa0, .end = 0xa1,
14236 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14237 - { .name = "dma2", .start = 0xc0, .end = 0xdf,
14238 - .flags = IORESOURCE_BUSY | IORESOURCE_IO },
14239 - { .name = "fpu", .start = 0xf0, .end = 0xff,
14240 - .flags = IORESOURCE_BUSY | IORESOURCE_IO }
14241 -};
14242 -
14243 -#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
14244 -
14245 -static struct resource data_resource = {
14246 - .name = "Kernel data",
14247 - .start = 0,
14248 - .end = 0,
14249 - .flags = IORESOURCE_RAM,
14250 -};
14251 -static struct resource code_resource = {
14252 - .name = "Kernel code",
14253 - .start = 0,
14254 - .end = 0,
14255 - .flags = IORESOURCE_RAM,
14256 -};
14257 -static struct resource bss_resource = {
14258 - .name = "Kernel bss",
14259 - .start = 0,
14260 - .end = 0,
14261 - .flags = IORESOURCE_RAM,
14262 -};
14263 -
14264 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
14265 -
14266 -#ifdef CONFIG_PROC_VMCORE
14267 -/* elfcorehdr= specifies the location of elf core header
14268 - * stored by the crashed kernel. This option will be passed
14269 - * by kexec loader to the capture kernel.
14270 - */
14271 -static int __init setup_elfcorehdr(char *arg)
14272 -{
14273 - char *end;
14274 - if (!arg)
14275 - return -EINVAL;
14276 - elfcorehdr_addr = memparse(arg, &end);
14277 - return end > arg ? 0 : -EINVAL;
14278 -}
14279 -early_param("elfcorehdr", setup_elfcorehdr);
14280 -#endif
14281 -
14282 -#ifndef CONFIG_NUMA
14283 -static void __init
14284 -contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
14285 -{
14286 - unsigned long bootmap_size, bootmap;
14287 -
14288 - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
14289 - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
14290 - PAGE_SIZE);
14291 - if (bootmap == -1L)
14292 - panic("Cannot find bootmem map of size %ld\n", bootmap_size);
14293 - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
14294 - e820_register_active_regions(0, start_pfn, end_pfn);
14295 -#ifdef CONFIG_XEN
14296 - free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
14297 - early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
14298 -#else
14299 - free_bootmem_with_active_regions(0, end_pfn);
14300 - early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
14301 -#endif
14302 - reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
14303 -}
14304 -#endif
14305 -
14306 -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
14307 -struct edd edd;
14308 -#ifdef CONFIG_EDD_MODULE
14309 -EXPORT_SYMBOL(edd);
14310 -#endif
14311 -#ifndef CONFIG_XEN
14312 -/**
14313 - * copy_edd() - Copy the BIOS EDD information
14314 - * from boot_params into a safe place.
14315 - *
14316 - */
14317 -static inline void copy_edd(void)
14318 -{
14319 - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
14320 - sizeof(edd.mbr_signature));
14321 - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
14322 - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
14323 - edd.edd_info_nr = boot_params.eddbuf_entries;
14324 -}
14325 -#endif
14326 -#else
14327 -static inline void copy_edd(void)
14328 -{
14329 -}
14330 -#endif
14331 -
14332 -#ifdef CONFIG_KEXEC
14333 -#ifndef CONFIG_XEN
14334 -static void __init reserve_crashkernel(void)
14335 -{
14336 - unsigned long long total_mem;
14337 - unsigned long long crash_size, crash_base;
14338 - int ret;
14339 -
14340 - total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
14341 -
14342 - ret = parse_crashkernel(boot_command_line, total_mem,
14343 - &crash_size, &crash_base);
14344 - if (ret == 0 && crash_size) {
14345 - if (crash_base <= 0) {
14346 - printk(KERN_INFO "crashkernel reservation failed - "
14347 - "you have to specify a base address\n");
14348 - return;
14349 - }
14350 -
14351 - if (reserve_bootmem(crash_base, crash_size,
14352 - BOOTMEM_EXCLUSIVE) < 0) {
14353 - printk(KERN_INFO "crashkernel reservation failed - "
14354 - "memory is in use\n");
14355 - return;
14356 - }
14357 -
14358 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
14359 - "for crashkernel (System RAM: %ldMB)\n",
14360 - (unsigned long)(crash_size >> 20),
14361 - (unsigned long)(crash_base >> 20),
14362 - (unsigned long)(total_mem >> 20));
14363 - crashk_res.start = crash_base;
14364 - crashk_res.end = crash_base + crash_size - 1;
14365 - insert_resource(&iomem_resource, &crashk_res);
14366 - }
14367 -}
14368 -#else
14369 -#define reserve_crashkernel xen_machine_kexec_setup_resources
14370 -#endif
14371 -#else
14372 -static inline void __init reserve_crashkernel(void)
14373 -{}
14374 -#endif
14375 -
14376 -/* Overridden in paravirt.c if CONFIG_PARAVIRT */
14377 -void __attribute__((weak)) __init memory_setup(void)
14378 -{
14379 - machine_specific_memory_setup();
14380 -}
14381 -
14382 -static void __init parse_setup_data(void)
14383 -{
14384 - struct setup_data *data;
14385 - unsigned long pa_data;
14386 -
14387 - if (boot_params.hdr.version < 0x0209)
14388 - return;
14389 - pa_data = boot_params.hdr.setup_data;
14390 - while (pa_data) {
14391 - data = early_ioremap(pa_data, PAGE_SIZE);
14392 - switch (data->type) {
14393 - default:
14394 - break;
14395 - }
14396 -#ifndef CONFIG_DEBUG_BOOT_PARAMS
14397 - free_early(pa_data, pa_data+sizeof(*data)+data->len);
14398 -#endif
14399 - pa_data = data->next;
14400 - early_iounmap(data, PAGE_SIZE);
14401 - }
14402 -}
14403 -
14404 -#ifdef CONFIG_PCI_MMCONFIG
14405 -extern void __cpuinit fam10h_check_enable_mmcfg(void);
14406 -extern void __init check_enable_amd_mmconf_dmi(void);
14407 -#else
14408 -void __cpuinit fam10h_check_enable_mmcfg(void)
14409 -{
14410 -}
14411 -void __init check_enable_amd_mmconf_dmi(void)
14412 -{
14413 -}
14414 -#endif
14415 -
14416 -/*
14417 - * setup_arch - architecture-specific boot-time initializations
14418 - *
14419 - * Note: On x86_64, fixmaps are ready for use even before this is called.
14420 - */
14421 -void __init setup_arch(char **cmdline_p)
14422 -{
14423 - unsigned i;
14424 -
14425 -#ifdef CONFIG_XEN
14426 - extern struct e820map machine_e820;
14427 -
14428 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14429 -
14430 - /* Register a call for panic conditions. */
14431 - atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
14432 -
14433 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
14434 - VMASST_TYPE_writable_pagetables));
14435 -
14436 - early_ioremap_init();
14437 -
14438 - ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
14439 - screen_info = boot_params.screen_info;
14440 -
14441 - if (is_initial_xendomain()) {
14442 - const struct dom0_vga_console_info *info =
14443 - (void *)((char *)xen_start_info +
14444 - xen_start_info->console.dom0.info_off);
14445 -
14446 - dom0_init_screen_info(info,
14447 - xen_start_info->console.dom0.info_size);
14448 - xen_start_info->console.domU.mfn = 0;
14449 - xen_start_info->console.domU.evtchn = 0;
14450 - } else
14451 - screen_info.orig_video_isVGA = 0;
14452 -
14453 - copy_edid();
14454 -#else
14455 - printk(KERN_INFO "Command line: %s\n", boot_command_line);
14456 -
14457 - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
14458 - screen_info = boot_params.screen_info;
14459 - edid_info = boot_params.edid_info;
14460 -#endif /* !CONFIG_XEN */
14461 - saved_video_mode = boot_params.hdr.vid_mode;
14462 - bootloader_type = boot_params.hdr.type_of_loader;
14463 -
14464 -#ifdef CONFIG_BLK_DEV_RAM
14465 - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
14466 - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
14467 - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
14468 -#endif
14469 -#ifdef CONFIG_EFI
14470 - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
14471 - "EL64", 4))
14472 - efi_enabled = 1;
14473 -#endif
14474 -
14475 - ARCH_SETUP
14476 -
14477 - memory_setup();
14478 - copy_edd();
14479 -
14480 - if (!boot_params.hdr.root_flags)
14481 - root_mountflags &= ~MS_RDONLY;
14482 - init_mm.start_code = (unsigned long) &_text;
14483 - init_mm.end_code = (unsigned long) &_etext;
14484 - init_mm.end_data = (unsigned long) &_edata;
14485 - init_mm.brk = (unsigned long) &_end;
14486 -
14487 - code_resource.start = virt_to_phys(&_text);
14488 - code_resource.end = virt_to_phys(&_etext)-1;
14489 - data_resource.start = virt_to_phys(&_etext);
14490 - data_resource.end = virt_to_phys(&_edata)-1;
14491 - bss_resource.start = virt_to_phys(&__bss_start);
14492 - bss_resource.end = virt_to_phys(&__bss_stop)-1;
14493 -
14494 - early_identify_cpu(&boot_cpu_data);
14495 -
14496 - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
14497 - *cmdline_p = command_line;
14498 -
14499 - parse_setup_data();
14500 -
14501 - parse_early_param();
14502 -
14503 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
14504 - if (init_ohci1394_dma_early)
14505 - init_ohci1394_dma_on_all_controllers();
14506 -#endif
14507 -
14508 - finish_e820_parsing();
14509 -
14510 -#ifndef CONFIG_XEN
14511 - /* after parse_early_param, so could debug it */
14512 - insert_resource(&iomem_resource, &code_resource);
14513 - insert_resource(&iomem_resource, &data_resource);
14514 - insert_resource(&iomem_resource, &bss_resource);
14515 -#endif
14516 -
14517 - early_gart_iommu_check();
14518 -
14519 - e820_register_active_regions(0, 0, -1UL);
14520 - /*
14521 - * partially used pages are not usable - thus
14522 - * we are rounding upwards:
14523 - */
14524 - end_pfn = e820_end_of_ram();
14525 - /* update e820 for memory not covered by WB MTRRs */
14526 - mtrr_bp_init();
14527 -#ifndef CONFIG_XEN
14528 - if (mtrr_trim_uncached_memory(end_pfn)) {
14529 - e820_register_active_regions(0, 0, -1UL);
14530 - end_pfn = e820_end_of_ram();
14531 - }
14532 -#endif
14533 -
14534 - num_physpages = end_pfn;
14535 - max_mapnr = end_pfn;
14536 -
14537 - check_efer();
14538 -
14539 - max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
14540 - if (efi_enabled)
14541 - efi_init();
14542 -
14543 -#ifndef CONFIG_XEN
14544 - vsmp_init();
14545 -#endif
14546 -
14547 - if (is_initial_xendomain())
14548 - dmi_scan_machine();
14549 -
14550 - io_delay_init();
14551 -
14552 -#ifdef CONFIG_KVM_CLOCK
14553 - kvmclock_init();
14554 -#endif
14555 -
14556 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
14557 - /* setup to use the early static init tables during kernel startup */
14558 - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
14559 - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
14560 -#ifdef CONFIG_NUMA
14561 - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
14562 -#endif
14563 -#endif
14564 -
14565 - /* How many end-of-memory variables you have, grandma! */
14566 - max_low_pfn = end_pfn;
14567 - max_pfn = end_pfn;
14568 - high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
14569 -
14570 - /* Remove active ranges so rediscovery with NUMA-awareness happens */
14571 - remove_all_active_ranges();
14572 -
14573 -#ifdef CONFIG_ACPI_NUMA
14574 - /*
14575 - * Parse SRAT to discover nodes.
14576 - */
14577 - acpi_numa_init();
14578 -#endif
14579 -
14580 -#ifdef CONFIG_NUMA
14581 - numa_initmem_init(0, end_pfn);
14582 -#else
14583 - contig_initmem_init(0, end_pfn);
14584 -#endif
14585 -
14586 -#ifndef CONFIG_XEN
14587 - dma32_reserve_bootmem();
14588 -
14589 -#ifdef CONFIG_ACPI_SLEEP
14590 - /*
14591 - * Reserve low memory region for sleep support.
14592 - */
14593 - acpi_reserve_bootmem();
14594 -#endif
14595 -
14596 - if (efi_enabled)
14597 - efi_reserve_bootmem();
14598 -#endif
14599 -
14600 -#ifdef CONFIG_BLK_DEV_INITRD
14601 -#ifdef CONFIG_XEN
14602 - if (xen_start_info->mod_start) {
14603 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
14604 - unsigned long ramdisk_size = xen_start_info->mod_len;
14605 -#else
14606 - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
14607 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
14608 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
14609 -#endif
14610 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
14611 - unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
14612 -
14613 - if (ramdisk_end <= end_of_mem) {
14614 - /*
14615 - * don't need to reserve again, already reserved early
14616 - * in x86_64_start_kernel, and early_res_to_bootmem
14617 - * convert that to reserved in bootmem
14618 - */
14619 - initrd_start = ramdisk_image + PAGE_OFFSET;
14620 - initrd_end = initrd_start+ramdisk_size;
14621 -#ifdef CONFIG_XEN
14622 - initrd_below_start_ok = 1;
14623 -#endif
14624 - } else {
14625 - free_bootmem(ramdisk_image, ramdisk_size);
14626 - printk(KERN_ERR "initrd extends beyond end of memory "
14627 - "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
14628 - ramdisk_end, end_of_mem);
14629 - initrd_start = 0;
14630 - }
14631 - }
14632 -#endif
14633 - reserve_crashkernel();
14634 -
14635 - reserve_ibft_region();
14636 -
14637 - paging_init();
14638 - map_vsyscall();
14639 -#ifdef CONFIG_X86_LOCAL_APIC
14640 - /*
14641 - * Find and reserve possible boot-time SMP configuration:
14642 - */
14643 - find_smp_config();
14644 -#endif
14645 -#ifdef CONFIG_XEN
14646 - {
14647 - int i, j, k, fpp;
14648 - unsigned long p2m_pages;
14649 -
14650 - p2m_pages = end_pfn;
14651 - if (xen_start_info->nr_pages > end_pfn) {
14652 - /*
14653 - * the end_pfn was shrunk (probably by mem= or highmem=
14654 - * kernel parameter); shrink reservation with the HV
14655 - */
14656 - struct xen_memory_reservation reservation = {
14657 - .address_bits = 0,
14658 - .extent_order = 0,
14659 - .domid = DOMID_SELF
14660 - };
14661 - unsigned int difference;
14662 - int ret;
14663 -
14664 - difference = xen_start_info->nr_pages - end_pfn;
14665 -
14666 - set_xen_guest_handle(reservation.extent_start,
14667 - ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
14668 - reservation.nr_extents = difference;
14669 - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
14670 - &reservation);
14671 - BUG_ON (ret != difference);
14672 - }
14673 - else if (end_pfn > xen_start_info->nr_pages)
14674 - p2m_pages = xen_start_info->nr_pages;
14675 -
14676 - if (!xen_feature(XENFEAT_auto_translated_physmap)) {
14677 - /* Make sure we have a large enough P->M table. */
14678 - phys_to_machine_mapping = alloc_bootmem_pages(
14679 - end_pfn * sizeof(unsigned long));
14680 - memset(phys_to_machine_mapping, ~0,
14681 - end_pfn * sizeof(unsigned long));
14682 - memcpy(phys_to_machine_mapping,
14683 - (unsigned long *)xen_start_info->mfn_list,
14684 - p2m_pages * sizeof(unsigned long));
14685 - free_bootmem(
14686 - __pa(xen_start_info->mfn_list),
14687 - PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
14688 - sizeof(unsigned long))));
14689 -
14690 - /*
14691 - * Initialise the list of the frames that specify the
14692 - * list of frames that make up the p2m table. Used by
14693 - * save/restore.
14694 - */
14695 - pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
14696 -
14697 - fpp = PAGE_SIZE/sizeof(unsigned long);
14698 - for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
14699 - if ((j % fpp) == 0) {
14700 - k++;
14701 - BUG_ON(k>=fpp);
14702 - pfn_to_mfn_frame_list[k] =
14703 - alloc_bootmem_pages(PAGE_SIZE);
14704 - pfn_to_mfn_frame_list_list[k] =
14705 - virt_to_mfn(pfn_to_mfn_frame_list[k]);
14706 - j=0;
14707 - }
14708 - pfn_to_mfn_frame_list[k][j] =
14709 - virt_to_mfn(&phys_to_machine_mapping[i]);
14710 - }
14711 - HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
14712 - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
14713 - virt_to_mfn(pfn_to_mfn_frame_list_list);
14714 - }
14715 -
14716 - /* Mark all ISA DMA channels in-use - using them wouldn't work. */
14717 - for (i = 0; i < MAX_DMA_CHANNELS; ++i)
14718 - if (i != 4 && request_dma(i, "xen") != 0)
14719 - BUG();
14720 - }
14721 -
14722 -#ifdef CONFIG_ACPI
14723 - if (!is_initial_xendomain()) {
14724 - acpi_disabled = 1;
14725 - acpi_ht = 0;
14726 - }
14727 -#endif
14728 -#endif
14729 -
14730 -#ifndef CONFIG_XEN
14731 - early_quirks();
14732 -#endif
14733 -
14734 -#ifdef CONFIG_ACPI
14735 - /*
14736 - * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
14737 - * Call this early for SRAT node setup.
14738 - */
14739 - acpi_boot_table_init();
14740 -
14741 - /*
14742 - * Read APIC and some other early information from ACPI tables.
14743 - */
14744 - acpi_boot_init();
14745 -#endif
14746 -
14747 - init_cpu_to_node();
14748 -
14749 -#ifdef CONFIG_X86_LOCAL_APIC
14750 - /*
14751 - * get boot-time SMP configuration:
14752 - */
14753 - if (smp_found_config)
14754 - get_smp_config();
14755 -#ifndef CONFIG_XEN
14756 - init_apic_mappings();
14757 - ioapic_init_mappings();
14758 -#endif
14759 -#endif
14760 -#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
14761 - prefill_possible_map();
14762 -#endif
14763 -
14764 - kvm_guest_init();
14765 -
14766 - /*
14767 - * We trust e820 completely. No explicit ROM probing in memory.
14768 - */
14769 -#ifdef CONFIG_XEN
14770 - if (is_initial_xendomain())
14771 - e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
14772 -#else
14773 - e820_reserve_resources(e820.map, e820.nr_map);
14774 - e820_mark_nosave_regions();
14775 -#endif
14776 -
14777 - /* request I/O space for devices used on all i[345]86 PCs */
14778 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
14779 - request_resource(&ioport_resource, &standard_io_resources[i]);
14780 -
14781 -#ifdef CONFIG_XEN
14782 - if (is_initial_xendomain())
14783 - e820_setup_gap(machine_e820.map, machine_e820.nr_map);
14784 -#else
14785 - e820_setup_gap(e820.map, e820.nr_map);
14786 -#endif
14787 -
14788 -#ifdef CONFIG_XEN
14789 - {
14790 - struct physdev_set_iopl set_iopl;
14791 -
14792 - set_iopl.iopl = 1;
14793 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
14794 -
14795 - if (is_initial_xendomain()) {
14796 -#ifdef CONFIG_VT
14797 -#if defined(CONFIG_VGA_CONSOLE)
14798 - conswitchp = &vga_con;
14799 -#elif defined(CONFIG_DUMMY_CONSOLE)
14800 - conswitchp = &dummy_con;
14801 -#endif
14802 -#endif
14803 - } else {
14804 -#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
14805 - conswitchp = &dummy_con;
14806 -#endif
14807 - }
14808 - }
14809 -#else /* CONFIG_XEN */
14810 -
14811 -#ifdef CONFIG_VT
14812 -#if defined(CONFIG_VGA_CONSOLE)
14813 - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
14814 - conswitchp = &vga_con;
14815 -#elif defined(CONFIG_DUMMY_CONSOLE)
14816 - conswitchp = &dummy_con;
14817 -#endif
14818 -#endif
14819 -
14820 -#endif /* !CONFIG_XEN */
14821 -
14822 - /* do this before identify_cpu for boot cpu */
14823 - check_enable_amd_mmconf_dmi();
14824 -}
14825 -
14826 -#ifdef CONFIG_XEN
14827 -static int
14828 -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
14829 -{
14830 - HYPERVISOR_shutdown(SHUTDOWN_crash);
14831 - /* we're never actually going to get here... */
14832 - return NOTIFY_DONE;
14833 -}
14834 -#endif /* !CONFIG_XEN */
14835 -
14836 -
14837 -static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
14838 -{
14839 - unsigned int *v;
14840 -
14841 - if (c->extended_cpuid_level < 0x80000004)
14842 - return 0;
14843 -
14844 - v = (unsigned int *) c->x86_model_id;
14845 - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
14846 - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
14847 - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
14848 - c->x86_model_id[48] = 0;
14849 - return 1;
14850 -}
14851 -
14852 -
14853 -static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
14854 -{
14855 - unsigned int n, dummy, eax, ebx, ecx, edx;
14856 -
14857 - n = c->extended_cpuid_level;
14858 -
14859 - if (n >= 0x80000005) {
14860 - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
14861 - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
14862 - "D cache %dK (%d bytes/line)\n",
14863 - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
14864 - c->x86_cache_size = (ecx>>24) + (edx>>24);
14865 - /* On K8 L1 TLB is inclusive, so don't count it */
14866 - c->x86_tlbsize = 0;
14867 - }
14868 -
14869 - if (n >= 0x80000006) {
14870 - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
14871 - ecx = cpuid_ecx(0x80000006);
14872 - c->x86_cache_size = ecx >> 16;
14873 - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
14874 -
14875 - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
14876 - c->x86_cache_size, ecx & 0xFF);
14877 - }
14878 - if (n >= 0x80000008) {
14879 - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
14880 - c->x86_virt_bits = (eax >> 8) & 0xff;
14881 - c->x86_phys_bits = eax & 0xff;
14882 - }
14883 -}
14884 -
14885 -#ifdef CONFIG_NUMA
14886 -static int __cpuinit nearby_node(int apicid)
14887 -{
14888 - int i, node;
14889 -
14890 - for (i = apicid - 1; i >= 0; i--) {
14891 - node = apicid_to_node[i];
14892 - if (node != NUMA_NO_NODE && node_online(node))
14893 - return node;
14894 - }
14895 - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
14896 - node = apicid_to_node[i];
14897 - if (node != NUMA_NO_NODE && node_online(node))
14898 - return node;
14899 - }
14900 - return first_node(node_online_map); /* Shouldn't happen */
14901 -}
14902 -#endif
14903 -
14904 -/*
14905 - * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
14906 - * Assumes number of cores is a power of two.
14907 - */
14908 -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
14909 -{
14910 -#ifdef CONFIG_SMP
14911 - unsigned bits;
14912 -#ifdef CONFIG_NUMA
14913 - int cpu = smp_processor_id();
14914 - int node = 0;
14915 - unsigned apicid = hard_smp_processor_id();
14916 -#endif
14917 - bits = c->x86_coreid_bits;
14918 -
14919 - /* Low order bits define the core id (index of core in socket) */
14920 - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
14921 - /* Convert the initial APIC ID into the socket ID */
14922 - c->phys_proc_id = c->initial_apicid >> bits;
14923 -
14924 -#ifdef CONFIG_NUMA
14925 - node = c->phys_proc_id;
14926 - if (apicid_to_node[apicid] != NUMA_NO_NODE)
14927 - node = apicid_to_node[apicid];
14928 - if (!node_online(node)) {
14929 - /* Two possibilities here:
14930 - - The CPU is missing memory and no node was created.
14931 - In that case try picking one from a nearby CPU
14932 - - The APIC IDs differ from the HyperTransport node IDs
14933 - which the K8 northbridge parsing fills in.
14934 - Assume they are all increased by a constant offset,
14935 - but in the same order as the HT nodeids.
14936 - If that doesn't result in a usable node fall back to the
14937 - path for the previous case. */
14938 -
14939 - int ht_nodeid = c->initial_apicid;
14940 -
14941 - if (ht_nodeid >= 0 &&
14942 - apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
14943 - node = apicid_to_node[ht_nodeid];
14944 - /* Pick a nearby node */
14945 - if (!node_online(node))
14946 - node = nearby_node(apicid);
14947 - }
14948 - numa_set_node(cpu, node);
14949 -
14950 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
14951 -#endif
14952 -#endif
14953 -}
14954 -
14955 -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
14956 -{
14957 -#ifdef CONFIG_SMP
14958 - unsigned bits, ecx;
14959 -
14960 - /* Multi core CPU? */
14961 - if (c->extended_cpuid_level < 0x80000008)
14962 - return;
14963 -
14964 - ecx = cpuid_ecx(0x80000008);
14965 -
14966 - c->x86_max_cores = (ecx & 0xff) + 1;
14967 -
14968 - /* CPU telling us the core id bits shift? */
14969 - bits = (ecx >> 12) & 0xF;
14970 -
14971 - /* Otherwise recompute */
14972 - if (bits == 0) {
14973 - while ((1 << bits) < c->x86_max_cores)
14974 - bits++;
14975 - }
14976 -
14977 - c->x86_coreid_bits = bits;
14978 -
14979 -#endif
14980 -}
14981 -
14982 -#define ENABLE_C1E_MASK 0x18000000
14983 -#define CPUID_PROCESSOR_SIGNATURE 1
14984 -#define CPUID_XFAM 0x0ff00000
14985 -#define CPUID_XFAM_K8 0x00000000
14986 -#define CPUID_XFAM_10H 0x00100000
14987 -#define CPUID_XFAM_11H 0x00200000
14988 -#define CPUID_XMOD 0x000f0000
14989 -#define CPUID_XMOD_REV_F 0x00040000
14990 -
14991 -#ifndef CONFIG_XEN
14992 -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
14993 -static __cpuinit int amd_apic_timer_broken(void)
14994 -{
14995 - u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
14996 -
14997 - switch (eax & CPUID_XFAM) {
14998 - case CPUID_XFAM_K8:
14999 - if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
15000 - break;
15001 - case CPUID_XFAM_10H:
15002 - case CPUID_XFAM_11H:
15003 - rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
15004 - if (lo & ENABLE_C1E_MASK)
15005 - return 1;
15006 - break;
15007 - default:
15008 - /* err on the side of caution */
15009 - return 1;
15010 - }
15011 - return 0;
15012 -}
15013 -#endif
15014 -
15015 -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
15016 -{
15017 - early_init_amd_mc(c);
15018 -
15019 - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
15020 - if (c->x86_power & (1<<8))
15021 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15022 -}
15023 -
15024 -static void __cpuinit init_amd(struct cpuinfo_x86 *c)
15025 -{
15026 - unsigned level;
15027 -
15028 -#ifdef CONFIG_SMP
15029 - unsigned long value;
15030 -
15031 - /*
15032 - * Disable TLB flush filter by setting HWCR.FFDIS on K8
15033 - * bit 6 of msr C001_0015
15034 - *
15035 - * Errata 63 for SH-B3 steppings
15036 - * Errata 122 for all steppings (F+ have it disabled by default)
15037 - */
15038 - if (c->x86 == 15) {
15039 - rdmsrl(MSR_K8_HWCR, value);
15040 - value |= 1 << 6;
15041 - wrmsrl(MSR_K8_HWCR, value);
15042 - }
15043 -#endif
15044 -
15045 - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
15046 - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
15047 - clear_cpu_cap(c, 0*32+31);
15048 -
15049 - /* On C+ stepping K8 rep microcode works well for copy/memset */
15050 - level = cpuid_eax(1);
15051 - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
15052 - level >= 0x0f58))
15053 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15054 - if (c->x86 == 0x10 || c->x86 == 0x11)
15055 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15056 -
15057 - /* Enable workaround for FXSAVE leak */
15058 - if (c->x86 >= 6)
15059 - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
15060 -
15061 - level = get_model_name(c);
15062 - if (!level) {
15063 - switch (c->x86) {
15064 - case 15:
15065 - /* Should distinguish Models here, but this is only
15066 - a fallback anyways. */
15067 - strcpy(c->x86_model_id, "Hammer");
15068 - break;
15069 - }
15070 - }
15071 - display_cacheinfo(c);
15072 -
15073 - /* Multi core CPU? */
15074 - if (c->extended_cpuid_level >= 0x80000008)
15075 - amd_detect_cmp(c);
15076 -
15077 - if (c->extended_cpuid_level >= 0x80000006 &&
15078 - (cpuid_edx(0x80000006) & 0xf000))
15079 - num_cache_leaves = 4;
15080 - else
15081 - num_cache_leaves = 3;
15082 -
15083 - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
15084 - set_cpu_cap(c, X86_FEATURE_K8);
15085 -
15086 - /* MFENCE stops RDTSC speculation */
15087 - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
15088 -
15089 - if (c->x86 == 0x10)
15090 - fam10h_check_enable_mmcfg();
15091 -
15092 -#ifndef CONFIG_XEN
15093 - if (amd_apic_timer_broken())
15094 - disable_apic_timer = 1;
15095 -
15096 - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
15097 - unsigned long long tseg;
15098 -
15099 - /*
15100 - * Split up direct mapping around the TSEG SMM area.
15101 - * Don't do it for gbpages because there seems very little
15102 - * benefit in doing so.
15103 - */
15104 - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
15105 - (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
15106 - set_memory_4k((unsigned long)__va(tseg), 1);
15107 - }
15108 -#endif
15109 -}
15110 -
15111 -void __cpuinit detect_ht(struct cpuinfo_x86 *c)
15112 -{
15113 -#ifdef CONFIG_SMP
15114 - u32 eax, ebx, ecx, edx;
15115 - int index_msb, core_bits;
15116 -
15117 - cpuid(1, &eax, &ebx, &ecx, &edx);
15118 -
15119 -
15120 - if (!cpu_has(c, X86_FEATURE_HT))
15121 - return;
15122 - if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
15123 - goto out;
15124 -
15125 - smp_num_siblings = (ebx & 0xff0000) >> 16;
15126 -
15127 - if (smp_num_siblings == 1) {
15128 - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
15129 - } else if (smp_num_siblings > 1) {
15130 -
15131 - if (smp_num_siblings > NR_CPUS) {
15132 - printk(KERN_WARNING "CPU: Unsupported number of "
15133 - "siblings %d", smp_num_siblings);
15134 - smp_num_siblings = 1;
15135 - return;
15136 - }
15137 -
15138 - index_msb = get_count_order(smp_num_siblings);
15139 - c->phys_proc_id = phys_pkg_id(index_msb);
15140 -
15141 - smp_num_siblings = smp_num_siblings / c->x86_max_cores;
15142 -
15143 - index_msb = get_count_order(smp_num_siblings);
15144 -
15145 - core_bits = get_count_order(c->x86_max_cores);
15146 -
15147 - c->cpu_core_id = phys_pkg_id(index_msb) &
15148 - ((1 << core_bits) - 1);
15149 - }
15150 -out:
15151 - if ((c->x86_max_cores * smp_num_siblings) > 1) {
15152 - printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
15153 - c->phys_proc_id);
15154 - printk(KERN_INFO "CPU: Processor Core ID: %d\n",
15155 - c->cpu_core_id);
15156 - }
15157 -
15158 -#endif
15159 -}
15160 -
15161 -/*
15162 - * find out the number of processor cores on the die
15163 - */
15164 -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
15165 -{
15166 - unsigned int eax, t;
15167 -
15168 - if (c->cpuid_level < 4)
15169 - return 1;
15170 -
15171 - cpuid_count(4, 0, &eax, &t, &t, &t);
15172 -
15173 - if (eax & 0x1f)
15174 - return ((eax >> 26) + 1);
15175 - else
15176 - return 1;
15177 -}
15178 -
15179 -static void __cpuinit srat_detect_node(void)
15180 -{
15181 -#ifdef CONFIG_NUMA
15182 - unsigned node;
15183 - int cpu = smp_processor_id();
15184 - int apicid = hard_smp_processor_id();
15185 -
15186 - /* Don't do the funky fallback heuristics the AMD version employs
15187 - for now. */
15188 - node = apicid_to_node[apicid];
15189 - if (node == NUMA_NO_NODE || !node_online(node))
15190 - node = first_node(node_online_map);
15191 - numa_set_node(cpu, node);
15192 -
15193 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
15194 -#endif
15195 -}
15196 -
15197 -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
15198 -{
15199 - if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
15200 - (c->x86 == 0x6 && c->x86_model >= 0x0e))
15201 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15202 -}
15203 -
15204 -static void __cpuinit init_intel(struct cpuinfo_x86 *c)
15205 -{
15206 - /* Cache sizes */
15207 - unsigned n;
15208 -
15209 - init_intel_cacheinfo(c);
15210 - if (c->cpuid_level > 9) {
15211 - unsigned eax = cpuid_eax(10);
15212 - /* Check for version and the number of counters */
15213 - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
15214 - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
15215 - }
15216 -
15217 - if (cpu_has_ds) {
15218 - unsigned int l1, l2;
15219 - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
15220 - if (!(l1 & (1<<11)))
15221 - set_cpu_cap(c, X86_FEATURE_BTS);
15222 - if (!(l1 & (1<<12)))
15223 - set_cpu_cap(c, X86_FEATURE_PEBS);
15224 - }
15225 -
15226 -
15227 - if (cpu_has_bts)
15228 - ds_init_intel(c);
15229 -
15230 - n = c->extended_cpuid_level;
15231 - if (n >= 0x80000008) {
15232 - unsigned eax = cpuid_eax(0x80000008);
15233 - c->x86_virt_bits = (eax >> 8) & 0xff;
15234 - c->x86_phys_bits = eax & 0xff;
15235 - /* CPUID workaround for Intel 0F34 CPU */
15236 - if (c->x86_vendor == X86_VENDOR_INTEL &&
15237 - c->x86 == 0xF && c->x86_model == 0x3 &&
15238 - c->x86_mask == 0x4)
15239 - c->x86_phys_bits = 36;
15240 - }
15241 -
15242 - if (c->x86 == 15)
15243 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15244 - if (c->x86 == 6)
15245 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15246 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15247 - c->x86_max_cores = intel_num_cpu_cores(c);
15248 -
15249 - srat_detect_node();
15250 -}
15251 -
15252 -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
15253 -{
15254 - if (c->x86 == 0x6 && c->x86_model >= 0xf)
15255 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15256 -}
15257 -
15258 -static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
15259 -{
15260 - /* Cache sizes */
15261 - unsigned n;
15262 -
15263 - n = c->extended_cpuid_level;
15264 - if (n >= 0x80000008) {
15265 - unsigned eax = cpuid_eax(0x80000008);
15266 - c->x86_virt_bits = (eax >> 8) & 0xff;
15267 - c->x86_phys_bits = eax & 0xff;
15268 - }
15269 -
15270 - if (c->x86 == 0x6 && c->x86_model >= 0xf) {
15271 - c->x86_cache_alignment = c->x86_clflush_size * 2;
15272 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15273 - set_cpu_cap(c, X86_FEATURE_REP_GOOD);
15274 - }
15275 - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
15276 -}
15277 -
15278 -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
15279 -{
15280 - char *v = c->x86_vendor_id;
15281 -
15282 - if (!strcmp(v, "AuthenticAMD"))
15283 - c->x86_vendor = X86_VENDOR_AMD;
15284 - else if (!strcmp(v, "GenuineIntel"))
15285 - c->x86_vendor = X86_VENDOR_INTEL;
15286 - else if (!strcmp(v, "CentaurHauls"))
15287 - c->x86_vendor = X86_VENDOR_CENTAUR;
15288 - else
15289 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15290 -}
15291 -
15292 -/* Do some early cpuid on the boot CPU to get some parameter that are
15293 - needed before check_bugs. Everything advanced is in identify_cpu
15294 - below. */
15295 -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
15296 -{
15297 - u32 tfms, xlvl;
15298 -
15299 - c->loops_per_jiffy = loops_per_jiffy;
15300 - c->x86_cache_size = -1;
15301 - c->x86_vendor = X86_VENDOR_UNKNOWN;
15302 - c->x86_model = c->x86_mask = 0; /* So far unknown... */
15303 - c->x86_vendor_id[0] = '\0'; /* Unset */
15304 - c->x86_model_id[0] = '\0'; /* Unset */
15305 - c->x86_clflush_size = 64;
15306 - c->x86_cache_alignment = c->x86_clflush_size;
15307 - c->x86_max_cores = 1;
15308 - c->x86_coreid_bits = 0;
15309 - c->extended_cpuid_level = 0;
15310 - memset(&c->x86_capability, 0, sizeof c->x86_capability);
15311 -
15312 - /* Get vendor name */
15313 - cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
15314 - (unsigned int *)&c->x86_vendor_id[0],
15315 - (unsigned int *)&c->x86_vendor_id[8],
15316 - (unsigned int *)&c->x86_vendor_id[4]);
15317 -
15318 - get_cpu_vendor(c);
15319 -
15320 - /* Initialize the standard set of capabilities */
15321 - /* Note that the vendor-specific code below might override */
15322 -
15323 - /* Intel-defined flags: level 0x00000001 */
15324 - if (c->cpuid_level >= 0x00000001) {
15325 - __u32 misc;
15326 - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
15327 - &c->x86_capability[0]);
15328 - c->x86 = (tfms >> 8) & 0xf;
15329 - c->x86_model = (tfms >> 4) & 0xf;
15330 - c->x86_mask = tfms & 0xf;
15331 - if (c->x86 == 0xf)
15332 - c->x86 += (tfms >> 20) & 0xff;
15333 - if (c->x86 >= 0x6)
15334 - c->x86_model += ((tfms >> 16) & 0xF) << 4;
15335 - if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
15336 - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
15337 - } else {
15338 - /* Have CPUID level 0 only - unheard of */
15339 - c->x86 = 4;
15340 - }
15341 -
15342 - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
15343 -#ifdef CONFIG_SMP
15344 - c->phys_proc_id = c->initial_apicid;
15345 -#endif
15346 - /* AMD-defined flags: level 0x80000001 */
15347 - xlvl = cpuid_eax(0x80000000);
15348 - c->extended_cpuid_level = xlvl;
15349 - if ((xlvl & 0xffff0000) == 0x80000000) {
15350 - if (xlvl >= 0x80000001) {
15351 - c->x86_capability[1] = cpuid_edx(0x80000001);
15352 - c->x86_capability[6] = cpuid_ecx(0x80000001);
15353 - }
15354 - if (xlvl >= 0x80000004)
15355 - get_model_name(c); /* Default name */
15356 - }
15357 -
15358 - /* Transmeta-defined flags: level 0x80860001 */
15359 - xlvl = cpuid_eax(0x80860000);
15360 - if ((xlvl & 0xffff0000) == 0x80860000) {
15361 - /* Don't set x86_cpuid_level here for now to not confuse. */
15362 - if (xlvl >= 0x80860001)
15363 - c->x86_capability[2] = cpuid_edx(0x80860001);
15364 - }
15365 -
15366 - c->extended_cpuid_level = cpuid_eax(0x80000000);
15367 - if (c->extended_cpuid_level >= 0x80000007)
15368 - c->x86_power = cpuid_edx(0x80000007);
15369 -
15370 - switch (c->x86_vendor) {
15371 - case X86_VENDOR_AMD:
15372 - early_init_amd(c);
15373 - break;
15374 - case X86_VENDOR_INTEL:
15375 - early_init_intel(c);
15376 - break;
15377 - case X86_VENDOR_CENTAUR:
15378 - early_init_centaur(c);
15379 - break;
15380 - }
15381 -
15382 - validate_pat_support(c);
15383 -}
15384 -
15385 -/*
15386 - * This does the hard work of actually picking apart the CPU stuff...
15387 - */
15388 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
15389 -{
15390 - int i;
15391 -
15392 - early_identify_cpu(c);
15393 -
15394 - init_scattered_cpuid_features(c);
15395 -
15396 - c->apicid = phys_pkg_id(0);
15397 -
15398 - /*
15399 - * Vendor-specific initialization. In this section we
15400 - * canonicalize the feature flags, meaning if there are
15401 - * features a certain CPU supports which CPUID doesn't
15402 - * tell us, CPUID claiming incorrect flags, or other bugs,
15403 - * we handle them here.
15404 - *
15405 - * At the end of this section, c->x86_capability better
15406 - * indicate the features this CPU genuinely supports!
15407 - */
15408 - switch (c->x86_vendor) {
15409 - case X86_VENDOR_AMD:
15410 - init_amd(c);
15411 - break;
15412 -
15413 - case X86_VENDOR_INTEL:
15414 - init_intel(c);
15415 - break;
15416 -
15417 - case X86_VENDOR_CENTAUR:
15418 - init_centaur(c);
15419 - break;
15420 -
15421 - case X86_VENDOR_UNKNOWN:
15422 - default:
15423 - display_cacheinfo(c);
15424 - break;
15425 - }
15426 -
15427 - detect_ht(c);
15428 -
15429 - /*
15430 - * On SMP, boot_cpu_data holds the common feature set between
15431 - * all CPUs; so make sure that we indicate which features are
15432 - * common between the CPUs. The first time this routine gets
15433 - * executed, c == &boot_cpu_data.
15434 - */
15435 - if (c != &boot_cpu_data) {
15436 - /* AND the already accumulated flags with these */
15437 - for (i = 0; i < NCAPINTS; i++)
15438 - boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
15439 - }
15440 -
15441 - /* Clear all flags overriden by options */
15442 - for (i = 0; i < NCAPINTS; i++)
15443 - c->x86_capability[i] &= ~cleared_cpu_caps[i];
15444 -
15445 -#ifdef CONFIG_X86_MCE
15446 - mcheck_init(c);
15447 -#endif
15448 - select_idle_routine(c);
15449 -
15450 -#ifdef CONFIG_NUMA
15451 - numa_add_cpu(smp_processor_id());
15452 -#endif
15453 -
15454 -}
15455 -
15456 -void __cpuinit identify_boot_cpu(void)
15457 -{
15458 - identify_cpu(&boot_cpu_data);
15459 -}
15460 -
15461 -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
15462 -{
15463 - BUG_ON(c == &boot_cpu_data);
15464 - identify_cpu(c);
15465 - mtrr_ap_init();
15466 -}
15467 -
15468 -static __init int setup_noclflush(char *arg)
15469 -{
15470 - setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
15471 - return 1;
15472 -}
15473 -__setup("noclflush", setup_noclflush);
15474 -
15475 -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
15476 -{
15477 - if (c->x86_model_id[0])
15478 - printk(KERN_CONT "%s", c->x86_model_id);
15479 -
15480 - if (c->x86_mask || c->cpuid_level >= 0)
15481 - printk(KERN_CONT " stepping %02x\n", c->x86_mask);
15482 - else
15483 - printk(KERN_CONT "\n");
15484 -}
15485 -
15486 -static __init int setup_disablecpuid(char *arg)
15487 -{
15488 - int bit;
15489 - if (get_option(&arg, &bit) && bit < NCAPINTS*32)
15490 - setup_clear_cpu_cap(bit);
15491 - else
15492 - return 0;
15493 - return 1;
15494 -}
15495 -__setup("clearcpuid=", setup_disablecpuid);
15496 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
15497 +++ sle11-2009-10-16/arch/x86/kernel/setup_percpu-xen.c 2009-06-04 10:21:39.000000000 +0200
15498 @@ -0,0 +1,385 @@
15499 +#include <linux/kernel.h>
15500 +#include <linux/module.h>
15501 +#include <linux/init.h>
15502 +#include <linux/bootmem.h>
15503 +#include <linux/percpu.h>
15504 +#include <linux/kexec.h>
15505 +#include <linux/crash_dump.h>
15506 +#include <asm/smp.h>
15507 +#include <asm/percpu.h>
15508 +#include <asm/sections.h>
15509 +#include <asm/processor.h>
15510 +#include <asm/setup.h>
15511 +#include <asm/topology.h>
15512 +#include <asm/mpspec.h>
15513 +#include <asm/apicdef.h>
15514 +#include <asm/highmem.h>
15515 +
15516 +#ifdef CONFIG_X86_LOCAL_APIC
15517 +unsigned int num_processors;
15518 +unsigned disabled_cpus __cpuinitdata;
15519 +/* Processor that is doing the boot up */
15520 +unsigned int boot_cpu_physical_apicid = -1U;
15521 +unsigned int max_physical_apicid;
15522 +EXPORT_SYMBOL(boot_cpu_physical_apicid);
15523 +
15524 +/* Bitmask of physically existing CPUs */
15525 +physid_mask_t phys_cpu_present_map;
15526 +#endif
15527 +
15528 +/* map cpu index to physical APIC ID */
15529 +#ifndef CONFIG_XEN
15530 +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
15531 +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
15532 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15533 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
15534 +#else
15535 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
15536 +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15537 +#endif
15538 +
15539 +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
15540 +#define X86_64_NUMA 1
15541 +
15542 +/* map cpu index to node index */
15543 +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
15544 +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
15545 +
15546 +/* which logical CPUs are on which nodes */
15547 +cpumask_t *node_to_cpumask_map;
15548 +EXPORT_SYMBOL(node_to_cpumask_map);
15549 +
15550 +/* setup node_to_cpumask_map */
15551 +static void __init setup_node_to_cpumask_map(void);
15552 +
15553 +#else
15554 +static inline void setup_node_to_cpumask_map(void) { }
15555 +#endif
15556 +
15557 +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
15558 +/*
15559 + * Copy data used in early init routines from the initial arrays to the
15560 + * per cpu data areas. These arrays then become expendable and the
15561 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
15562 + */
15563 +static void __init setup_per_cpu_maps(void)
15564 +{
15565 +#ifndef CONFIG_XEN
15566 + int cpu;
15567 +
15568 + for_each_possible_cpu(cpu) {
15569 + per_cpu(x86_cpu_to_apicid, cpu) =
15570 + early_per_cpu_map(x86_cpu_to_apicid, cpu);
15571 + per_cpu(x86_bios_cpu_apicid, cpu) =
15572 + early_per_cpu_map(x86_bios_cpu_apicid, cpu);
15573 +#ifdef X86_64_NUMA
15574 + per_cpu(x86_cpu_to_node_map, cpu) =
15575 + early_per_cpu_map(x86_cpu_to_node_map, cpu);
15576 +#endif
15577 + }
15578 +
15579 + /* indicate the early static arrays will soon be gone */
15580 + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
15581 + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
15582 +#ifdef X86_64_NUMA
15583 + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
15584 +#endif
15585 +#endif
15586 +}
15587 +
15588 +#ifdef CONFIG_X86_32
15589 +/*
15590 + * Great future not-so-futuristic plan: make i386 and x86_64 do it
15591 + * the same way
15592 + */
15593 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
15594 +EXPORT_SYMBOL(__per_cpu_offset);
15595 +static inline void setup_cpu_pda_map(void) { }
15596 +
15597 +#elif !defined(CONFIG_SMP)
15598 +static inline void setup_cpu_pda_map(void) { }
15599 +
15600 +#else /* CONFIG_SMP && CONFIG_X86_64 */
15601 +
15602 +/*
15603 + * Allocate cpu_pda pointer table and array via alloc_bootmem.
15604 + */
15605 +static void __init setup_cpu_pda_map(void)
15606 +{
15607 + char *pda;
15608 + struct x8664_pda **new_cpu_pda;
15609 + unsigned long size;
15610 + int cpu;
15611 +
15612 + size = roundup(sizeof(struct x8664_pda), cache_line_size());
15613 +
15614 + /* allocate cpu_pda array and pointer table */
15615 + {
15616 + unsigned long tsize = nr_cpu_ids * sizeof(void *);
15617 + unsigned long asize = size * (nr_cpu_ids - 1);
15618 +
15619 + tsize = roundup(tsize, cache_line_size());
15620 + new_cpu_pda = alloc_bootmem(tsize + asize);
15621 + pda = (char *)new_cpu_pda + tsize;
15622 + }
15623 +
15624 + /* initialize pointer table to static pda's */
15625 + for_each_possible_cpu(cpu) {
15626 + if (cpu == 0) {
15627 + /* leave boot cpu pda in place */
15628 + new_cpu_pda[0] = cpu_pda(0);
15629 + continue;
15630 + }
15631 + new_cpu_pda[cpu] = (struct x8664_pda *)pda;
15632 + new_cpu_pda[cpu]->in_bootmem = 1;
15633 + pda += size;
15634 + }
15635 +
15636 + /* point to new pointer table */
15637 + _cpu_pda = new_cpu_pda;
15638 +}
15639 +#endif
15640 +
15641 +/*
15642 + * Great future plan:
15643 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
15644 + * Always point %gs to its beginning
15645 + */
15646 +void __init setup_per_cpu_areas(void)
15647 +{
15648 + ssize_t size = PERCPU_ENOUGH_ROOM;
15649 + char *ptr;
15650 + int cpu;
15651 +
15652 + /* Setup cpu_pda map */
15653 + setup_cpu_pda_map();
15654 +
15655 + /* Copy section for each CPU (we discard the original) */
15656 + size = PERCPU_ENOUGH_ROOM;
15657 + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
15658 + size);
15659 +
15660 + for_each_possible_cpu(cpu) {
15661 +#ifndef CONFIG_NEED_MULTIPLE_NODES
15662 + ptr = alloc_bootmem_pages(size);
15663 +#else
15664 + int node = early_cpu_to_node(cpu);
15665 + if (!node_online(node) || !NODE_DATA(node)) {
15666 + ptr = alloc_bootmem_pages(size);
15667 + printk(KERN_INFO
15668 + "cpu %d has no node %d or node-local memory\n",
15669 + cpu, node);
15670 + }
15671 + else
15672 + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
15673 +#endif
15674 + per_cpu_offset(cpu) = ptr - __per_cpu_start;
15675 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
15676 +
15677 + }
15678 +
15679 + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
15680 + NR_CPUS, nr_cpu_ids, nr_node_ids);
15681 +
15682 + /* Setup percpu data maps */
15683 + setup_per_cpu_maps();
15684 +
15685 + /* Setup node to cpumask map */
15686 + setup_node_to_cpumask_map();
15687 +}
15688 +
15689 +#endif
15690 +
15691 +#ifdef X86_64_NUMA
15692 +
15693 +/*
15694 + * Allocate node_to_cpumask_map based on number of available nodes
15695 + * Requires node_possible_map to be valid.
15696 + *
15697 + * Note: node_to_cpumask() is not valid until after this is done.
15698 + */
15699 +static void __init setup_node_to_cpumask_map(void)
15700 +{
15701 + unsigned int node, num = 0;
15702 + cpumask_t *map;
15703 +
15704 + /* setup nr_node_ids if not done yet */
15705 + if (nr_node_ids == MAX_NUMNODES) {
15706 + for_each_node_mask(node, node_possible_map)
15707 + num = node;
15708 + nr_node_ids = num + 1;
15709 + }
15710 +
15711 + /* allocate the map */
15712 + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
15713 +
15714 + pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
15715 + map, nr_node_ids);
15716 +
15717 + /* node_to_cpumask() will now work */
15718 + node_to_cpumask_map = map;
15719 +}
15720 +
15721 +void __cpuinit numa_set_node(int cpu, int node)
15722 +{
15723 + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
15724 +
15725 + if (cpu_pda(cpu) && node != NUMA_NO_NODE)
15726 + cpu_pda(cpu)->nodenumber = node;
15727 +
15728 + if (cpu_to_node_map)
15729 + cpu_to_node_map[cpu] = node;
15730 +
15731 + else if (per_cpu_offset(cpu))
15732 + per_cpu(x86_cpu_to_node_map, cpu) = node;
15733 +
15734 + else
15735 + pr_debug("Setting node for non-present cpu %d\n", cpu);
15736 +}
15737 +
15738 +void __cpuinit numa_clear_node(int cpu)
15739 +{
15740 + numa_set_node(cpu, NUMA_NO_NODE);
15741 +}
15742 +
15743 +#ifndef CONFIG_DEBUG_PER_CPU_MAPS
15744 +
15745 +void __cpuinit numa_add_cpu(int cpu)
15746 +{
15747 + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
15748 +}
15749 +
15750 +void __cpuinit numa_remove_cpu(int cpu)
15751 +{
15752 + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
15753 +}
15754 +
15755 +#else /* CONFIG_DEBUG_PER_CPU_MAPS */
15756 +
15757 +/*
15758 + * --------- debug versions of the numa functions ---------
15759 + */
15760 +static void __cpuinit numa_set_cpumask(int cpu, int enable)
15761 +{
15762 + int node = cpu_to_node(cpu);
15763 + cpumask_t *mask;
15764 + char buf[64];
15765 +
15766 + if (node_to_cpumask_map == NULL) {
15767 + printk(KERN_ERR "node_to_cpumask_map NULL\n");
15768 + dump_stack();
15769 + return;
15770 + }
15771 +
15772 + mask = &node_to_cpumask_map[node];
15773 + if (enable)
15774 + cpu_set(cpu, *mask);
15775 + else
15776 + cpu_clear(cpu, *mask);
15777 +
15778 + cpulist_scnprintf(buf, sizeof(buf), *mask);
15779 + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
15780 + enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
15781 + }
15782 +
15783 +void __cpuinit numa_add_cpu(int cpu)
15784 +{
15785 + numa_set_cpumask(cpu, 1);
15786 +}
15787 +
15788 +void __cpuinit numa_remove_cpu(int cpu)
15789 +{
15790 + numa_set_cpumask(cpu, 0);
15791 +}
15792 +
15793 +int cpu_to_node(int cpu)
15794 +{
15795 + if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
15796 + printk(KERN_WARNING
15797 + "cpu_to_node(%d): usage too early!\n", cpu);
15798 + dump_stack();
15799 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15800 + }
15801 + return per_cpu(x86_cpu_to_node_map, cpu);
15802 +}
15803 +EXPORT_SYMBOL(cpu_to_node);
15804 +
15805 +/*
15806 + * Same function as cpu_to_node() but used if called before the
15807 + * per_cpu areas are setup.
15808 + */
15809 +int early_cpu_to_node(int cpu)
15810 +{
15811 + if (early_per_cpu_ptr(x86_cpu_to_node_map))
15812 + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
15813 +
15814 + if (!per_cpu_offset(cpu)) {
15815 + printk(KERN_WARNING
15816 + "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
15817 + dump_stack();
15818 + return NUMA_NO_NODE;
15819 + }
15820 + return per_cpu(x86_cpu_to_node_map, cpu);
15821 +}
15822 +
15823 +
15824 +/* empty cpumask */
15825 +static const cpumask_t cpu_mask_none;
15826 +
15827 +/*
15828 + * Returns a pointer to the bitmask of CPUs on Node 'node'.
15829 + */
15830 +const cpumask_t *_node_to_cpumask_ptr(int node)
15831 +{
15832 + if (node_to_cpumask_map == NULL) {
15833 + printk(KERN_WARNING
15834 + "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
15835 + node);
15836 + dump_stack();
15837 + return (const cpumask_t *)&cpu_online_map;
15838 + }
15839 + if (node >= nr_node_ids) {
15840 + printk(KERN_WARNING
15841 + "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
15842 + node, nr_node_ids);
15843 + dump_stack();
15844 + return &cpu_mask_none;
15845 + }
15846 + return &node_to_cpumask_map[node];
15847 +}
15848 +EXPORT_SYMBOL(_node_to_cpumask_ptr);
15849 +
15850 +/*
15851 + * Returns a bitmask of CPUs on Node 'node'.
15852 + *
15853 + * Side note: this function creates the returned cpumask on the stack
15854 + * so with a high NR_CPUS count, excessive stack space is used. The
15855 + * node_to_cpumask_ptr function should be used whenever possible.
15856 + */
15857 +cpumask_t node_to_cpumask(int node)
15858 +{
15859 + if (node_to_cpumask_map == NULL) {
15860 + printk(KERN_WARNING
15861 + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
15862 + dump_stack();
15863 + return cpu_online_map;
15864 + }
15865 + if (node >= nr_node_ids) {
15866 + printk(KERN_WARNING
15867 + "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
15868 + node, nr_node_ids);
15869 + dump_stack();
15870 + return cpu_mask_none;
15871 + }
15872 + return node_to_cpumask_map[node];
15873 +}
15874 +EXPORT_SYMBOL(node_to_cpumask);
15875 +
15876 +/*
15877 + * --------- end of debug versions of the numa functions ---------
15878 + */
15879 +
15880 +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
15881 +
15882 +#endif /* X86_64_NUMA */
15883 +
15884 --- sle11-2009-10-16.orig/arch/x86/kernel/smp-xen.c 2009-03-16 16:38:05.000000000 +0100
15885 +++ sle11-2009-10-16/arch/x86/kernel/smp-xen.c 2009-06-04 10:21:39.000000000 +0200
15886 @@ -121,132 +121,14 @@ void xen_smp_send_reschedule(int cpu)
15887 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
15888 }
15889
15890 -/*
15891 - * Structure and data for smp_call_function(). This is designed to minimise
15892 - * static memory requirements. It also looks cleaner.
15893 - */
15894 -static DEFINE_SPINLOCK(call_lock);
15895 -
15896 -struct call_data_struct {
15897 - void (*func) (void *info);
15898 - void *info;
15899 - atomic_t started;
15900 - atomic_t finished;
15901 - int wait;
15902 -};
15903 -
15904 -void lock_ipi_call_lock(void)
15905 +void xen_send_call_func_single_ipi(int cpu)
15906 {
15907 - spin_lock_irq(&call_lock);
15908 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
15909 }
15910
15911 -void unlock_ipi_call_lock(void)
15912 +void xen_send_call_func_ipi(cpumask_t mask)
15913 {
15914 - spin_unlock_irq(&call_lock);
15915 -}
15916 -
15917 -static struct call_data_struct *call_data;
15918 -
15919 -static void __smp_call_function(void (*func) (void *info), void *info,
15920 - int nonatomic, int wait)
15921 -{
15922 - struct call_data_struct data;
15923 - int cpus = num_online_cpus() - 1;
15924 -
15925 - if (!cpus)
15926 - return;
15927 -
15928 - data.func = func;
15929 - data.info = info;
15930 - atomic_set(&data.started, 0);
15931 - data.wait = wait;
15932 - if (wait)
15933 - atomic_set(&data.finished, 0);
15934 -
15935 - call_data = &data;
15936 - mb();
15937 -
15938 - /* Send a message to all other CPUs and wait for them to respond */
15939 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
15940 -
15941 - /* Wait for response */
15942 - while (atomic_read(&data.started) != cpus)
15943 - cpu_relax();
15944 -
15945 - if (wait)
15946 - while (atomic_read(&data.finished) != cpus)
15947 - cpu_relax();
15948 -}
15949 -
15950 -
15951 -/**
15952 - * smp_call_function_mask(): Run a function on a set of other CPUs.
15953 - * @mask: The set of cpus to run on. Must not include the current cpu.
15954 - * @func: The function to run. This must be fast and non-blocking.
15955 - * @info: An arbitrary pointer to pass to the function.
15956 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
15957 - *
15958 - * Returns 0 on success, else a negative status code.
15959 - *
15960 - * If @wait is true, then returns once @func has returned; otherwise
15961 - * it returns just before the target cpu calls @func.
15962 - *
15963 - * You must not call this function with disabled interrupts or from a
15964 - * hardware interrupt handler or from a bottom half handler.
15965 - */
15966 -int
15967 -xen_smp_call_function_mask(cpumask_t mask,
15968 - void (*func)(void *), void *info,
15969 - int wait)
15970 -{
15971 - struct call_data_struct data;
15972 - cpumask_t allbutself;
15973 - int cpus;
15974 -
15975 - /* Can deadlock when called with interrupts disabled */
15976 - WARN_ON(irqs_disabled());
15977 -
15978 - /* Holding any lock stops cpus from going down. */
15979 - spin_lock(&call_lock);
15980 -
15981 - allbutself = cpu_online_map;
15982 - cpu_clear(smp_processor_id(), allbutself);
15983 -
15984 - cpus_and(mask, mask, allbutself);
15985 - cpus = cpus_weight(mask);
15986 -
15987 - if (!cpus) {
15988 - spin_unlock(&call_lock);
15989 - return 0;
15990 - }
15991 -
15992 - data.func = func;
15993 - data.info = info;
15994 - atomic_set(&data.started, 0);
15995 - data.wait = wait;
15996 - if (wait)
15997 - atomic_set(&data.finished, 0);
15998 -
15999 - call_data = &data;
16000 - wmb();
16001 -
16002 - /* Send a message to other CPUs */
16003 - if (cpus_equal(mask, allbutself) &&
16004 - cpus_equal(cpu_online_map, cpu_callout_map))
16005 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
16006 - else
16007 - send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16008 -
16009 - /* Wait for response */
16010 - while (atomic_read(&data.started) != cpus)
16011 - cpu_relax();
16012 -
16013 - if (wait)
16014 - while (atomic_read(&data.finished) != cpus)
16015 - cpu_relax();
16016 - spin_unlock(&call_lock);
16017 -
16018 - return 0;
16019 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
16020 }
16021
16022 static void stop_this_cpu(void *dummy)
16023 @@ -268,15 +150,10 @@ static void stop_this_cpu(void *dummy)
16024
16025 void xen_smp_send_stop(void)
16026 {
16027 - int nolock;
16028 unsigned long flags;
16029
16030 - /* Don't deadlock on the call lock in panic */
16031 - nolock = !spin_trylock(&call_lock);
16032 + smp_call_function(stop_this_cpu, NULL, 0);
16033 local_irq_save(flags);
16034 - __smp_call_function(stop_this_cpu, NULL, 0, 0);
16035 - if (!nolock)
16036 - spin_unlock(&call_lock);
16037 disable_all_local_evtchn();
16038 local_irq_restore(flags);
16039 }
16040 @@ -298,21 +175,8 @@ irqreturn_t smp_reschedule_interrupt(int
16041
16042 irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
16043 {
16044 - void (*func) (void *info) = call_data->func;
16045 - void *info = call_data->info;
16046 - int wait = call_data->wait;
16047 -
16048 - /*
16049 - * Notify initiating CPU that I've grabbed the data and am
16050 - * about to execute the function
16051 - */
16052 - mb();
16053 - atomic_inc(&call_data->started);
16054 - /*
16055 - * At this point the info structure may be out of scope unless wait==1
16056 - */
16057 irq_enter();
16058 - (*func)(info);
16059 + generic_smp_call_function_interrupt();
16060 #ifdef CONFIG_X86_32
16061 __get_cpu_var(irq_stat).irq_call_count++;
16062 #else
16063 @@ -320,10 +184,19 @@ irqreturn_t smp_call_function_interrupt(
16064 #endif
16065 irq_exit();
16066
16067 - if (wait) {
16068 - mb();
16069 - atomic_inc(&call_data->finished);
16070 - }
16071 + return IRQ_HANDLED;
16072 +}
16073 +
16074 +irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
16075 +{
16076 + irq_enter();
16077 + generic_smp_call_function_single_interrupt();
16078 +#ifdef CONFIG_X86_32
16079 + __get_cpu_var(irq_stat).irq_call_count++;
16080 +#else
16081 + add_pda(irq_call_count, 1);
16082 +#endif
16083 + irq_exit();
16084
16085 return IRQ_HANDLED;
16086 }
16087 --- sle11-2009-10-16.orig/arch/x86/kernel/time_32-xen.c 2009-10-28 14:58:12.000000000 +0100
16088 +++ sle11-2009-10-16/arch/x86/kernel/time_32-xen.c 2009-10-28 14:58:19.000000000 +0100
16089 @@ -468,7 +468,7 @@ irqreturn_t timer_interrupt(int irq, voi
16090
16091 /* Keep nmi watchdog up to date */
16092 #ifdef __i386__
16093 - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
16094 + x86_add_percpu(irq_stat.irq0_irqs, 1);
16095 #else
16096 add_pda(irq0_irqs, 1);
16097 #endif
16098 @@ -747,9 +747,7 @@ void __init time_init(void)
16099
16100 update_wallclock();
16101
16102 -#ifndef CONFIG_X86_64
16103 use_tsc_delay();
16104 -#endif
16105
16106 /* Cannot request_irq() until kmem is initialised. */
16107 late_time_init = setup_cpu0_timer_irq;
16108 @@ -806,7 +804,8 @@ static void stop_hz_timer(void)
16109
16110 /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
16111 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
16112 - (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
16113 + (j = get_next_timer_interrupt(jiffies),
16114 + time_before_eq(j, jiffies))) {
16115 cpu_clear(cpu, nohz_cpu_mask);
16116 j = jiffies + 1;
16117 }
16118 --- sle11-2009-10-16.orig/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:38:05.000000000 +0100
16119 +++ sle11-2009-10-16/arch/x86/kernel/traps_32-xen.c 2009-06-04 10:21:39.000000000 +0200
16120 @@ -1,5 +1,6 @@
16121 /*
16122 * Copyright (C) 1991, 1992 Linus Torvalds
16123 + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
16124 *
16125 * Pentium III FXSR, SSE support
16126 * Gareth Hughes <gareth@valinux.com>, May 2000
16127 @@ -57,11 +58,10 @@
16128 #include <asm/nmi.h>
16129 #include <asm/smp.h>
16130 #include <asm/io.h>
16131 +#include <asm/traps.h>
16132
16133 #include "mach_traps.h"
16134
16135 -int panic_on_unrecovered_nmi;
16136 -
16137 #ifndef CONFIG_XEN
16138 DECLARE_BITMAP(used_vectors, NR_VECTORS);
16139 EXPORT_SYMBOL_GPL(used_vectors);
16140 @@ -82,43 +82,22 @@ gate_desc idt_table[256]
16141 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
16142 #endif
16143
16144 -asmlinkage void divide_error(void);
16145 -asmlinkage void debug(void);
16146 -asmlinkage void nmi(void);
16147 -asmlinkage void int3(void);
16148 -asmlinkage void overflow(void);
16149 -asmlinkage void bounds(void);
16150 -asmlinkage void invalid_op(void);
16151 -asmlinkage void device_not_available(void);
16152 -asmlinkage void coprocessor_segment_overrun(void);
16153 -asmlinkage void invalid_TSS(void);
16154 -asmlinkage void segment_not_present(void);
16155 -asmlinkage void stack_segment(void);
16156 -asmlinkage void general_protection(void);
16157 -asmlinkage void page_fault(void);
16158 -asmlinkage void coprocessor_error(void);
16159 -asmlinkage void simd_coprocessor_error(void);
16160 -asmlinkage void alignment_check(void);
16161 -#ifndef CONFIG_XEN
16162 -asmlinkage void spurious_interrupt_bug(void);
16163 -#else
16164 -asmlinkage void fixup_4gb_segment(void);
16165 -#endif
16166 -asmlinkage void machine_check(void);
16167 -
16168 +int panic_on_unrecovered_nmi;
16169 int kstack_depth_to_print = 24;
16170 static unsigned int code_bytes = 64;
16171 +static int ignore_nmis;
16172 +static int die_counter;
16173
16174 void printk_address(unsigned long address, int reliable)
16175 {
16176 #ifdef CONFIG_KALLSYMS
16177 - char namebuf[KSYM_NAME_LEN];
16178 unsigned long offset = 0;
16179 unsigned long symsize;
16180 const char *symname;
16181 - char reliab[4] = "";
16182 - char *delim = ":";
16183 char *modname;
16184 + char *delim = ":";
16185 + char namebuf[KSYM_NAME_LEN];
16186 + char reliab[4] = "";
16187
16188 symname = kallsyms_lookup(address, &symsize, &offset,
16189 &modname, namebuf);
16190 @@ -138,22 +117,23 @@ void printk_address(unsigned long addres
16191 #endif
16192 }
16193
16194 -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
16195 +static inline int valid_stack_ptr(struct thread_info *tinfo,
16196 + void *p, unsigned int size)
16197 {
16198 - return p > (void *)tinfo &&
16199 - p <= (void *)tinfo + THREAD_SIZE - size;
16200 + void *t = tinfo;
16201 + return p > t && p <= t + THREAD_SIZE - size;
16202 }
16203
16204 /* The form of the top of the frame on the stack */
16205 struct stack_frame {
16206 - struct stack_frame *next_frame;
16207 - unsigned long return_address;
16208 + struct stack_frame *next_frame;
16209 + unsigned long return_address;
16210 };
16211
16212 static inline unsigned long
16213 print_context_stack(struct thread_info *tinfo,
16214 - unsigned long *stack, unsigned long bp,
16215 - const struct stacktrace_ops *ops, void *data)
16216 + unsigned long *stack, unsigned long bp,
16217 + const struct stacktrace_ops *ops, void *data)
16218 {
16219 struct stack_frame *frame = (struct stack_frame *)bp;
16220
16221 @@ -175,8 +155,6 @@ print_context_stack(struct thread_info *
16222 return bp;
16223 }
16224
16225 -#define MSG(msg) ops->warning(data, msg)
16226 -
16227 void dump_trace(struct task_struct *task, struct pt_regs *regs,
16228 unsigned long *stack, unsigned long bp,
16229 const struct stacktrace_ops *ops, void *data)
16230 @@ -186,7 +164,6 @@ void dump_trace(struct task_struct *task
16231
16232 if (!stack) {
16233 unsigned long dummy;
16234 -
16235 stack = &dummy;
16236 if (task != current)
16237 stack = (unsigned long *)task->thread.sp;
16238 @@ -204,7 +181,7 @@ void dump_trace(struct task_struct *task
16239 }
16240 #endif
16241
16242 - while (1) {
16243 + for (;;) {
16244 struct thread_info *context;
16245
16246 context = (struct thread_info *)
16247 @@ -256,15 +233,15 @@ static void print_trace_address(void *da
16248 }
16249
16250 static const struct stacktrace_ops print_trace_ops = {
16251 - .warning = print_trace_warning,
16252 - .warning_symbol = print_trace_warning_symbol,
16253 - .stack = print_trace_stack,
16254 - .address = print_trace_address,
16255 + .warning = print_trace_warning,
16256 + .warning_symbol = print_trace_warning_symbol,
16257 + .stack = print_trace_stack,
16258 + .address = print_trace_address,
16259 };
16260
16261 static void
16262 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16263 - unsigned long *stack, unsigned long bp, char *log_lvl)
16264 + unsigned long *stack, unsigned long bp, char *log_lvl)
16265 {
16266 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16267 printk("%s =======================\n", log_lvl);
16268 @@ -359,15 +336,14 @@ void show_registers(struct pt_regs *regs
16269 printk(KERN_EMERG "Code: ");
16270
16271 ip = (u8 *)regs->ip - code_prologue;
16272 - if (ip < (u8 *)PAGE_OFFSET ||
16273 - probe_kernel_address(ip, c)) {
16274 + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
16275 /* try starting at EIP */
16276 ip = (u8 *)regs->ip;
16277 code_len = code_len - code_prologue + 1;
16278 }
16279 for (i = 0; i < code_len; i++, ip++) {
16280 if (ip < (u8 *)PAGE_OFFSET ||
16281 - probe_kernel_address(ip, c)) {
16282 + probe_kernel_address(ip, c)) {
16283 printk(" Bad EIP value.");
16284 break;
16285 }
16286 @@ -392,7 +368,53 @@ int is_valid_bugaddr(unsigned long ip)
16287 return ud2 == 0x0b0f;
16288 }
16289
16290 -static int die_counter;
16291 +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
16292 +static int die_owner = -1;
16293 +static unsigned int die_nest_count;
16294 +
16295 +unsigned __kprobes long oops_begin(void)
16296 +{
16297 + unsigned long flags;
16298 +
16299 + oops_enter();
16300 +
16301 + if (die_owner != raw_smp_processor_id()) {
16302 + console_verbose();
16303 + raw_local_irq_save(flags);
16304 + __raw_spin_lock(&die_lock);
16305 + die_owner = smp_processor_id();
16306 + die_nest_count = 0;
16307 + bust_spinlocks(1);
16308 + } else {
16309 + raw_local_irq_save(flags);
16310 + }
16311 + die_nest_count++;
16312 + return flags;
16313 +}
16314 +
16315 +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
16316 +{
16317 + bust_spinlocks(0);
16318 + die_owner = -1;
16319 + add_taint(TAINT_DIE);
16320 + __raw_spin_unlock(&die_lock);
16321 + raw_local_irq_restore(flags);
16322 +
16323 + if (!regs)
16324 + return;
16325 +
16326 + if (kexec_should_crash(current))
16327 + crash_kexec(regs);
16328 +
16329 + if (in_interrupt())
16330 + panic("Fatal exception in interrupt");
16331 +
16332 + if (panic_on_oops)
16333 + panic("Fatal exception");
16334 +
16335 + oops_exit();
16336 + do_exit(signr);
16337 +}
16338
16339 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
16340 {
16341 @@ -410,26 +432,22 @@ int __kprobes __die(const char *str, str
16342 printk("DEBUG_PAGEALLOC");
16343 #endif
16344 printk("\n");
16345 -
16346 if (notify_die(DIE_OOPS, str, regs, err,
16347 - current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
16348 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
16349 + return 1;
16350
16351 - show_registers(regs);
16352 - /* Executive summary in case the oops scrolled away */
16353 - sp = (unsigned long) (&regs->sp);
16354 - savesegment(ss, ss);
16355 - if (user_mode(regs)) {
16356 - sp = regs->sp;
16357 - ss = regs->ss & 0xffff;
16358 - }
16359 - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16360 - print_symbol("%s", regs->ip);
16361 - printk(" SS:ESP %04x:%08lx\n", ss, sp);
16362 -
16363 - return 0;
16364 - }
16365 -
16366 - return 1;
16367 + show_registers(regs);
16368 + /* Executive summary in case the oops scrolled away */
16369 + sp = (unsigned long) (&regs->sp);
16370 + savesegment(ss, ss);
16371 + if (user_mode(regs)) {
16372 + sp = regs->sp;
16373 + ss = regs->ss & 0xffff;
16374 + }
16375 + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
16376 + print_symbol("%s", regs->ip);
16377 + printk(" SS:ESP %04x:%08lx\n", ss, sp);
16378 + return 0;
16379 }
16380
16381 /*
16382 @@ -438,31 +456,9 @@ int __kprobes __die(const char *str, str
16383 */
16384 void die(const char *str, struct pt_regs *regs, long err)
16385 {
16386 - static struct {
16387 - raw_spinlock_t lock;
16388 - u32 lock_owner;
16389 - int lock_owner_depth;
16390 - } die = {
16391 - .lock = __RAW_SPIN_LOCK_UNLOCKED,
16392 - .lock_owner = -1,
16393 - .lock_owner_depth = 0
16394 - };
16395 - unsigned long flags;
16396 -
16397 - oops_enter();
16398 + unsigned long flags = oops_begin();
16399
16400 - if (die.lock_owner != raw_smp_processor_id()) {
16401 - console_verbose();
16402 - raw_local_irq_save(flags);
16403 - __raw_spin_lock(&die.lock);
16404 - die.lock_owner = smp_processor_id();
16405 - die.lock_owner_depth = 0;
16406 - bust_spinlocks(1);
16407 - } else {
16408 - raw_local_irq_save(flags);
16409 - }
16410 -
16411 - if (++die.lock_owner_depth < 3) {
16412 + if (die_nest_count < 3) {
16413 report_bug(regs->ip, regs);
16414
16415 if (__die(str, regs, err))
16416 @@ -471,26 +467,7 @@ void die(const char *str, struct pt_regs
16417 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
16418 }
16419
16420 - bust_spinlocks(0);
16421 - die.lock_owner = -1;
16422 - add_taint(TAINT_DIE);
16423 - __raw_spin_unlock(&die.lock);
16424 - raw_local_irq_restore(flags);
16425 -
16426 - if (!regs)
16427 - return;
16428 -
16429 - if (kexec_should_crash(current))
16430 - crash_kexec(regs);
16431 -
16432 - if (in_interrupt())
16433 - panic("Fatal exception in interrupt");
16434 -
16435 - if (panic_on_oops)
16436 - panic("Fatal exception");
16437 -
16438 - oops_exit();
16439 - do_exit(SIGSEGV);
16440 + oops_end(flags, regs, SIGSEGV);
16441 }
16442
16443 static inline void
16444 @@ -554,7 +531,7 @@ void do_##name(struct pt_regs *regs, lon
16445 { \
16446 trace_hardirqs_fixup(); \
16447 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16448 - == NOTIFY_STOP) \
16449 + == NOTIFY_STOP) \
16450 return; \
16451 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
16452 }
16453 @@ -570,7 +547,7 @@ void do_##name(struct pt_regs *regs, lon
16454 info.si_code = sicode; \
16455 info.si_addr = (void __user *)siaddr; \
16456 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16457 - == NOTIFY_STOP) \
16458 + == NOTIFY_STOP) \
16459 return; \
16460 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
16461 }
16462 @@ -579,7 +556,7 @@ void do_##name(struct pt_regs *regs, lon
16463 void do_##name(struct pt_regs *regs, long error_code) \
16464 { \
16465 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16466 - == NOTIFY_STOP) \
16467 + == NOTIFY_STOP) \
16468 return; \
16469 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
16470 }
16471 @@ -594,28 +571,29 @@ void do_##name(struct pt_regs *regs, lon
16472 info.si_addr = (void __user *)siaddr; \
16473 trace_hardirqs_fixup(); \
16474 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
16475 - == NOTIFY_STOP) \
16476 + == NOTIFY_STOP) \
16477 return; \
16478 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
16479 }
16480
16481 -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16482 +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
16483 #ifndef CONFIG_KPROBES
16484 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
16485 #endif
16486 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
16487 DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
16488 -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16489 -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16490 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
16491 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
16492 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
16493 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16494 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16495 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
16496 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
16497 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
16498 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
16499
16500 -void __kprobes do_general_protection(struct pt_regs * regs,
16501 - long error_code)
16502 +void __kprobes
16503 +do_general_protection(struct pt_regs *regs, long error_code)
16504 {
16505 + struct task_struct *tsk;
16506 struct thread_struct *thread;
16507
16508 thread = &current->thread;
16509 @@ -623,23 +601,24 @@ void __kprobes do_general_protection(str
16510 if (regs->flags & X86_VM_MASK)
16511 goto gp_in_vm86;
16512
16513 + tsk = current;
16514 if (!user_mode(regs))
16515 goto gp_in_kernel;
16516
16517 - current->thread.error_code = error_code;
16518 - current->thread.trap_no = 13;
16519 + tsk->thread.error_code = error_code;
16520 + tsk->thread.trap_no = 13;
16521
16522 - if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
16523 - printk_ratelimit()) {
16524 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
16525 + printk_ratelimit()) {
16526 printk(KERN_INFO
16527 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16528 - current->comm, task_pid_nr(current),
16529 - regs->ip, regs->sp, error_code);
16530 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
16531 + tsk->comm, task_pid_nr(tsk),
16532 + regs->ip, regs->sp, error_code);
16533 print_vma_addr(" in ", regs->ip);
16534 printk("\n");
16535 }
16536
16537 - force_sig(SIGSEGV, current);
16538 + force_sig(SIGSEGV, tsk);
16539 return;
16540
16541 gp_in_vm86:
16542 @@ -648,14 +627,15 @@ gp_in_vm86:
16543 return;
16544
16545 gp_in_kernel:
16546 - if (!fixup_exception(regs)) {
16547 - current->thread.error_code = error_code;
16548 - current->thread.trap_no = 13;
16549 - if (notify_die(DIE_GPF, "general protection fault", regs,
16550 + if (fixup_exception(regs))
16551 + return;
16552 +
16553 + tsk->thread.error_code = error_code;
16554 + tsk->thread.trap_no = 13;
16555 + if (notify_die(DIE_GPF, "general protection fault", regs,
16556 error_code, 13, SIGSEGV) == NOTIFY_STOP)
16557 - return;
16558 - die("general protection fault", regs, error_code);
16559 - }
16560 + return;
16561 + die("general protection fault", regs, error_code);
16562 }
16563
16564 static notrace __kprobes void
16565 @@ -722,9 +702,9 @@ unknown_nmi_error(unsigned char reason,
16566
16567 static DEFINE_SPINLOCK(nmi_print_lock);
16568
16569 -void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
16570 +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
16571 {
16572 - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16573 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
16574 return;
16575
16576 spin_lock(&nmi_print_lock);
16577 @@ -733,10 +713,12 @@ void notrace __kprobes die_nmi(struct pt
16578 * to get a message out:
16579 */
16580 bust_spinlocks(1);
16581 - printk(KERN_EMERG "%s", msg);
16582 + printk(KERN_EMERG "%s", str);
16583 printk(" on CPU%d, ip %08lx, registers:\n",
16584 smp_processor_id(), regs->ip);
16585 show_registers(regs);
16586 + if (do_panic)
16587 + panic("Non maskable interrupt");
16588 console_silent();
16589 spin_unlock(&nmi_print_lock);
16590 bust_spinlocks(0);
16591 @@ -756,14 +738,17 @@ void notrace __kprobes die_nmi(struct pt
16592 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
16593 {
16594 unsigned char reason = 0;
16595 + int cpu;
16596
16597 - /* Only the BSP gets external NMIs from the system: */
16598 - if (!smp_processor_id())
16599 + cpu = smp_processor_id();
16600 +
16601 + /* Only the BSP gets external NMIs from the system. */
16602 + if (!cpu)
16603 reason = get_nmi_reason();
16604
16605 if (!(reason & 0xc0)) {
16606 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
16607 - == NOTIFY_STOP)
16608 + == NOTIFY_STOP)
16609 return;
16610 #ifdef CONFIG_X86_LOCAL_APIC
16611 /*
16612 @@ -772,7 +757,7 @@ static notrace __kprobes void default_do
16613 */
16614 if (nmi_watchdog_tick(regs, reason))
16615 return;
16616 - if (!do_nmi_callback(regs, smp_processor_id()))
16617 + if (!do_nmi_callback(regs, cpu))
16618 unknown_nmi_error(reason, regs);
16619 #else
16620 unknown_nmi_error(reason, regs);
16621 @@ -782,6 +767,8 @@ static notrace __kprobes void default_do
16622 }
16623 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
16624 return;
16625 +
16626 + /* AK: following checks seem to be broken on modern chipsets. FIXME */
16627 if (reason & 0x80)
16628 mem_parity_error(reason, regs);
16629 if (reason & 0x40)
16630 @@ -793,8 +780,6 @@ static notrace __kprobes void default_do
16631 reassert_nmi();
16632 }
16633
16634 -static int ignore_nmis;
16635 -
16636 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
16637 {
16638 int cpu;
16639 @@ -879,7 +864,7 @@ void __kprobes do_debug(struct pt_regs *
16640 tsk->thread.debugctlmsr = 0;
16641
16642 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16643 - SIGTRAP) == NOTIFY_STOP)
16644 + SIGTRAP) == NOTIFY_STOP)
16645 return;
16646 /* It's safe to allow irq's after DR6 has been saved */
16647 if (regs->flags & X86_EFLAGS_IF)
16648 @@ -940,9 +925,8 @@ clear_TF_reenable:
16649 void math_error(void __user *ip)
16650 {
16651 struct task_struct *task;
16652 - unsigned short cwd;
16653 - unsigned short swd;
16654 siginfo_t info;
16655 + unsigned short cwd, swd;
16656
16657 /*
16658 * Save the info for the exception handler and clear the error.
16659 @@ -961,7 +945,7 @@ void math_error(void __user *ip)
16660 * C1 reg you need in case of a stack fault, 0x040 is the stack
16661 * fault bit. We should only be taking one exception at a time,
16662 * so if this combination doesn't produce any single exception,
16663 - * then we have a bad program that isn't syncronizing its FPU usage
16664 + * then we have a bad program that isn't synchronizing its FPU usage
16665 * and it will suffer the consequences since we won't be able to
16666 * fully reproduce the context of the exception
16667 */
16668 @@ -970,7 +954,7 @@ void math_error(void __user *ip)
16669 switch (swd & ~cwd & 0x3f) {
16670 case 0x000: /* No unmasked exception */
16671 return;
16672 - default: /* Multiple exceptions */
16673 + default: /* Multiple exceptions */
16674 break;
16675 case 0x001: /* Invalid Op */
16676 /*
16677 @@ -1006,8 +990,8 @@ void do_coprocessor_error(struct pt_regs
16678 static void simd_math_error(void __user *ip)
16679 {
16680 struct task_struct *task;
16681 - unsigned short mxcsr;
16682 siginfo_t info;
16683 + unsigned short mxcsr;
16684
16685 /*
16686 * Save the info for the exception handler and clear the error.
16687 @@ -1084,7 +1068,7 @@ void do_spurious_interrupt_bug(struct pt
16688
16689 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
16690 {
16691 - struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
16692 + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
16693 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
16694 unsigned long new_kesp = kesp - base;
16695 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
16696 --- sle11-2009-10-16.orig/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:38:05.000000000 +0100
16697 +++ sle11-2009-10-16/arch/x86/kernel/traps_64-xen.c 2009-06-04 10:21:39.000000000 +0200
16698 @@ -10,73 +10,56 @@
16699 * 'Traps.c' handles hardware traps and faults after we have saved some
16700 * state in 'entry.S'.
16701 */
16702 -#include <linux/sched.h>
16703 +#include <linux/moduleparam.h>
16704 +#include <linux/interrupt.h>
16705 +#include <linux/kallsyms.h>
16706 +#include <linux/spinlock.h>
16707 +#include <linux/kprobes.h>
16708 +#include <linux/uaccess.h>
16709 +#include <linux/utsname.h>
16710 +#include <linux/kdebug.h>
16711 #include <linux/kernel.h>
16712 +#include <linux/module.h>
16713 +#include <linux/ptrace.h>
16714 #include <linux/string.h>
16715 +#include <linux/unwind.h>
16716 +#include <linux/delay.h>
16717 #include <linux/errno.h>
16718 -#include <linux/ptrace.h>
16719 +#include <linux/kexec.h>
16720 +#include <linux/sched.h>
16721 #include <linux/timer.h>
16722 -#include <linux/mm.h>
16723 #include <linux/init.h>
16724 -#include <linux/delay.h>
16725 -#include <linux/spinlock.h>
16726 -#include <linux/interrupt.h>
16727 -#include <linux/kallsyms.h>
16728 -#include <linux/module.h>
16729 -#include <linux/moduleparam.h>
16730 -#include <linux/nmi.h>
16731 -#include <linux/kprobes.h>
16732 -#include <linux/kexec.h>
16733 -#include <linux/unwind.h>
16734 -#include <linux/uaccess.h>
16735 #include <linux/bug.h>
16736 -#include <linux/kdebug.h>
16737 -#include <linux/utsname.h>
16738 -
16739 -#include <mach_traps.h>
16740 +#include <linux/nmi.h>
16741 +#include <linux/mm.h>
16742
16743 #if defined(CONFIG_EDAC)
16744 #include <linux/edac.h>
16745 #endif
16746
16747 -#include <asm/system.h>
16748 -#include <asm/io.h>
16749 -#include <asm/atomic.h>
16750 +#include <asm/stacktrace.h>
16751 +#include <asm/processor.h>
16752 #include <asm/debugreg.h>
16753 +#include <asm/atomic.h>
16754 +#include <asm/system.h>
16755 +#include <asm/unwind.h>
16756 #include <asm/desc.h>
16757 #include <asm/i387.h>
16758 -#include <asm/processor.h>
16759 -#include <asm/unwind.h>
16760 +#include <asm/nmi.h>
16761 #include <asm/smp.h>
16762 +#include <asm/io.h>
16763 #include <asm/pgalloc.h>
16764 -#include <asm/pda.h>
16765 #include <asm/proto.h>
16766 -#include <asm/nmi.h>
16767 -#include <asm/stacktrace.h>
16768 +#include <asm/pda.h>
16769 +#include <asm/traps.h>
16770
16771 -asmlinkage void divide_error(void);
16772 -asmlinkage void debug(void);
16773 -asmlinkage void nmi(void);
16774 -asmlinkage void int3(void);
16775 -asmlinkage void overflow(void);
16776 -asmlinkage void bounds(void);
16777 -asmlinkage void invalid_op(void);
16778 -asmlinkage void device_not_available(void);
16779 -asmlinkage void double_fault(void);
16780 -asmlinkage void coprocessor_segment_overrun(void);
16781 -asmlinkage void invalid_TSS(void);
16782 -asmlinkage void segment_not_present(void);
16783 -asmlinkage void stack_segment(void);
16784 -asmlinkage void general_protection(void);
16785 -asmlinkage void page_fault(void);
16786 -asmlinkage void coprocessor_error(void);
16787 -asmlinkage void simd_coprocessor_error(void);
16788 -asmlinkage void reserved(void);
16789 -asmlinkage void alignment_check(void);
16790 -asmlinkage void machine_check(void);
16791 -asmlinkage void spurious_interrupt_bug(void);
16792 +#include <mach_traps.h>
16793
16794 +int panic_on_unrecovered_nmi;
16795 +int kstack_depth_to_print = 12;
16796 static unsigned int code_bytes = 64;
16797 +static int ignore_nmis;
16798 +static int die_counter;
16799
16800 static inline void conditional_sti(struct pt_regs *regs)
16801 {
16802 @@ -100,34 +83,9 @@ static inline void preempt_conditional_c
16803 dec_preempt_count();
16804 }
16805
16806 -int kstack_depth_to_print = 12;
16807 -
16808 void printk_address(unsigned long address, int reliable)
16809 {
16810 -#ifdef CONFIG_KALLSYMS
16811 - unsigned long offset = 0, symsize;
16812 - const char *symname;
16813 - char *modname;
16814 - char *delim = ":";
16815 - char namebuf[KSYM_NAME_LEN];
16816 - char reliab[4] = "";
16817 -
16818 - symname = kallsyms_lookup(address, &symsize, &offset,
16819 - &modname, namebuf);
16820 - if (!symname) {
16821 - printk(" [<%016lx>]\n", address);
16822 - return;
16823 - }
16824 - if (!reliable)
16825 - strcpy(reliab, "? ");
16826 -
16827 - if (!modname)
16828 - modname = delim = "";
16829 - printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
16830 - address, reliab, delim, modname, delim, symname, offset, symsize);
16831 -#else
16832 - printk(" [<%016lx>]\n", address);
16833 -#endif
16834 + printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
16835 }
16836
16837 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
16838 @@ -206,8 +164,6 @@ static unsigned long *in_exception_stack
16839 return NULL;
16840 }
16841
16842 -#define MSG(txt) ops->warning(data, txt)
16843 -
16844 /*
16845 * x86-64 can have up to three kernel stacks:
16846 * process stack
16847 @@ -234,11 +190,11 @@ struct stack_frame {
16848 unsigned long return_address;
16849 };
16850
16851 -
16852 -static inline unsigned long print_context_stack(struct thread_info *tinfo,
16853 - unsigned long *stack, unsigned long bp,
16854 - const struct stacktrace_ops *ops, void *data,
16855 - unsigned long *end)
16856 +static inline unsigned long
16857 +print_context_stack(struct thread_info *tinfo,
16858 + unsigned long *stack, unsigned long bp,
16859 + const struct stacktrace_ops *ops, void *data,
16860 + unsigned long *end)
16861 {
16862 struct stack_frame *frame = (struct stack_frame *)bp;
16863
16864 @@ -260,7 +216,7 @@ static inline unsigned long print_contex
16865 return bp;
16866 }
16867
16868 -void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
16869 +void dump_trace(struct task_struct *task, struct pt_regs *regs,
16870 unsigned long *stack, unsigned long bp,
16871 const struct stacktrace_ops *ops, void *data)
16872 {
16873 @@ -269,36 +225,34 @@ void dump_trace(struct task_struct *tsk,
16874 unsigned used = 0;
16875 struct thread_info *tinfo;
16876
16877 - if (!tsk)
16878 - tsk = current;
16879 - tinfo = task_thread_info(tsk);
16880 + if (!task)
16881 + task = current;
16882
16883 if (!stack) {
16884 unsigned long dummy;
16885 stack = &dummy;
16886 - if (tsk && tsk != current)
16887 - stack = (unsigned long *)tsk->thread.sp;
16888 + if (task && task != current)
16889 + stack = (unsigned long *)task->thread.sp;
16890 }
16891
16892 #ifdef CONFIG_FRAME_POINTER
16893 if (!bp) {
16894 - if (tsk == current) {
16895 + if (task == current) {
16896 /* Grab bp right from our regs */
16897 - asm("movq %%rbp, %0" : "=r" (bp):);
16898 + asm("movq %%rbp, %0" : "=r" (bp) :);
16899 } else {
16900 /* bp is the last reg pushed by switch_to */
16901 - bp = *(unsigned long *) tsk->thread.sp;
16902 + bp = *(unsigned long *) task->thread.sp;
16903 }
16904 }
16905 #endif
16906
16907 -
16908 -
16909 /*
16910 * Print function call entries in all stacks, starting at the
16911 * current stack address. If the stacks consist of nested
16912 * exceptions
16913 */
16914 + tinfo = task_thread_info(task);
16915 for (;;) {
16916 char *id;
16917 unsigned long *estack_end;
16918 @@ -383,18 +337,24 @@ static const struct stacktrace_ops print
16919 .address = print_trace_address,
16920 };
16921
16922 -void
16923 -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
16924 - unsigned long bp)
16925 +static void
16926 +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
16927 + unsigned long *stack, unsigned long bp, char *log_lvl)
16928 {
16929 printk("\nCall Trace:\n");
16930 - dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
16931 + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
16932 printk("\n");
16933 }
16934
16935 +void show_trace(struct task_struct *task, struct pt_regs *regs,
16936 + unsigned long *stack, unsigned long bp)
16937 +{
16938 + show_trace_log_lvl(task, regs, stack, bp, "");
16939 +}
16940 +
16941 static void
16942 -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
16943 - unsigned long bp)
16944 +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
16945 + unsigned long *sp, unsigned long bp, char *log_lvl)
16946 {
16947 unsigned long *stack;
16948 int i;
16949 @@ -406,14 +366,14 @@ _show_stack(struct task_struct *tsk, str
16950 // back trace for this cpu.
16951
16952 if (sp == NULL) {
16953 - if (tsk)
16954 - sp = (unsigned long *)tsk->thread.sp;
16955 + if (task)
16956 + sp = (unsigned long *)task->thread.sp;
16957 else
16958 sp = (unsigned long *)&sp;
16959 }
16960
16961 stack = sp;
16962 - for(i=0; i < kstack_depth_to_print; i++) {
16963 + for (i = 0; i < kstack_depth_to_print; i++) {
16964 if (stack >= irqstack && stack <= irqstack_end) {
16965 if (stack == irqstack_end) {
16966 stack = (unsigned long *) (irqstack_end[-1]);
16967 @@ -428,12 +388,12 @@ _show_stack(struct task_struct *tsk, str
16968 printk(" %016lx", *stack++);
16969 touch_nmi_watchdog();
16970 }
16971 - show_trace(tsk, regs, sp, bp);
16972 + show_trace_log_lvl(task, regs, sp, bp, log_lvl);
16973 }
16974
16975 -void show_stack(struct task_struct *tsk, unsigned long * sp)
16976 +void show_stack(struct task_struct *task, unsigned long *sp)
16977 {
16978 - _show_stack(tsk, NULL, sp, 0);
16979 + show_stack_log_lvl(task, NULL, sp, 0, "");
16980 }
16981
16982 /*
16983 @@ -441,8 +401,8 @@ void show_stack(struct task_struct *tsk,
16984 */
16985 void dump_stack(void)
16986 {
16987 - unsigned long dummy;
16988 unsigned long bp = 0;
16989 + unsigned long stack;
16990
16991 #ifdef CONFIG_FRAME_POINTER
16992 if (!bp)
16993 @@ -454,7 +414,7 @@ void dump_stack(void)
16994 init_utsname()->release,
16995 (int)strcspn(init_utsname()->version, " "),
16996 init_utsname()->version);
16997 - show_trace(NULL, NULL, &dummy, bp);
16998 + show_trace(NULL, NULL, &stack, bp);
16999 }
17000
17001 EXPORT_SYMBOL(dump_stack);
17002 @@ -465,12 +425,8 @@ void show_registers(struct pt_regs *regs
17003 unsigned long sp;
17004 const int cpu = smp_processor_id();
17005 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
17006 - u8 *ip;
17007 - unsigned int code_prologue = code_bytes * 43 / 64;
17008 - unsigned int code_len = code_bytes;
17009
17010 sp = regs->sp;
17011 - ip = (u8 *) regs->ip - code_prologue;
17012 printk("CPU %d ", cpu);
17013 __show_regs(regs);
17014 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
17015 @@ -481,15 +437,22 @@ void show_registers(struct pt_regs *regs
17016 * time of the fault..
17017 */
17018 if (!user_mode(regs)) {
17019 + unsigned int code_prologue = code_bytes * 43 / 64;
17020 + unsigned int code_len = code_bytes;
17021 unsigned char c;
17022 + u8 *ip;
17023 +
17024 printk("Stack: ");
17025 - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
17026 + show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
17027 + regs->bp, "");
17028 printk("\n");
17029
17030 printk(KERN_EMERG "Code: ");
17031 +
17032 + ip = (u8 *)regs->ip - code_prologue;
17033 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
17034 /* try starting at RIP */
17035 - ip = (u8 *) regs->ip;
17036 + ip = (u8 *)regs->ip;
17037 code_len = code_len - code_prologue + 1;
17038 }
17039 for (i = 0; i < code_len; i++, ip++) {
17040 @@ -505,7 +468,7 @@ void show_registers(struct pt_regs *regs
17041 }
17042 }
17043 printk("\n");
17044 -}
17045 +}
17046
17047 int is_valid_bugaddr(unsigned long ip)
17048 {
17049 @@ -545,7 +508,7 @@ unsigned __kprobes long oops_begin(void)
17050 }
17051
17052 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
17053 -{
17054 +{
17055 die_owner = -1;
17056 bust_spinlocks(0);
17057 die_nest_count--;
17058 @@ -563,10 +526,9 @@ void __kprobes oops_end(unsigned long fl
17059 do_exit(signr);
17060 }
17061
17062 -int __kprobes __die(const char * str, struct pt_regs * regs, long err)
17063 +int __kprobes __die(const char *str, struct pt_regs *regs, long err)
17064 {
17065 - static int die_counter;
17066 - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
17067 + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
17068 #ifdef CONFIG_PREEMPT
17069 printk("PREEMPT ");
17070 #endif
17071 @@ -577,8 +539,10 @@ int __kprobes __die(const char * str, st
17072 printk("DEBUG_PAGEALLOC");
17073 #endif
17074 printk("\n");
17075 - if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17076 + if (notify_die(DIE_OOPS, str, regs, err,
17077 + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
17078 return 1;
17079 +
17080 show_registers(regs);
17081 add_taint(TAINT_DIE);
17082 /* Executive summary in case the oops scrolled away */
17083 @@ -590,7 +554,7 @@ int __kprobes __die(const char * str, st
17084 return 0;
17085 }
17086
17087 -void die(const char * str, struct pt_regs * regs, long err)
17088 +void die(const char *str, struct pt_regs *regs, long err)
17089 {
17090 unsigned long flags = oops_begin();
17091
17092 @@ -608,8 +572,7 @@ die_nmi(char *str, struct pt_regs *regs,
17093 {
17094 unsigned long flags;
17095
17096 - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
17097 - NOTIFY_STOP)
17098 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
17099 return;
17100
17101 flags = oops_begin();
17102 @@ -617,7 +580,9 @@ die_nmi(char *str, struct pt_regs *regs,
17103 * We are in trouble anyway, lets at least try
17104 * to get a message out.
17105 */
17106 - printk(str, smp_processor_id());
17107 + printk(KERN_EMERG "%s", str);
17108 + printk(" on CPU%d, ip %08lx, registers:\n",
17109 + smp_processor_id(), regs->ip);
17110 show_registers(regs);
17111 if (kexec_should_crash(current))
17112 crash_kexec(regs);
17113 @@ -630,44 +595,44 @@ die_nmi(char *str, struct pt_regs *regs,
17114 }
17115 #endif
17116
17117 -static void __kprobes do_trap(int trapnr, int signr, char *str,
17118 - struct pt_regs * regs, long error_code,
17119 - siginfo_t *info)
17120 +static void __kprobes
17121 +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
17122 + long error_code, siginfo_t *info)
17123 {
17124 struct task_struct *tsk = current;
17125
17126 - if (user_mode(regs)) {
17127 - /*
17128 - * We want error_code and trap_no set for userspace
17129 - * faults and kernelspace faults which result in
17130 - * die(), but not kernelspace faults which are fixed
17131 - * up. die() gives the process no chance to handle
17132 - * the signal and notice the kernel fault information,
17133 - * so that won't result in polluting the information
17134 - * about previously queued, but not yet delivered,
17135 - * faults. See also do_general_protection below.
17136 - */
17137 - tsk->thread.error_code = error_code;
17138 - tsk->thread.trap_no = trapnr;
17139 + if (!user_mode(regs))
17140 + goto kernel_trap;
17141
17142 - if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17143 - printk_ratelimit()) {
17144 - printk(KERN_INFO
17145 - "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17146 - tsk->comm, tsk->pid, str,
17147 - regs->ip, regs->sp, error_code);
17148 - print_vma_addr(" in ", regs->ip);
17149 - printk("\n");
17150 - }
17151 + /*
17152 + * We want error_code and trap_no set for userspace faults and
17153 + * kernelspace faults which result in die(), but not
17154 + * kernelspace faults which are fixed up. die() gives the
17155 + * process no chance to handle the signal and notice the
17156 + * kernel fault information, so that won't result in polluting
17157 + * the information about previously queued, but not yet
17158 + * delivered, faults. See also do_general_protection below.
17159 + */
17160 + tsk->thread.error_code = error_code;
17161 + tsk->thread.trap_no = trapnr;
17162
17163 - if (info)
17164 - force_sig_info(signr, info, tsk);
17165 - else
17166 - force_sig(signr, tsk);
17167 - return;
17168 + if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
17169 + printk_ratelimit()) {
17170 + printk(KERN_INFO
17171 + "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
17172 + tsk->comm, tsk->pid, str,
17173 + regs->ip, regs->sp, error_code);
17174 + print_vma_addr(" in ", regs->ip);
17175 + printk("\n");
17176 }
17177
17178 + if (info)
17179 + force_sig_info(signr, info, tsk);
17180 + else
17181 + force_sig(signr, tsk);
17182 + return;
17183
17184 +kernel_trap:
17185 if (!fixup_exception(regs)) {
17186 tsk->thread.error_code = error_code;
17187 tsk->thread.trap_no = trapnr;
17188 @@ -677,41 +642,39 @@ static void __kprobes do_trap(int trapnr
17189 }
17190
17191 #define DO_ERROR(trapnr, signr, str, name) \
17192 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17193 -{ \
17194 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17195 - == NOTIFY_STOP) \
17196 - return; \
17197 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17198 +{ \
17199 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17200 + == NOTIFY_STOP) \
17201 + return; \
17202 conditional_sti(regs); \
17203 - do_trap(trapnr, signr, str, regs, error_code, NULL); \
17204 + do_trap(trapnr, signr, str, regs, error_code, NULL); \
17205 }
17206
17207 -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17208 -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17209 -{ \
17210 - siginfo_t info; \
17211 - info.si_signo = signr; \
17212 - info.si_errno = 0; \
17213 - info.si_code = sicode; \
17214 - info.si_addr = (void __user *)siaddr; \
17215 - trace_hardirqs_fixup(); \
17216 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17217 - == NOTIFY_STOP) \
17218 - return; \
17219 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
17220 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
17221 +{ \
17222 + siginfo_t info; \
17223 + info.si_signo = signr; \
17224 + info.si_errno = 0; \
17225 + info.si_code = sicode; \
17226 + info.si_addr = (void __user *)siaddr; \
17227 + trace_hardirqs_fixup(); \
17228 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
17229 + == NOTIFY_STOP) \
17230 + return; \
17231 conditional_sti(regs); \
17232 - do_trap(trapnr, signr, str, regs, error_code, &info); \
17233 + do_trap(trapnr, signr, str, regs, error_code, &info); \
17234 }
17235
17236 -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17237 -DO_ERROR( 4, SIGSEGV, "overflow", overflow)
17238 -DO_ERROR( 5, SIGSEGV, "bounds", bounds)
17239 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17240 -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
17241 -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17242 +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
17243 +DO_ERROR(4, SIGSEGV, "overflow", overflow)
17244 +DO_ERROR(5, SIGSEGV, "bounds", bounds)
17245 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
17246 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
17247 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
17248 -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17249 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
17250 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
17251 -DO_ERROR(18, SIGSEGV, "reserved", reserved)
17252
17253 /* Runs on IST stack */
17254 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
17255 @@ -741,31 +704,34 @@ asmlinkage void do_double_fault(struct p
17256 die(str, regs, error_code);
17257 }
17258
17259 -asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
17260 - long error_code)
17261 +asmlinkage void __kprobes
17262 +do_general_protection(struct pt_regs *regs, long error_code)
17263 {
17264 - struct task_struct *tsk = current;
17265 + struct task_struct *tsk;
17266
17267 conditional_sti(regs);
17268
17269 - if (user_mode(regs)) {
17270 - tsk->thread.error_code = error_code;
17271 - tsk->thread.trap_no = 13;
17272 + tsk = current;
17273 + if (!user_mode(regs))
17274 + goto gp_in_kernel;
17275
17276 - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17277 - printk_ratelimit()) {
17278 - printk(KERN_INFO
17279 - "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17280 - tsk->comm, tsk->pid,
17281 - regs->ip, regs->sp, error_code);
17282 - print_vma_addr(" in ", regs->ip);
17283 - printk("\n");
17284 - }
17285 + tsk->thread.error_code = error_code;
17286 + tsk->thread.trap_no = 13;
17287
17288 - force_sig(SIGSEGV, tsk);
17289 - return;
17290 - }
17291 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17292 + printk_ratelimit()) {
17293 + printk(KERN_INFO
17294 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
17295 + tsk->comm, tsk->pid,
17296 + regs->ip, regs->sp, error_code);
17297 + print_vma_addr(" in ", regs->ip);
17298 + printk("\n");
17299 + }
17300
17301 + force_sig(SIGSEGV, tsk);
17302 + return;
17303 +
17304 +gp_in_kernel:
17305 if (fixup_exception(regs))
17306 return;
17307
17308 @@ -778,14 +744,14 @@ asmlinkage void __kprobes do_general_pro
17309 }
17310
17311 static notrace __kprobes void
17312 -mem_parity_error(unsigned char reason, struct pt_regs * regs)
17313 +mem_parity_error(unsigned char reason, struct pt_regs *regs)
17314 {
17315 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
17316 reason);
17317 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
17318
17319 #if defined(CONFIG_EDAC)
17320 - if(edac_handler_set()) {
17321 + if (edac_handler_set()) {
17322 edac_atomic_assert_error();
17323 return;
17324 }
17325 @@ -801,7 +767,7 @@ mem_parity_error(unsigned char reason, s
17326 }
17327
17328 static notrace __kprobes void
17329 -io_check_error(unsigned char reason, struct pt_regs * regs)
17330 +io_check_error(unsigned char reason, struct pt_regs *regs)
17331 {
17332 printk("NMI: IOCK error (debug interrupt?)\n");
17333 show_registers(regs);
17334 @@ -827,14 +793,14 @@ unknown_nmi_error(unsigned char reason,
17335
17336 /* Runs on IST stack. This code must keep interrupts off all the time.
17337 Nested NMIs are prevented by the CPU. */
17338 -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17339 +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
17340 {
17341 unsigned char reason = 0;
17342 int cpu;
17343
17344 cpu = smp_processor_id();
17345
17346 - /* Only the BSP gets external NMIs from the system. */
17347 + /* Only the BSP gets external NMIs from the system. */
17348 if (!cpu)
17349 reason = get_nmi_reason();
17350
17351 @@ -847,33 +813,58 @@ asmlinkage notrace __kprobes void defau
17352 * Ok, so this is none of the documented NMI sources,
17353 * so it must be the NMI watchdog.
17354 */
17355 - if (nmi_watchdog_tick(regs,reason))
17356 + if (nmi_watchdog_tick(regs, reason))
17357 return;
17358 #endif
17359 - if (!do_nmi_callback(regs,cpu))
17360 + if (!do_nmi_callback(regs, cpu))
17361 unknown_nmi_error(reason, regs);
17362
17363 return;
17364 }
17365 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
17366 - return;
17367 + return;
17368
17369 /* AK: following checks seem to be broken on modern chipsets. FIXME */
17370 -
17371 if (reason & 0x80)
17372 mem_parity_error(reason, regs);
17373 if (reason & 0x40)
17374 io_check_error(reason, regs);
17375 }
17376
17377 +asmlinkage notrace __kprobes void
17378 +do_nmi(struct pt_regs *regs, long error_code)
17379 +{
17380 + nmi_enter();
17381 +
17382 + add_pda(__nmi_count, 1);
17383 +
17384 + if (!ignore_nmis)
17385 + default_do_nmi(regs);
17386 +
17387 + nmi_exit();
17388 +}
17389 +
17390 +void stop_nmi(void)
17391 +{
17392 + acpi_nmi_disable();
17393 + ignore_nmis++;
17394 +}
17395 +
17396 +void restart_nmi(void)
17397 +{
17398 + ignore_nmis--;
17399 + acpi_nmi_enable();
17400 +}
17401 +
17402 /* runs on IST stack. */
17403 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
17404 {
17405 trace_hardirqs_fixup();
17406
17407 - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
17408 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
17409 + == NOTIFY_STOP)
17410 return;
17411 - }
17412 +
17413 preempt_conditional_sti(regs);
17414 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
17415 preempt_conditional_cli(regs);
17416 @@ -904,8 +895,8 @@ asmlinkage __kprobes struct pt_regs *syn
17417 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
17418 unsigned long error_code)
17419 {
17420 - unsigned long condition;
17421 struct task_struct *tsk = current;
17422 + unsigned long condition;
17423 siginfo_t info;
17424
17425 trace_hardirqs_fixup();
17426 @@ -926,21 +917,19 @@ asmlinkage void __kprobes do_debug(struc
17427
17428 /* Mask out spurious debug traps due to lazy DR7 setting */
17429 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
17430 - if (!tsk->thread.debugreg7) {
17431 + if (!tsk->thread.debugreg7)
17432 goto clear_dr7;
17433 - }
17434 }
17435
17436 tsk->thread.debugreg6 = condition;
17437
17438 -
17439 /*
17440 * Single-stepping through TF: make sure we ignore any events in
17441 * kernel space (but re-enable TF when returning to user mode).
17442 */
17443 if (condition & DR_STEP) {
17444 - if (!user_mode(regs))
17445 - goto clear_TF_reenable;
17446 + if (!user_mode(regs))
17447 + goto clear_TF_reenable;
17448 }
17449
17450 /* Ok, finally something we can handle */
17451 @@ -953,7 +942,7 @@ asmlinkage void __kprobes do_debug(struc
17452 force_sig_info(SIGTRAP, &info, tsk);
17453
17454 clear_dr7:
17455 - set_debugreg(0UL, 7);
17456 + set_debugreg(0, 7);
17457 preempt_conditional_cli(regs);
17458 return;
17459
17460 @@ -961,6 +950,7 @@ clear_TF_reenable:
17461 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
17462 regs->flags &= ~X86_EFLAGS_TF;
17463 preempt_conditional_cli(regs);
17464 + return;
17465 }
17466
17467 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
17468 @@ -983,7 +973,7 @@ static int kernel_math_error(struct pt_r
17469 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
17470 {
17471 void __user *ip = (void __user *)(regs->ip);
17472 - struct task_struct * task;
17473 + struct task_struct *task;
17474 siginfo_t info;
17475 unsigned short cwd, swd;
17476
17477 @@ -1016,30 +1006,30 @@ asmlinkage void do_coprocessor_error(str
17478 cwd = get_fpu_cwd(task);
17479 swd = get_fpu_swd(task);
17480 switch (swd & ~cwd & 0x3f) {
17481 - case 0x000:
17482 - default:
17483 - break;
17484 - case 0x001: /* Invalid Op */
17485 - /*
17486 - * swd & 0x240 == 0x040: Stack Underflow
17487 - * swd & 0x240 == 0x240: Stack Overflow
17488 - * User must clear the SF bit (0x40) if set
17489 - */
17490 - info.si_code = FPE_FLTINV;
17491 - break;
17492 - case 0x002: /* Denormalize */
17493 - case 0x010: /* Underflow */
17494 - info.si_code = FPE_FLTUND;
17495 - break;
17496 - case 0x004: /* Zero Divide */
17497 - info.si_code = FPE_FLTDIV;
17498 - break;
17499 - case 0x008: /* Overflow */
17500 - info.si_code = FPE_FLTOVF;
17501 - break;
17502 - case 0x020: /* Precision */
17503 - info.si_code = FPE_FLTRES;
17504 - break;
17505 + case 0x000: /* No unmasked exception */
17506 + default: /* Multiple exceptions */
17507 + break;
17508 + case 0x001: /* Invalid Op */
17509 + /*
17510 + * swd & 0x240 == 0x040: Stack Underflow
17511 + * swd & 0x240 == 0x240: Stack Overflow
17512 + * User must clear the SF bit (0x40) if set
17513 + */
17514 + info.si_code = FPE_FLTINV;
17515 + break;
17516 + case 0x002: /* Denormalize */
17517 + case 0x010: /* Underflow */
17518 + info.si_code = FPE_FLTUND;
17519 + break;
17520 + case 0x004: /* Zero Divide */
17521 + info.si_code = FPE_FLTDIV;
17522 + break;
17523 + case 0x008: /* Overflow */
17524 + info.si_code = FPE_FLTOVF;
17525 + break;
17526 + case 0x020: /* Precision */
17527 + info.si_code = FPE_FLTRES;
17528 + break;
17529 }
17530 force_sig_info(SIGFPE, &info, task);
17531 }
17532 @@ -1052,7 +1042,7 @@ asmlinkage void bad_intr(void)
17533 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
17534 {
17535 void __user *ip = (void __user *)(regs->ip);
17536 - struct task_struct * task;
17537 + struct task_struct *task;
17538 siginfo_t info;
17539 unsigned short mxcsr;
17540
17541 @@ -1080,25 +1070,25 @@ asmlinkage void do_simd_coprocessor_erro
17542 */
17543 mxcsr = get_fpu_mxcsr(task);
17544 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
17545 - case 0x000:
17546 - default:
17547 - break;
17548 - case 0x001: /* Invalid Op */
17549 - info.si_code = FPE_FLTINV;
17550 - break;
17551 - case 0x002: /* Denormalize */
17552 - case 0x010: /* Underflow */
17553 - info.si_code = FPE_FLTUND;
17554 - break;
17555 - case 0x004: /* Zero Divide */
17556 - info.si_code = FPE_FLTDIV;
17557 - break;
17558 - case 0x008: /* Overflow */
17559 - info.si_code = FPE_FLTOVF;
17560 - break;
17561 - case 0x020: /* Precision */
17562 - info.si_code = FPE_FLTRES;
17563 - break;
17564 + case 0x000:
17565 + default:
17566 + break;
17567 + case 0x001: /* Invalid Op */
17568 + info.si_code = FPE_FLTINV;
17569 + break;
17570 + case 0x002: /* Denormalize */
17571 + case 0x010: /* Underflow */
17572 + info.si_code = FPE_FLTUND;
17573 + break;
17574 + case 0x004: /* Zero Divide */
17575 + info.si_code = FPE_FLTDIV;
17576 + break;
17577 + case 0x008: /* Overflow */
17578 + info.si_code = FPE_FLTOVF;
17579 + break;
17580 + case 0x020: /* Precision */
17581 + info.si_code = FPE_FLTRES;
17582 + break;
17583 }
17584 force_sig_info(SIGFPE, &info, task);
17585 }
17586 @@ -1118,7 +1108,7 @@ asmlinkage void __attribute__((weak)) mc
17587 }
17588
17589 /*
17590 - * 'math_state_restore()' saves the current math information in the
17591 + * 'math_state_restore()' saves the current math information in the
17592 * old math state array, and gets the new ones from the current task
17593 *
17594 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
17595 @@ -1145,7 +1135,14 @@ asmlinkage void math_state_restore(void)
17596
17597 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
17598
17599 - restore_fpu_checking(&me->thread.xstate->fxsave);
17600 + /*
17601 + * Paranoid restore. send a SIGSEGV if we fail to restore the state.
17602 + */
17603 + if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
17604 + stts();
17605 + force_sig(SIGSEGV, me);
17606 + return;
17607 + }
17608 task_thread_info(me)->status |= TS_USEDFPU;
17609 me->fpu_counter++;
17610 }
17611 @@ -1190,13 +1187,12 @@ void __init trap_init(void)
17612 ret = HYPERVISOR_set_trap_table(trap_table);
17613 if (ret)
17614 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
17615 -
17616 /*
17617 * initialize the per thread extended state:
17618 */
17619 - init_thread_xstate();
17620 + init_thread_xstate();
17621 /*
17622 - * Should be a barrier for any external CPU state.
17623 + * Should be a barrier for any external CPU state:
17624 */
17625 cpu_init();
17626 }
17627 @@ -1212,27 +1208,25 @@ void __cpuinit smp_trap_init(trap_info_t
17628 }
17629 }
17630
17631 -
17632 static int __init oops_setup(char *s)
17633 -{
17634 +{
17635 if (!s)
17636 return -EINVAL;
17637 if (!strcmp(s, "panic"))
17638 panic_on_oops = 1;
17639 return 0;
17640 -}
17641 +}
17642 early_param("oops", oops_setup);
17643
17644 static int __init kstack_setup(char *s)
17645 {
17646 if (!s)
17647 return -EINVAL;
17648 - kstack_depth_to_print = simple_strtoul(s,NULL,0);
17649 + kstack_depth_to_print = simple_strtoul(s, NULL, 0);
17650 return 0;
17651 }
17652 early_param("kstack", kstack_setup);
17653
17654 -
17655 static int __init code_bytes_setup(char *s)
17656 {
17657 code_bytes = simple_strtoul(s, NULL, 0);
17658 --- sle11-2009-10-16.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:38:05.000000000 +0100
17659 +++ sle11-2009-10-16/arch/x86/kernel/vsyscall_64-xen.c 2009-06-04 10:21:39.000000000 +0200
17660 @@ -42,7 +42,8 @@
17661 #include <asm/topology.h>
17662 #include <asm/vgtod.h>
17663
17664 -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
17665 +#define __vsyscall(nr) \
17666 + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
17667 #define __syscall_clobber "r11","cx","memory"
17668
17669 /*
17670 @@ -264,10 +265,7 @@ static void __cpuinit vsyscall_set_cpu(i
17671 d |= cpu;
17672 d |= (node & 0xf) << 12;
17673 d |= (node >> 4) << 48;
17674 - if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
17675 - + GDT_ENTRY_PER_CPU),
17676 - d))
17677 - BUG();
17678 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
17679 }
17680
17681 static void __cpuinit cpu_vsyscall_init(void *arg)
17682 @@ -281,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_bl
17683 {
17684 long cpu = (long)arg;
17685 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
17686 - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
17687 + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
17688 return NOTIFY_DONE;
17689 }
17690
17691 @@ -311,7 +309,7 @@ static int __init vsyscall_init(void)
17692 #ifdef CONFIG_SYSCTL
17693 register_sysctl_table(kernel_root_table2);
17694 #endif
17695 - on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
17696 + on_each_cpu(cpu_vsyscall_init, NULL, 1);
17697 hotcpu_notifier(cpu_vsyscall_notifier, 0);
17698 return 0;
17699 }
17700 --- sle11-2009-10-16.orig/arch/x86/mach-xen/setup.c 2009-03-16 16:33:40.000000000 +0100
17701 +++ sle11-2009-10-16/arch/x86/mach-xen/setup.c 2009-06-04 10:21:39.000000000 +0200
17702 @@ -17,6 +17,8 @@
17703 #include <xen/interface/callback.h>
17704 #include <xen/interface/memory.h>
17705
17706 +#ifdef CONFIG_X86_32
17707 +
17708 #ifdef CONFIG_HOTPLUG_CPU
17709 #define DEFAULT_SEND_IPI (1)
17710 #else
17711 @@ -44,51 +46,6 @@ static int __init print_ipi_mode(void)
17712
17713 late_initcall(print_ipi_mode);
17714
17715 -/**
17716 - * machine_specific_memory_setup - Hook for machine specific memory setup.
17717 - *
17718 - * Description:
17719 - * This is included late in kernel/setup.c so that it can make
17720 - * use of all of the static functions.
17721 - **/
17722 -
17723 -char * __init machine_specific_memory_setup(void)
17724 -{
17725 - int rc;
17726 - struct xen_memory_map memmap;
17727 - /*
17728 - * This is rather large for a stack variable but this early in
17729 - * the boot process we know we have plenty slack space.
17730 - */
17731 - struct e820entry map[E820MAX];
17732 -
17733 - memmap.nr_entries = E820MAX;
17734 - set_xen_guest_handle(memmap.buffer, map);
17735 -
17736 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
17737 - if ( rc == -ENOSYS ) {
17738 - memmap.nr_entries = 1;
17739 - map[0].addr = 0ULL;
17740 - map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
17741 - /* 8MB slack (to balance backend allocations). */
17742 - map[0].size += 8ULL << 20;
17743 - map[0].type = E820_RAM;
17744 - rc = 0;
17745 - }
17746 - BUG_ON(rc);
17747 -
17748 - sanitize_e820_map(map, (char *)&memmap.nr_entries);
17749 -
17750 - BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
17751 -
17752 - return "Xen";
17753 -}
17754 -
17755 -
17756 -extern void hypervisor_callback(void);
17757 -extern void failsafe_callback(void);
17758 -extern void nmi(void);
17759 -
17760 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
17761 EXPORT_SYMBOL(machine_to_phys_mapping);
17762 unsigned int machine_to_phys_order;
17763 @@ -121,33 +78,66 @@ void __init pre_setup_arch_hook(void)
17764 (unsigned long *)xen_start_info->mfn_list;
17765 }
17766
17767 +#endif /* CONFIG_X86_32 */
17768 +
17769 +extern void hypervisor_callback(void);
17770 +extern void failsafe_callback(void);
17771 +extern void nmi(void);
17772 +
17773 +#ifdef CONFIG_X86_64
17774 +#include <asm/proto.h>
17775 +#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
17776 +#else
17777 +#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
17778 +#endif
17779 +
17780 void __init machine_specific_arch_setup(void)
17781 {
17782 int ret;
17783 static struct callback_register __initdata event = {
17784 .type = CALLBACKTYPE_event,
17785 - .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
17786 + .address = CALLBACK_ADDR(hypervisor_callback)
17787 };
17788 static struct callback_register __initdata failsafe = {
17789 .type = CALLBACKTYPE_failsafe,
17790 - .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
17791 + .address = CALLBACK_ADDR(failsafe_callback)
17792 + };
17793 +#ifdef CONFIG_X86_64
17794 + static struct callback_register __initdata syscall = {
17795 + .type = CALLBACKTYPE_syscall,
17796 + .address = CALLBACK_ADDR(system_call)
17797 };
17798 +#endif
17799 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17800 static struct callback_register __initdata nmi_cb = {
17801 .type = CALLBACKTYPE_nmi,
17802 - .address = { __KERNEL_CS, (unsigned long)nmi },
17803 + .address = CALLBACK_ADDR(nmi)
17804 };
17805 +#endif
17806
17807 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
17808 if (ret == 0)
17809 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
17810 +#ifdef CONFIG_X86_64
17811 + if (ret == 0)
17812 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
17813 +#endif
17814 #if CONFIG_XEN_COMPAT <= 0x030002
17815 +#ifdef CONFIG_X86_32
17816 if (ret == -ENOSYS)
17817 ret = HYPERVISOR_set_callbacks(
17818 event.address.cs, event.address.eip,
17819 failsafe.address.cs, failsafe.address.eip);
17820 +#else
17821 + ret = HYPERVISOR_set_callbacks(
17822 + event.address,
17823 + failsafe.address,
17824 + syscall.address);
17825 +#endif
17826 #endif
17827 BUG_ON(ret);
17828
17829 +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
17830 ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
17831 #if CONFIG_XEN_COMPAT <= 0x030002
17832 if (ret == -ENOSYS) {
17833 @@ -158,15 +148,43 @@ void __init machine_specific_arch_setup(
17834 HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
17835 }
17836 #endif
17837 +#endif
17838
17839 +#ifdef CONFIG_X86_32
17840 /* Do an early initialization of the fixmap area */
17841 {
17842 extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
17843 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
17844 pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
17845 pmd_t *pmd = pmd_offset(pud, addr);
17846 + unsigned int i;
17847
17848 make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
17849 set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
17850 +
17851 +#define __FIXADDR_TOP (-PAGE_SIZE)
17852 +#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
17853 + != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
17854 + FIX_BUG_ON(SHARED_INFO);
17855 + FIX_BUG_ON(ISAMAP_BEGIN);
17856 + FIX_BUG_ON(ISAMAP_END);
17857 +#undef __FIXADDR_TOP
17858 + BUG_ON(pte_index(hypervisor_virt_start));
17859 +
17860 + /* Switch to the real shared_info page, and clear the
17861 + * dummy page. */
17862 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17863 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17864 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
17865 +
17866 + /* Setup mapping of lower 1st MB */
17867 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
17868 + if (is_initial_xendomain())
17869 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17870 + else
17871 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
17872 + virt_to_machine(empty_zero_page),
17873 + PAGE_KERNEL_RO);
17874 }
17875 +#endif
17876 }
17877 --- sle11-2009-10-16.orig/arch/x86/mm/fault-xen.c 2009-03-16 16:38:05.000000000 +0100
17878 +++ sle11-2009-10-16/arch/x86/mm/fault-xen.c 2009-06-04 10:21:39.000000000 +0200
17879 @@ -10,6 +10,7 @@
17880 #include <linux/string.h>
17881 #include <linux/types.h>
17882 #include <linux/ptrace.h>
17883 +#include <linux/mmiotrace.h>
17884 #include <linux/mman.h>
17885 #include <linux/mm.h>
17886 #include <linux/smp.h>
17887 @@ -49,17 +50,23 @@
17888 #define PF_RSVD (1<<3)
17889 #define PF_INSTR (1<<4)
17890
17891 +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
17892 +{
17893 +#ifdef CONFIG_MMIOTRACE_HOOKS
17894 + if (unlikely(is_kmmio_active()))
17895 + if (kmmio_handler(regs, addr) == 1)
17896 + return -1;
17897 +#endif
17898 + return 0;
17899 +}
17900 +
17901 static inline int notify_page_fault(struct pt_regs *regs)
17902 {
17903 #ifdef CONFIG_KPROBES
17904 int ret = 0;
17905
17906 /* kprobe_running() needs smp_processor_id() */
17907 -#ifdef CONFIG_X86_32
17908 if (!user_mode_vm(regs)) {
17909 -#else
17910 - if (!user_mode(regs)) {
17911 -#endif
17912 preempt_disable();
17913 if (kprobe_running() && kprobe_fault_handler(regs, 14))
17914 ret = 1;
17915 @@ -409,11 +416,7 @@ static void show_fault_oops(struct pt_re
17916 printk(KERN_CONT "NULL pointer dereference");
17917 else
17918 printk(KERN_CONT "paging request");
17919 -#ifdef CONFIG_X86_32
17920 - printk(KERN_CONT " at %08lx\n", address);
17921 -#else
17922 - printk(KERN_CONT " at %016lx\n", address);
17923 -#endif
17924 + printk(KERN_CONT " at %p\n", (void *) address);
17925 printk(KERN_ALERT "IP:");
17926 printk_address(regs->ip, 1);
17927 dump_pagetable(address);
17928 @@ -628,6 +631,8 @@ void __kprobes do_page_fault(struct pt_r
17929
17930 if (notify_page_fault(regs))
17931 return;
17932 + if (unlikely(kmmio_fault(regs, address)))
17933 + return;
17934
17935 /*
17936 * We fault-in kernel-space virtual memory on-demand. The
17937 @@ -831,14 +836,10 @@ bad_area_nosemaphore:
17938 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
17939 printk_ratelimit()) {
17940 printk(
17941 -#ifdef CONFIG_X86_32
17942 - "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
17943 -#else
17944 - "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
17945 -#endif
17946 + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
17947 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
17948 - tsk->comm, task_pid_nr(tsk), address, regs->ip,
17949 - regs->sp, error_code);
17950 + tsk->comm, task_pid_nr(tsk), address,
17951 + (void *) regs->ip, (void *) regs->sp, error_code);
17952 print_vma_addr(" in ", regs->ip);
17953 printk("\n");
17954 }
17955 @@ -946,81 +947,45 @@ LIST_HEAD(pgd_list);
17956 void vmalloc_sync_all(void)
17957 {
17958 #ifdef CONFIG_X86_32
17959 - /*
17960 - * Note that races in the updates of insync and start aren't
17961 - * problematic: insync can only get set bits added, and updates to
17962 - * start are only improving performance (without affecting correctness
17963 - * if undone).
17964 - * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
17965 - * This change works just fine with 2-level paging too.
17966 - */
17967 -#define sync_index(a) ((a) >> PMD_SHIFT)
17968 - static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
17969 - static unsigned long start = TASK_SIZE;
17970 - unsigned long address;
17971 + unsigned long address = VMALLOC_START & PGDIR_MASK;
17972
17973 if (SHARED_KERNEL_PMD)
17974 return;
17975
17976 BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
17977 - for (address = start;
17978 - address < hypervisor_virt_start;
17979 - address += PMD_SIZE) {
17980 - if (!test_bit(sync_index(address), insync)) {
17981 - unsigned long flags;
17982 - struct page *page;
17983 -
17984 - spin_lock_irqsave(&pgd_lock, flags);
17985 - /* XEN: failure path assumes non-empty pgd_list. */
17986 - if (unlikely(list_empty(&pgd_list))) {
17987 - spin_unlock_irqrestore(&pgd_lock, flags);
17988 - return;
17989 - }
17990 - list_for_each_entry(page, &pgd_list, lru) {
17991 - if (!vmalloc_sync_one(page_address(page),
17992 - address))
17993 - break;
17994 - }
17995 - spin_unlock_irqrestore(&pgd_lock, flags);
17996 - if (!page)
17997 - set_bit(sync_index(address), insync);
17998 + for (; address < hypervisor_virt_start; address += PMD_SIZE) {
17999 + unsigned long flags;
18000 + struct page *page;
18001 +
18002 + spin_lock_irqsave(&pgd_lock, flags);
18003 + list_for_each_entry(page, &pgd_list, lru) {
18004 + if (!vmalloc_sync_one(page_address(page),
18005 + address))
18006 + break;
18007 }
18008 - if (address == start && test_bit(sync_index(address), insync))
18009 - start = address + PMD_SIZE;
18010 + spin_unlock_irqrestore(&pgd_lock, flags);
18011 }
18012 #else /* CONFIG_X86_64 */
18013 - /*
18014 - * Note that races in the updates of insync and start aren't
18015 - * problematic: insync can only get set bits added, and updates to
18016 - * start are only improving performance (without affecting correctness
18017 - * if undone).
18018 - */
18019 - static DECLARE_BITMAP(insync, PTRS_PER_PGD);
18020 - static unsigned long start = VMALLOC_START & PGDIR_MASK;
18021 + unsigned long start = VMALLOC_START & PGDIR_MASK;
18022 unsigned long address;
18023
18024 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
18025 - if (!test_bit(pgd_index(address), insync)) {
18026 - const pgd_t *pgd_ref = pgd_offset_k(address);
18027 - unsigned long flags;
18028 - struct page *page;
18029 -
18030 - if (pgd_none(*pgd_ref))
18031 - continue;
18032 - spin_lock_irqsave(&pgd_lock, flags);
18033 - list_for_each_entry(page, &pgd_list, lru) {
18034 - pgd_t *pgd;
18035 - pgd = (pgd_t *)page_address(page) + pgd_index(address);
18036 - if (pgd_none(*pgd))
18037 - set_pgd(pgd, *pgd_ref);
18038 - else
18039 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18040 - }
18041 - spin_unlock_irqrestore(&pgd_lock, flags);
18042 - set_bit(pgd_index(address), insync);
18043 + const pgd_t *pgd_ref = pgd_offset_k(address);
18044 + unsigned long flags;
18045 + struct page *page;
18046 +
18047 + if (pgd_none(*pgd_ref))
18048 + continue;
18049 + spin_lock_irqsave(&pgd_lock, flags);
18050 + list_for_each_entry(page, &pgd_list, lru) {
18051 + pgd_t *pgd;
18052 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
18053 + if (pgd_none(*pgd))
18054 + set_pgd(pgd, *pgd_ref);
18055 + else
18056 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
18057 }
18058 - if (address == start)
18059 - start = address + PGDIR_SIZE;
18060 + spin_unlock_irqrestore(&pgd_lock, flags);
18061 }
18062 #endif
18063 }
18064 --- sle11-2009-10-16.orig/arch/x86/mm/hypervisor.c 2009-05-14 11:18:39.000000000 +0200
18065 +++ sle11-2009-10-16/arch/x86/mm/hypervisor.c 2009-06-04 10:21:39.000000000 +0200
18066 @@ -709,6 +709,72 @@ void xen_destroy_contiguous_region(unsig
18067 }
18068 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
18069
18070 +int __init early_create_contiguous_region(unsigned long pfn,
18071 + unsigned int order,
18072 + unsigned int address_bits)
18073 +{
18074 + unsigned long *in_frames = discontig_frames, out_frame = pfn;
18075 + unsigned int i;
18076 + int rc, success;
18077 + struct xen_memory_exchange exchange = {
18078 + .in = {
18079 + .nr_extents = 1UL << order,
18080 + .extent_order = 0,
18081 + .domid = DOMID_SELF
18082 + },
18083 + .out = {
18084 + .nr_extents = 1,
18085 + .extent_order = order,
18086 + .address_bits = address_bits,
18087 + .domid = DOMID_SELF
18088 + }
18089 + };
18090 +
18091 + if (xen_feature(XENFEAT_auto_translated_physmap))
18092 + return 0;
18093 +
18094 + if (unlikely(order > MAX_CONTIG_ORDER))
18095 + return -ENOMEM;
18096 +
18097 + for (i = 0; i < (1U << order); ++i) {
18098 + in_frames[i] = pfn_to_mfn(pfn + i);
18099 + set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
18100 + }
18101 +
18102 + set_xen_guest_handle(exchange.in.extent_start, in_frames);
18103 + set_xen_guest_handle(exchange.out.extent_start, &out_frame);
18104 +
18105 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
18106 + success = (exchange.nr_exchanged == (1UL << order));
18107 + BUG_ON(!success && (exchange.nr_exchanged || !rc));
18108 + BUG_ON(success && rc);
18109 +#if CONFIG_XEN_COMPAT <= 0x030002
18110 + if (unlikely(rc == -ENOSYS)) {
18111 + /* Compatibility when XENMEM_exchange is unavailable. */
18112 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
18113 + &exchange.in) != (1UL << order))
18114 + BUG();
18115 + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18116 + &exchange.out) == 1);
18117 + if (!success) {
18118 + for (i = 0; i < (1U << order); ++i)
18119 + in_frames[i] = pfn + i;
18120 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
18121 + &exchange.in) != (1UL << order))
18122 + BUG();
18123 + }
18124 + }
18125 +#endif
18126 +
18127 + for (i = 0; i < (1U << order); ++i, ++out_frame) {
18128 + if (!success)
18129 + out_frame = in_frames[i];
18130 + set_phys_to_machine(pfn + i, out_frame);
18131 + }
18132 +
18133 + return success ? 0 : -ENOMEM;
18134 +}
18135 +
18136 static void undo_limit_pages(struct page *pages, unsigned int order)
18137 {
18138 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
18139 @@ -875,42 +941,9 @@ int write_ldt_entry(struct desc_struct *
18140 return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
18141 }
18142
18143 -#define MAX_BATCHED_FULL_PTES 32
18144 -
18145 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
18146 - unsigned long addr, unsigned long end, pgprot_t newprot,
18147 - int dirty_accountable)
18148 +int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
18149 + int type)
18150 {
18151 - int rc = 0, i = 0;
18152 - mmu_update_t u[MAX_BATCHED_FULL_PTES];
18153 - pte_t *pte;
18154 - spinlock_t *ptl;
18155 -
18156 - if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
18157 - return 0;
18158 -
18159 - pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
18160 - do {
18161 - if (pte_present(*pte)) {
18162 - pte_t ptent = pte_modify(*pte, newprot);
18163 -
18164 - if (dirty_accountable && pte_dirty(ptent))
18165 - ptent = pte_mkwrite(ptent);
18166 - u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
18167 - | ((unsigned long)pte & ~PAGE_MASK)
18168 - | MMU_PT_UPDATE_PRESERVE_AD;
18169 - u[i].val = __pte_val(ptent);
18170 - if (++i == MAX_BATCHED_FULL_PTES) {
18171 - if ((rc = HYPERVISOR_mmu_update(
18172 - &u[0], i, NULL, DOMID_SELF)) != 0)
18173 - break;
18174 - i = 0;
18175 - }
18176 - }
18177 - } while (pte++, addr += PAGE_SIZE, addr != end);
18178 - if (i)
18179 - rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
18180 - pte_unmap_unlock(pte - 1, ptl);
18181 - BUG_ON(rc && rc != -ENOSYS);
18182 - return !rc;
18183 + maddr_t mach_gp = virt_to_machine(gdt + entry);
18184 + return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
18185 }
18186 --- sle11-2009-10-16.orig/arch/x86/mm/init_32-xen.c 2009-03-16 16:38:05.000000000 +0100
18187 +++ sle11-2009-10-16/arch/x86/mm/init_32-xen.c 2009-06-04 10:21:39.000000000 +0200
18188 @@ -54,6 +54,7 @@
18189
18190 unsigned int __VMALLOC_RESERVE = 128 << 20;
18191
18192 +unsigned long max_low_pfn_mapped;
18193 unsigned long max_pfn_mapped;
18194
18195 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18196 @@ -61,6 +62,27 @@ unsigned long highstart_pfn, highend_pfn
18197
18198 static noinline int do_test_wp_bit(void);
18199
18200 +
18201 +static unsigned long __initdata table_start;
18202 +static unsigned long __initdata table_end;
18203 +static unsigned long __initdata table_top;
18204 +
18205 +static int __initdata after_init_bootmem;
18206 +
18207 +static __init void *alloc_low_page(unsigned long *phys)
18208 +{
18209 + unsigned long pfn = table_end++;
18210 + void *adr;
18211 +
18212 + if (pfn >= table_top)
18213 + panic("alloc_low_page: ran out of memory");
18214 +
18215 + adr = __va(pfn * PAGE_SIZE);
18216 + memset(adr, 0, PAGE_SIZE);
18217 + *phys = pfn * PAGE_SIZE;
18218 + return adr;
18219 +}
18220 +
18221 /*
18222 * Creates a middle page table and puts a pointer to it in the
18223 * given global directory entry. This only returns the gd entry
18224 @@ -72,9 +94,12 @@ static pmd_t * __init one_md_table_init(
18225 pmd_t *pmd_table;
18226
18227 #ifdef CONFIG_X86_PAE
18228 + unsigned long phys;
18229 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
18230 - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18231 -
18232 + if (after_init_bootmem)
18233 + pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18234 + else
18235 + pmd_table = (pmd_t *)alloc_low_page(&phys);
18236 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
18237 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18238 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18239 @@ -101,12 +126,16 @@ static pte_t * __init one_page_table_ini
18240 #endif
18241 pte_t *page_table = NULL;
18242
18243 + if (after_init_bootmem) {
18244 #ifdef CONFIG_DEBUG_PAGEALLOC
18245 - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18246 + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
18247 #endif
18248 - if (!page_table) {
18249 - page_table =
18250 + if (!page_table)
18251 + page_table =
18252 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
18253 + } else {
18254 + unsigned long phys;
18255 + page_table = (pte_t *)alloc_low_page(&phys);
18256 }
18257
18258 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
18259 @@ -167,24 +196,24 @@ static inline int is_kernel_text(unsigne
18260 * of max_low_pfn pages, by creating page tables starting from address
18261 * PAGE_OFFSET:
18262 */
18263 -static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18264 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
18265 + unsigned long start_pfn,
18266 + unsigned long end_pfn,
18267 + int use_pse)
18268 {
18269 int pgd_idx, pmd_idx, pte_ofs;
18270 unsigned long pfn;
18271 pgd_t *pgd;
18272 pmd_t *pmd;
18273 pte_t *pte;
18274 + unsigned pages_2m = 0, pages_4k = 0;
18275
18276 - unsigned long max_ram_pfn = xen_start_info->nr_pages;
18277 - if (max_ram_pfn > max_low_pfn)
18278 - max_ram_pfn = max_low_pfn;
18279 + if (!cpu_has_pse)
18280 + use_pse = 0;
18281
18282 - pgd_idx = pgd_index(PAGE_OFFSET);
18283 + pfn = start_pfn;
18284 + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18285 pgd = pgd_base + pgd_idx;
18286 - pfn = 0;
18287 - pmd_idx = pmd_index(PAGE_OFFSET);
18288 - pte_ofs = pte_index(PAGE_OFFSET);
18289 -
18290 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18291 #ifdef CONFIG_XEN
18292 /*
18293 @@ -198,10 +227,16 @@ static void __init kernel_physical_mappi
18294 #else
18295 pmd = one_md_table_init(pgd);
18296 #endif
18297 - if (pfn >= max_low_pfn)
18298 +
18299 + if (pfn >= end_pfn)
18300 continue;
18301 +#ifdef CONFIG_X86_PAE
18302 + pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18303 pmd += pmd_idx;
18304 - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
18305 +#else
18306 + pmd_idx = 0;
18307 +#endif
18308 + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
18309 pmd++, pmd_idx++) {
18310 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
18311
18312 @@ -211,13 +246,8 @@ static void __init kernel_physical_mappi
18313 /*
18314 * Map with big pages if possible, otherwise
18315 * create normal page tables:
18316 - *
18317 - * Don't use a large page for the first 2/4MB of memory
18318 - * because there are often fixed size MTRRs in there
18319 - * and overlapping MTRRs into large pages can cause
18320 - * slowdowns.
18321 */
18322 - if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
18323 + if (use_pse) {
18324 unsigned int addr2;
18325 pgprot_t prot = PAGE_KERNEL_LARGE;
18326
18327 @@ -228,49 +258,35 @@ static void __init kernel_physical_mappi
18328 is_kernel_text(addr2))
18329 prot = PAGE_KERNEL_LARGE_EXEC;
18330
18331 + pages_2m++;
18332 set_pmd(pmd, pfn_pmd(pfn, prot));
18333
18334 pfn += PTRS_PER_PTE;
18335 - max_pfn_mapped = pfn;
18336 continue;
18337 }
18338 pte = one_page_table_init(pmd);
18339
18340 - for (pte += pte_ofs;
18341 - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
18342 + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
18343 + pte += pte_ofs;
18344 + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
18345 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
18346 pgprot_t prot = PAGE_KERNEL;
18347
18348 /* XEN: Only map initial RAM allocation. */
18349 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
18350 + if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
18351 continue;
18352 if (is_kernel_text(addr))
18353 prot = PAGE_KERNEL_EXEC;
18354
18355 + pages_4k++;
18356 set_pte(pte, pfn_pte(pfn, prot));
18357 }
18358 - max_pfn_mapped = pfn;
18359 - pte_ofs = 0;
18360 }
18361 - pmd_idx = 0;
18362 }
18363 + update_page_count(PG_LEVEL_2M, pages_2m);
18364 + update_page_count(PG_LEVEL_4K, pages_4k);
18365 }
18366
18367 -#ifndef CONFIG_XEN
18368 -
18369 -static inline int page_kills_ppro(unsigned long pagenr)
18370 -{
18371 - if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18372 - return 1;
18373 - return 0;
18374 -}
18375 -
18376 -#else
18377 -
18378 -#define page_kills_ppro(p) 0
18379 -
18380 -#endif
18381 -
18382 /*
18383 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
18384 * is valid. The argument is a physical page number.
18385 @@ -331,30 +347,63 @@ static void __init permanent_kmaps_init(
18386 pkmap_page_table = pte;
18387 }
18388
18389 -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18390 +static void __init add_one_highpage_init(struct page *page, int pfn)
18391 {
18392 - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18393 - ClearPageReserved(page);
18394 - init_page_count(page);
18395 - if (pfn < xen_start_info->nr_pages)
18396 - __free_page(page);
18397 - totalhigh_pages++;
18398 - } else
18399 - SetPageReserved(page);
18400 + ClearPageReserved(page);
18401 + init_page_count(page);
18402 + if (pfn < xen_start_info->nr_pages)
18403 + __free_page(page);
18404 + totalhigh_pages++;
18405 +}
18406 +
18407 +struct add_highpages_data {
18408 + unsigned long start_pfn;
18409 + unsigned long end_pfn;
18410 +};
18411 +
18412 +static int __init add_highpages_work_fn(unsigned long start_pfn,
18413 + unsigned long end_pfn, void *datax)
18414 +{
18415 + int node_pfn;
18416 + struct page *page;
18417 + unsigned long final_start_pfn, final_end_pfn;
18418 + struct add_highpages_data *data;
18419 +
18420 + data = (struct add_highpages_data *)datax;
18421 +
18422 + final_start_pfn = max(start_pfn, data->start_pfn);
18423 + final_end_pfn = min(end_pfn, data->end_pfn);
18424 + if (final_start_pfn >= final_end_pfn)
18425 + return 0;
18426 +
18427 + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
18428 + node_pfn++) {
18429 + if (!pfn_valid(node_pfn))
18430 + continue;
18431 + page = pfn_to_page(node_pfn);
18432 + add_one_highpage_init(page, node_pfn);
18433 + }
18434 +
18435 + return 0;
18436 +
18437 +}
18438 +
18439 +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
18440 + unsigned long end_pfn)
18441 +{
18442 + struct add_highpages_data data;
18443 +
18444 + data.start_pfn = start_pfn;
18445 + data.end_pfn = end_pfn;
18446 +
18447 + work_with_active_regions(nid, add_highpages_work_fn, &data);
18448 }
18449
18450 #ifndef CONFIG_NUMA
18451 -static void __init set_highmem_pages_init(int bad_ppro)
18452 +static void __init set_highmem_pages_init(void)
18453 {
18454 - int pfn;
18455 + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
18456
18457 - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
18458 - /*
18459 - * Holes under sparsemem might not have no mem_map[]:
18460 - */
18461 - if (pfn_valid(pfn))
18462 - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18463 - }
18464 totalram_pages += totalhigh_pages;
18465 }
18466 #endif /* !CONFIG_NUMA */
18467 @@ -362,24 +411,11 @@ static void __init set_highmem_pages_ini
18468 #else
18469 # define kmap_init() do { } while (0)
18470 # define permanent_kmaps_init(pgd_base) do { } while (0)
18471 -# define set_highmem_pages_init(bad_ppro) do { } while (0)
18472 +# define set_highmem_pages_init() do { } while (0)
18473 #endif /* CONFIG_HIGHMEM */
18474
18475 -pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
18476 -EXPORT_SYMBOL(__PAGE_KERNEL);
18477 -
18478 -pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18479 -
18480 pgd_t *swapper_pg_dir;
18481
18482 -static void __init xen_pagetable_setup_start(pgd_t *base)
18483 -{
18484 -}
18485 -
18486 -static void __init xen_pagetable_setup_done(pgd_t *base)
18487 -{
18488 -}
18489 -
18490 /*
18491 * Build a proper pagetable for the kernel mappings. Up until this
18492 * point, we've been running on some set of pagetables constructed by
18493 @@ -399,27 +435,10 @@ static void __init xen_pagetable_setup_d
18494 * be partially populated, and so it avoids stomping on any existing
18495 * mappings.
18496 */
18497 -static void __init pagetable_init(void)
18498 +static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
18499 {
18500 - pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18501 unsigned long vaddr, end;
18502
18503 - xen_pagetable_setup_start(pgd_base);
18504 -
18505 - /* Enable PSE if available */
18506 - if (cpu_has_pse)
18507 - set_in_cr4(X86_CR4_PSE);
18508 -
18509 - /* Enable PGE if available */
18510 - if (cpu_has_pge) {
18511 - set_in_cr4(X86_CR4_PGE);
18512 - __PAGE_KERNEL |= _PAGE_GLOBAL;
18513 - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18514 - }
18515 -
18516 - kernel_physical_mapping_init(pgd_base);
18517 - remap_numa_kva();
18518 -
18519 /*
18520 * Fixed mappings, only the page table structure has to be
18521 * created - mappings will be set by set_fixmap():
18522 @@ -429,10 +448,13 @@ static void __init pagetable_init(void)
18523 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
18524 page_table_range_init(vaddr, end, pgd_base);
18525 early_ioremap_reset();
18526 +}
18527
18528 - permanent_kmaps_init(pgd_base);
18529 +static void __init pagetable_init(void)
18530 +{
18531 + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18532
18533 - xen_pagetable_setup_done(pgd_base);
18534 + permanent_kmaps_init(pgd_base);
18535 }
18536
18537 #if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
18538 @@ -475,7 +497,7 @@ void zap_low_mappings(void)
18539
18540 int nx_enabled;
18541
18542 -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
18543 +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
18544 EXPORT_SYMBOL_GPL(__supported_pte_mask);
18545
18546 #ifdef CONFIG_X86_PAE
18547 @@ -528,42 +550,364 @@ static void __init set_nx(void)
18548 }
18549 #endif
18550
18551 +/* user-defined highmem size */
18552 +static unsigned int highmem_pages = -1;
18553 +
18554 /*
18555 - * paging_init() sets up the page tables - note that the first 8MB are
18556 - * already mapped by head.S.
18557 - *
18558 - * This routines also unmaps the page at virtual kernel address 0, so
18559 - * that we can trap those pesky NULL-reference errors in the kernel.
18560 + * highmem=size forces highmem to be exactly 'size' bytes.
18561 + * This works even on boxes that have no highmem otherwise.
18562 + * This also works to reduce highmem size on bigger boxes.
18563 */
18564 -void __init paging_init(void)
18565 +static int __init parse_highmem(char *arg)
18566 +{
18567 + if (!arg)
18568 + return -EINVAL;
18569 +
18570 + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
18571 + return 0;
18572 +}
18573 +early_param("highmem", parse_highmem);
18574 +
18575 +/*
18576 + * Determine low and high memory ranges:
18577 + */
18578 +void __init find_low_pfn_range(void)
18579 +{
18580 + /* it could update max_pfn */
18581 +
18582 + /* max_low_pfn is 0, we already have early_res support */
18583 +
18584 + max_low_pfn = max_pfn;
18585 + if (max_low_pfn > MAXMEM_PFN) {
18586 + if (highmem_pages == -1)
18587 + highmem_pages = max_pfn - MAXMEM_PFN;
18588 + if (highmem_pages + MAXMEM_PFN < max_pfn)
18589 + max_pfn = MAXMEM_PFN + highmem_pages;
18590 + if (highmem_pages + MAXMEM_PFN > max_pfn) {
18591 + printk(KERN_WARNING "only %luMB highmem pages "
18592 + "available, ignoring highmem size of %uMB.\n",
18593 + pages_to_mb(max_pfn - MAXMEM_PFN),
18594 + pages_to_mb(highmem_pages));
18595 + highmem_pages = 0;
18596 + }
18597 + max_low_pfn = MAXMEM_PFN;
18598 +#ifndef CONFIG_HIGHMEM
18599 + /* Maximum memory usable is what is directly addressable */
18600 + printk(KERN_WARNING "Warning only %ldMB will be used.\n",
18601 + MAXMEM>>20);
18602 + if (max_pfn > MAX_NONPAE_PFN)
18603 + printk(KERN_WARNING
18604 + "Use a HIGHMEM64G enabled kernel.\n");
18605 + else
18606 + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
18607 + max_pfn = MAXMEM_PFN;
18608 +#else /* !CONFIG_HIGHMEM */
18609 +#ifndef CONFIG_HIGHMEM64G
18610 + if (max_pfn > MAX_NONPAE_PFN) {
18611 + max_pfn = MAX_NONPAE_PFN;
18612 + printk(KERN_WARNING "Warning only 4GB will be used."
18613 + "Use a HIGHMEM64G enabled kernel.\n");
18614 + }
18615 +#endif /* !CONFIG_HIGHMEM64G */
18616 +#endif /* !CONFIG_HIGHMEM */
18617 + } else {
18618 + if (highmem_pages == -1)
18619 + highmem_pages = 0;
18620 +#ifdef CONFIG_HIGHMEM
18621 + if (highmem_pages >= max_pfn) {
18622 + printk(KERN_ERR "highmem size specified (%uMB) is "
18623 + "bigger than pages available (%luMB)!.\n",
18624 + pages_to_mb(highmem_pages),
18625 + pages_to_mb(max_pfn));
18626 + highmem_pages = 0;
18627 + }
18628 + if (highmem_pages) {
18629 + if (max_low_pfn - highmem_pages <
18630 + 64*1024*1024/PAGE_SIZE){
18631 + printk(KERN_ERR "highmem size %uMB results in "
18632 + "smaller than 64MB lowmem, ignoring it.\n"
18633 + , pages_to_mb(highmem_pages));
18634 + highmem_pages = 0;
18635 + }
18636 + max_low_pfn -= highmem_pages;
18637 + }
18638 +#else
18639 + if (highmem_pages)
18640 + printk(KERN_ERR "ignoring highmem size on non-highmem"
18641 + " kernel!\n");
18642 +#endif
18643 + }
18644 +}
18645 +
18646 +#ifndef CONFIG_NEED_MULTIPLE_NODES
18647 +void __init initmem_init(unsigned long start_pfn,
18648 + unsigned long end_pfn)
18649 +{
18650 +#ifdef CONFIG_HIGHMEM
18651 + highstart_pfn = highend_pfn = max_pfn;
18652 + if (max_pfn > max_low_pfn)
18653 + highstart_pfn = max_low_pfn;
18654 + memory_present(0, 0, highend_pfn);
18655 + e820_register_active_regions(0, 0, highend_pfn);
18656 + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
18657 + pages_to_mb(highend_pfn - highstart_pfn));
18658 + num_physpages = highend_pfn;
18659 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
18660 +#else
18661 + memory_present(0, 0, max_low_pfn);
18662 + e820_register_active_regions(0, 0, max_low_pfn);
18663 + num_physpages = max_low_pfn;
18664 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
18665 +#endif
18666 +#ifdef CONFIG_FLATMEM
18667 + max_mapnr = num_physpages;
18668 +#endif
18669 + printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
18670 + pages_to_mb(max_low_pfn));
18671 +
18672 + setup_bootmem_allocator();
18673 +}
18674 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
18675 +
18676 +static void __init zone_sizes_init(void)
18677 +{
18678 + unsigned long max_zone_pfns[MAX_NR_ZONES];
18679 + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
18680 + max_zone_pfns[ZONE_DMA] =
18681 + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
18682 + max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
18683 +#ifdef CONFIG_HIGHMEM
18684 + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
18685 +#endif
18686 +
18687 + free_area_init_nodes(max_zone_pfns);
18688 +}
18689 +
18690 +void __init setup_bootmem_allocator(void)
18691 {
18692 int i;
18693 + unsigned long bootmap_size, bootmap;
18694 + unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
18695 +
18696 + /*
18697 + * Initialize the boot-time allocator (with low memory only):
18698 + */
18699 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
18700 + bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
18701 + min(max_pfn_mapped, xen_start_info->nr_pages)<<PAGE_SHIFT,
18702 + bootmap_size, PAGE_SIZE);
18703 + if (bootmap == -1L)
18704 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
18705 + reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
18706 +
18707 + /* don't touch min_low_pfn */
18708 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
18709 + min_low_pfn, end_pfn);
18710 + printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
18711 + max_pfn_mapped<<PAGE_SHIFT);
18712 + printk(KERN_INFO " low ram: %08lx - %08lx\n",
18713 + min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
18714 + printk(KERN_INFO " bootmap %08lx - %08lx\n",
18715 + bootmap, bootmap + bootmap_size);
18716 + for_each_online_node(i)
18717 + free_bootmem_with_active_regions(i, end_pfn);
18718 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
18719 +
18720 + after_init_bootmem = 1;
18721 +}
18722 +
18723 +static unsigned long __init extend_init_mapping(unsigned long tables_space)
18724 +{
18725 + unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
18726 + + xen_start_info->nr_pt_frames;
18727 + unsigned long start = start_pfn, va = (unsigned long)&_text;
18728 + pgd_t *pgd;
18729 + pud_t *pud;
18730 + pmd_t *pmd;
18731 + pte_t *pte;
18732 +
18733 + /* Ensure init mappings cover kernel text/data and initial tables. */
18734 + while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
18735 + pgd = pgd_offset_k(va);
18736 + pud = pud_offset(pgd, va);
18737 + pmd = pmd_offset(pud, va);
18738 + if (pmd_none(*pmd)) {
18739 + unsigned long pa = start_pfn++ << PAGE_SHIFT;
18740 +
18741 + memset(__va(pa), 0, PAGE_SIZE);
18742 + make_lowmem_page_readonly(__va(pa),
18743 + XENFEAT_writable_page_tables);
18744 + xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
18745 + }
18746 + pte = pte_offset_kernel(pmd, va);
18747 + if (pte_none(*pte)) {
18748 + pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
18749 +
18750 + if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
18751 + BUG();
18752 + }
18753 + va += PAGE_SIZE;
18754 + }
18755 +
18756 + /* Finally, blow away any spurious initial mappings. */
18757 + while (1) {
18758 + pgd = pgd_offset_k(va);
18759 + pud = pud_offset(pgd, va);
18760 + pmd = pmd_offset(pud, va);
18761 + if (pmd_none(*pmd))
18762 + break;
18763 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
18764 + BUG();
18765 + va += PAGE_SIZE;
18766 + }
18767 +
18768 + if (start_pfn > start)
18769 + reserve_early(start << PAGE_SHIFT,
18770 + start_pfn << PAGE_SHIFT, "INITMAP");
18771 +
18772 + return start_pfn;
18773 +}
18774 +
18775 +static void __init find_early_table_space(unsigned long end)
18776 +{
18777 + unsigned long puds, pmds, ptes, tables;
18778 +
18779 + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
18780 + tables = PAGE_ALIGN(puds * sizeof(pud_t));
18781 +
18782 + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
18783 + tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
18784 +
18785 + if (cpu_has_pse) {
18786 + unsigned long extra;
18787 +
18788 + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
18789 + extra += PMD_SIZE;
18790 + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
18791 + } else
18792 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
18793 +
18794 + tables += PAGE_ALIGN(ptes * sizeof(pte_t));
18795 +
18796 + /* for fixmap */
18797 + tables += PAGE_SIZE
18798 + * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
18799 + - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
18800 + >> PMD_SHIFT);
18801 +
18802 + table_start = extend_init_mapping(tables);
18803 +
18804 + table_end = table_start;
18805 + table_top = table_start + (tables>>PAGE_SHIFT);
18806 +
18807 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
18808 + end, table_start << PAGE_SHIFT,
18809 + (table_start << PAGE_SHIFT) + tables);
18810 +}
18811 +
18812 +unsigned long __init_refok init_memory_mapping(unsigned long start,
18813 + unsigned long end)
18814 +{
18815 + pgd_t *pgd_base = swapper_pg_dir;
18816 + unsigned long start_pfn, end_pfn;
18817 + unsigned long big_page_start;
18818 +
18819 + /*
18820 + * Find space for the kernel direct mapping tables.
18821 + */
18822 + if (!after_init_bootmem)
18823 + find_early_table_space(end);
18824
18825 #ifdef CONFIG_X86_PAE
18826 set_nx();
18827 if (nx_enabled)
18828 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
18829 #endif
18830 +
18831 + /* Enable PSE if available */
18832 + if (cpu_has_pse)
18833 + set_in_cr4(X86_CR4_PSE);
18834 +
18835 + /* Enable PGE if available */
18836 + if (cpu_has_pge) {
18837 + set_in_cr4(X86_CR4_PGE);
18838 + __supported_pte_mask |= _PAGE_GLOBAL;
18839 + }
18840 +
18841 + /*
18842 + * Don't use a large page for the first 2/4MB of memory
18843 + * because there are often fixed size MTRRs in there
18844 + * and overlapping MTRRs into large pages can cause
18845 + * slowdowns.
18846 + */
18847 + big_page_start = PMD_SIZE;
18848 +
18849 + if (start < big_page_start) {
18850 + start_pfn = start >> PAGE_SHIFT;
18851 + end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
18852 + } else {
18853 + /* head is not big page alignment ? */
18854 + start_pfn = start >> PAGE_SHIFT;
18855 + end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18856 + << (PMD_SHIFT - PAGE_SHIFT);
18857 + }
18858 + if (start_pfn < end_pfn)
18859 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
18860 +
18861 + /* big page range */
18862 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
18863 + << (PMD_SHIFT - PAGE_SHIFT);
18864 + if (start_pfn < (big_page_start >> PAGE_SHIFT))
18865 + start_pfn = big_page_start >> PAGE_SHIFT;
18866 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
18867 + if (start_pfn < end_pfn)
18868 + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
18869 + cpu_has_pse);
18870 +
18871 + /* tail is not big page alignment ? */
18872 + start_pfn = end_pfn;
18873 + if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
18874 + end_pfn = end >> PAGE_SHIFT;
18875 + if (start_pfn < end_pfn)
18876 + kernel_physical_mapping_init(pgd_base, start_pfn,
18877 + end_pfn, 0);
18878 + }
18879 +
18880 + early_ioremap_page_table_range_init(pgd_base);
18881 +
18882 + __flush_tlb_all();
18883 +
18884 + if (!after_init_bootmem)
18885 + reserve_early(table_start << PAGE_SHIFT,
18886 + table_end << PAGE_SHIFT, "PGTABLE");
18887 +
18888 + if (!after_init_bootmem)
18889 + early_memtest(start, end);
18890 +
18891 + return end >> PAGE_SHIFT;
18892 +}
18893 +
18894 +
18895 +/*
18896 + * paging_init() sets up the page tables - note that the first 8MB are
18897 + * already mapped by head.S.
18898 + *
18899 + * This routines also unmaps the page at virtual kernel address 0, so
18900 + * that we can trap those pesky NULL-reference errors in the kernel.
18901 + */
18902 +void __init paging_init(void)
18903 +{
18904 pagetable_init();
18905
18906 __flush_tlb_all();
18907
18908 kmap_init();
18909
18910 - /* Switch to the real shared_info page, and clear the
18911 - * dummy page. */
18912 - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18913 - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18914 - memset(empty_zero_page, 0, sizeof(empty_zero_page));
18915 -
18916 - /* Setup mapping of lower 1st MB */
18917 - for (i = 0; i < NR_FIX_ISAMAPS; i++)
18918 - if (is_initial_xendomain())
18919 - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18920 - else
18921 - __set_fixmap(FIX_ISAMAP_BEGIN - i,
18922 - virt_to_machine(empty_zero_page),
18923 - PAGE_KERNEL_RO);
18924 + /*
18925 + * NOTE: at this point the bootmem allocator is fully available.
18926 + */
18927 + sparse_init();
18928 + zone_sizes_init();
18929 }
18930
18931 /*
18932 @@ -598,7 +942,7 @@ static struct kcore_list kcore_mem, kcor
18933 void __init mem_init(void)
18934 {
18935 int codesize, reservedpages, datasize, initsize;
18936 - int tmp, bad_ppro;
18937 + int tmp;
18938 unsigned long pfn;
18939
18940 pci_iommu_alloc();
18941 @@ -606,19 +950,6 @@ void __init mem_init(void)
18942 #ifdef CONFIG_FLATMEM
18943 BUG_ON(!mem_map);
18944 #endif
18945 - bad_ppro = ppro_with_ram_bug();
18946 -
18947 -#ifdef CONFIG_HIGHMEM
18948 - /* check that fixmap and pkmap do not overlap */
18949 - if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
18950 - printk(KERN_ERR
18951 - "fixmap and kmap areas overlap - this will crash\n");
18952 - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
18953 - PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
18954 - FIXADDR_START);
18955 - BUG();
18956 - }
18957 -#endif
18958 /* this will put all low memory onto the freelists */
18959 totalram_pages += free_all_bootmem();
18960 /* XEN: init and count low-mem pages outside initial allocation. */
18961 @@ -636,7 +967,7 @@ void __init mem_init(void)
18962 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
18963 reservedpages++;
18964
18965 - set_highmem_pages_init(bad_ppro);
18966 + set_highmem_pages_init();
18967
18968 codesize = (unsigned long) &_etext - (unsigned long) &_text;
18969 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
18970 @@ -657,7 +988,6 @@ void __init mem_init(void)
18971 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
18972 );
18973
18974 -#if 1 /* double-sanity-check paranoia */
18975 printk(KERN_INFO "virtual kernel memory layout:\n"
18976 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
18977 #ifdef CONFIG_HIGHMEM
18978 @@ -698,7 +1028,6 @@ void __init mem_init(void)
18979 #endif
18980 BUG_ON(VMALLOC_START > VMALLOC_END);
18981 BUG_ON((unsigned long)high_memory > VMALLOC_START);
18982 -#endif /* double-sanity-check paranoia */
18983
18984 if (boot_cpu_data.wp_works_ok < 0)
18985 test_wp_bit();
18986 @@ -755,6 +1084,8 @@ void mark_rodata_ro(void)
18987 unsigned long start = PFN_ALIGN(_text);
18988 unsigned long size = PFN_ALIGN(_etext) - start;
18989
18990 +#ifndef CONFIG_DYNAMIC_FTRACE
18991 + /* Dynamic tracing modifies the kernel text section */
18992 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
18993 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
18994 size >> 10);
18995 @@ -767,6 +1098,8 @@ void mark_rodata_ro(void)
18996 printk(KERN_INFO "Testing CPA: write protecting again\n");
18997 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
18998 #endif
18999 +#endif /* CONFIG_DYNAMIC_FTRACE */
19000 +
19001 start += size;
19002 size = (unsigned long)__end_rodata - start;
19003 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
19004 @@ -829,3 +1162,9 @@ void free_initrd_mem(unsigned long start
19005 free_init_pages("initrd memory", start, end);
19006 }
19007 #endif
19008 +
19009 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
19010 + int flags)
19011 +{
19012 + return reserve_bootmem(phys, len, flags);
19013 +}
19014 --- sle11-2009-10-16.orig/arch/x86/mm/init_64-xen.c 2009-03-16 16:38:05.000000000 +0100
19015 +++ sle11-2009-10-16/arch/x86/mm/init_64-xen.c 2009-06-04 10:21:39.000000000 +0200
19016 @@ -21,6 +21,7 @@
19017 #include <linux/swap.h>
19018 #include <linux/smp.h>
19019 #include <linux/init.h>
19020 +#include <linux/initrd.h>
19021 #include <linux/pagemap.h>
19022 #include <linux/bootmem.h>
19023 #include <linux/proc_fs.h>
19024 @@ -52,6 +53,14 @@
19025
19026 #include <xen/features.h>
19027
19028 +/*
19029 + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
19030 + * The direct mapping extends to max_pfn_mapped, so that we can directly access
19031 + * apertures, ACPI and other tables without having to play with fixmaps.
19032 + */
19033 +unsigned long max_low_pfn_mapped;
19034 +unsigned long max_pfn_mapped;
19035 +
19036 #if CONFIG_XEN_COMPAT <= 0x030002
19037 unsigned int __kernel_page_user;
19038 EXPORT_SYMBOL(__kernel_page_user);
19039 @@ -60,13 +69,12 @@ EXPORT_SYMBOL(__kernel_page_user);
19040 int after_bootmem;
19041
19042 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
19043 -extern unsigned long start_pfn;
19044
19045 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
19046 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
19047
19048 #ifndef CONFIG_XEN
19049 -int direct_gbpages __meminitdata
19050 +int direct_gbpages
19051 #ifdef CONFIG_DIRECT_GBPAGES
19052 = 1
19053 #endif
19054 @@ -145,55 +153,23 @@ void __meminit early_make_page_readonly(
19055 * around without checking the pgd every time.
19056 */
19057
19058 -void show_mem(void)
19059 -{
19060 - long i, total = 0, reserved = 0;
19061 - long shared = 0, cached = 0;
19062 - struct page *page;
19063 - pg_data_t *pgdat;
19064 -
19065 - printk(KERN_INFO "Mem-info:\n");
19066 - show_free_areas();
19067 - for_each_online_pgdat(pgdat) {
19068 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19069 - /*
19070 - * This loop can take a while with 256 GB and
19071 - * 4k pages so defer the NMI watchdog:
19072 - */
19073 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
19074 - touch_nmi_watchdog();
19075 -
19076 - if (!pfn_valid(pgdat->node_start_pfn + i))
19077 - continue;
19078 -
19079 - page = pfn_to_page(pgdat->node_start_pfn + i);
19080 - total++;
19081 - if (PageReserved(page))
19082 - reserved++;
19083 - else if (PageSwapCache(page))
19084 - cached++;
19085 - else if (page_count(page))
19086 - shared += page_count(page) - 1;
19087 - }
19088 - }
19089 - printk(KERN_INFO "%lu pages of RAM\n", total);
19090 - printk(KERN_INFO "%lu reserved pages\n", reserved);
19091 - printk(KERN_INFO "%lu pages shared\n", shared);
19092 - printk(KERN_INFO "%lu pages swap cached\n", cached);
19093 -}
19094 -
19095 static unsigned long __meminitdata table_start;
19096 -static unsigned long __meminitdata table_end;
19097 +static unsigned long __meminitdata table_cur;
19098 +static unsigned long __meminitdata table_top;
19099
19100 -static __init void *spp_getpage(void)
19101 +/*
19102 + * NOTE: This function is marked __ref because it calls __init function
19103 + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
19104 + */
19105 +static __ref void *spp_getpage(void)
19106 {
19107 void *ptr;
19108
19109 if (after_bootmem)
19110 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
19111 - else if (start_pfn < table_end) {
19112 - ptr = __va(start_pfn << PAGE_SHIFT);
19113 - start_pfn++;
19114 + else if (table_cur < table_top) {
19115 + ptr = __va(table_cur << PAGE_SHIFT);
19116 + table_cur++;
19117 memset(ptr, 0, PAGE_SIZE);
19118 } else
19119 ptr = alloc_bootmem_pages(PAGE_SIZE);
19120 @@ -208,30 +184,18 @@ static __init void *spp_getpage(void)
19121 return ptr;
19122 }
19123
19124 -#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
19125 -#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
19126 -
19127 -static __init void
19128 -set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
19129 +void
19130 +set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
19131 {
19132 - pgd_t *pgd;
19133 pud_t *pud;
19134 pmd_t *pmd;
19135 - pte_t *pte, new_pte;
19136 -
19137 - pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
19138 + pte_t *pte;
19139
19140 - pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
19141 - if (pgd_none(*pgd)) {
19142 - printk(KERN_ERR
19143 - "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19144 - return;
19145 - }
19146 - pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
19147 + pud = pud_page + pud_index(vaddr);
19148 if (pud_none(*pud)) {
19149 pmd = (pmd_t *) spp_getpage();
19150 make_page_readonly(pmd, XENFEAT_writable_page_tables);
19151 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19152 + pud_populate(&init_mm, pud, pmd);
19153 if (pmd != pmd_offset(pud, 0)) {
19154 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19155 pmd, pmd_offset(pud, 0));
19156 @@ -242,19 +206,20 @@ set_pte_phys(unsigned long vaddr, unsign
19157 if (pmd_none(*pmd)) {
19158 pte = (pte_t *) spp_getpage();
19159 make_page_readonly(pte, XENFEAT_writable_page_tables);
19160 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19161 + pmd_populate_kernel(&init_mm, pmd, pte);
19162 if (pte != pte_offset_kernel(pmd, 0)) {
19163 printk(KERN_ERR "PAGETABLE BUG #02!\n");
19164 return;
19165 }
19166 }
19167 - if (pgprot_val(prot))
19168 - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
19169 - else
19170 - new_pte = __pte(0);
19171
19172 pte = pte_offset_kernel(pmd, vaddr);
19173 if (!pte_none(*pte) && __pte_val(new_pte) &&
19174 +#ifdef CONFIG_ACPI
19175 + /* __acpi_map_table() fails to properly call clear_fixmap() */
19176 + (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19177 + vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19178 +#endif
19179 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19180 pte_ERROR(*pte);
19181 set_pte(pte, new_pte);
19182 @@ -266,15 +231,13 @@ set_pte_phys(unsigned long vaddr, unsign
19183 __flush_tlb_one(vaddr);
19184 }
19185
19186 -static __init void
19187 -set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
19188 +void
19189 +set_pte_vaddr(unsigned long vaddr, pte_t pteval)
19190 {
19191 pgd_t *pgd;
19192 - pud_t *pud;
19193 - pmd_t *pmd;
19194 - pte_t *pte, new_pte;
19195 + pud_t *pud_page;
19196
19197 - pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
19198 + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
19199
19200 pgd = pgd_offset_k(vaddr);
19201 if (pgd_none(*pgd)) {
19202 @@ -282,47 +245,51 @@ set_pte_phys_ma(unsigned long vaddr, uns
19203 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
19204 return;
19205 }
19206 - pud = pud_offset(pgd, vaddr);
19207 - if (pud_none(*pud)) {
19208 - pmd = (pmd_t *) spp_getpage();
19209 - make_page_readonly(pmd, XENFEAT_writable_page_tables);
19210 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
19211 - if (pmd != pmd_offset(pud, 0)) {
19212 - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
19213 - pmd, pmd_offset(pud, 0));
19214 + pud_page = (pud_t*)pgd_page_vaddr(*pgd);
19215 + set_pte_vaddr_pud(pud_page, vaddr, pteval);
19216 +}
19217 +
19218 +#ifndef CONFIG_XEN
19219 +/*
19220 + * Create large page table mappings for a range of physical addresses.
19221 + */
19222 +static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
19223 + pgprot_t prot)
19224 +{
19225 + pgd_t *pgd;
19226 + pud_t *pud;
19227 + pmd_t *pmd;
19228 +
19229 + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
19230 + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
19231 + pgd = pgd_offset_k((unsigned long)__va(phys));
19232 + if (pgd_none(*pgd)) {
19233 + pud = (pud_t *) spp_getpage();
19234 + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
19235 + _PAGE_USER));
19236 }
19237 - }
19238 - pmd = pmd_offset(pud, vaddr);
19239 - if (pmd_none(*pmd)) {
19240 - pte = (pte_t *) spp_getpage();
19241 - make_page_readonly(pte, XENFEAT_writable_page_tables);
19242 - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
19243 - if (pte != pte_offset_kernel(pmd, 0)) {
19244 - printk(KERN_ERR "PAGETABLE BUG #02!\n");
19245 - return;
19246 + pud = pud_offset(pgd, (unsigned long)__va(phys));
19247 + if (pud_none(*pud)) {
19248 + pmd = (pmd_t *) spp_getpage();
19249 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
19250 + _PAGE_USER));
19251 }
19252 + pmd = pmd_offset(pud, phys);
19253 + BUG_ON(!pmd_none(*pmd));
19254 + set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
19255 }
19256 - new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
19257 +}
19258
19259 - pte = pte_offset_kernel(pmd, vaddr);
19260 - if (!pte_none(*pte) && __pte_val(new_pte) &&
19261 -#ifdef CONFIG_ACPI
19262 - /* __acpi_map_table() fails to properly call clear_fixmap() */
19263 - (vaddr < __fix_to_virt(FIX_ACPI_END) ||
19264 - vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
19265 -#endif
19266 - __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
19267 - pte_ERROR(*pte);
19268 - set_pte(pte, new_pte);
19269 +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
19270 +{
19271 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
19272 +}
19273
19274 - /*
19275 - * It's enough to flush this one mapping.
19276 - * (PGE mappings get flushed as well)
19277 - */
19278 - __flush_tlb_one(vaddr);
19279 +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
19280 +{
19281 + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
19282 }
19283
19284 -#ifndef CONFIG_XEN
19285 /*
19286 * The head.S code sets up the kernel high mapping:
19287 *
19288 @@ -352,63 +319,52 @@ void __init cleanup_highmap(void)
19289 }
19290 #endif
19291
19292 -/* NOTE: this is meant to be run only at boot */
19293 -void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
19294 -{
19295 - unsigned long address = __fix_to_virt(idx);
19296 -
19297 - if (idx >= __end_of_fixed_addresses) {
19298 - printk(KERN_ERR "Invalid __set_fixmap\n");
19299 - return;
19300 - }
19301 - switch (idx) {
19302 - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
19303 - set_pte_phys(address, phys, prot, 0);
19304 - set_pte_phys(address, phys, prot, 1);
19305 - break;
19306 - case FIX_EARLYCON_MEM_BASE:
19307 - xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
19308 - pfn_pte_ma(phys >> PAGE_SHIFT, prot));
19309 - break;
19310 - default:
19311 - set_pte_phys_ma(address, phys, prot);
19312 - break;
19313 - }
19314 -}
19315 -
19316 -static __meminit void *alloc_static_page(unsigned long *phys)
19317 +static __ref void *alloc_low_page(unsigned long *phys)
19318 {
19319 - unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
19320 + unsigned long pfn;
19321 + void *adr;
19322
19323 if (after_bootmem) {
19324 - void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
19325 + adr = (void *)get_zeroed_page(GFP_ATOMIC);
19326 *phys = __pa(adr);
19327
19328 return adr;
19329 }
19330
19331 - *phys = start_pfn << PAGE_SHIFT;
19332 - start_pfn++;
19333 - memset((void *)va, 0, PAGE_SIZE);
19334 - return (void *)va;
19335 + BUG_ON(!table_cur);
19336 + pfn = table_cur++;
19337 + if (pfn >= table_top)
19338 + panic("alloc_low_page: ran out of memory");
19339 +
19340 + adr = early_ioremap(pfn_to_mfn(pfn) * PAGE_SIZE, PAGE_SIZE);
19341 + memset(adr, 0, PAGE_SIZE);
19342 + *phys = pfn * PAGE_SIZE;
19343 + return adr;
19344 }
19345
19346 -#define PTE_SIZE PAGE_SIZE
19347 +static __ref void unmap_low_page(void *adr)
19348 +{
19349 + if (after_bootmem)
19350 + return;
19351 +
19352 + early_iounmap(adr, PAGE_SIZE);
19353 +}
19354
19355 static inline int __meminit make_readonly(unsigned long paddr)
19356 {
19357 extern char __vsyscall_0;
19358 int readonly = 0;
19359
19360 - /* Make new page tables read-only. */
19361 + /* Make new page tables read-only on the first pass. */
19362 if (!xen_feature(XENFEAT_writable_page_tables)
19363 + && !max_pfn_mapped
19364 && (paddr >= (table_start << PAGE_SHIFT))
19365 - && (paddr < (table_end << PAGE_SHIFT)))
19366 + && (paddr < (table_top << PAGE_SHIFT)))
19367 readonly = 1;
19368 /* Make old page tables read-only. */
19369 if (!xen_feature(XENFEAT_writable_page_tables)
19370 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
19371 - && (paddr < (start_pfn << PAGE_SHIFT)))
19372 + && (paddr < (table_cur << PAGE_SHIFT)))
19373 readonly = 1;
19374
19375 /*
19376 @@ -425,118 +381,131 @@ static inline int __meminit make_readonl
19377 return readonly;
19378 }
19379
19380 -#ifndef CONFIG_XEN
19381 -/* Must run before zap_low_mappings */
19382 -__meminit void *early_ioremap(unsigned long addr, unsigned long size)
19383 +static unsigned long __meminit
19384 +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
19385 {
19386 - pmd_t *pmd, *last_pmd;
19387 - unsigned long vaddr;
19388 - int i, pmds;
19389 -
19390 - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19391 - vaddr = __START_KERNEL_map;
19392 - pmd = level2_kernel_pgt;
19393 - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
19394 -
19395 - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
19396 - for (i = 0; i < pmds; i++) {
19397 - if (pmd_present(pmd[i]))
19398 - goto continue_outer_loop;
19399 - }
19400 - vaddr += addr & ~PMD_MASK;
19401 - addr &= PMD_MASK;
19402 + unsigned pages = 0;
19403 + unsigned long last_map_addr = end;
19404 + int i;
19405 +
19406 + pte_t *pte = pte_page + pte_index(addr);
19407 +
19408 + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
19409 + unsigned long pteval = addr | __PAGE_KERNEL;
19410
19411 - for (i = 0; i < pmds; i++, addr += PMD_SIZE)
19412 - set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
19413 - __flush_tlb_all();
19414 -
19415 - return (void *)vaddr;
19416 -continue_outer_loop:
19417 - ;
19418 + if (addr >= end ||
19419 + (!after_bootmem &&
19420 + (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
19421 + break;
19422 +
19423 + if (__pte_val(*pte))
19424 + continue;
19425 +
19426 + if (make_readonly(addr))
19427 + pteval &= ~_PAGE_RW;
19428 + if (0)
19429 + printk(" pte=%p addr=%lx pte=%016lx\n",
19430 + pte, addr, pteval);
19431 + if (!after_bootmem)
19432 + *pte = __pte(pteval & __supported_pte_mask);
19433 + else
19434 + set_pte(pte, __pte(pteval & __supported_pte_mask));
19435 + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
19436 + pages++;
19437 }
19438 - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
19439 - return NULL;
19440 + update_page_count(PG_LEVEL_4K, pages);
19441 +
19442 + return last_map_addr;
19443 }
19444
19445 -/*
19446 - * To avoid virtual aliases later:
19447 - */
19448 -__meminit void early_iounmap(void *addr, unsigned long size)
19449 +static unsigned long __meminit
19450 +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
19451 {
19452 - unsigned long vaddr;
19453 - pmd_t *pmd;
19454 - int i, pmds;
19455 -
19456 - vaddr = (unsigned long)addr;
19457 - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
19458 - pmd = level2_kernel_pgt + pmd_index(vaddr);
19459 -
19460 - for (i = 0; i < pmds; i++)
19461 - pmd_clear(pmd + i);
19462 + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
19463
19464 - __flush_tlb_all();
19465 + BUG_ON(!max_pfn_mapped);
19466 + return phys_pte_init(pte, address, end);
19467 }
19468 -#endif
19469
19470 static unsigned long __meminit
19471 -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
19472 +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
19473 + unsigned long page_size_mask)
19474 {
19475 + unsigned long pages = 0;
19476 + unsigned long last_map_addr = end;
19477 + unsigned long start = address;
19478 +
19479 int i = pmd_index(address);
19480
19481 - for (; i < PTRS_PER_PMD; i++) {
19482 + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
19483 unsigned long pte_phys;
19484 - pmd_t *pmd = pmd_page + i;
19485 - pte_t *pte, *pte_save;
19486 - int k;
19487 + pmd_t *pmd = pmd_page + pmd_index(address);
19488 + pte_t *pte;
19489
19490 if (address >= end)
19491 break;
19492
19493 if (__pmd_val(*pmd)) {
19494 - address += PMD_SIZE;
19495 + if (!pmd_large(*pmd)) {
19496 + spin_lock(&init_mm.page_table_lock);
19497 + last_map_addr = phys_pte_update(pmd, address,
19498 + end);
19499 + spin_unlock(&init_mm.page_table_lock);
19500 + }
19501 + /* Count entries we're using from level2_ident_pgt */
19502 + if (start == 0)
19503 + pages++;
19504 continue;
19505 }
19506
19507 - pte = alloc_static_page(&pte_phys);
19508 - pte_save = pte;
19509 - for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
19510 - unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
19511 -
19512 - if (address >= (after_bootmem
19513 - ? end
19514 - : xen_start_info->nr_pages << PAGE_SHIFT))
19515 - pteval = 0;
19516 - else if (make_readonly(address))
19517 - pteval &= ~_PAGE_RW;
19518 - set_pte(pte, __pte(pteval & __supported_pte_mask));
19519 + if (page_size_mask & (1<<PG_LEVEL_2M)) {
19520 + pages++;
19521 + spin_lock(&init_mm.page_table_lock);
19522 + set_pte((pte_t *)pmd,
19523 + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19524 + spin_unlock(&init_mm.page_table_lock);
19525 + last_map_addr = (address & PMD_MASK) + PMD_SIZE;
19526 + continue;
19527 }
19528 +
19529 + pte = alloc_low_page(&pte_phys);
19530 + last_map_addr = phys_pte_init(pte, address, end);
19531 + unmap_low_page(pte);
19532 +
19533 if (!after_bootmem) {
19534 - early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19535 - *pmd = __pmd(pte_phys | _KERNPG_TABLE);
19536 + if (max_pfn_mapped)
19537 + make_page_readonly(__va(pte_phys),
19538 + XENFEAT_writable_page_tables);
19539 + *pmd = __pmd(pte_phys | _PAGE_TABLE);
19540 } else {
19541 - make_page_readonly(pte_save, XENFEAT_writable_page_tables);
19542 - set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
19543 + make_page_readonly(pte, XENFEAT_writable_page_tables);
19544 + spin_lock(&init_mm.page_table_lock);
19545 + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
19546 + spin_unlock(&init_mm.page_table_lock);
19547 }
19548 }
19549 - return address;
19550 + update_page_count(PG_LEVEL_2M, pages);
19551 + return last_map_addr;
19552 }
19553
19554 static unsigned long __meminit
19555 -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
19556 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
19557 + unsigned long page_size_mask)
19558 {
19559 pmd_t *pmd = pmd_offset(pud, 0);
19560 unsigned long last_map_addr;
19561
19562 - spin_lock(&init_mm.page_table_lock);
19563 - last_map_addr = phys_pmd_init(pmd, address, end);
19564 - spin_unlock(&init_mm.page_table_lock);
19565 + BUG_ON(!max_pfn_mapped);
19566 + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
19567 __flush_tlb_all();
19568 return last_map_addr;
19569 }
19570
19571 static unsigned long __meminit
19572 -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
19573 +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
19574 + unsigned long page_size_mask)
19575 {
19576 + unsigned long pages = 0;
19577 unsigned long last_map_addr = end;
19578 int i = pud_index(addr);
19579
19580 @@ -550,29 +519,55 @@ phys_pud_init(pud_t *pud_page, unsigned
19581
19582 if (__pud_val(*pud)) {
19583 if (!pud_large(*pud))
19584 - last_map_addr = phys_pmd_update(pud, addr, end);
19585 + last_map_addr = phys_pmd_update(pud, addr, end,
19586 + page_size_mask);
19587 continue;
19588 }
19589
19590 - if (direct_gbpages) {
19591 + if (page_size_mask & (1<<PG_LEVEL_1G)) {
19592 + pages++;
19593 + spin_lock(&init_mm.page_table_lock);
19594 set_pte((pte_t *)pud,
19595 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
19596 + spin_unlock(&init_mm.page_table_lock);
19597 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
19598 continue;
19599 }
19600
19601 - pmd = alloc_static_page(&pmd_phys);
19602 -
19603 - spin_lock(&init_mm.page_table_lock);
19604 - *pud = __pud(pmd_phys | _KERNPG_TABLE);
19605 - last_map_addr = phys_pmd_init(pmd, addr, end);
19606 - spin_unlock(&init_mm.page_table_lock);
19607 + pmd = alloc_low_page(&pmd_phys);
19608 + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
19609 + unmap_low_page(pmd);
19610
19611 - early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
19612 + if (!after_bootmem) {
19613 + if (max_pfn_mapped)
19614 + make_page_readonly(__va(pmd_phys),
19615 + XENFEAT_writable_page_tables);
19616 + if (page_size_mask & (1 << PG_LEVEL_NUM))
19617 + xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
19618 + else
19619 + *pud = __pud(pmd_phys | _PAGE_TABLE);
19620 + } else {
19621 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
19622 + spin_lock(&init_mm.page_table_lock);
19623 + pud_populate(&init_mm, pud, __va(pmd_phys));
19624 + spin_unlock(&init_mm.page_table_lock);
19625 + }
19626 }
19627 __flush_tlb_all();
19628 + update_page_count(PG_LEVEL_1G, pages);
19629
19630 - return last_map_addr >> PAGE_SHIFT;
19631 + return last_map_addr;
19632 +}
19633 +
19634 +static unsigned long __meminit
19635 +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
19636 + unsigned long page_size_mask)
19637 +{
19638 + pud_t *pud;
19639 +
19640 + pud = (pud_t *)pgd_page_vaddr(*pgd);
19641 +
19642 + return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
19643 }
19644
19645 void __init xen_init_pt(void)
19646 @@ -651,86 +646,36 @@ void __init xen_init_pt(void)
19647 }
19648 }
19649
19650 -static void __init extend_init_mapping(unsigned long tables_space)
19651 -{
19652 - unsigned long va = __START_KERNEL_map;
19653 - unsigned long start = start_pfn;
19654 - unsigned long phys, addr, *pte_page;
19655 - pmd_t *pmd;
19656 - pte_t *pte, new_pte;
19657 - unsigned long *page = (unsigned long *)init_level4_pgt;
19658 -
19659 - addr = page[pgd_index(va)];
19660 - addr_to_page(addr, page);
19661 - addr = page[pud_index(va)];
19662 - addr_to_page(addr, page);
19663 -
19664 - /* Kill mapping of low 1MB. */
19665 - while (va < (unsigned long)&_text) {
19666 - if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
19667 - BUG();
19668 - va += PAGE_SIZE;
19669 - }
19670 -
19671 - /* Ensure init mappings cover kernel text/data and initial tables. */
19672 - while (va < (__START_KERNEL_map
19673 - + (start_pfn << PAGE_SHIFT)
19674 - + tables_space)) {
19675 - pmd = (pmd_t *)&page[pmd_index(va)];
19676 - if (pmd_none(*pmd)) {
19677 - pte_page = alloc_static_page(&phys);
19678 - early_make_page_readonly(
19679 - pte_page, XENFEAT_writable_page_tables);
19680 - set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
19681 - } else {
19682 - addr = page[pmd_index(va)];
19683 - addr_to_page(addr, pte_page);
19684 - }
19685 - pte = (pte_t *)&pte_page[pte_index(va)];
19686 - if (pte_none(*pte)) {
19687 - new_pte = pfn_pte(
19688 - (va - __START_KERNEL_map) >> PAGE_SHIFT,
19689 - __pgprot(_KERNPG_TABLE));
19690 - xen_l1_entry_update(pte, new_pte);
19691 - }
19692 - va += PAGE_SIZE;
19693 - }
19694 -
19695 - /* Finally, blow away any spurious initial mappings. */
19696 - while (1) {
19697 - pmd = (pmd_t *)&page[pmd_index(va)];
19698 - if (pmd_none(*pmd))
19699 - break;
19700 - if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
19701 - BUG();
19702 - va += PAGE_SIZE;
19703 - }
19704 -
19705 - if (start_pfn > start)
19706 - reserve_early(start << PAGE_SHIFT,
19707 - start_pfn << PAGE_SHIFT, "INITMAP");
19708 -}
19709 -
19710 static void __init find_early_table_space(unsigned long end)
19711 {
19712 unsigned long puds, pmds, ptes, tables;
19713
19714 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
19715 + tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
19716 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
19717 - ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
19718 + tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
19719
19720 - tables = round_up(puds * 8, PAGE_SIZE) +
19721 - round_up(pmds * 8, PAGE_SIZE) +
19722 - round_up(ptes * 8, PAGE_SIZE);
19723 + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
19724 + tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
19725
19726 - extend_init_mapping(tables);
19727 + if (!table_top) {
19728 + table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
19729 + xen_start_info->nr_pt_frames;
19730 + table_cur = table_start;
19731 + } else {
19732 + /*
19733 + * [table_start, table_top) gets passed to reserve_early(),
19734 + * so we must not use table_cur here, despite continuing
19735 + * to allocate from there. table_cur possibly being below
19736 + * table_start is otoh not a problem.
19737 + */
19738 + table_start = table_top;
19739 + }
19740
19741 - table_start = start_pfn;
19742 - table_end = table_start + (tables>>PAGE_SHIFT);
19743 + table_top = table_cur + (tables >> PAGE_SHIFT);
19744
19745 - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
19746 - end, table_start << PAGE_SHIFT,
19747 - (table_start << PAGE_SHIFT) + tables);
19748 + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
19749 + end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT);
19750 }
19751
19752 static void __init xen_finish_init_mapping(void)
19753 @@ -752,18 +697,18 @@ static void __init xen_finish_init_mappi
19754 xen_start_info->mod_start = (unsigned long)
19755 __va(__pa(xen_start_info->mod_start));
19756
19757 - /* Destroy the Xen-created mappings beyond the kernel image as
19758 - * well as the temporary mappings created above. Prevents
19759 - * overlap with modules area (if init mapping is very big).
19760 - */
19761 + /* Destroy the Xen-created mappings beyond the kernel image. */
19762 start = PAGE_ALIGN((unsigned long)_end);
19763 - end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
19764 + end = __START_KERNEL_map + (table_start << PAGE_SHIFT);
19765 for (; start < end; start += PAGE_SIZE)
19766 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
19767 BUG();
19768
19769 - /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
19770 - table_end = ~0UL;
19771 + /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
19772 + start = table_top;
19773 + WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n",
19774 + table_start, table_cur, start);
19775 + table_top = ~0UL;
19776
19777 /* Switch to the real shared_info page, and clear the dummy page. */
19778 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
19779 @@ -780,8 +725,7 @@ static void __init xen_finish_init_mappi
19780 << PAGE_SHIFT,
19781 PAGE_KERNEL_RO);
19782
19783 - /* Disable the 'start_pfn' allocator. */
19784 - table_end = start_pfn;
19785 + table_top = max(table_cur, start);
19786 }
19787
19788 static void __init init_gbpages(void)
19789 @@ -794,126 +738,91 @@ static void __init init_gbpages(void)
19790 #endif
19791 }
19792
19793 -#ifdef CONFIG_MEMTEST_BOOTPARAM
19794 -
19795 -static void __init memtest(unsigned long start_phys, unsigned long size,
19796 - unsigned pattern)
19797 +static unsigned long __init kernel_physical_mapping_init(unsigned long start,
19798 + unsigned long end,
19799 + unsigned long page_size_mask)
19800 {
19801 - unsigned long i;
19802 - unsigned long *start;
19803 - unsigned long start_bad;
19804 - unsigned long last_bad;
19805 - unsigned long val;
19806 - unsigned long start_phys_aligned;
19807 - unsigned long count;
19808 - unsigned long incr;
19809 -
19810 - switch (pattern) {
19811 - case 0:
19812 - val = 0UL;
19813 - break;
19814 - case 1:
19815 - val = -1UL;
19816 - break;
19817 - case 2:
19818 - val = 0x5555555555555555UL;
19819 - break;
19820 - case 3:
19821 - val = 0xaaaaaaaaaaaaaaaaUL;
19822 - break;
19823 - default:
19824 - return;
19825 - }
19826 -
19827 - incr = sizeof(unsigned long);
19828 - start_phys_aligned = ALIGN(start_phys, incr);
19829 - count = (size - (start_phys_aligned - start_phys))/incr;
19830 - start = __va(start_phys_aligned);
19831 - start_bad = 0;
19832 - last_bad = 0;
19833 -
19834 - for (i = 0; i < count; i++)
19835 - start[i] = val;
19836 - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
19837 - if (*start != val) {
19838 - if (start_phys_aligned == last_bad + incr) {
19839 - last_bad += incr;
19840 - } else {
19841 - if (start_bad) {
19842 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19843 - val, start_bad, last_bad + incr);
19844 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19845 - }
19846 - start_bad = last_bad = start_phys_aligned;
19847 - }
19848 - }
19849 - }
19850 - if (start_bad) {
19851 - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
19852 - val, start_bad, last_bad + incr);
19853 - reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
19854 - }
19855
19856 -}
19857 + unsigned long next, last_map_addr = end;
19858
19859 -static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
19860 + start = (unsigned long)__va(start);
19861 + end = (unsigned long)__va(end);
19862
19863 -static int __init parse_memtest(char *arg)
19864 -{
19865 - if (arg)
19866 - memtest_pattern = simple_strtoul(arg, NULL, 0);
19867 - return 0;
19868 -}
19869 + for (; start < end; start = next) {
19870 + pgd_t *pgd = pgd_offset_k(start);
19871 + unsigned long pud_phys;
19872 + pud_t *pud;
19873
19874 -early_param("memtest", parse_memtest);
19875 + next = (start + PGDIR_SIZE) & PGDIR_MASK;
19876 + if (next > end)
19877 + next = end;
19878
19879 -static void __init early_memtest(unsigned long start, unsigned long end)
19880 -{
19881 - u64 t_start, t_size;
19882 - unsigned pattern;
19883 + if (__pgd_val(*pgd)) {
19884 + last_map_addr = phys_pud_update(pgd, __pa(start),
19885 + __pa(end), page_size_mask);
19886 + continue;
19887 + }
19888
19889 - if (!memtest_pattern)
19890 - return;
19891 + pud = alloc_low_page(&pud_phys);
19892 + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
19893 + page_size_mask);
19894 + unmap_low_page(pud);
19895 +
19896 + if(!after_bootmem) {
19897 + if (max_pfn_mapped)
19898 + make_page_readonly(__va(pud_phys),
19899 + XENFEAT_writable_page_tables);
19900 + xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
19901 + } else {
19902 + make_page_readonly(pud, XENFEAT_writable_page_tables);
19903 + spin_lock(&init_mm.page_table_lock);
19904 + pgd_populate(&init_mm, pgd, __va(pud_phys));
19905 + spin_unlock(&init_mm.page_table_lock);
19906 + }
19907 + }
19908
19909 - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
19910 - for (pattern = 0; pattern < memtest_pattern; pattern++) {
19911 - t_start = start;
19912 - t_size = 0;
19913 - while (t_start < end) {
19914 - t_start = find_e820_area_size(t_start, &t_size, 1);
19915 + return last_map_addr;
19916 +}
19917
19918 - /* done ? */
19919 - if (t_start >= end)
19920 - break;
19921 - if (t_start + t_size > end)
19922 - t_size = end - t_start;
19923 +struct map_range {
19924 + unsigned long start;
19925 + unsigned long end;
19926 + unsigned page_size_mask;
19927 +};
19928
19929 - printk(KERN_CONT "\n %016llx - %016llx pattern %d",
19930 - (unsigned long long)t_start,
19931 - (unsigned long long)t_start + t_size, pattern);
19932 +#define NR_RANGE_MR 5
19933
19934 - memtest(t_start, t_size, pattern);
19935 +static int save_mr(struct map_range *mr, int nr_range,
19936 + unsigned long start_pfn, unsigned long end_pfn,
19937 + unsigned long page_size_mask)
19938 +{
19939
19940 - t_start += t_size;
19941 - }
19942 + if (start_pfn < end_pfn) {
19943 + if (nr_range >= NR_RANGE_MR)
19944 + panic("run out of range for init_memory_mapping\n");
19945 + mr[nr_range].start = start_pfn<<PAGE_SHIFT;
19946 + mr[nr_range].end = end_pfn<<PAGE_SHIFT;
19947 + mr[nr_range].page_size_mask = page_size_mask;
19948 + nr_range++;
19949 }
19950 - printk(KERN_CONT "\n");
19951 -}
19952 -#else
19953 -static void __init early_memtest(unsigned long start, unsigned long end)
19954 -{
19955 +
19956 + return nr_range;
19957 }
19958 -#endif
19959
19960 /*
19961 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
19962 * This runs before bootmem is initialized and gets pages directly from
19963 * the physical memory. To access them they are temporarily mapped.
19964 */
19965 -unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
19966 +unsigned long __init_refok init_memory_mapping(unsigned long start,
19967 + unsigned long end)
19968 {
19969 - unsigned long next, last_map_addr = end;
19970 - unsigned long start_phys = start, end_phys = end;
19971 + unsigned long last_map_addr = 0;
19972 + unsigned long page_size_mask = 0;
19973 + unsigned long start_pfn, end_pfn;
19974 +
19975 + struct map_range mr[NR_RANGE_MR];
19976 + int nr_range, i;
19977
19978 printk(KERN_INFO "init_memory_mapping\n");
19979
19980 @@ -924,51 +833,150 @@ unsigned long __init_refok init_memory_m
19981 * memory mapped. Unfortunately this is done currently before the
19982 * nodes are discovered.
19983 */
19984 - if (!after_bootmem) {
19985 + if (!after_bootmem)
19986 init_gbpages();
19987 - find_early_table_space(end);
19988 +
19989 + if (direct_gbpages)
19990 + page_size_mask |= 1 << PG_LEVEL_1G;
19991 + if (cpu_has_pse)
19992 + page_size_mask |= 1 << PG_LEVEL_2M;
19993 +
19994 + memset(mr, 0, sizeof(mr));
19995 + nr_range = 0;
19996 +
19997 + /* head if not big page alignment ?*/
19998 + start_pfn = start >> PAGE_SHIFT;
19999 + end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
20000 + << (PMD_SHIFT - PAGE_SHIFT);
20001 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
20002 +
20003 + /* big page (2M) range*/
20004 + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
20005 + << (PMD_SHIFT - PAGE_SHIFT);
20006 + end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
20007 + << (PUD_SHIFT - PAGE_SHIFT);
20008 + if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
20009 + end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
20010 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20011 + page_size_mask & (1<<PG_LEVEL_2M));
20012 +
20013 + /* big page (1G) range */
20014 + start_pfn = end_pfn;
20015 + end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
20016 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20017 + page_size_mask &
20018 + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
20019 +
20020 + /* tail is not big page (1G) alignment */
20021 + start_pfn = end_pfn;
20022 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
20023 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
20024 + page_size_mask & (1<<PG_LEVEL_2M));
20025 +
20026 + /* tail is not big page (2M) alignment */
20027 + start_pfn = end_pfn;
20028 + end_pfn = end>>PAGE_SHIFT;
20029 + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
20030 +
20031 + /* try to merge same page size and continuous */
20032 + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
20033 + unsigned long old_start;
20034 + if (mr[i].end != mr[i+1].start ||
20035 + mr[i].page_size_mask != mr[i+1].page_size_mask)
20036 + continue;
20037 + /* move it */
20038 + old_start = mr[i].start;
20039 + memmove(&mr[i], &mr[i+1],
20040 + (nr_range - 1 - i) * sizeof (struct map_range));
20041 + mr[i--].start = old_start;
20042 + nr_range--;
20043 }
20044
20045 - start = (unsigned long)__va(start);
20046 - end = (unsigned long)__va(end);
20047 + for (i = 0; i < nr_range; i++)
20048 + printk(KERN_DEBUG " %010lx - %010lx page %s\n",
20049 + mr[i].start, mr[i].end,
20050 + (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
20051 + (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
20052
20053 - for (; start < end; start = next) {
20054 - pgd_t *pgd = pgd_offset_k(start);
20055 - unsigned long pud_phys;
20056 - pud_t *pud;
20057 + if (!after_bootmem)
20058 + find_early_table_space(end);
20059
20060 - if (after_bootmem)
20061 - pud = pud_offset(pgd, start & PGDIR_MASK);
20062 - else
20063 - pud = alloc_static_page(&pud_phys);
20064 - next = start + PGDIR_SIZE;
20065 - if (next > end)
20066 - next = end;
20067 - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
20068 - if (!after_bootmem) {
20069 - early_make_page_readonly(pud, XENFEAT_writable_page_tables);
20070 - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
20071 + if (!start) {
20072 + unsigned long addr, va = __START_KERNEL_map;
20073 + unsigned long *page = (unsigned long *)init_level4_pgt;
20074 +
20075 + /* Kill mapping of memory below _text. */
20076 + while (va < (unsigned long)&_text) {
20077 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
20078 + BUG();
20079 + va += PAGE_SIZE;
20080 + }
20081 +
20082 + /* Blow away any spurious initial mappings. */
20083 + va = __START_KERNEL_map + (table_start << PAGE_SHIFT);
20084 + addr = page[pgd_index(va)];
20085 + addr_to_page(addr, page);
20086 + addr = page[pud_index(va)];
20087 + addr_to_page(addr, page);
20088 + while (pmd_index(va) | pte_index(va)) {
20089 + if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
20090 + break;
20091 + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
20092 + BUG();
20093 + va += PAGE_SIZE;
20094 }
20095 }
20096
20097 - if (!after_bootmem) {
20098 - BUG_ON(start_pfn != table_end);
20099 + for (i = 0; i < nr_range; i++)
20100 + last_map_addr = kernel_physical_mapping_init(
20101 + mr[i].start, mr[i].end,
20102 + mr[i].page_size_mask);
20103 +
20104 + BUG_ON(table_cur > table_top);
20105 + if (!start)
20106 xen_finish_init_mapping();
20107 - }
20108 + else if (table_cur < table_top)
20109 + /* Disable the 'table_cur' allocator. */
20110 + table_top = table_cur;
20111
20112 __flush_tlb_all();
20113
20114 - if (!after_bootmem)
20115 + if (!after_bootmem && table_top > table_start)
20116 reserve_early(table_start << PAGE_SHIFT,
20117 - table_end << PAGE_SHIFT, "PGTABLE");
20118 + table_top << PAGE_SHIFT, "PGTABLE");
20119 +
20120 + printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
20121 + last_map_addr, end);
20122
20123 if (!after_bootmem)
20124 - early_memtest(start_phys, end_phys);
20125 + early_memtest(start, end);
20126
20127 - return last_map_addr;
20128 + return last_map_addr >> PAGE_SHIFT;
20129 }
20130
20131 #ifndef CONFIG_NUMA
20132 +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
20133 +{
20134 + unsigned long bootmap_size, bootmap;
20135 +
20136 + e820_register_active_regions(0, start_pfn, end_pfn);
20137 +#ifdef CONFIG_XEN
20138 + if (end_pfn > xen_start_info->nr_pages)
20139 + end_pfn = xen_start_info->nr_pages;
20140 +#endif
20141 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
20142 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
20143 + PAGE_SIZE);
20144 + if (bootmap == -1L)
20145 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
20146 + /* don't touch min_low_pfn */
20147 + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
20148 + 0, end_pfn);
20149 + free_bootmem_with_active_regions(0, end_pfn);
20150 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
20151 + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
20152 +}
20153 +
20154 void __init paging_init(void)
20155 {
20156 unsigned long max_zone_pfns[MAX_NR_ZONES];
20157 @@ -976,9 +984,9 @@ void __init paging_init(void)
20158 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
20159 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
20160 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
20161 - max_zone_pfns[ZONE_NORMAL] = end_pfn;
20162 + max_zone_pfns[ZONE_NORMAL] = max_pfn;
20163
20164 - memory_present(0, 0, end_pfn);
20165 + memory_present(0, 0, max_pfn);
20166 sparse_init();
20167 free_area_init_nodes(max_zone_pfns);
20168
20169 @@ -1069,8 +1077,8 @@ void __init mem_init(void)
20170 init_page_count(pfn_to_page(pfn));
20171 totalram_pages++;
20172 }
20173 - reservedpages = end_pfn - totalram_pages -
20174 - absent_pages_in_range(0, end_pfn);
20175 + reservedpages = max_pfn - totalram_pages -
20176 + absent_pages_in_range(0, max_pfn);
20177 after_bootmem = 1;
20178
20179 codesize = (unsigned long) &_etext - (unsigned long) &_text;
20180 @@ -1089,7 +1097,7 @@ void __init mem_init(void)
20181 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
20182 "%ldk reserved, %ldk data, %ldk init)\n",
20183 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
20184 - end_pfn << (PAGE_SHIFT-10),
20185 + max_pfn << (PAGE_SHIFT-10),
20186 codesize >> 10,
20187 reservedpages << (PAGE_SHIFT-10),
20188 datasize >> 10,
20189 @@ -1152,6 +1160,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
20190 void mark_rodata_ro(void)
20191 {
20192 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
20193 + unsigned long rodata_start =
20194 + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20195 +
20196 +#ifdef CONFIG_DYNAMIC_FTRACE
20197 + /* Dynamic tracing modifies the kernel text section */
20198 + start = rodata_start;
20199 +#endif
20200
20201 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
20202 (end - start) >> 10);
20203 @@ -1161,8 +1176,7 @@ void mark_rodata_ro(void)
20204 * The rodata section (but not the kernel text!) should also be
20205 * not-executable.
20206 */
20207 - start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
20208 - set_memory_nx(start, (end - start) >> PAGE_SHIFT);
20209 + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
20210
20211 rodata_test();
20212
20213 @@ -1184,24 +1198,26 @@ void free_initrd_mem(unsigned long start
20214 }
20215 #endif
20216
20217 -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
20218 +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
20219 + int flags)
20220 {
20221 #ifdef CONFIG_NUMA
20222 int nid, next_nid;
20223 + int ret;
20224 #endif
20225 unsigned long pfn = phys >> PAGE_SHIFT;
20226
20227 - if (pfn >= end_pfn) {
20228 + if (pfn >= max_pfn) {
20229 /*
20230 * This can happen with kdump kernels when accessing
20231 * firmware tables:
20232 */
20233 if (pfn < max_pfn_mapped)
20234 - return;
20235 + return -EFAULT;
20236
20237 - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
20238 + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
20239 phys, len);
20240 - return;
20241 + return -EFAULT;
20242 }
20243
20244 /* Should check here against the e820 map to avoid double free */
20245 @@ -1209,9 +1225,13 @@ void __init reserve_bootmem_generic(unsi
20246 nid = phys_to_nid(phys);
20247 next_nid = phys_to_nid(phys + len - 1);
20248 if (nid == next_nid)
20249 - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
20250 + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
20251 else
20252 - reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20253 + ret = reserve_bootmem(phys, len, flags);
20254 +
20255 + if (ret != 0)
20256 + return ret;
20257 +
20258 #else
20259 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
20260 #endif
20261 @@ -1222,6 +1242,8 @@ void __init reserve_bootmem_generic(unsi
20262 set_dma_reserve(dma_reserve);
20263 }
20264 #endif
20265 +
20266 + return 0;
20267 }
20268
20269 int kern_addr_valid(unsigned long addr)
20270 @@ -1326,7 +1348,7 @@ vmemmap_populate(struct page *start_page
20271 pmd_t *pmd;
20272
20273 for (; addr < end; addr = next) {
20274 - next = pmd_addr_end(addr, end);
20275 + void *p = NULL;
20276
20277 pgd = vmemmap_pgd_populate(addr, node);
20278 if (!pgd)
20279 @@ -1336,33 +1358,51 @@ vmemmap_populate(struct page *start_page
20280 if (!pud)
20281 return -ENOMEM;
20282
20283 - pmd = pmd_offset(pud, addr);
20284 - if (pmd_none(*pmd)) {
20285 - pte_t entry;
20286 - void *p;
20287 + if (!cpu_has_pse) {
20288 + next = (addr + PAGE_SIZE) & PAGE_MASK;
20289 + pmd = vmemmap_pmd_populate(pud, addr, node);
20290 +
20291 + if (!pmd)
20292 + return -ENOMEM;
20293 +
20294 + p = vmemmap_pte_populate(pmd, addr, node);
20295
20296 - p = vmemmap_alloc_block(PMD_SIZE, node);
20297 if (!p)
20298 return -ENOMEM;
20299
20300 - entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20301 - PAGE_KERNEL_LARGE);
20302 - set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20303 -
20304 - /* check to see if we have contiguous blocks */
20305 - if (p_end != p || node_start != node) {
20306 - if (p_start)
20307 - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20308 - addr_start, addr_end-1, p_start, p_end-1, node_start);
20309 - addr_start = addr;
20310 - node_start = node;
20311 - p_start = p;
20312 - }
20313 - addr_end = addr + PMD_SIZE;
20314 - p_end = p + PMD_SIZE;
20315 + addr_end = addr + PAGE_SIZE;
20316 + p_end = p + PAGE_SIZE;
20317 } else {
20318 - vmemmap_verify((pte_t *)pmd, node, addr, next);
20319 + next = pmd_addr_end(addr, end);
20320 +
20321 + pmd = pmd_offset(pud, addr);
20322 + if (pmd_none(*pmd)) {
20323 + pte_t entry;
20324 +
20325 + p = vmemmap_alloc_block(PMD_SIZE, node);
20326 + if (!p)
20327 + return -ENOMEM;
20328 +
20329 + entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
20330 + PAGE_KERNEL_LARGE);
20331 + set_pmd(pmd, __pmd_ma(__pte_val(entry)));
20332 +
20333 + /* check to see if we have contiguous blocks */
20334 + if (p_end != p || node_start != node) {
20335 + if (p_start)
20336 + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
20337 + addr_start, addr_end-1, p_start, p_end-1, node_start);
20338 + addr_start = addr;
20339 + node_start = node;
20340 + p_start = p;
20341 + }
20342 +
20343 + addr_end = addr + PMD_SIZE;
20344 + p_end = p + PMD_SIZE;
20345 + } else
20346 + vmemmap_verify((pte_t *)pmd, node, addr, next);
20347 }
20348 +
20349 }
20350 return 0;
20351 }
20352 --- sle11-2009-10-16.orig/arch/x86/mm/ioremap-xen.c 2009-03-16 16:38:05.000000000 +0100
20353 +++ sle11-2009-10-16/arch/x86/mm/ioremap-xen.c 2009-06-04 10:21:39.000000000 +0200
20354 @@ -13,6 +13,7 @@
20355 #include <linux/pfn.h>
20356 #include <linux/slab.h>
20357 #include <linux/vmalloc.h>
20358 +#include <linux/mmiotrace.h>
20359
20360 #include <asm/cacheflush.h>
20361 #include <asm/e820.h>
20362 @@ -274,7 +275,8 @@ int ioremap_check_change_attr(unsigned l
20363 for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
20364 unsigned long pfn = mfn_to_local_pfn(mfn);
20365
20366 - if (pfn >= max_pfn_mapped)
20367 + if (pfn >= max_low_pfn_mapped &&
20368 + (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
20369 continue;
20370 rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
20371 PAGE_SIZE, prot_val);
20372 @@ -297,11 +299,14 @@ static void __iomem *__ioremap_caller(re
20373 {
20374 unsigned long mfn, offset, vaddr;
20375 resource_size_t last_addr;
20376 + const resource_size_t unaligned_phys_addr = phys_addr;
20377 + const unsigned long unaligned_size = size;
20378 struct vm_struct *area;
20379 unsigned long new_prot_val;
20380 pgprot_t prot;
20381 int retval;
20382 domid_t domid = DOMID_IO;
20383 + void __iomem *ret_addr;
20384
20385 /* Don't allow wraparound or zero size */
20386 last_addr = phys_addr + size - 1;
20387 @@ -318,7 +323,7 @@ static void __iomem *__ioremap_caller(re
20388 /*
20389 * Don't remap the low PCI/ISA area, it's always mapped..
20390 */
20391 - if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
20392 + if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
20393 return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
20394
20395 /*
20396 @@ -342,7 +347,7 @@ static void __iomem *__ioremap_caller(re
20397 phys_addr &= PAGE_MASK;
20398 size = PAGE_ALIGN(last_addr+1) - phys_addr;
20399
20400 - retval = reserve_memtype(phys_addr, phys_addr + size,
20401 + retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
20402 prot_val, &new_prot_val);
20403 if (retval) {
20404 pr_debug("Warning: reserve_memtype returned %d\n", retval);
20405 @@ -410,7 +415,10 @@ static void __iomem *__ioremap_caller(re
20406 return NULL;
20407 }
20408
20409 - return (void __iomem *) (vaddr + offset);
20410 + ret_addr = (void __iomem *) (vaddr + offset);
20411 + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
20412 +
20413 + return ret_addr;
20414 }
20415
20416 /**
20417 @@ -438,7 +446,7 @@ void __iomem *ioremap_nocache(resource_s
20418 {
20419 /*
20420 * Ideally, this should be:
20421 - * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20422 + * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
20423 *
20424 * Till we fix all X drivers to use ioremap_wc(), we will use
20425 * UC MINUS.
20426 @@ -462,7 +470,7 @@ EXPORT_SYMBOL(ioremap_nocache);
20427 */
20428 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
20429 {
20430 - if (pat_wc_enabled)
20431 + if (pat_enabled)
20432 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
20433 __builtin_return_address(0));
20434 else
20435 @@ -502,6 +510,14 @@ static void __iomem *ioremap_default(res
20436 }
20437 #endif
20438
20439 +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
20440 + unsigned long prot_val)
20441 +{
20442 + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
20443 + __builtin_return_address(0));
20444 +}
20445 +EXPORT_SYMBOL(ioremap_prot);
20446 +
20447 /**
20448 * iounmap - Free a IO remapping
20449 * @addr: virtual address from ioremap_*
20450 @@ -526,6 +542,8 @@ void iounmap(volatile void __iomem *addr
20451 addr = (volatile void __iomem *)
20452 (PAGE_MASK & (unsigned long __force)addr);
20453
20454 + mmiotrace_iounmap(addr);
20455 +
20456 /* Use the vm area unlocked, assuming the caller
20457 ensures there isn't another iounmap for the same address
20458 in parallel. Reuse of the virtual address is prevented by
20459 @@ -533,7 +551,7 @@ void iounmap(volatile void __iomem *addr
20460 cpa takes care of the direct mappings. */
20461 read_lock(&vmlist_lock);
20462 for (p = vmlist; p; p = p->next) {
20463 - if (p->addr == addr)
20464 + if (p->addr == (void __force *)addr)
20465 break;
20466 }
20467 read_unlock(&vmlist_lock);
20468 @@ -547,7 +565,7 @@ void iounmap(volatile void __iomem *addr
20469 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
20470
20471 /* Finally remove it */
20472 - o = remove_vm_area((void *)addr);
20473 + o = remove_vm_area((void __force *)addr);
20474 BUG_ON(p != o || o == NULL);
20475 kfree(p);
20476 }
20477 @@ -567,7 +585,7 @@ void *xlate_dev_mem_ptr(unsigned long ph
20478 if (page_is_ram(start >> PAGE_SHIFT))
20479 return __va(phys);
20480
20481 - addr = (void *)ioremap_default(start, PAGE_SIZE);
20482 + addr = (void __force *)ioremap_default(start, PAGE_SIZE);
20483 if (addr)
20484 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
20485
20486 @@ -595,8 +613,7 @@ static int __init early_ioremap_debug_se
20487 early_param("early_ioremap_debug", early_ioremap_debug_setup);
20488
20489 static __initdata int after_paging_init;
20490 -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
20491 - __section(.bss.page_aligned);
20492 +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
20493
20494 #ifdef CONFIG_X86_32
20495 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
20496 @@ -695,10 +712,11 @@ static void __init __early_set_fixmap(en
20497 return;
20498 }
20499 pte = early_ioremap_pte(addr);
20500 +
20501 if (pgprot_val(flags))
20502 set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
20503 else
20504 - pte_clear(NULL, addr, pte);
20505 + pte_clear(&init_mm, addr, pte);
20506 __flush_tlb_one(addr);
20507 }
20508
20509 @@ -726,13 +744,11 @@ static int __init check_early_ioremap_le
20510 {
20511 if (!early_ioremap_nested)
20512 return 0;
20513 -
20514 - printk(KERN_WARNING
20515 + WARN(1, KERN_WARNING
20516 "Debug warning: early ioremap leak of %d areas detected.\n",
20517 - early_ioremap_nested);
20518 + early_ioremap_nested);
20519 printk(KERN_WARNING
20520 - "please boot with early_ioremap_debug and report the dmesg.\n");
20521 - WARN_ON(1);
20522 + "please boot with early_ioremap_debug and report the dmesg.\n");
20523
20524 return 1;
20525 }
20526 --- sle11-2009-10-16.orig/arch/x86/mm/pageattr-xen.c 2009-03-16 16:38:05.000000000 +0100
20527 +++ sle11-2009-10-16/arch/x86/mm/pageattr-xen.c 2009-06-04 10:21:39.000000000 +0200
20528 @@ -34,6 +34,47 @@ struct cpa_data {
20529 unsigned force_split : 1;
20530 };
20531
20532 +#ifdef CONFIG_PROC_FS
20533 +static unsigned long direct_pages_count[PG_LEVEL_NUM];
20534 +
20535 +void update_page_count(int level, unsigned long pages)
20536 +{
20537 + unsigned long flags;
20538 +
20539 + /* Protect against CPA */
20540 + spin_lock_irqsave(&pgd_lock, flags);
20541 + direct_pages_count[level] += pages;
20542 + spin_unlock_irqrestore(&pgd_lock, flags);
20543 +}
20544 +
20545 +static void split_page_count(int level)
20546 +{
20547 + direct_pages_count[level]--;
20548 + direct_pages_count[level - 1] += PTRS_PER_PTE;
20549 +}
20550 +
20551 +int arch_report_meminfo(char *page)
20552 +{
20553 + int n = sprintf(page, "DirectMap4k: %8lu kB\n",
20554 + direct_pages_count[PG_LEVEL_4K] << 2);
20555 +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
20556 + n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
20557 + direct_pages_count[PG_LEVEL_2M] << 11);
20558 +#else
20559 + n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
20560 + direct_pages_count[PG_LEVEL_2M] << 12);
20561 +#endif
20562 +#ifdef CONFIG_X86_64
20563 + if (direct_gbpages)
20564 + n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
20565 + direct_pages_count[PG_LEVEL_1G] << 20);
20566 +#endif
20567 + return n;
20568 +}
20569 +#else
20570 +static inline void split_page_count(int level) { }
20571 +#endif
20572 +
20573 #ifdef CONFIG_X86_64
20574
20575 static inline unsigned long highmap_start_pfn(void)
20576 @@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long
20577 {
20578 BUG_ON(irqs_disabled());
20579
20580 - on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
20581 + on_each_cpu(__cpa_flush_all, (void *) cache, 1);
20582 }
20583
20584 static void __cpa_flush_range(void *arg)
20585 @@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned lon
20586 BUG_ON(irqs_disabled());
20587 WARN_ON(PAGE_ALIGN(start) != start);
20588
20589 - on_each_cpu(__cpa_flush_range, NULL, 1, 1);
20590 + on_each_cpu(__cpa_flush_range, NULL, 1);
20591
20592 if (!cache)
20593 return;
20594 @@ -229,6 +270,7 @@ pte_t *lookup_address(unsigned long addr
20595
20596 return pte_offset_kernel(pmd, address);
20597 }
20598 +EXPORT_SYMBOL_GPL(lookup_address);
20599
20600 /*
20601 * Set the new pmd in all the pgds we know about:
20602 @@ -509,6 +551,16 @@ static int split_large_page(pte_t *kpte,
20603 }
20604 #endif
20605
20606 + if (address >= (unsigned long)__va(0) &&
20607 + address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
20608 + split_page_count(level);
20609 +
20610 +#ifdef CONFIG_X86_64
20611 + if (address >= (unsigned long)__va(1UL<<32) &&
20612 + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
20613 + split_page_count(level);
20614 +#endif
20615 +
20616 /*
20617 * Get the target mfn from the original entry:
20618 */
20619 @@ -566,10 +618,9 @@ repeat:
20620 if (!__pte_val(old_pte)) {
20621 if (!primary)
20622 return 0;
20623 - printk(KERN_WARNING "CPA: called for zero pte. "
20624 + WARN(1, KERN_WARNING "CPA: called for zero pte. "
20625 "vaddr = %lx cpa->vaddr = %lx\n", address,
20626 cpa->vaddr);
20627 - WARN_ON(1);
20628 return -EINVAL;
20629 }
20630
20631 @@ -634,15 +685,24 @@ static int cpa_process_alias(struct cpa_
20632 struct cpa_data alias_cpa;
20633 int ret = 0;
20634
20635 - if (cpa->pfn > max_pfn_mapped)
20636 + if (cpa->pfn >= max_pfn_mapped)
20637 return 0;
20638
20639 +#ifdef CONFIG_X86_64
20640 + if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
20641 + return 0;
20642 +#endif
20643 /*
20644 * No need to redo, when the primary call touched the direct
20645 * mapping already:
20646 */
20647 - if (!within(cpa->vaddr, PAGE_OFFSET,
20648 - PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
20649 + if (!(within(cpa->vaddr, PAGE_OFFSET,
20650 + PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
20651 +#ifdef CONFIG_X86_64
20652 + || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
20653 + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
20654 +#endif
20655 + )) {
20656
20657 alias_cpa = *cpa;
20658 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
20659 @@ -796,6 +856,51 @@ static inline int change_page_attr_clear
20660 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
20661 }
20662
20663 +#ifdef CONFIG_XEN
20664 +static void _free_memtype(u64 pstart, u64 pend)
20665 +{
20666 + u64 pa = pstart &= __PHYSICAL_MASK;
20667 + u64 ma = phys_to_machine(pa);
20668 +
20669 + while ((pa += PAGE_SIZE) < pend) {
20670 + if (phys_to_machine(pa) != ma + (pa - pstart)) {
20671 + free_memtype(ma, ma + (pa - pstart));
20672 + pstart = pa;
20673 + ma = phys_to_machine(pa);
20674 + }
20675 + }
20676 + free_memtype(ma, ma + (pend - pstart));
20677 +}
20678 +#define free_memtype _free_memtype
20679 +
20680 +static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type)
20681 +{
20682 + u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur;
20683 + u64 ma = phys_to_machine(pa);
20684 + int rc = 0;
20685 +
20686 + while ((pa += PAGE_SIZE) < pend) {
20687 + if (phys_to_machine(pa) != ma + (pa - pcur)) {
20688 + rc = reserve_memtype(ma, ma + (pa - pcur),
20689 + req_type, NULL);
20690 + if (rc)
20691 + break;
20692 + pcur = pa;
20693 + ma = phys_to_machine(pa);
20694 + }
20695 + }
20696 + if (likely(!rc))
20697 + rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL);
20698 +
20699 + if (unlikely(!rc) && pstart < pcur)
20700 + _free_memtype(pstart, pcur);
20701 +
20702 + return rc;
20703 +}
20704 +#define reserve_memtype(s, e, r, n) \
20705 + _reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r))
20706 +#endif
20707 +
20708 int _set_memory_uc(unsigned long addr, int numpages)
20709 {
20710 /*
20711 @@ -810,7 +915,7 @@ int set_memory_uc(unsigned long addr, in
20712 /*
20713 * for now UC MINUS. see comments in ioremap_nocache()
20714 */
20715 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20716 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20717 _PAGE_CACHE_UC_MINUS, NULL))
20718 return -EINVAL;
20719
20720 @@ -826,10 +931,10 @@ int _set_memory_wc(unsigned long addr, i
20721
20722 int set_memory_wc(unsigned long addr, int numpages)
20723 {
20724 - if (!pat_wc_enabled)
20725 + if (!pat_enabled)
20726 return set_memory_uc(addr, numpages);
20727
20728 - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
20729 + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
20730 _PAGE_CACHE_WC, NULL))
20731 return -EINVAL;
20732
20733 @@ -845,7 +950,7 @@ int _set_memory_wb(unsigned long addr, i
20734
20735 int set_memory_wb(unsigned long addr, int numpages)
20736 {
20737 - free_memtype(addr, addr + numpages * PAGE_SIZE);
20738 + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
20739
20740 return _set_memory_wb(addr, numpages);
20741 }
20742 --- sle11-2009-10-16.orig/arch/x86/mm/pat-xen.c 2009-03-16 16:38:05.000000000 +0100
20743 +++ sle11-2009-10-16/arch/x86/mm/pat-xen.c 2009-06-04 10:21:39.000000000 +0200
20744 @@ -12,6 +12,8 @@
20745 #include <linux/gfp.h>
20746 #include <linux/fs.h>
20747 #include <linux/bootmem.h>
20748 +#include <linux/debugfs.h>
20749 +#include <linux/seq_file.h>
20750
20751 #include <asm/msr.h>
20752 #include <asm/tlbflush.h>
20753 @@ -26,11 +28,11 @@
20754 #include <asm/io.h>
20755
20756 #ifdef CONFIG_X86_PAT
20757 -int __read_mostly pat_wc_enabled = 1;
20758 +int __read_mostly pat_enabled = 1;
20759
20760 void __cpuinit pat_disable(char *reason)
20761 {
20762 - pat_wc_enabled = 0;
20763 + pat_enabled = 0;
20764 printk(KERN_INFO "%s\n", reason);
20765 }
20766
20767 @@ -42,6 +44,19 @@ static int __init nopat(char *str)
20768 early_param("nopat", nopat);
20769 #endif
20770
20771 +
20772 +static int debug_enable;
20773 +static int __init pat_debug_setup(char *str)
20774 +{
20775 + debug_enable = 1;
20776 + return 0;
20777 +}
20778 +__setup("debugpat", pat_debug_setup);
20779 +
20780 +#define dprintk(fmt, arg...) \
20781 + do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
20782 +
20783 +
20784 static u64 __read_mostly boot_pat_state;
20785
20786 enum {
20787 @@ -53,24 +68,25 @@ enum {
20788 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
20789 };
20790
20791 -#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
20792 +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
20793
20794 void pat_init(void)
20795 {
20796 u64 pat;
20797
20798 - if (!pat_wc_enabled)
20799 + if (!pat_enabled)
20800 return;
20801
20802 /* Paranoia check. */
20803 - if (!cpu_has_pat) {
20804 - printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
20805 + if (!cpu_has_pat && boot_pat_state) {
20806 /*
20807 - * Panic if this happens on the secondary CPU, and we
20808 + * If this happens we are on a secondary CPU, but
20809 * switched to PAT on the boot CPU. We have no way to
20810 * undo PAT.
20811 - */
20812 - BUG_ON(boot_pat_state);
20813 + */
20814 + printk(KERN_ERR "PAT enabled, "
20815 + "but not supported by secondary CPU\n");
20816 + BUG();
20817 }
20818
20819 #ifndef CONFIG_XEN
20820 @@ -87,8 +103,8 @@ void pat_init(void)
20821 * 011 UC _PAGE_CACHE_UC
20822 * PAT bit unused
20823 */
20824 - pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
20825 - PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
20826 + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
20827 + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
20828
20829 /* Boot CPU check */
20830 if (!boot_pat_state)
20831 @@ -113,13 +129,13 @@ void pat_init(void)
20832 static char *cattr_name(unsigned long flags)
20833 {
20834 switch (flags & _PAGE_CACHE_MASK) {
20835 - case _PAGE_CACHE_UC: return "uncached";
20836 - case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20837 - case _PAGE_CACHE_WB: return "write-back";
20838 - case _PAGE_CACHE_WC: return "write-combining";
20839 - case _PAGE_CACHE_WP: return "write-protected";
20840 - case _PAGE_CACHE_WT: return "write-through";
20841 - default: return "broken";
20842 + case _PAGE_CACHE_UC: return "uncached";
20843 + case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
20844 + case _PAGE_CACHE_WB: return "write-back";
20845 + case _PAGE_CACHE_WC: return "write-combining";
20846 + case _PAGE_CACHE_WP: return "write-protected";
20847 + case _PAGE_CACHE_WT: return "write-through";
20848 + default: return "broken";
20849 }
20850 }
20851
20852 @@ -157,49 +173,55 @@ static DEFINE_SPINLOCK(memtype_lock); /
20853 * The intersection is based on "Effective Memory Type" tables in IA-32
20854 * SDM vol 3a
20855 */
20856 -static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
20857 - unsigned long *ret_prot)
20858 +static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
20859 {
20860 - unsigned long pat_type;
20861 - u8 mtrr_type;
20862 -
20863 - pat_type = prot & _PAGE_CACHE_MASK;
20864 - prot &= (~_PAGE_CACHE_MASK);
20865 -
20866 - /*
20867 - * We return the PAT request directly for types where PAT takes
20868 - * precedence with respect to MTRR and for UC_MINUS.
20869 - * Consistency checks with other PAT requests is done later
20870 - * while going through memtype list.
20871 - */
20872 - if (pat_type == _PAGE_CACHE_WC) {
20873 - *ret_prot = prot | _PAGE_CACHE_WC;
20874 - return 0;
20875 - } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
20876 - *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
20877 - return 0;
20878 - } else if (pat_type == _PAGE_CACHE_UC) {
20879 - *ret_prot = prot | _PAGE_CACHE_UC;
20880 - return 0;
20881 - }
20882 -
20883 /*
20884 * Look for MTRR hint to get the effective type in case where PAT
20885 * request is for WB.
20886 */
20887 - mtrr_type = mtrr_type_lookup(start, end);
20888 + if (req_type == _PAGE_CACHE_WB) {
20889 + u8 mtrr_type;
20890
20891 - if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
20892 - *ret_prot = prot | _PAGE_CACHE_UC;
20893 - } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
20894 - *ret_prot = prot | _PAGE_CACHE_WC;
20895 - } else {
20896 - *ret_prot = prot | _PAGE_CACHE_WB;
20897 + mtrr_type = mtrr_type_lookup(start, end);
20898 + if (mtrr_type == MTRR_TYPE_UNCACHABLE)
20899 + return _PAGE_CACHE_UC;
20900 + if (mtrr_type == MTRR_TYPE_WRCOMB)
20901 + return _PAGE_CACHE_WC;
20902 + }
20903 +
20904 + return req_type;
20905 +}
20906 +
20907 +static int chk_conflict(struct memtype *new, struct memtype *entry,
20908 + unsigned long *type)
20909 +{
20910 + if (new->type != entry->type) {
20911 + if (type) {
20912 + new->type = entry->type;
20913 + *type = entry->type;
20914 + } else
20915 + goto conflict;
20916 }
20917
20918 + /* check overlaps with more than one entry in the list */
20919 + list_for_each_entry_continue(entry, &memtype_list, nd) {
20920 + if (new->end <= entry->start)
20921 + break;
20922 + else if (new->type != entry->type)
20923 + goto conflict;
20924 + }
20925 return 0;
20926 +
20927 + conflict:
20928 + printk(KERN_INFO "%s:%d conflicting memory types "
20929 + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
20930 + new->end, cattr_name(new->type), cattr_name(entry->type));
20931 + return -EBUSY;
20932 }
20933
20934 +static struct memtype *cached_entry;
20935 +static u64 cached_start;
20936 +
20937 /*
20938 * req_type typically has one of the:
20939 * - _PAGE_CACHE_WB
20940 @@ -210,37 +232,36 @@ static int pat_x_mtrr_type(u64 start, u6
20941 * req_type will have a special case value '-1', when requester want to inherit
20942 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
20943 *
20944 - * If ret_type is NULL, function will return an error if it cannot reserve the
20945 - * region with req_type. If ret_type is non-null, function will return
20946 - * available type in ret_type in case of no error. In case of any error
20947 + * If new_type is NULL, function will return an error if it cannot reserve the
20948 + * region with req_type. If new_type is non-NULL, function will return
20949 + * available type in new_type in case of no error. In case of any error
20950 * it will return a negative return value.
20951 */
20952 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
20953 - unsigned long *ret_type)
20954 + unsigned long *new_type)
20955 {
20956 - struct memtype *new_entry = NULL;
20957 - struct memtype *parse;
20958 + struct memtype *new, *entry;
20959 unsigned long actual_type;
20960 + struct list_head *where;
20961 int err = 0;
20962
20963 - /* Only track when pat_wc_enabled */
20964 - if (!pat_wc_enabled) {
20965 + BUG_ON(start >= end); /* end is exclusive */
20966 +
20967 + if (!pat_enabled) {
20968 /* This is identical to page table setting without PAT */
20969 - if (ret_type) {
20970 - if (req_type == -1) {
20971 - *ret_type = _PAGE_CACHE_WB;
20972 - } else {
20973 - *ret_type = req_type;
20974 - }
20975 + if (new_type) {
20976 + if (req_type == -1)
20977 + *new_type = _PAGE_CACHE_WB;
20978 + else
20979 + *new_type = req_type & _PAGE_CACHE_MASK;
20980 }
20981 return 0;
20982 }
20983
20984 /* Low ISA region is always mapped WB in page table. No need to track */
20985 - if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
20986 - if (ret_type)
20987 - *ret_type = _PAGE_CACHE_WB;
20988 -
20989 + if (is_ISA_range(start, end - 1)) {
20990 + if (new_type)
20991 + *new_type = _PAGE_CACHE_WB;
20992 return 0;
20993 }
20994
20995 @@ -253,206 +274,118 @@ int reserve_memtype(u64 start, u64 end,
20996 */
20997 u8 mtrr_type = mtrr_type_lookup(start, end);
20998
20999 - if (mtrr_type == MTRR_TYPE_WRBACK) {
21000 - req_type = _PAGE_CACHE_WB;
21001 + if (mtrr_type == MTRR_TYPE_WRBACK)
21002 actual_type = _PAGE_CACHE_WB;
21003 - } else {
21004 - req_type = _PAGE_CACHE_UC_MINUS;
21005 + else
21006 actual_type = _PAGE_CACHE_UC_MINUS;
21007 - }
21008 - } else {
21009 - req_type &= _PAGE_CACHE_MASK;
21010 - err = pat_x_mtrr_type(start, end, req_type, &actual_type);
21011 - }
21012 -
21013 - if (err) {
21014 - if (ret_type)
21015 - *ret_type = actual_type;
21016 + } else
21017 + actual_type = pat_x_mtrr_type(start, end,
21018 + req_type & _PAGE_CACHE_MASK);
21019
21020 - return -EINVAL;
21021 - }
21022 -
21023 - new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21024 - if (!new_entry)
21025 + new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21026 + if (!new)
21027 return -ENOMEM;
21028
21029 - new_entry->start = start;
21030 - new_entry->end = end;
21031 - new_entry->type = actual_type;
21032 + new->start = start;
21033 + new->end = end;
21034 + new->type = actual_type;
21035
21036 - if (ret_type)
21037 - *ret_type = actual_type;
21038 + if (new_type)
21039 + *new_type = actual_type;
21040
21041 spin_lock(&memtype_lock);
21042
21043 - /* Search for existing mapping that overlaps the current range */
21044 - list_for_each_entry(parse, &memtype_list, nd) {
21045 - struct memtype *saved_ptr;
21046 + if (cached_entry && start >= cached_start)
21047 + entry = cached_entry;
21048 + else
21049 + entry = list_entry(&memtype_list, struct memtype, nd);
21050
21051 - if (parse->start >= end) {
21052 - pr_debug("New Entry\n");
21053 - list_add(&new_entry->nd, parse->nd.prev);
21054 - new_entry = NULL;
21055 + /* Search for existing mapping that overlaps the current range */
21056 + where = NULL;
21057 + list_for_each_entry_continue(entry, &memtype_list, nd) {
21058 + if (end <= entry->start) {
21059 + where = entry->nd.prev;
21060 + cached_entry = list_entry(where, struct memtype, nd);
21061 break;
21062 - }
21063 -
21064 - if (start <= parse->start && end >= parse->start) {
21065 - if (actual_type != parse->type && ret_type) {
21066 - actual_type = parse->type;
21067 - *ret_type = actual_type;
21068 - new_entry->type = actual_type;
21069 - }
21070 -
21071 - if (actual_type != parse->type) {
21072 - printk(
21073 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21074 - current->comm, current->pid,
21075 - start, end,
21076 - cattr_name(actual_type),
21077 - cattr_name(parse->type));
21078 - err = -EBUSY;
21079 - break;
21080 - }
21081 -
21082 - saved_ptr = parse;
21083 - /*
21084 - * Check to see whether the request overlaps more
21085 - * than one entry in the list
21086 - */
21087 - list_for_each_entry_continue(parse, &memtype_list, nd) {
21088 - if (end <= parse->start) {
21089 - break;
21090 - }
21091 -
21092 - if (actual_type != parse->type) {
21093 - printk(
21094 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21095 - current->comm, current->pid,
21096 - start, end,
21097 - cattr_name(actual_type),
21098 - cattr_name(parse->type));
21099 - err = -EBUSY;
21100 - break;
21101 - }
21102 - }
21103 -
21104 - if (err) {
21105 - break;
21106 + } else if (start <= entry->start) { /* end > entry->start */
21107 + err = chk_conflict(new, entry, new_type);
21108 + if (!err) {
21109 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
21110 + entry->start, entry->end);
21111 + where = entry->nd.prev;
21112 + cached_entry = list_entry(where,
21113 + struct memtype, nd);
21114 }
21115 -
21116 - pr_debug("Overlap at 0x%Lx-0x%Lx\n",
21117 - saved_ptr->start, saved_ptr->end);
21118 - /* No conflict. Go ahead and add this new entry */
21119 - list_add(&new_entry->nd, saved_ptr->nd.prev);
21120 - new_entry = NULL;
21121 break;
21122 - }
21123 -
21124 - if (start < parse->end) {
21125 - if (actual_type != parse->type && ret_type) {
21126 - actual_type = parse->type;
21127 - *ret_type = actual_type;
21128 - new_entry->type = actual_type;
21129 - }
21130 -
21131 - if (actual_type != parse->type) {
21132 - printk(
21133 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21134 - current->comm, current->pid,
21135 - start, end,
21136 - cattr_name(actual_type),
21137 - cattr_name(parse->type));
21138 - err = -EBUSY;
21139 - break;
21140 - }
21141 -
21142 - saved_ptr = parse;
21143 - /*
21144 - * Check to see whether the request overlaps more
21145 - * than one entry in the list
21146 - */
21147 - list_for_each_entry_continue(parse, &memtype_list, nd) {
21148 - if (end <= parse->start) {
21149 - break;
21150 - }
21151 -
21152 - if (actual_type != parse->type) {
21153 - printk(
21154 - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
21155 - current->comm, current->pid,
21156 - start, end,
21157 - cattr_name(actual_type),
21158 - cattr_name(parse->type));
21159 - err = -EBUSY;
21160 - break;
21161 + } else if (start < entry->end) { /* start > entry->start */
21162 + err = chk_conflict(new, entry, new_type);
21163 + if (!err) {
21164 + dprintk("Overlap at 0x%Lx-0x%Lx\n",
21165 + entry->start, entry->end);
21166 + cached_entry = list_entry(entry->nd.prev,
21167 + struct memtype, nd);
21168 +
21169 + /*
21170 + * Move to right position in the linked
21171 + * list to add this new entry
21172 + */
21173 + list_for_each_entry_continue(entry,
21174 + &memtype_list, nd) {
21175 + if (start <= entry->start) {
21176 + where = entry->nd.prev;
21177 + break;
21178 + }
21179 }
21180 }
21181 -
21182 - if (err) {
21183 - break;
21184 - }
21185 -
21186 - pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
21187 - saved_ptr->start, saved_ptr->end);
21188 - /* No conflict. Go ahead and add this new entry */
21189 - list_add(&new_entry->nd, &saved_ptr->nd);
21190 - new_entry = NULL;
21191 break;
21192 }
21193 }
21194
21195 if (err) {
21196 - printk(KERN_INFO
21197 - "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
21198 - start, end, cattr_name(new_entry->type),
21199 - cattr_name(req_type));
21200 - kfree(new_entry);
21201 + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
21202 + "track %s, req %s\n",
21203 + start, end, cattr_name(new->type), cattr_name(req_type));
21204 + kfree(new);
21205 spin_unlock(&memtype_lock);
21206 return err;
21207 }
21208
21209 - if (new_entry) {
21210 - /* No conflict. Not yet added to the list. Add to the tail */
21211 - list_add_tail(&new_entry->nd, &memtype_list);
21212 - pr_debug("New Entry\n");
21213 - }
21214 + cached_start = start;
21215
21216 - if (ret_type) {
21217 - pr_debug(
21218 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21219 - start, end, cattr_name(actual_type),
21220 - cattr_name(req_type), cattr_name(*ret_type));
21221 - } else {
21222 - pr_debug(
21223 - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
21224 - start, end, cattr_name(actual_type),
21225 - cattr_name(req_type));
21226 - }
21227 + if (where)
21228 + list_add(&new->nd, where);
21229 + else
21230 + list_add_tail(&new->nd, &memtype_list);
21231
21232 spin_unlock(&memtype_lock);
21233 +
21234 + dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
21235 + start, end, cattr_name(new->type), cattr_name(req_type),
21236 + new_type ? cattr_name(*new_type) : "-");
21237 +
21238 return err;
21239 }
21240
21241 int free_memtype(u64 start, u64 end)
21242 {
21243 - struct memtype *ml;
21244 + struct memtype *entry;
21245 int err = -EINVAL;
21246
21247 - /* Only track when pat_wc_enabled */
21248 - if (!pat_wc_enabled) {
21249 + if (!pat_enabled)
21250 return 0;
21251 - }
21252
21253 /* Low ISA region is always mapped WB. No need to track */
21254 - if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
21255 + if (is_ISA_range(start, end - 1))
21256 return 0;
21257 - }
21258
21259 spin_lock(&memtype_lock);
21260 - list_for_each_entry(ml, &memtype_list, nd) {
21261 - if (ml->start == start && ml->end == end) {
21262 - list_del(&ml->nd);
21263 - kfree(ml);
21264 + list_for_each_entry(entry, &memtype_list, nd) {
21265 + if (entry->start == start && entry->end == end) {
21266 + if (cached_entry == entry || cached_start == start)
21267 + cached_entry = NULL;
21268 +
21269 + list_del(&entry->nd);
21270 + kfree(entry);
21271 err = 0;
21272 break;
21273 }
21274 @@ -464,27 +397,19 @@ int free_memtype(u64 start, u64 end)
21275 current->comm, current->pid, start, end);
21276 }
21277
21278 - pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21279 + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
21280 return err;
21281 }
21282
21283
21284 -/*
21285 - * /dev/mem mmap interface. The memtype used for mapping varies:
21286 - * - Use UC for mappings with O_SYNC flag
21287 - * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
21288 - * inherit the memtype from existing mapping.
21289 - * - Else use UC_MINUS memtype (for backward compatibility with existing
21290 - * X drivers.
21291 - */
21292 pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
21293 unsigned long size, pgprot_t vma_prot)
21294 {
21295 return vma_prot;
21296 }
21297
21298 -#ifdef CONFIG_NONPROMISC_DEVMEM
21299 -/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
21300 +#ifdef CONFIG_STRICT_DEVMEM
21301 +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
21302 static inline int range_is_allowed(unsigned long mfn, unsigned long size)
21303 {
21304 return 1;
21305 @@ -508,20 +433,20 @@ static inline int range_is_allowed(unsig
21306 }
21307 return 1;
21308 }
21309 -#endif /* CONFIG_NONPROMISC_DEVMEM */
21310 +#endif /* CONFIG_STRICT_DEVMEM */
21311
21312 int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
21313 unsigned long size, pgprot_t *vma_prot)
21314 {
21315 u64 addr = (u64)mfn << PAGE_SHIFT;
21316 - unsigned long flags = _PAGE_CACHE_UC_MINUS;
21317 + unsigned long flags = -1;
21318 int retval;
21319
21320 if (!range_is_allowed(mfn, size))
21321 return 0;
21322
21323 if (file->f_flags & O_SYNC) {
21324 - flags = _PAGE_CACHE_UC;
21325 + flags = _PAGE_CACHE_UC_MINUS;
21326 }
21327
21328 #ifndef CONFIG_X86_32
21329 @@ -534,25 +459,26 @@ int phys_mem_access_prot_allowed(struct
21330 * caching for the high addresses through the KEN pin, but
21331 * we maintain the tradition of paranoia in this code.
21332 */
21333 - if (!pat_wc_enabled &&
21334 - ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
21335 - test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
21336 - test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
21337 - test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
21338 - (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21339 + if (!pat_enabled &&
21340 + !(boot_cpu_has(X86_FEATURE_MTRR) ||
21341 + boot_cpu_has(X86_FEATURE_K6_MTRR) ||
21342 + boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
21343 + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
21344 + (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
21345 flags = _PAGE_CACHE_UC;
21346 }
21347 #endif
21348 #endif
21349
21350 /*
21351 - * With O_SYNC, we can only take UC mapping. Fail if we cannot.
21352 + * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
21353 + *
21354 * Without O_SYNC, we want to get
21355 * - WB for WB-able memory and no other conflicting mappings
21356 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
21357 * - Inherit from confliting mappings otherwise
21358 */
21359 - if (flags != _PAGE_CACHE_UC_MINUS) {
21360 + if (flags != -1) {
21361 retval = reserve_memtype(addr, addr + size, flags, NULL);
21362 } else {
21363 retval = reserve_memtype(addr, addr + size, -1, &flags);
21364 @@ -600,3 +526,88 @@ void unmap_devmem(unsigned long mfn, uns
21365 free_memtype(addr, addr + size);
21366 }
21367
21368 +#if defined(CONFIG_DEBUG_FS)
21369 +
21370 +/* get Nth element of the linked list */
21371 +static struct memtype *memtype_get_idx(loff_t pos)
21372 +{
21373 + struct memtype *list_node, *print_entry;
21374 + int i = 1;
21375 +
21376 + print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
21377 + if (!print_entry)
21378 + return NULL;
21379 +
21380 + spin_lock(&memtype_lock);
21381 + list_for_each_entry(list_node, &memtype_list, nd) {
21382 + if (pos == i) {
21383 + *print_entry = *list_node;
21384 + spin_unlock(&memtype_lock);
21385 + return print_entry;
21386 + }
21387 + ++i;
21388 + }
21389 + spin_unlock(&memtype_lock);
21390 + kfree(print_entry);
21391 + return NULL;
21392 +}
21393 +
21394 +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
21395 +{
21396 + if (*pos == 0) {
21397 + ++*pos;
21398 + seq_printf(seq, "PAT memtype list:\n");
21399 + }
21400 +
21401 + return memtype_get_idx(*pos);
21402 +}
21403 +
21404 +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
21405 +{
21406 + ++*pos;
21407 + return memtype_get_idx(*pos);
21408 +}
21409 +
21410 +static void memtype_seq_stop(struct seq_file *seq, void *v)
21411 +{
21412 +}
21413 +
21414 +static int memtype_seq_show(struct seq_file *seq, void *v)
21415 +{
21416 + struct memtype *print_entry = (struct memtype *)v;
21417 +
21418 + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
21419 + print_entry->start, print_entry->end);
21420 + kfree(print_entry);
21421 + return 0;
21422 +}
21423 +
21424 +static struct seq_operations memtype_seq_ops = {
21425 + .start = memtype_seq_start,
21426 + .next = memtype_seq_next,
21427 + .stop = memtype_seq_stop,
21428 + .show = memtype_seq_show,
21429 +};
21430 +
21431 +static int memtype_seq_open(struct inode *inode, struct file *file)
21432 +{
21433 + return seq_open(file, &memtype_seq_ops);
21434 +}
21435 +
21436 +static const struct file_operations memtype_fops = {
21437 + .open = memtype_seq_open,
21438 + .read = seq_read,
21439 + .llseek = seq_lseek,
21440 + .release = seq_release,
21441 +};
21442 +
21443 +static int __init pat_memtype_list_init(void)
21444 +{
21445 + debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
21446 + NULL, &memtype_fops);
21447 + return 0;
21448 +}
21449 +
21450 +late_initcall(pat_memtype_list_init);
21451 +
21452 +#endif /* CONFIG_DEBUG_FS */
21453 --- sle11-2009-10-16.orig/arch/x86/mm/pgtable-xen.c 2009-03-16 16:38:05.000000000 +0100
21454 +++ sle11-2009-10-16/arch/x86/mm/pgtable-xen.c 2009-06-04 10:21:39.000000000 +0200
21455 @@ -4,6 +4,7 @@
21456 #include <asm/pgalloc.h>
21457 #include <asm/pgtable.h>
21458 #include <asm/tlb.h>
21459 +#include <asm/fixmap.h>
21460 #include <asm/hypervisor.h>
21461 #include <asm/mmu_context.h>
21462
21463 @@ -410,15 +411,9 @@ static inline void pgd_list_del(pgd_t *p
21464 static void pgd_ctor(void *p)
21465 {
21466 pgd_t *pgd = p;
21467 - unsigned long flags;
21468
21469 pgd_test_and_unpin(pgd);
21470
21471 - /* Clear usermode parts of PGD */
21472 - memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
21473 -
21474 - spin_lock_irqsave(&pgd_lock, flags);
21475 -
21476 /* If the pgd points to a shared pagetable level (either the
21477 ptes in non-PAE, or shared PMD in PAE), then just copy the
21478 references from swapper_pg_dir. */
21479 @@ -440,13 +435,9 @@ static void pgd_ctor(void *p)
21480 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
21481 #endif
21482
21483 -#ifndef CONFIG_X86_PAE
21484 /* list required to sync kernel mapping updates */
21485 if (!SHARED_KERNEL_PMD)
21486 pgd_list_add(pgd);
21487 -#endif
21488 -
21489 - spin_unlock_irqrestore(&pgd_lock, flags);
21490 }
21491
21492 static void pgd_dtor(void *pgd)
21493 @@ -475,33 +466,6 @@ static void pgd_dtor(void *pgd)
21494
21495 #ifdef CONFIG_X86_PAE
21496 /*
21497 - * Mop up any pmd pages which may still be attached to the pgd.
21498 - * Normally they will be freed by munmap/exit_mmap, but any pmd we
21499 - * preallocate which never got a corresponding vma will need to be
21500 - * freed manually.
21501 - */
21502 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21503 -{
21504 - int i;
21505 -
21506 - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
21507 - pgd_t pgd = pgdp[i];
21508 -
21509 - if (__pgd_val(pgd) != 0) {
21510 - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21511 -
21512 - pgdp[i] = xen_make_pgd(0);
21513 -
21514 - paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21515 - pmd_free(mm, pmd);
21516 - }
21517 - }
21518 -
21519 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21520 - xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21521 -}
21522 -
21523 -/*
21524 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
21525 * updating the top-level pagetable entries to guarantee the
21526 * processor notices the update. Since this is expensive, and
21527 @@ -512,61 +476,7 @@ static void pgd_mop_up_pmds(struct mm_st
21528 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
21529 * and initialize the kernel pmds here.
21530 */
21531 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21532 -{
21533 - pud_t *pud;
21534 - pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
21535 - unsigned long addr, flags;
21536 - int i;
21537 -
21538 - /*
21539 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
21540 - * allocation). We therefore store virtual addresses of pmds as they
21541 - * do not change across save/restore, and poke the machine addresses
21542 - * into the pgdir under the pgd_lock.
21543 - */
21544 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
21545 - pmds[i] = pmd_alloc_one(mm, addr);
21546 - if (!pmds[i])
21547 - goto out_oom;
21548 - }
21549 -
21550 - spin_lock_irqsave(&pgd_lock, flags);
21551 -
21552 - /* Protect against save/restore: move below 4GB under pgd_lock. */
21553 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21554 - && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21555 - spin_unlock_irqrestore(&pgd_lock, flags);
21556 -out_oom:
21557 - while (i--)
21558 - pmd_free(mm, pmds[i]);
21559 - return 0;
21560 - }
21561 -
21562 - /* Copy kernel pmd contents and write-protect the new pmds. */
21563 - pud = pud_offset(pgd, 0);
21564 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
21565 - i++, pud++, addr += PUD_SIZE) {
21566 - if (i >= KERNEL_PGD_BOUNDARY) {
21567 - memcpy(pmds[i],
21568 - (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21569 - sizeof(pmd_t) * PTRS_PER_PMD);
21570 - make_lowmem_page_readonly(
21571 - pmds[i], XENFEAT_writable_page_tables);
21572 - }
21573 -
21574 - /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21575 - pud_populate(mm, pud, pmds[i]);
21576 - }
21577 -
21578 - /* List required to sync kernel mapping updates and
21579 - * to pin/unpin on save/restore. */
21580 - pgd_list_add(pgd);
21581 -
21582 - spin_unlock_irqrestore(&pgd_lock, flags);
21583 -
21584 - return 1;
21585 -}
21586 +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
21587
21588 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
21589 {
21590 @@ -596,16 +506,101 @@ void pud_populate(struct mm_struct *mm,
21591 xen_tlb_flush();
21592 }
21593 #else /* !CONFIG_X86_PAE */
21594 +
21595 /* No need to prepopulate any pagetable entries in non-PAE modes. */
21596 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
21597 +#define PREALLOCATED_PMDS 0
21598 +
21599 +#endif /* CONFIG_X86_PAE */
21600 +
21601 +static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
21602 {
21603 - return 1;
21604 + int i;
21605 +
21606 +#ifdef CONFIG_X86_PAE
21607 + if (contig)
21608 + xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
21609 +#endif
21610 +
21611 + for(i = 0; i < PREALLOCATED_PMDS; i++)
21612 + if (pmds[i])
21613 + pmd_free(mm, pmds[i]);
21614 }
21615
21616 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
21617 +static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
21618 {
21619 + int i;
21620 + bool failed = false;
21621 +
21622 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21623 + pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
21624 + if (pmd == NULL)
21625 + failed = true;
21626 + pmds[i] = pmd;
21627 + }
21628 +
21629 + if (failed) {
21630 + free_pmds(pmds, mm, false);
21631 + return -ENOMEM;
21632 + }
21633 +
21634 + return 0;
21635 +}
21636 +
21637 +/*
21638 + * Mop up any pmd pages which may still be attached to the pgd.
21639 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
21640 + * preallocate which never got a corresponding vma will need to be
21641 + * freed manually.
21642 + */
21643 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
21644 +{
21645 + int i;
21646 +
21647 + for(i = 0; i < PREALLOCATED_PMDS; i++) {
21648 + pgd_t pgd = pgdp[i];
21649 +
21650 + if (__pgd_val(pgd) != 0) {
21651 + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
21652 +
21653 + pgdp[i] = xen_make_pgd(0);
21654 +
21655 + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
21656 + pmd_free(mm, pmd);
21657 + }
21658 + }
21659 +
21660 +#ifdef CONFIG_X86_PAE
21661 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
21662 + xen_destroy_contiguous_region((unsigned long)pgdp, 0);
21663 +#endif
21664 +}
21665 +
21666 +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
21667 +{
21668 + pud_t *pud;
21669 + unsigned long addr;
21670 + int i;
21671 +
21672 + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
21673 + return;
21674 +
21675 + pud = pud_offset(pgd, 0);
21676 + for (addr = i = 0; i < PREALLOCATED_PMDS;
21677 + i++, pud++, addr += PUD_SIZE) {
21678 + pmd_t *pmd = pmds[i];
21679 +
21680 + if (i >= KERNEL_PGD_BOUNDARY) {
21681 + memcpy(pmd,
21682 + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
21683 + sizeof(pmd_t) * PTRS_PER_PMD);
21684 + make_lowmem_page_readonly(
21685 + pmd, XENFEAT_writable_page_tables);
21686 + }
21687 +
21688 + /* It is safe to poke machine addresses of pmds under the pgd_lock. */
21689 + pud_populate(mm, pud, pmd);
21690 + }
21691 }
21692 -#endif /* CONFIG_X86_PAE */
21693
21694 #ifdef CONFIG_X86_64
21695 /* We allocate two contiguous pages for kernel and user. */
21696 @@ -616,19 +611,52 @@ static void pgd_mop_up_pmds(struct mm_st
21697
21698 pgd_t *pgd_alloc(struct mm_struct *mm)
21699 {
21700 - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21701 + pgd_t *pgd;
21702 + pmd_t *pmds[PREALLOCATED_PMDS];
21703 + unsigned long flags;
21704 +
21705 + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
21706 +
21707 + if (pgd == NULL)
21708 + goto out;
21709
21710 - /* so that alloc_pd can use it */
21711 mm->pgd = pgd;
21712 - if (pgd)
21713 - pgd_ctor(pgd);
21714
21715 - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
21716 - free_pages((unsigned long)pgd, PGD_ORDER);
21717 - pgd = NULL;
21718 + if (preallocate_pmds(pmds, mm) != 0)
21719 + goto out_free_pgd;
21720 +
21721 + if (paravirt_pgd_alloc(mm) != 0)
21722 + goto out_free_pmds;
21723 +
21724 + /*
21725 + * Make sure that pre-populating the pmds is atomic with
21726 + * respect to anything walking the pgd_list, so that they
21727 + * never see a partially populated pgd.
21728 + */
21729 + spin_lock_irqsave(&pgd_lock, flags);
21730 +
21731 +#ifdef CONFIG_X86_PAE
21732 + /* Protect against save/restore: move below 4GB under pgd_lock. */
21733 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
21734 + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
21735 + spin_unlock_irqrestore(&pgd_lock, flags);
21736 + goto out_free_pmds;
21737 }
21738 +#endif
21739 +
21740 + pgd_ctor(pgd);
21741 + pgd_prepopulate_pmd(mm, pgd, pmds);
21742 +
21743 + spin_unlock_irqrestore(&pgd_lock, flags);
21744
21745 return pgd;
21746 +
21747 +out_free_pmds:
21748 + free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
21749 +out_free_pgd:
21750 + free_pages((unsigned long)pgd, PGD_ORDER);
21751 +out:
21752 + return NULL;
21753 }
21754
21755 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
21756 @@ -644,6 +672,7 @@ void pgd_free(struct mm_struct *mm, pgd_
21757 pgd_dtor(pgd);
21758
21759 pgd_mop_up_pmds(mm, pgd);
21760 + paravirt_pgd_free(mm, pgd);
21761 free_pages((unsigned long)pgd, PGD_ORDER);
21762 }
21763
21764 @@ -685,7 +714,7 @@ int ptep_test_and_clear_young(struct vm_
21765
21766 if (pte_young(*ptep))
21767 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
21768 - &ptep->pte);
21769 + (unsigned long *) &ptep->pte);
21770
21771 if (ret)
21772 pte_update(vma->vm_mm, addr, ptep);
21773 @@ -707,3 +736,42 @@ int ptep_clear_flush_young(struct vm_are
21774
21775 return young;
21776 }
21777 +
21778 +int fixmaps_set;
21779 +
21780 +void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21781 +{
21782 + unsigned long address = __fix_to_virt(idx);
21783 + pte_t pte;
21784 +
21785 + if (idx >= __end_of_fixed_addresses) {
21786 + BUG();
21787 + return;
21788 + }
21789 +
21790 + switch (idx) {
21791 +#ifdef CONFIG_X86_64
21792 + extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
21793 +
21794 + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
21795 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21796 + set_pte_vaddr_pud(level3_user_pgt, address, pte);
21797 + break;
21798 + case FIX_EARLYCON_MEM_BASE:
21799 + xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
21800 + pfn_pte_ma(phys >> PAGE_SHIFT, flags));
21801 + fixmaps_set++;
21802 + return;
21803 +#else
21804 + case FIX_WP_TEST:
21805 + case FIX_VDSO:
21806 + pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21807 + break;
21808 +#endif
21809 + default:
21810 + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21811 + break;
21812 + }
21813 + set_pte_vaddr(address, pte);
21814 + fixmaps_set++;
21815 +}
21816 --- sle11-2009-10-16.orig/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:38:05.000000000 +0100
21817 +++ sle11-2009-10-16/arch/x86/mm/pgtable_32-xen.c 2009-06-04 10:21:39.000000000 +0200
21818 @@ -25,51 +25,49 @@
21819 #include <xen/features.h>
21820 #include <asm/hypervisor.h>
21821
21822 -void show_mem(void)
21823 +/*
21824 + * Associate a virtual page frame with a given physical page frame
21825 + * and protection flags for that frame.
21826 + */
21827 +void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
21828 {
21829 - int total = 0, reserved = 0;
21830 - int shared = 0, cached = 0;
21831 - int highmem = 0;
21832 - struct page *page;
21833 - pg_data_t *pgdat;
21834 - unsigned long i;
21835 - unsigned long flags;
21836 -
21837 - printk(KERN_INFO "Mem-info:\n");
21838 - show_free_areas();
21839 - for_each_online_pgdat(pgdat) {
21840 - pgdat_resize_lock(pgdat, &flags);
21841 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
21842 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
21843 - touch_nmi_watchdog();
21844 - page = pgdat_page_nr(pgdat, i);
21845 - total++;
21846 - if (PageHighMem(page))
21847 - highmem++;
21848 - if (PageReserved(page))
21849 - reserved++;
21850 - else if (PageSwapCache(page))
21851 - cached++;
21852 - else if (page_count(page))
21853 - shared += page_count(page) - 1;
21854 - }
21855 - pgdat_resize_unlock(pgdat, &flags);
21856 - }
21857 - printk(KERN_INFO "%d pages of RAM\n", total);
21858 - printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
21859 - printk(KERN_INFO "%d reserved pages\n", reserved);
21860 - printk(KERN_INFO "%d pages shared\n", shared);
21861 - printk(KERN_INFO "%d pages swap cached\n", cached);
21862 -
21863 - printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
21864 - printk(KERN_INFO "%lu pages writeback\n",
21865 - global_page_state(NR_WRITEBACK));
21866 - printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
21867 - printk(KERN_INFO "%lu pages slab\n",
21868 - global_page_state(NR_SLAB_RECLAIMABLE) +
21869 - global_page_state(NR_SLAB_UNRECLAIMABLE));
21870 - printk(KERN_INFO "%lu pages pagetables\n",
21871 - global_page_state(NR_PAGETABLE));
21872 +#ifndef CONFIG_XEN
21873 + pgd_t *pgd;
21874 + pud_t *pud;
21875 + pmd_t *pmd;
21876 + pte_t *pte;
21877 +
21878 + pgd = swapper_pg_dir + pgd_index(vaddr);
21879 + if (pgd_none(*pgd)) {
21880 + BUG();
21881 + return;
21882 + }
21883 + pud = pud_offset(pgd, vaddr);
21884 + if (pud_none(*pud)) {
21885 + BUG();
21886 + return;
21887 + }
21888 + pmd = pmd_offset(pud, vaddr);
21889 + if (pmd_none(*pmd)) {
21890 + BUG();
21891 + return;
21892 + }
21893 + pte = pte_offset_kernel(pmd, vaddr);
21894 + if (pte_val(pteval))
21895 + set_pte_present(&init_mm, vaddr, pte, pteval);
21896 + else
21897 + pte_clear(&init_mm, vaddr, pte);
21898 +
21899 + /*
21900 + * It's enough to flush this one mapping.
21901 + * (PGE mappings get flushed as well)
21902 + */
21903 + __flush_tlb_one(vaddr);
21904 +#else
21905 + if (HYPERVISOR_update_va_mapping(vaddr, pteval,
21906 + UVMF_INVLPG|UVMF_ALL))
21907 + BUG();
21908 +#endif
21909 }
21910
21911 /*
21912 @@ -107,35 +105,10 @@ void set_pmd_pfn(unsigned long vaddr, un
21913 __flush_tlb_one(vaddr);
21914 }
21915
21916 -static int fixmaps;
21917 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
21918 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
21919 EXPORT_SYMBOL(__FIXADDR_TOP);
21920
21921 -void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
21922 -{
21923 - unsigned long address = __fix_to_virt(idx);
21924 - pte_t pte;
21925 -
21926 - if (idx >= __end_of_fixed_addresses) {
21927 - BUG();
21928 - return;
21929 - }
21930 - switch (idx) {
21931 - case FIX_WP_TEST:
21932 - case FIX_VDSO:
21933 - pte = pfn_pte(phys >> PAGE_SHIFT, flags);
21934 - break;
21935 - default:
21936 - pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
21937 - break;
21938 - }
21939 - if (HYPERVISOR_update_va_mapping(address, pte,
21940 - UVMF_INVLPG|UVMF_ALL))
21941 - BUG();
21942 - fixmaps++;
21943 -}
21944 -
21945 /**
21946 * reserve_top_address - reserves a hole in the top of kernel address space
21947 * @reserve - size of hole to reserve
21948 @@ -145,13 +118,48 @@ void __set_fixmap (enum fixed_addresses
21949 */
21950 void __init reserve_top_address(unsigned long reserve)
21951 {
21952 - BUG_ON(fixmaps > 0);
21953 + BUG_ON(fixmaps_set > 0);
21954 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
21955 (int)-reserve);
21956 __FIXADDR_TOP = -reserve - PAGE_SIZE;
21957 __VMALLOC_RESERVE += reserve;
21958 }
21959
21960 +/*
21961 + * vmalloc=size forces the vmalloc area to be exactly 'size'
21962 + * bytes. This can be used to increase (or decrease) the
21963 + * vmalloc area - the default is 128m.
21964 + */
21965 +static int __init parse_vmalloc(char *arg)
21966 +{
21967 + if (!arg)
21968 + return -EINVAL;
21969 +
21970 + __VMALLOC_RESERVE = memparse(arg, &arg);
21971 + return 0;
21972 +}
21973 +early_param("vmalloc", parse_vmalloc);
21974 +
21975 +#ifndef CONFIG_XEN
21976 +/*
21977 + * reservetop=size reserves a hole at the top of the kernel address space which
21978 + * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
21979 + * so relocating the fixmap can be done before paging initialization.
21980 + */
21981 +static int __init parse_reservetop(char *arg)
21982 +{
21983 + unsigned long address;
21984 +
21985 + if (!arg)
21986 + return -EINVAL;
21987 +
21988 + address = memparse(arg, &arg);
21989 + reserve_top_address(address);
21990 + return 0;
21991 +}
21992 +early_param("reservetop", parse_reservetop);
21993 +#endif
21994 +
21995 void make_lowmem_page_readonly(void *va, unsigned int feature)
21996 {
21997 pte_t *pte;
21998 --- sle11-2009-10-16.orig/arch/x86/pci/amd_bus.c 2009-10-28 14:55:02.000000000 +0100
21999 +++ sle11-2009-10-16/arch/x86/pci/amd_bus.c 2009-06-04 10:21:39.000000000 +0200
22000 @@ -607,6 +607,14 @@ static int __init pci_io_ecs_init(void)
22001 for_each_online_cpu(cpu)
22002 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
22003 (void *)(long)cpu);
22004 +#ifdef CONFIG_XEN
22005 + {
22006 + u64 reg;
22007 + rdmsrl(MSR_AMD64_NB_CFG, reg);
22008 + if (!(reg & ENABLE_CF8_EXT_CFG))
22009 + return 0;
22010 + }
22011 +#endif
22012 pci_probe |= PCI_HAS_IO_ECS;
22013
22014 return 0;
22015 @@ -614,6 +622,10 @@ static int __init pci_io_ecs_init(void)
22016
22017 static int __init amd_postcore_init(void)
22018 {
22019 +#ifdef CONFIG_XEN
22020 + if (!is_initial_xendomain())
22021 + return 0;
22022 +#endif
22023 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
22024 return 0;
22025
22026 --- sle11-2009-10-16.orig/arch/x86/pci/irq-xen.c 2009-03-16 16:38:05.000000000 +0100
22027 +++ sle11-2009-10-16/arch/x86/pci/irq-xen.c 2009-06-04 10:21:39.000000000 +0200
22028 @@ -11,8 +11,8 @@
22029 #include <linux/slab.h>
22030 #include <linux/interrupt.h>
22031 #include <linux/dmi.h>
22032 -#include <asm/io.h>
22033 -#include <asm/smp.h>
22034 +#include <linux/io.h>
22035 +#include <linux/smp.h>
22036 #include <asm/io_apic.h>
22037 #include <linux/irq.h>
22038 #include <linux/acpi.h>
22039 @@ -45,7 +45,8 @@ struct irq_router {
22040 char *name;
22041 u16 vendor, device;
22042 int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
22043 - int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
22044 + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
22045 + int new);
22046 };
22047
22048 struct irq_router_handler {
22049 @@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_d
22050 * and perform checksum verification.
22051 */
22052
22053 -static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
22054 +static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
22055 {
22056 struct irq_routing_table *rt;
22057 int i;
22058 @@ -74,10 +75,11 @@ static inline struct irq_routing_table *
22059 rt->size < sizeof(struct irq_routing_table))
22060 return NULL;
22061 sum = 0;
22062 - for (i=0; i < rt->size; i++)
22063 + for (i = 0; i < rt->size; i++)
22064 sum += addr[i];
22065 if (!sum) {
22066 - DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
22067 + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
22068 + rt);
22069 return rt;
22070 }
22071 return NULL;
22072 @@ -104,7 +106,9 @@ static struct irq_routing_table * __init
22073 return rt;
22074 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
22075 }
22076 - for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
22077 + for (addr = (u8 *) isa_bus_to_virt(0xf0000);
22078 + addr < (u8 *) isa_bus_to_virt(0x100000);
22079 + addr += 16) {
22080 rt = pirq_check_routing_table(addr);
22081 if (rt)
22082 return rt;
22083 @@ -126,20 +130,20 @@ static void __init pirq_peer_trick(void)
22084 struct irq_info *e;
22085
22086 memset(busmap, 0, sizeof(busmap));
22087 - for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
22088 + for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
22089 e = &rt->slots[i];
22090 #ifdef DEBUG
22091 {
22092 int j;
22093 DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
22094 - for(j=0; j<4; j++)
22095 + for (j = 0; j < 4; j++)
22096 DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
22097 DBG("\n");
22098 }
22099 #endif
22100 busmap[e->bus] = 1;
22101 }
22102 - for(i = 1; i < 256; i++) {
22103 + for (i = 1; i < 256; i++) {
22104 int node;
22105 if (!busmap[i] || pci_find_bus(0, i))
22106 continue;
22107 @@ -187,7 +191,8 @@ static unsigned int read_config_nybble(s
22108 return (nr & 1) ? (x >> 4) : (x & 0xf);
22109 }
22110
22111 -static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
22112 +static void write_config_nybble(struct pci_dev *router, unsigned offset,
22113 + unsigned nr, unsigned int val)
22114 {
22115 u8 x;
22116 unsigned reg = offset + (nr >> 1);
22117 @@ -289,7 +294,7 @@ static int pirq_ite_get(struct pci_dev *
22118 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
22119
22120 WARN_ON_ONCE(pirq > 4);
22121 - return read_config_nybble(router,0x43, pirqmap[pirq-1]);
22122 + return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
22123 }
22124
22125 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22126 @@ -318,7 +323,7 @@ static int pirq_opti_set(struct pci_dev
22127
22128 /*
22129 * Cyrix: nibble offset 0x5C
22130 - * 0x5C bits 7:4 is INTB bits 3:0 is INTA
22131 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA
22132 * 0x5D bits 7:4 is INTD bits 3:0 is INTC
22133 */
22134 static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
22135 @@ -354,7 +359,7 @@ static int pirq_cyrix_set(struct pci_dev
22136 * Apparently there are systems implementing PCI routing table using
22137 * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
22138 * We try our best to handle both link mappings.
22139 - *
22140 + *
22141 * Currently (2003-05-21) it appears most SiS chipsets follow the
22142 * definition of routing registers from the SiS-5595 southbridge.
22143 * According to the SiS 5595 datasheets the revision id's of the
22144 @@ -374,7 +379,7 @@ static int pirq_cyrix_set(struct pci_dev
22145 *
22146 * 0x62: USBIRQ:
22147 * bit 6 OHCI function disabled (0), enabled (1)
22148 - *
22149 + *
22150 * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
22151 *
22152 * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
22153 @@ -437,7 +442,7 @@ static int pirq_vlsi_get(struct pci_dev
22154 {
22155 WARN_ON_ONCE(pirq >= 9);
22156 if (pirq > 8) {
22157 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
22158 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
22159 return 0;
22160 }
22161 return read_config_nybble(router, 0x74, pirq-1);
22162 @@ -447,7 +452,7 @@ static int pirq_vlsi_set(struct pci_dev
22163 {
22164 WARN_ON_ONCE(pirq >= 9);
22165 if (pirq > 8) {
22166 - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
22167 + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
22168 return 0;
22169 }
22170 write_config_nybble(router, 0x74, pirq-1, irq);
22171 @@ -471,7 +476,8 @@ static int pirq_serverworks_get(struct p
22172 return inb(0xc01) & 0xf;
22173 }
22174
22175 -static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22176 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
22177 + int pirq, int irq)
22178 {
22179 outb(pirq, 0xc00);
22180 outb(irq, 0xc01);
22181 @@ -491,22 +497,20 @@ static int pirq_amd756_get(struct pci_de
22182 u8 irq;
22183 irq = 0;
22184 if (pirq <= 4)
22185 - {
22186 irq = read_config_nybble(router, 0x56, pirq - 1);
22187 - }
22188 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
22189 - dev->vendor, dev->device, pirq, irq);
22190 + dev_info(&dev->dev,
22191 + "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
22192 + dev->vendor, dev->device, pirq, irq);
22193 return irq;
22194 }
22195
22196 static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
22197 {
22198 - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
22199 - dev->vendor, dev->device, pirq, irq);
22200 + dev_info(&dev->dev,
22201 + "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
22202 + dev->vendor, dev->device, pirq, irq);
22203 if (pirq <= 4)
22204 - {
22205 write_config_nybble(router, 0x56, pirq - 1, irq);
22206 - }
22207 return 1;
22208 }
22209
22210 @@ -553,50 +557,51 @@ static __init int intel_router_probe(str
22211 if (pci_dev_present(pirq_440gx))
22212 return 0;
22213
22214 - switch(device)
22215 - {
22216 - case PCI_DEVICE_ID_INTEL_82371FB_0:
22217 - case PCI_DEVICE_ID_INTEL_82371SB_0:
22218 - case PCI_DEVICE_ID_INTEL_82371AB_0:
22219 - case PCI_DEVICE_ID_INTEL_82371MX:
22220 - case PCI_DEVICE_ID_INTEL_82443MX_0:
22221 - case PCI_DEVICE_ID_INTEL_82801AA_0:
22222 - case PCI_DEVICE_ID_INTEL_82801AB_0:
22223 - case PCI_DEVICE_ID_INTEL_82801BA_0:
22224 - case PCI_DEVICE_ID_INTEL_82801BA_10:
22225 - case PCI_DEVICE_ID_INTEL_82801CA_0:
22226 - case PCI_DEVICE_ID_INTEL_82801CA_12:
22227 - case PCI_DEVICE_ID_INTEL_82801DB_0:
22228 - case PCI_DEVICE_ID_INTEL_82801E_0:
22229 - case PCI_DEVICE_ID_INTEL_82801EB_0:
22230 - case PCI_DEVICE_ID_INTEL_ESB_1:
22231 - case PCI_DEVICE_ID_INTEL_ICH6_0:
22232 - case PCI_DEVICE_ID_INTEL_ICH6_1:
22233 - case PCI_DEVICE_ID_INTEL_ICH7_0:
22234 - case PCI_DEVICE_ID_INTEL_ICH7_1:
22235 - case PCI_DEVICE_ID_INTEL_ICH7_30:
22236 - case PCI_DEVICE_ID_INTEL_ICH7_31:
22237 - case PCI_DEVICE_ID_INTEL_ESB2_0:
22238 - case PCI_DEVICE_ID_INTEL_ICH8_0:
22239 - case PCI_DEVICE_ID_INTEL_ICH8_1:
22240 - case PCI_DEVICE_ID_INTEL_ICH8_2:
22241 - case PCI_DEVICE_ID_INTEL_ICH8_3:
22242 - case PCI_DEVICE_ID_INTEL_ICH8_4:
22243 - case PCI_DEVICE_ID_INTEL_ICH9_0:
22244 - case PCI_DEVICE_ID_INTEL_ICH9_1:
22245 - case PCI_DEVICE_ID_INTEL_ICH9_2:
22246 - case PCI_DEVICE_ID_INTEL_ICH9_3:
22247 - case PCI_DEVICE_ID_INTEL_ICH9_4:
22248 - case PCI_DEVICE_ID_INTEL_ICH9_5:
22249 - case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22250 - case PCI_DEVICE_ID_INTEL_ICH10_0:
22251 - case PCI_DEVICE_ID_INTEL_ICH10_1:
22252 - case PCI_DEVICE_ID_INTEL_ICH10_2:
22253 - case PCI_DEVICE_ID_INTEL_ICH10_3:
22254 - r->name = "PIIX/ICH";
22255 - r->get = pirq_piix_get;
22256 - r->set = pirq_piix_set;
22257 - return 1;
22258 + switch (device) {
22259 + case PCI_DEVICE_ID_INTEL_82371FB_0:
22260 + case PCI_DEVICE_ID_INTEL_82371SB_0:
22261 + case PCI_DEVICE_ID_INTEL_82371AB_0:
22262 + case PCI_DEVICE_ID_INTEL_82371MX:
22263 + case PCI_DEVICE_ID_INTEL_82443MX_0:
22264 + case PCI_DEVICE_ID_INTEL_82801AA_0:
22265 + case PCI_DEVICE_ID_INTEL_82801AB_0:
22266 + case PCI_DEVICE_ID_INTEL_82801BA_0:
22267 + case PCI_DEVICE_ID_INTEL_82801BA_10:
22268 + case PCI_DEVICE_ID_INTEL_82801CA_0:
22269 + case PCI_DEVICE_ID_INTEL_82801CA_12:
22270 + case PCI_DEVICE_ID_INTEL_82801DB_0:
22271 + case PCI_DEVICE_ID_INTEL_82801E_0:
22272 + case PCI_DEVICE_ID_INTEL_82801EB_0:
22273 + case PCI_DEVICE_ID_INTEL_ESB_1:
22274 + case PCI_DEVICE_ID_INTEL_ICH6_0:
22275 + case PCI_DEVICE_ID_INTEL_ICH6_1:
22276 + case PCI_DEVICE_ID_INTEL_ICH7_0:
22277 + case PCI_DEVICE_ID_INTEL_ICH7_1:
22278 + case PCI_DEVICE_ID_INTEL_ICH7_30:
22279 + case PCI_DEVICE_ID_INTEL_ICH7_31:
22280 + case PCI_DEVICE_ID_INTEL_ESB2_0:
22281 + case PCI_DEVICE_ID_INTEL_ICH8_0:
22282 + case PCI_DEVICE_ID_INTEL_ICH8_1:
22283 + case PCI_DEVICE_ID_INTEL_ICH8_2:
22284 + case PCI_DEVICE_ID_INTEL_ICH8_3:
22285 + case PCI_DEVICE_ID_INTEL_ICH8_4:
22286 + case PCI_DEVICE_ID_INTEL_ICH9_0:
22287 + case PCI_DEVICE_ID_INTEL_ICH9_1:
22288 + case PCI_DEVICE_ID_INTEL_ICH9_2:
22289 + case PCI_DEVICE_ID_INTEL_ICH9_3:
22290 + case PCI_DEVICE_ID_INTEL_ICH9_4:
22291 + case PCI_DEVICE_ID_INTEL_ICH9_5:
22292 + case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
22293 + case PCI_DEVICE_ID_INTEL_ICH10_0:
22294 + case PCI_DEVICE_ID_INTEL_ICH10_1:
22295 + case PCI_DEVICE_ID_INTEL_ICH10_2:
22296 + case PCI_DEVICE_ID_INTEL_ICH10_3:
22297 + case PCI_DEVICE_ID_INTEL_PCH_0:
22298 + case PCI_DEVICE_ID_INTEL_PCH_1:
22299 + r->name = "PIIX/ICH";
22300 + r->get = pirq_piix_get;
22301 + r->set = pirq_piix_set;
22302 + return 1;
22303 }
22304 return 0;
22305 }
22306 @@ -610,7 +615,7 @@ static __init int via_router_probe(struc
22307 * workarounds for some buggy BIOSes
22308 */
22309 if (device == PCI_DEVICE_ID_VIA_82C586_0) {
22310 - switch(router->device) {
22311 + switch (router->device) {
22312 case PCI_DEVICE_ID_VIA_82C686:
22313 /*
22314 * Asus k7m bios wrongly reports 82C686A
22315 @@ -635,7 +640,7 @@ static __init int via_router_probe(struc
22316 }
22317 }
22318
22319 - switch(device) {
22320 + switch (device) {
22321 case PCI_DEVICE_ID_VIA_82C586_0:
22322 r->name = "VIA";
22323 r->get = pirq_via586_get;
22324 @@ -658,28 +663,27 @@ static __init int via_router_probe(struc
22325
22326 static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22327 {
22328 - switch(device)
22329 - {
22330 - case PCI_DEVICE_ID_VLSI_82C534:
22331 - r->name = "VLSI 82C534";
22332 - r->get = pirq_vlsi_get;
22333 - r->set = pirq_vlsi_set;
22334 - return 1;
22335 + switch (device) {
22336 + case PCI_DEVICE_ID_VLSI_82C534:
22337 + r->name = "VLSI 82C534";
22338 + r->get = pirq_vlsi_get;
22339 + r->set = pirq_vlsi_set;
22340 + return 1;
22341 }
22342 return 0;
22343 }
22344
22345
22346 -static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22347 +static __init int serverworks_router_probe(struct irq_router *r,
22348 + struct pci_dev *router, u16 device)
22349 {
22350 - switch(device)
22351 - {
22352 - case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22353 - case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22354 - r->name = "ServerWorks";
22355 - r->get = pirq_serverworks_get;
22356 - r->set = pirq_serverworks_set;
22357 - return 1;
22358 + switch (device) {
22359 + case PCI_DEVICE_ID_SERVERWORKS_OSB4:
22360 + case PCI_DEVICE_ID_SERVERWORKS_CSB5:
22361 + r->name = "ServerWorks";
22362 + r->get = pirq_serverworks_get;
22363 + r->set = pirq_serverworks_set;
22364 + return 1;
22365 }
22366 return 0;
22367 }
22368 @@ -688,7 +692,7 @@ static __init int sis_router_probe(struc
22369 {
22370 if (device != PCI_DEVICE_ID_SI_503)
22371 return 0;
22372 -
22373 +
22374 r->name = "SIS";
22375 r->get = pirq_sis_get;
22376 r->set = pirq_sis_set;
22377 @@ -697,50 +701,45 @@ static __init int sis_router_probe(struc
22378
22379 static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22380 {
22381 - switch(device)
22382 - {
22383 - case PCI_DEVICE_ID_CYRIX_5520:
22384 - r->name = "NatSemi";
22385 - r->get = pirq_cyrix_get;
22386 - r->set = pirq_cyrix_set;
22387 - return 1;
22388 + switch (device) {
22389 + case PCI_DEVICE_ID_CYRIX_5520:
22390 + r->name = "NatSemi";
22391 + r->get = pirq_cyrix_get;
22392 + r->set = pirq_cyrix_set;
22393 + return 1;
22394 }
22395 return 0;
22396 }
22397
22398 static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22399 {
22400 - switch(device)
22401 - {
22402 - case PCI_DEVICE_ID_OPTI_82C700:
22403 - r->name = "OPTI";
22404 - r->get = pirq_opti_get;
22405 - r->set = pirq_opti_set;
22406 - return 1;
22407 + switch (device) {
22408 + case PCI_DEVICE_ID_OPTI_82C700:
22409 + r->name = "OPTI";
22410 + r->get = pirq_opti_get;
22411 + r->set = pirq_opti_set;
22412 + return 1;
22413 }
22414 return 0;
22415 }
22416
22417 static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22418 {
22419 - switch(device)
22420 - {
22421 - case PCI_DEVICE_ID_ITE_IT8330G_0:
22422 - r->name = "ITE";
22423 - r->get = pirq_ite_get;
22424 - r->set = pirq_ite_set;
22425 - return 1;
22426 + switch (device) {
22427 + case PCI_DEVICE_ID_ITE_IT8330G_0:
22428 + r->name = "ITE";
22429 + r->get = pirq_ite_get;
22430 + r->set = pirq_ite_set;
22431 + return 1;
22432 }
22433 return 0;
22434 }
22435
22436 static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22437 {
22438 - switch(device)
22439 - {
22440 + switch (device) {
22441 case PCI_DEVICE_ID_AL_M1533:
22442 case PCI_DEVICE_ID_AL_M1563:
22443 - printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
22444 r->name = "ALI";
22445 r->get = pirq_ali_get;
22446 r->set = pirq_ali_set;
22447 @@ -751,25 +750,24 @@ static __init int ali_router_probe(struc
22448
22449 static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22450 {
22451 - switch(device)
22452 - {
22453 - case PCI_DEVICE_ID_AMD_VIPER_740B:
22454 - r->name = "AMD756";
22455 - break;
22456 - case PCI_DEVICE_ID_AMD_VIPER_7413:
22457 - r->name = "AMD766";
22458 - break;
22459 - case PCI_DEVICE_ID_AMD_VIPER_7443:
22460 - r->name = "AMD768";
22461 - break;
22462 - default:
22463 - return 0;
22464 + switch (device) {
22465 + case PCI_DEVICE_ID_AMD_VIPER_740B:
22466 + r->name = "AMD756";
22467 + break;
22468 + case PCI_DEVICE_ID_AMD_VIPER_7413:
22469 + r->name = "AMD766";
22470 + break;
22471 + case PCI_DEVICE_ID_AMD_VIPER_7443:
22472 + r->name = "AMD768";
22473 + break;
22474 + default:
22475 + return 0;
22476 }
22477 r->get = pirq_amd756_get;
22478 r->set = pirq_amd756_set;
22479 return 1;
22480 }
22481 -
22482 +
22483 static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
22484 {
22485 switch (device) {
22486 @@ -811,7 +809,7 @@ static struct pci_dev *pirq_router_dev;
22487 * FIXME: should we have an option to say "generic for
22488 * chipset" ?
22489 */
22490 -
22491 +
22492 static void __init pirq_find_router(struct irq_router *r)
22493 {
22494 struct irq_routing_table *rt = pirq_table;
22495 @@ -830,7 +828,7 @@ static void __init pirq_find_router(stru
22496 r->name = "default";
22497 r->get = NULL;
22498 r->set = NULL;
22499 -
22500 +
22501 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
22502 rt->rtr_vendor, rt->rtr_device);
22503
22504 @@ -841,19 +839,19 @@ static void __init pirq_find_router(stru
22505 return;
22506 }
22507
22508 - for( h = pirq_routers; h->vendor; h++) {
22509 + for (h = pirq_routers; h->vendor; h++) {
22510 /* First look for a router match */
22511 - if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
22512 + if (rt->rtr_vendor == h->vendor &&
22513 + h->probe(r, pirq_router_dev, rt->rtr_device))
22514 break;
22515 /* Fall back to a device match */
22516 - if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
22517 + if (pirq_router_dev->vendor == h->vendor &&
22518 + h->probe(r, pirq_router_dev, pirq_router_dev->device))
22519 break;
22520 }
22521 - printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
22522 - pirq_router.name,
22523 - pirq_router_dev->vendor,
22524 - pirq_router_dev->device,
22525 - pci_name(pirq_router_dev));
22526 + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
22527 + pirq_router.name,
22528 + pirq_router_dev->vendor, pirq_router_dev->device);
22529
22530 /* The device remains referenced for the kernel lifetime */
22531 }
22532 @@ -861,11 +859,13 @@ static void __init pirq_find_router(stru
22533 static struct irq_info *pirq_get_info(struct pci_dev *dev)
22534 {
22535 struct irq_routing_table *rt = pirq_table;
22536 - int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
22537 + int entries = (rt->size - sizeof(struct irq_routing_table)) /
22538 + sizeof(struct irq_info);
22539 struct irq_info *info;
22540
22541 for (info = rt->slots; entries--; info++)
22542 - if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22543 + if (info->bus == dev->bus->number &&
22544 + PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
22545 return info;
22546 return NULL;
22547 }
22548 @@ -884,7 +884,7 @@ static int pcibios_lookup_irq(struct pci
22549 /* Find IRQ pin */
22550 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
22551 if (!pin) {
22552 - DBG(KERN_DEBUG " -> no interrupt pin\n");
22553 + dev_dbg(&dev->dev, "no interrupt pin\n");
22554 return 0;
22555 }
22556 pin = pin - 1;
22557 @@ -893,20 +893,21 @@ static int pcibios_lookup_irq(struct pci
22558
22559 if (!pirq_table)
22560 return 0;
22561 -
22562 - DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
22563 +
22564 info = pirq_get_info(dev);
22565 if (!info) {
22566 - DBG(" -> not found in routing table\n" KERN_DEBUG);
22567 + dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
22568 + 'A' + pin);
22569 return 0;
22570 }
22571 pirq = info->irq[pin].link;
22572 mask = info->irq[pin].bitmap;
22573 if (!pirq) {
22574 - DBG(" -> not routed\n" KERN_DEBUG);
22575 + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
22576 return 0;
22577 }
22578 - DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
22579 + dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
22580 + 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
22581 mask &= pcibios_irq_mask;
22582
22583 /* Work around broken HP Pavilion Notebooks which assign USB to
22584 @@ -919,7 +920,8 @@ static int pcibios_lookup_irq(struct pci
22585 }
22586
22587 /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
22588 - if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
22589 + if (acer_tm360_irqrouting && dev->irq == 11 &&
22590 + dev->vendor == PCI_VENDOR_ID_O2) {
22591 pirq = 0x68;
22592 mask = 0x400;
22593 dev->irq = r->get(pirq_router_dev, dev, pirq);
22594 @@ -932,51 +934,50 @@ static int pcibios_lookup_irq(struct pci
22595 */
22596 newirq = dev->irq;
22597 if (newirq && !((1 << newirq) & mask)) {
22598 - if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
22599 - else printk("\n" KERN_WARNING
22600 - "PCI: IRQ %i for device %s doesn't match PIRQ mask "
22601 - "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
22602 - pci_name(dev));
22603 + if (pci_probe & PCI_USE_PIRQ_MASK)
22604 + newirq = 0;
22605 + else
22606 + dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
22607 + "%#x; try pci=usepirqmask\n", newirq, mask);
22608 }
22609 if (!newirq && assign) {
22610 for (i = 0; i < 16; i++) {
22611 if (!(mask & (1 << i)))
22612 continue;
22613 - if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
22614 + if (pirq_penalty[i] < pirq_penalty[newirq] &&
22615 + can_request_irq(i, IRQF_SHARED))
22616 newirq = i;
22617 }
22618 }
22619 - DBG(" -> newirq=%d", newirq);
22620 + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
22621
22622 /* Check if it is hardcoded */
22623 if ((pirq & 0xf0) == 0xf0) {
22624 irq = pirq & 0xf;
22625 - DBG(" -> hardcoded IRQ %d\n", irq);
22626 - msg = "Hardcoded";
22627 - } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22628 - ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
22629 - DBG(" -> got IRQ %d\n", irq);
22630 - msg = "Found";
22631 + msg = "hardcoded";
22632 + } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
22633 + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
22634 + msg = "found";
22635 eisa_set_level_irq(irq);
22636 - } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22637 - DBG(" -> assigning IRQ %d", newirq);
22638 + } else if (newirq && r->set &&
22639 + (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
22640 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
22641 eisa_set_level_irq(newirq);
22642 - DBG(" ... OK\n");
22643 - msg = "Assigned";
22644 + msg = "assigned";
22645 irq = newirq;
22646 }
22647 }
22648
22649 if (!irq) {
22650 - DBG(" ... failed\n");
22651 if (newirq && mask == (1 << newirq)) {
22652 - msg = "Guessed";
22653 + msg = "guessed";
22654 irq = newirq;
22655 - } else
22656 + } else {
22657 + dev_dbg(&dev->dev, "can't route interrupt\n");
22658 return 0;
22659 + }
22660 }
22661 - printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
22662 + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
22663
22664 /* Update IRQ for all devices with the same pirq value */
22665 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
22666 @@ -988,20 +989,25 @@ static int pcibios_lookup_irq(struct pci
22667 if (!info)
22668 continue;
22669 if (info->irq[pin].link == pirq) {
22670 - /* We refuse to override the dev->irq information. Give a warning! */
22671 - if ( dev2->irq && dev2->irq != irq && \
22672 + /*
22673 + * We refuse to override the dev->irq
22674 + * information. Give a warning!
22675 + */
22676 + if (dev2->irq && dev2->irq != irq && \
22677 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
22678 - ((1 << dev2->irq) & mask)) ) {
22679 + ((1 << dev2->irq) & mask))) {
22680 #ifndef CONFIG_PCI_MSI
22681 - printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
22682 - pci_name(dev2), dev2->irq, irq);
22683 + dev_info(&dev2->dev, "IRQ routing conflict: "
22684 + "have IRQ %d, want IRQ %d\n",
22685 + dev2->irq, irq);
22686 #endif
22687 - continue;
22688 - }
22689 + continue;
22690 + }
22691 dev2->irq = irq;
22692 pirq_penalty[irq]++;
22693 if (dev != dev2)
22694 - printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
22695 + dev_info(&dev->dev, "sharing IRQ %d with %s\n",
22696 + irq, pci_name(dev2));
22697 }
22698 }
22699 return 1;
22700 @@ -1015,15 +1021,20 @@ static void __init pcibios_fixup_irqs(vo
22701 DBG(KERN_DEBUG "PCI: IRQ fixup\n");
22702 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
22703 /*
22704 - * If the BIOS has set an out of range IRQ number, just ignore it.
22705 - * Also keep track of which IRQ's are already in use.
22706 + * If the BIOS has set an out of range IRQ number, just
22707 + * ignore it. Also keep track of which IRQ's are
22708 + * already in use.
22709 */
22710 if (dev->irq >= 16) {
22711 - DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
22712 + dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
22713 dev->irq = 0;
22714 }
22715 - /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
22716 - if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
22717 + /*
22718 + * If the IRQ is already assigned to a PCI device,
22719 + * ignore its ISA use penalty
22720 + */
22721 + if (pirq_penalty[dev->irq] >= 100 &&
22722 + pirq_penalty[dev->irq] < 100000)
22723 pirq_penalty[dev->irq] = 0;
22724 pirq_penalty[dev->irq]++;
22725 }
22726 @@ -1035,13 +1046,17 @@ static void __init pcibios_fixup_irqs(vo
22727 /*
22728 * Recalculate IRQ numbers if we use the I/O APIC.
22729 */
22730 - if (io_apic_assign_pci_irqs)
22731 - {
22732 + if (io_apic_assign_pci_irqs) {
22733 int irq;
22734
22735 if (pin) {
22736 - pin--; /* interrupt pins are numbered starting from 1 */
22737 - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
22738 + /*
22739 + * interrupt pins are numbered starting
22740 + * from 1
22741 + */
22742 + pin--;
22743 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
22744 + PCI_SLOT(dev->devfn), pin);
22745 /*
22746 * Busses behind bridges are typically not listed in the MP-table.
22747 * In this case we have to look up the IRQ based on the parent bus,
22748 @@ -1049,18 +1064,18 @@ static void __init pcibios_fixup_irqs(vo
22749 * busses itself so we should get into this branch reliably.
22750 */
22751 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22752 - struct pci_dev * bridge = dev->bus->self;
22753 + struct pci_dev *bridge = dev->bus->self;
22754
22755 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22756 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22757 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22758 PCI_SLOT(bridge->devfn), pin);
22759 if (irq >= 0)
22760 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22761 - pci_name(bridge), 'A' + pin, irq);
22762 + dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
22763 + pci_name(bridge),
22764 + 'A' + pin, irq);
22765 }
22766 if (irq >= 0) {
22767 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22768 - pci_name(dev), 'A' + pin, irq);
22769 + dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
22770 dev->irq = irq;
22771 }
22772 }
22773 @@ -1082,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq
22774 {
22775 if (!broken_hp_bios_irq9) {
22776 broken_hp_bios_irq9 = 1;
22777 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22778 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22779 + d->ident);
22780 }
22781 return 0;
22782 }
22783 @@ -1095,7 +1111,8 @@ static int __init fix_acer_tm360_irqrout
22784 {
22785 if (!acer_tm360_irqrouting) {
22786 acer_tm360_irqrouting = 1;
22787 - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
22788 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
22789 + d->ident);
22790 }
22791 return 0;
22792 }
22793 @@ -1107,7 +1124,8 @@ static struct dmi_system_id __initdata p
22794 .matches = {
22795 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
22796 DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
22797 - DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
22798 + DMI_MATCH(DMI_PRODUCT_VERSION,
22799 + "HP Pavilion Notebook Model GE"),
22800 DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
22801 },
22802 },
22803 @@ -1122,7 +1140,7 @@ static struct dmi_system_id __initdata p
22804 { }
22805 };
22806
22807 -static int __init pcibios_irq_init(void)
22808 +int __init pcibios_irq_init(void)
22809 {
22810 DBG(KERN_DEBUG "PCI: IRQ init\n");
22811
22812 @@ -1142,11 +1160,14 @@ static int __init pcibios_irq_init(void)
22813 pirq_find_router(&pirq_router);
22814 if (pirq_table->exclusive_irqs) {
22815 int i;
22816 - for (i=0; i<16; i++)
22817 + for (i = 0; i < 16; i++)
22818 if (!(pirq_table->exclusive_irqs & (1 << i)))
22819 pirq_penalty[i] += 100;
22820 }
22821 - /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
22822 + /*
22823 + * If we're using the I/O APIC, avoid using the PCI IRQ
22824 + * routing table
22825 + */
22826 if (io_apic_assign_pci_irqs)
22827 pirq_table = NULL;
22828 }
22829 @@ -1157,9 +1178,6 @@ static int __init pcibios_irq_init(void)
22830 return 0;
22831 }
22832
22833 -subsys_initcall(pcibios_irq_init);
22834 -
22835 -
22836 static void pirq_penalize_isa_irq(int irq, int active)
22837 {
22838 /*
22839 @@ -1193,7 +1211,7 @@ static int pirq_enable_irq(struct pci_de
22840 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
22841 char *msg = "";
22842
22843 - pin--; /* interrupt pins are numbered starting from 1 */
22844 + pin--; /* interrupt pins are numbered starting from 1 */
22845
22846 if (io_apic_assign_pci_irqs) {
22847 int irq;
22848 @@ -1207,35 +1225,41 @@ static int pirq_enable_irq(struct pci_de
22849 */
22850 temp_dev = dev;
22851 while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
22852 - struct pci_dev * bridge = dev->bus->self;
22853 + struct pci_dev *bridge = dev->bus->self;
22854
22855 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
22856 - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22857 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
22858 PCI_SLOT(bridge->devfn), pin);
22859 if (irq >= 0)
22860 - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
22861 - pci_name(bridge), 'A' + pin, irq);
22862 + dev_warn(&dev->dev, "using bridge %s "
22863 + "INT %c to get IRQ %d\n",
22864 + pci_name(bridge), 'A' + pin,
22865 + irq);
22866 dev = bridge;
22867 }
22868 dev = temp_dev;
22869 if (irq >= 0) {
22870 - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
22871 - pci_name(dev), 'A' + pin, irq);
22872 + dev_info(&dev->dev, "PCI->APIC IRQ transform: "
22873 + "INT %c -> IRQ %d\n", 'A' + pin, irq);
22874 dev->irq = irq;
22875 return 0;
22876 } else
22877 - msg = " Probably buggy MP table.";
22878 + msg = "; probably buggy MP table";
22879 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
22880 msg = "";
22881 else
22882 - msg = " Please try using pci=biosirq.";
22883 + msg = "; please try using pci=biosirq";
22884
22885 - /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
22886 - if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
22887 + /*
22888 + * With IDE legacy devices the IRQ lookup failure is not
22889 + * a problem..
22890 + */
22891 + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
22892 + !(dev->class & 0x5))
22893 return 0;
22894
22895 - printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
22896 - 'A' + pin, pci_name(dev), msg);
22897 + dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
22898 + 'A' + pin, msg);
22899 }
22900 return 0;
22901 }
22902 --- sle11-2009-10-16.orig/arch/x86/vdso/Makefile 2009-03-16 16:33:40.000000000 +0100
22903 +++ sle11-2009-10-16/arch/x86/vdso/Makefile 2009-06-04 10:21:39.000000000 +0200
22904 @@ -65,9 +65,7 @@ obj-$(VDSO32-y) += vdso32-syms.lds
22905 vdso32.so-$(VDSO32-y) += int80
22906 vdso32.so-$(CONFIG_COMPAT) += syscall
22907 vdso32.so-$(VDSO32-y) += sysenter
22908 -xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
22909 -xen-vdso32-$(CONFIG_X86_32) += syscall
22910 -vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
22911 +vdso32.so-$(CONFIG_X86_XEN) += syscall
22912
22913 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
22914
22915 --- sle11-2009-10-16.orig/arch/x86/vdso/vdso32.S 2009-03-16 16:33:40.000000000 +0100
22916 +++ sle11-2009-10-16/arch/x86/vdso/vdso32.S 2009-06-04 10:21:39.000000000 +0200
22917 @@ -9,7 +9,7 @@ vdso32_int80_end:
22918
22919 .globl vdso32_syscall_start, vdso32_syscall_end
22920 vdso32_syscall_start:
22921 -#ifdef CONFIG_COMPAT
22922 +#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
22923 .incbin "arch/x86/vdso/vdso32-syscall.so"
22924 #endif
22925 vdso32_syscall_end:
22926 @@ -19,16 +19,4 @@ vdso32_sysenter_start:
22927 .incbin "arch/x86/vdso/vdso32-sysenter.so"
22928 vdso32_sysenter_end:
22929
22930 -#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
22931 - .globl vdso32_int80_start, vdso32_int80_end
22932 -vdso32_int80_start:
22933 - .incbin "arch/x86/vdso/vdso32-int80.so"
22934 -vdso32_int80_end:
22935 -#elif defined(CONFIG_X86_XEN)
22936 - .globl vdso32_syscall_start, vdso32_syscall_end
22937 -vdso32_syscall_start:
22938 - .incbin "arch/x86/vdso/vdso32-syscall.so"
22939 -vdso32_syscall_end:
22940 -#endif
22941 -
22942 __FINIT
22943 --- sle11-2009-10-16.orig/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:38:05.000000000 +0100
22944 +++ sle11-2009-10-16/arch/x86/vdso/vdso32-setup-xen.c 2009-06-04 10:21:39.000000000 +0200
22945 @@ -195,50 +195,28 @@ static __init void relocate_vdso(Elf32_E
22946 }
22947 }
22948
22949 -/*
22950 - * These symbols are defined by vdso32.S to mark the bounds
22951 - * of the ELF DSO images included therein.
22952 - */
22953 -extern const char vdso32_default_start, vdso32_default_end;
22954 -extern const char vdso32_sysenter_start, vdso32_sysenter_end;
22955 static struct page *vdso32_pages[1];
22956
22957 #ifdef CONFIG_X86_64
22958
22959 -#if CONFIG_XEN_COMPAT < 0x030200
22960 -static int use_int80 = 1;
22961 -#endif
22962 -static int use_sysenter __read_mostly = -1;
22963 -
22964 -#define vdso32_sysenter() (use_sysenter > 0)
22965 +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
22966 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
22967
22968 -/* May not be __init: called during resume */
22969 -void syscall32_cpu_init(void)
22970 +void __cpuinit syscall32_cpu_init(void)
22971 {
22972 - static const struct callback_register cstar = {
22973 + static const struct callback_register __cpuinitconst cstar = {
22974 .type = CALLBACKTYPE_syscall32,
22975 .address = (unsigned long)ia32_cstar_target
22976 };
22977 - static const struct callback_register sysenter = {
22978 + static const struct callback_register __cpuinitconst sysenter = {
22979 .type = CALLBACKTYPE_sysenter,
22980 .address = (unsigned long)ia32_sysenter_target
22981 };
22982
22983 - if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
22984 - (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
22985 -#if CONFIG_XEN_COMPAT < 0x030200
22986 - return;
22987 - use_int80 = 0;
22988 -#else
22989 - BUG();
22990 -#endif
22991 -
22992 - if (use_sysenter < 0) {
22993 - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
22994 - use_sysenter = 1;
22995 - if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
22996 - use_sysenter = 1;
22997 - }
22998 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
22999 + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
23000 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
23001 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
23002 }
23003
23004 #define compat_uses_vma 1
23005 @@ -250,6 +228,7 @@ static inline void map_compat_vdso(int m
23006 #else /* CONFIG_X86_32 */
23007
23008 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
23009 +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
23010
23011 extern asmlinkage void ia32pv_cstar_target(void);
23012 static const struct callback_register __cpuinitconst cstar = {
23013 @@ -265,13 +244,13 @@ void __cpuinit enable_sep_cpu(void)
23014 .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
23015 };
23016
23017 - if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
23018 + if (vdso32_syscall()) {
23019 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
23020 BUG();
23021 return;
23022 }
23023
23024 - if (!boot_cpu_has(X86_FEATURE_SEP))
23025 + if (!vdso32_sysenter())
23026 return;
23027
23028 if (xen_feature(XENFEAT_supervisor_mode_kernel))
23029 @@ -341,34 +320,26 @@ int __init sysenter_setup(void)
23030
23031 #ifdef CONFIG_X86_32
23032 gate_vma_init();
23033 -#endif
23034
23035 -#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
23036 - if (use_int80) {
23037 - extern const char vdso32_int80_start, vdso32_int80_end;
23038 -
23039 - vsyscall = &vdso32_int80_start;
23040 - vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
23041 - } else
23042 -#elif defined(CONFIG_X86_32)
23043 - if (boot_cpu_has(X86_FEATURE_SYSCALL)
23044 - && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
23045 - || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
23046 - setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
23047 - barrier(); /* until clear_bit()'s constraints are correct ... */
23048 if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
23049 - extern const char vdso32_syscall_start, vdso32_syscall_end;
23050 -
23051 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
23052 + && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
23053 + setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
23054 + else {
23055 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
23056 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
23057 + }
23058 + }
23059 +#endif
23060 + if (vdso32_syscall()) {
23061 vsyscall = &vdso32_syscall_start;
23062 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
23063 - } else
23064 -#endif
23065 - if (!vdso32_sysenter()) {
23066 - vsyscall = &vdso32_default_start;
23067 - vsyscall_len = &vdso32_default_end - &vdso32_default_start;
23068 - } else {
23069 + } else if (vdso32_sysenter()){
23070 vsyscall = &vdso32_sysenter_start;
23071 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
23072 + } else {
23073 + vsyscall = &vdso32_int80_start;
23074 + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
23075 }
23076
23077 memcpy(syscall_page, vsyscall, vsyscall_len);
23078 --- sle11-2009-10-16.orig/arch/x86/xen/Kconfig 2009-02-16 16:17:21.000000000 +0100
23079 +++ sle11-2009-10-16/arch/x86/xen/Kconfig 2009-06-04 10:21:39.000000000 +0200
23080 @@ -17,7 +17,7 @@ config XEN_MAX_DOMAIN_MEMORY
23081 int "Maximum allowed size of a domain in gigabytes"
23082 default 8 if X86_32
23083 default 32 if X86_64
23084 - depends on XEN
23085 + depends on PARAVIRT_XEN
23086 help
23087 The pseudo-physical to machine address array is sized
23088 according to the maximum possible memory size of a Xen
23089 @@ -26,5 +26,5 @@ config XEN_MAX_DOMAIN_MEMORY
23090
23091 config XEN_SAVE_RESTORE
23092 bool
23093 - depends on PM
23094 + depends on PARAVIRT_XEN && PM
23095 default y
23096 \ No newline at end of file
23097 --- sle11-2009-10-16.orig/drivers/acpi/processor_core.c 2009-08-26 11:54:44.000000000 +0200
23098 +++ sle11-2009-10-16/drivers/acpi/processor_core.c 2009-08-26 12:04:00.000000000 +0200
23099 @@ -730,9 +730,11 @@ static int __cpuinit acpi_processor_star
23100 if (result)
23101 goto end;
23102
23103 - sysdev = get_cpu_sysdev(pr->id);
23104 - if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
23105 - return -EFAULT;
23106 + if (pr->id != -1) {
23107 + sysdev = get_cpu_sysdev(pr->id);
23108 + if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev"))
23109 + return -EFAULT;
23110 + }
23111
23112 status = acpi_install_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
23113 acpi_processor_notify, pr);
23114 @@ -904,7 +906,8 @@ static int acpi_processor_remove(struct
23115 status = acpi_remove_notify_handler(pr->handle, ACPI_DEVICE_NOTIFY,
23116 acpi_processor_notify);
23117
23118 - sysfs_remove_link(&device->dev.kobj, "sysdev");
23119 + if (pr->id != -1)
23120 + sysfs_remove_link(&device->dev.kobj, "sysdev");
23121
23122 acpi_processor_remove_fs(device);
23123
23124 --- sle11-2009-10-16.orig/drivers/char/tpm/tpm_vtpm.c 2009-08-26 11:52:33.000000000 +0200
23125 +++ sle11-2009-10-16/drivers/char/tpm/tpm_vtpm.c 2009-06-04 10:21:39.000000000 +0200
23126 @@ -347,7 +347,7 @@ static int _vtpm_send_queued(struct tpm_
23127 {
23128 int rc;
23129 int error = 0;
23130 - long flags;
23131 + unsigned long flags;
23132 unsigned char buffer[1];
23133 struct vtpm_state *vtpms;
23134 vtpms = (struct vtpm_state *)chip_get_private(chip);
23135 --- sle11-2009-10-16.orig/drivers/misc/Kconfig 2009-10-28 14:55:02.000000000 +0100
23136 +++ sle11-2009-10-16/drivers/misc/Kconfig 2009-08-26 12:04:11.000000000 +0200
23137 @@ -440,7 +440,7 @@ config ENCLOSURE_SERVICES
23138 config SGI_XP
23139 tristate "Support communication between SGI SSIs"
23140 depends on NET
23141 - depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP
23142 + depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP && !XEN
23143 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
23144 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
23145 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
23146 @@ -467,7 +467,7 @@ config HP_ILO
23147
23148 config SGI_GRU
23149 tristate "SGI GRU driver"
23150 - depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP
23151 + depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP && !XEN
23152 default n
23153 select MMU_NOTIFIER
23154 ---help---
23155 --- sle11-2009-10-16.orig/drivers/pci/msi-xen.c 2009-03-16 16:38:05.000000000 +0100
23156 +++ sle11-2009-10-16/drivers/pci/msi-xen.c 2009-06-04 10:21:39.000000000 +0200
23157 @@ -90,12 +90,10 @@ arch_teardown_msi_irqs(struct pci_dev *d
23158 }
23159 #endif
23160
23161 -static void msi_set_enable(struct pci_dev *dev, int enable)
23162 +static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
23163 {
23164 - int pos;
23165 u16 control;
23166
23167 - pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
23168 if (pos) {
23169 pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
23170 control &= ~PCI_MSI_FLAGS_ENABLE;
23171 @@ -105,6 +103,11 @@ static void msi_set_enable(struct pci_de
23172 }
23173 }
23174
23175 +static void msi_set_enable(struct pci_dev *dev, int enable)
23176 +{
23177 + __msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable);
23178 +}
23179 +
23180 static void msix_set_enable(struct pci_dev *dev, int enable)
23181 {
23182 int pos;
23183 @@ -568,9 +571,8 @@ int pci_enable_msi(struct pci_dev* dev)
23184
23185 /* Check whether driver already requested for MSI-X irqs */
23186 if (dev->msix_enabled) {
23187 - printk(KERN_INFO "PCI: %s: Can't enable MSI. "
23188 - "Device already has MSI-X enabled\n",
23189 - pci_name(dev));
23190 + dev_info(&dev->dev, "can't enable MSI "
23191 + "(MSI-X already enabled)\n");
23192 return -EINVAL;
23193 }
23194
23195 @@ -702,9 +704,8 @@ int pci_enable_msix(struct pci_dev* dev,
23196 temp = dev->irq;
23197 /* Check whether driver already requested for MSI vector */
23198 if (dev->msi_enabled) {
23199 - printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
23200 - "Device already has an MSI irq assigned\n",
23201 - pci_name(dev));
23202 + dev_info(&dev->dev, "can't enable MSI-X "
23203 + "(MSI IRQ already assigned)\n");
23204 return -EINVAL;
23205 }
23206
23207 --- sle11-2009-10-16.orig/drivers/pci/quirks.c 2009-10-28 14:55:02.000000000 +0100
23208 +++ sle11-2009-10-16/drivers/pci/quirks.c 2009-06-04 10:21:39.000000000 +0200
23209 @@ -44,9 +44,8 @@ static void __devinit quirk_release_reso
23210 /* PCI Host Bridge isn't a target device */
23211 return;
23212 }
23213 - printk(KERN_INFO
23214 - "PCI: Disable memory decoding and release memory resources [%s].\n",
23215 - pci_name(dev));
23216 + dev_info(&dev->dev,
23217 + "disable memory decoding and release memory resources\n");
23218 pci_read_config_word(dev, PCI_COMMAND, &command);
23219 command &= ~PCI_COMMAND_MEMORY;
23220 pci_write_config_word(dev, PCI_COMMAND, command);
23221 --- sle11-2009-10-16.orig/drivers/pci/setup-res.c 2009-10-28 14:55:02.000000000 +0100
23222 +++ sle11-2009-10-16/drivers/pci/setup-res.c 2009-06-04 10:21:39.000000000 +0200
23223 @@ -129,7 +129,7 @@ int pci_claim_resource(struct pci_dev *d
23224 #ifdef CONFIG_PCI_REASSIGN
23225 void pci_disable_bridge_window(struct pci_dev *dev)
23226 {
23227 - printk(KERN_DEBUG "PCI: Disable bridge window on %s\n", pci_name(dev));
23228 + dev_dbg(&dev->dev, "disable bridge window\n");
23229
23230 /* MMIO Base/Limit */
23231 pci_write_config_dword(dev, PCI_MEMORY_BASE, 0x0000fff0);
23232 @@ -190,8 +190,8 @@ int pci_assign_resource(struct pci_dev *
23233 res->flags &= ~IORESOURCE_STARTALIGN;
23234 if (resno < PCI_BRIDGE_RESOURCES) {
23235 #ifdef CONFIG_PCI_REASSIGN
23236 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23237 - "%016llx - %016llx\n", resno, pci_name(dev),
23238 + dev_dbg(&dev->dev, "assign resource(%d) "
23239 + "%016llx - %016llx\n", resno,
23240 (unsigned long long)res->start,
23241 (unsigned long long)res->end);
23242 #endif
23243 @@ -235,8 +235,8 @@ int pci_assign_resource_fixed(struct pci
23244 (unsigned long long)res->end);
23245 } else if (resno < PCI_BRIDGE_RESOURCES) {
23246 #ifdef CONFIG_PCI_REASSIGN
23247 - printk(KERN_DEBUG "PCI: Assign resource(%d) on %s "
23248 - "%016llx - %016llx\n", resno, pci_name(dev),
23249 + dev_dbg(&dev->dev, "assign resource(%d) "
23250 + "%016llx - %016llx\n", resno,
23251 (unsigned long long)res->start,
23252 (unsigned long long)res->end);
23253 #endif
23254 --- sle11-2009-10-16.orig/drivers/xen/Makefile 2009-03-16 16:38:05.000000000 +0100
23255 +++ sle11-2009-10-16/drivers/xen/Makefile 2009-06-04 10:21:39.000000000 +0200
23256 @@ -1,4 +1,4 @@
23257 -obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
23258 +obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o
23259 xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
23260 xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
23261
23262 --- sle11-2009-10-16.orig/drivers/xen/balloon/balloon.c 2009-06-29 15:28:36.000000000 +0200
23263 +++ sle11-2009-10-16/drivers/xen/balloon/balloon.c 2009-06-29 15:30:29.000000000 +0200
23264 @@ -84,7 +84,7 @@ static unsigned long frame_list[PAGE_SIZ
23265 /* VM /proc information for memory */
23266 extern unsigned long totalram_pages;
23267
23268 -#if !defined(MODULE) && defined(CONFIG_HIGHMEM)
23269 +#ifdef CONFIG_HIGHMEM
23270 extern unsigned long totalhigh_pages;
23271 #define inc_totalhigh_pages() (totalhigh_pages++)
23272 #define dec_totalhigh_pages() (totalhigh_pages--)
23273 --- sle11-2009-10-16.orig/drivers/xen/balloon/sysfs.c 2009-06-29 15:29:24.000000000 +0200
23274 +++ sle11-2009-10-16/drivers/xen/balloon/sysfs.c 2009-06-29 15:31:06.000000000 +0200
23275 @@ -45,6 +45,7 @@
23276
23277 #define BALLOON_SHOW(name, format, args...) \
23278 static ssize_t show_##name(struct sys_device *dev, \
23279 + struct sysdev_attribute *attr, \
23280 char *buf) \
23281 { \
23282 return sprintf(buf, format, ##args); \
23283 @@ -56,14 +57,15 @@ BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(b
23284 BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
23285 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
23286
23287 -static ssize_t show_target_kb(struct sys_device *dev, char *buf)
23288 +static ssize_t show_target_kb(struct sys_device *dev,
23289 + struct sysdev_attribute *attr, char *buf)
23290 {
23291 return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
23292 }
23293
23294 static ssize_t store_target_kb(struct sys_device *dev,
23295 - const char *buf,
23296 - size_t count)
23297 + struct sysdev_attribute *attr,
23298 + const char *buf, size_t count)
23299 {
23300 char memstring[64], *endchar;
23301 unsigned long long target_bytes;
23302 --- sle11-2009-10-16.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:40:14.000000000 +0200
23303 +++ sle11-2009-10-16/drivers/xen/blktap/blktap.c 2009-06-04 10:21:39.000000000 +0200
23304 @@ -54,6 +54,7 @@
23305 #include <linux/gfp.h>
23306 #include <linux/poll.h>
23307 #include <linux/delay.h>
23308 +#include <linux/nsproxy.h>
23309 #include <asm/tlbflush.h>
23310
23311 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
23312 @@ -498,7 +499,7 @@ found:
23313
23314 if ((class = get_xen_class()) != NULL)
23315 device_create(class, NULL, MKDEV(blktap_major, minor),
23316 - "blktap%d", minor);
23317 + NULL, "blktap%d", minor);
23318 }
23319
23320 out:
23321 @@ -1683,7 +1684,8 @@ static int __init blkif_init(void)
23322 * We only create the device when a request of a new device is
23323 * made.
23324 */
23325 - device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
23326 + device_create(class, NULL, MKDEV(blktap_major, 0), NULL,
23327 + "blktap0");
23328 } else {
23329 /* this is bad, but not fatal */
23330 WPRINTK("blktap: sysfs xen_class not created\n");
23331 --- sle11-2009-10-16.orig/drivers/xen/char/mem.c 2009-03-16 16:38:05.000000000 +0100
23332 +++ sle11-2009-10-16/drivers/xen/char/mem.c 2009-06-04 10:21:39.000000000 +0200
23333 @@ -35,7 +35,7 @@ static inline int uncached_access(struct
23334
23335 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
23336 {
23337 -#ifdef CONFIG_NONPROMISC_DEVMEM
23338 +#ifdef CONFIG_STRICT_DEVMEM
23339 u64 from = ((u64)pfn) << PAGE_SHIFT;
23340 u64 to = from + size;
23341 u64 cursor = from;
23342 @@ -172,7 +172,10 @@ static void mmap_mem_close(struct vm_are
23343
23344 static struct vm_operations_struct mmap_mem_ops = {
23345 .open = mmap_mem_open,
23346 - .close = mmap_mem_close
23347 + .close = mmap_mem_close,
23348 +#ifdef CONFIG_HAVE_IOREMAP_PROT
23349 + .access = generic_access_phys
23350 +#endif
23351 };
23352
23353 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
23354 --- sle11-2009-10-16.orig/drivers/xen/console/console.c 2009-03-16 16:38:05.000000000 +0100
23355 +++ sle11-2009-10-16/drivers/xen/console/console.c 2009-06-04 10:21:39.000000000 +0200
23356 @@ -432,9 +432,7 @@ static void __xencons_tx_flush(void)
23357
23358 if (work_done && (xencons_tty != NULL)) {
23359 wake_up_interruptible(&xencons_tty->write_wait);
23360 - if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
23361 - (xencons_tty->ldisc.write_wakeup != NULL))
23362 - (xencons_tty->ldisc.write_wakeup)(xencons_tty);
23363 + tty_wakeup(xencons_tty);
23364 }
23365 }
23366
23367 @@ -635,8 +633,8 @@ static void xencons_close(struct tty_str
23368 tty->closing = 1;
23369 tty_wait_until_sent(tty, 0);
23370 tty_driver_flush_buffer(tty);
23371 - if (tty->ldisc.flush_buffer != NULL)
23372 - tty->ldisc.flush_buffer(tty);
23373 + if (tty->ldisc.ops->flush_buffer != NULL)
23374 + tty->ldisc.ops->flush_buffer(tty);
23375 tty->closing = 0;
23376 spin_lock_irqsave(&xencons_lock, flags);
23377 xencons_tty = NULL;
23378 --- sle11-2009-10-16.orig/drivers/xen/core/evtchn.c 2009-03-16 16:33:40.000000000 +0100
23379 +++ sle11-2009-10-16/drivers/xen/core/evtchn.c 2009-06-04 10:21:39.000000000 +0200
23380 @@ -746,8 +746,9 @@ static struct irq_chip dynirq_chip = {
23381 };
23382
23383 /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
23384 -static int pirq_eoi_does_unmask;
23385 +static bool pirq_eoi_does_unmask;
23386 static unsigned long *pirq_needs_eoi;
23387 +static DECLARE_BITMAP(probing_pirq, NR_PIRQS);
23388
23389 static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
23390 {
23391 @@ -794,25 +795,31 @@ static inline void pirq_query_unmask(int
23392 set_bit(irq - PIRQ_BASE, pirq_needs_eoi);
23393 }
23394
23395 -/*
23396 - * On startup, if there is no action associated with the IRQ then we are
23397 - * probing. In this case we should not share with others as it will confuse us.
23398 - */
23399 -#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
23400 +static int set_type_pirq(unsigned int irq, unsigned int type)
23401 +{
23402 + if (type != IRQ_TYPE_PROBE)
23403 + return -EINVAL;
23404 + set_bit(irq - PIRQ_BASE, probing_pirq);
23405 + return 0;
23406 +}
23407
23408 static unsigned int startup_pirq(unsigned int irq)
23409 {
23410 struct evtchn_bind_pirq bind_pirq;
23411 int evtchn = evtchn_from_irq(irq);
23412
23413 - if (VALID_EVTCHN(evtchn))
23414 + if (VALID_EVTCHN(evtchn)) {
23415 + clear_bit(irq - PIRQ_BASE, probing_pirq);
23416 goto out;
23417 + }
23418
23419 bind_pirq.pirq = evtchn_get_xen_pirq(irq);
23420 /* NB. We are happy to share unless we are probing. */
23421 - bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
23422 + bind_pirq.flags = test_and_clear_bit(irq - PIRQ_BASE, probing_pirq)
23423 + || (irq_desc[irq].status & IRQ_AUTODETECT)
23424 + ? 0 : BIND_PIRQ__WILL_SHARE;
23425 if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
23426 - if (!probing_irq(irq))
23427 + if (bind_pirq.flags)
23428 printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
23429 irq);
23430 return 0;
23431 @@ -891,6 +898,7 @@ static struct irq_chip pirq_chip = {
23432 .mask_ack = ack_pirq,
23433 .ack = ack_pirq,
23434 .end = end_pirq,
23435 + .set_type = set_type_pirq,
23436 #ifdef CONFIG_SMP
23437 .set_affinity = set_affinity_irq,
23438 #endif
23439 @@ -1003,6 +1011,7 @@ void xen_poll_irq(int irq)
23440 BUG();
23441 }
23442
23443 +#ifdef CONFIG_PM_SLEEP
23444 static void restore_cpu_virqs(unsigned int cpu)
23445 {
23446 struct evtchn_bind_virq bind_virq;
23447 @@ -1095,6 +1104,7 @@ void irq_resume(void)
23448 }
23449
23450 }
23451 +#endif
23452
23453 #if defined(CONFIG_X86_IO_APIC)
23454 #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
23455 @@ -1177,7 +1187,7 @@ void __init xen_init_IRQ(void)
23456 * BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8)));
23457 eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
23458 if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
23459 - pirq_eoi_does_unmask = 1;
23460 + pirq_eoi_does_unmask = true;
23461
23462 /* No event channels are 'live' right now. */
23463 for (i = 0; i < NR_EVENT_CHANNELS; i++)
23464 --- sle11-2009-10-16.orig/drivers/xen/core/gnttab.c 2008-12-01 11:25:57.000000000 +0100
23465 +++ sle11-2009-10-16/drivers/xen/core/gnttab.c 2009-06-04 10:21:39.000000000 +0200
23466 @@ -449,6 +449,7 @@ static int map_pte_fn(pte_t *pte, struct
23467 return 0;
23468 }
23469
23470 +#ifdef CONFIG_PM_SLEEP
23471 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
23472 unsigned long addr, void *data)
23473 {
23474 @@ -456,6 +457,7 @@ static int unmap_pte_fn(pte_t *pte, stru
23475 set_pte_at(&init_mm, addr, pte, __pte(0));
23476 return 0;
23477 }
23478 +#endif
23479
23480 void *arch_gnttab_alloc_shared(unsigned long *frames)
23481 {
23482 @@ -633,6 +635,75 @@ void __gnttab_dma_map_page(struct page *
23483 } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
23484 }
23485
23486 +#ifdef __HAVE_ARCH_PTE_SPECIAL
23487 +
23488 +static unsigned int GNTMAP_pte_special;
23489 +
23490 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
23491 + unsigned int count)
23492 +{
23493 + unsigned int i;
23494 +
23495 + if (unlikely(cmd != GNTTABOP_map_grant_ref))
23496 + count = 0;
23497 +
23498 + for (i = 0; i < count; ++i, ++map) {
23499 + if (!(map->flags & GNTMAP_host_map)
23500 + || !(map->flags & GNTMAP_application_map))
23501 + continue;
23502 + if (GNTMAP_pte_special)
23503 + map->flags |= GNTMAP_pte_special;
23504 + else {
23505 + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
23506 + return true;
23507 + }
23508 + }
23509 +
23510 + return false;
23511 +}
23512 +EXPORT_SYMBOL(gnttab_pre_map_adjust);
23513 +
23514 +#if CONFIG_XEN_COMPAT < 0x030400
23515 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
23516 +{
23517 + unsigned int i;
23518 + int rc = 0;
23519 +
23520 + for (i = 0; i < count && rc == 0; ++i, ++map) {
23521 + pte_t pte;
23522 +
23523 + if (!(map->flags & GNTMAP_host_map)
23524 + || !(map->flags & GNTMAP_application_map))
23525 + continue;
23526 +
23527 +#ifdef CONFIG_X86
23528 + pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
23529 + | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
23530 + | _PAGE_SPECIAL)
23531 + & __supported_pte_mask);
23532 +#else
23533 +#error Architecture not yet supported.
23534 +#endif
23535 + if (!(map->flags & GNTMAP_readonly))
23536 + pte = pte_mkwrite(pte);
23537 +
23538 + if (map->flags & GNTMAP_contains_pte) {
23539 + mmu_update_t u;
23540 +
23541 + u.ptr = map->host_addr;
23542 + u.val = __pte_val(pte);
23543 + rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
23544 + } else
23545 + rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
23546 + }
23547 +
23548 + return rc;
23549 +}
23550 +EXPORT_SYMBOL(gnttab_post_map_adjust);
23551 +#endif
23552 +
23553 +#endif /* __HAVE_ARCH_PTE_SPECIAL */
23554 +
23555 int gnttab_resume(void)
23556 {
23557 if (max_nr_grant_frames() < nr_grant_frames)
23558 @@ -640,6 +711,7 @@ int gnttab_resume(void)
23559 return gnttab_map(0, nr_grant_frames - 1);
23560 }
23561
23562 +#ifdef CONFIG_PM_SLEEP
23563 int gnttab_suspend(void)
23564 {
23565 #ifdef CONFIG_X86
23566 @@ -649,6 +721,7 @@ int gnttab_suspend(void)
23567 #endif
23568 return 0;
23569 }
23570 +#endif
23571
23572 #else /* !CONFIG_XEN */
23573
23574 @@ -759,6 +832,18 @@ int __devinit gnttab_init(void)
23575 gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
23576 gnttab_free_head = NR_RESERVED_ENTRIES;
23577
23578 +#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
23579 + if (!xen_feature(XENFEAT_auto_translated_physmap)
23580 + && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
23581 +#ifdef CONFIG_X86
23582 + GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
23583 + >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
23584 +#else
23585 +#error Architecture not yet supported.
23586 +#endif
23587 + }
23588 +#endif
23589 +
23590 return 0;
23591
23592 ini_nomem:
23593 --- sle11-2009-10-16.orig/drivers/xen/core/machine_kexec.c 2009-03-16 16:38:05.000000000 +0100
23594 +++ sle11-2009-10-16/drivers/xen/core/machine_kexec.c 2009-06-04 10:21:39.000000000 +0200
23595 @@ -91,7 +91,7 @@ void __init xen_machine_kexec_setup_reso
23596 xen_hypervisor_res.start = range.start;
23597 xen_hypervisor_res.end = range.start + range.size - 1;
23598 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
23599 -#ifdef CONFIG_X86_64
23600 +#ifdef CONFIG_X86
23601 insert_resource(&iomem_resource, &xen_hypervisor_res);
23602 #endif
23603
23604 @@ -106,7 +106,7 @@ void __init xen_machine_kexec_setup_reso
23605 if (range.size) {
23606 crashk_res.start = range.start;
23607 crashk_res.end = range.start + range.size - 1;
23608 -#ifdef CONFIG_X86_64
23609 +#ifdef CONFIG_X86
23610 insert_resource(&iomem_resource, &crashk_res);
23611 #endif
23612 }
23613 @@ -160,7 +160,7 @@ void __init xen_machine_kexec_setup_reso
23614 return;
23615 }
23616
23617 -#ifndef CONFIG_X86_64
23618 +#ifndef CONFIG_X86
23619 void __init xen_machine_kexec_register_resources(struct resource *res)
23620 {
23621 request_resource(res, &xen_hypervisor_res);
23622 --- sle11-2009-10-16.orig/drivers/xen/core/machine_reboot.c 2009-10-28 14:55:02.000000000 +0100
23623 +++ sle11-2009-10-16/drivers/xen/core/machine_reboot.c 2009-06-04 10:21:39.000000000 +0200
23624 @@ -57,6 +57,7 @@ EXPORT_SYMBOL(machine_restart);
23625 EXPORT_SYMBOL(machine_halt);
23626 EXPORT_SYMBOL(machine_power_off);
23627
23628 +#ifdef CONFIG_PM_SLEEP
23629 static void pre_suspend(void)
23630 {
23631 HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
23632 @@ -111,6 +112,7 @@ static void post_suspend(int suspend_can
23633 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
23634 virt_to_mfn(pfn_to_mfn_frame_list_list);
23635 }
23636 +#endif
23637
23638 #else /* !(defined(__i386__) || defined(__x86_64__)) */
23639
23640 @@ -129,6 +131,7 @@ static void post_suspend(int suspend_can
23641
23642 #endif
23643
23644 +#ifdef CONFIG_PM_SLEEP
23645 struct suspend {
23646 int fast_suspend;
23647 void (*resume_notifier)(int);
23648 @@ -222,7 +225,8 @@ int __xen_suspend(int fast_suspend, void
23649
23650 if (fast_suspend) {
23651 xenbus_suspend();
23652 - err = stop_machine_run(take_machine_down, &suspend, 0);
23653 + err = stop_machine(take_machine_down, &suspend,
23654 + &cpumask_of_cpu(0));
23655 if (err < 0)
23656 xenbus_suspend_cancel();
23657 } else {
23658 @@ -245,3 +249,4 @@ int __xen_suspend(int fast_suspend, void
23659
23660 return 0;
23661 }
23662 +#endif
23663 --- sle11-2009-10-16.orig/drivers/xen/core/reboot.c 2009-02-16 16:17:21.000000000 +0100
23664 +++ sle11-2009-10-16/drivers/xen/core/reboot.c 2009-06-04 10:21:39.000000000 +0200
23665 @@ -29,17 +29,12 @@ MODULE_LICENSE("Dual BSD/GPL");
23666 /* Ignore multiple shutdown requests. */
23667 static int shutting_down = SHUTDOWN_INVALID;
23668
23669 -/* Was last suspend request cancelled? */
23670 -static int suspend_cancelled;
23671 -
23672 /* Can we leave APs online when we suspend? */
23673 static int fast_suspend;
23674
23675 static void __shutdown_handler(struct work_struct *unused);
23676 static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
23677
23678 -static int setup_suspend_evtchn(void);
23679 -
23680 int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
23681
23682 static int shutdown_process(void *__unused)
23683 @@ -69,6 +64,13 @@ static int shutdown_process(void *__unus
23684 return 0;
23685 }
23686
23687 +#ifdef CONFIG_PM_SLEEP
23688 +
23689 +static int setup_suspend_evtchn(void);
23690 +
23691 +/* Was last suspend request cancelled? */
23692 +static int suspend_cancelled;
23693 +
23694 static void xen_resume_notifier(int _suspend_cancelled)
23695 {
23696 int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
23697 @@ -118,6 +120,10 @@ static int xen_suspend(void *__unused)
23698 return 0;
23699 }
23700
23701 +#else
23702 +# define xen_suspend NULL
23703 +#endif
23704 +
23705 static void switch_shutdown_state(int new_state)
23706 {
23707 int prev_state, old_state = SHUTDOWN_INVALID;
23708 @@ -194,8 +200,10 @@ static void shutdown_handler(struct xenb
23709 new_state = SHUTDOWN_POWEROFF;
23710 else if (strcmp(str, "reboot") == 0)
23711 ctrl_alt_del();
23712 +#ifdef CONFIG_PM_SLEEP
23713 else if (strcmp(str, "suspend") == 0)
23714 new_state = SHUTDOWN_SUSPEND;
23715 +#endif
23716 else if (strcmp(str, "halt") == 0)
23717 new_state = SHUTDOWN_HALT;
23718 else
23719 @@ -247,6 +255,7 @@ static struct xenbus_watch sysrq_watch =
23720 .callback = sysrq_handler
23721 };
23722
23723 +#ifdef CONFIG_PM_SLEEP
23724 static irqreturn_t suspend_int(int irq, void* dev_id)
23725 {
23726 switch_shutdown_state(SHUTDOWN_SUSPEND);
23727 @@ -274,6 +283,9 @@ static int setup_suspend_evtchn(void)
23728
23729 return 0;
23730 }
23731 +#else
23732 +#define setup_suspend_evtchn() 0
23733 +#endif
23734
23735 static int setup_shutdown_watcher(void)
23736 {
23737 --- sle11-2009-10-16.orig/drivers/xen/core/smpboot.c 2009-03-16 16:38:05.000000000 +0100
23738 +++ sle11-2009-10-16/drivers/xen/core/smpboot.c 2009-06-04 10:21:39.000000000 +0200
23739 @@ -27,6 +27,7 @@
23740
23741 extern irqreturn_t smp_reschedule_interrupt(int, void *);
23742 extern irqreturn_t smp_call_function_interrupt(int, void *);
23743 +extern irqreturn_t smp_call_function_single_interrupt(int, void *);
23744
23745 extern int local_setup_timer(unsigned int cpu);
23746 extern void local_teardown_timer(unsigned int cpu);
23747 @@ -50,8 +51,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
23748
23749 static DEFINE_PER_CPU(int, resched_irq);
23750 static DEFINE_PER_CPU(int, callfunc_irq);
23751 +static DEFINE_PER_CPU(int, call1func_irq);
23752 static char resched_name[NR_CPUS][15];
23753 static char callfunc_name[NR_CPUS][15];
23754 +static char call1func_name[NR_CPUS][15];
23755
23756 #ifdef CONFIG_X86_LOCAL_APIC
23757 #define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
23758 @@ -73,15 +76,13 @@ void __init prefill_possible_map(void)
23759
23760 for (i = 0; i < NR_CPUS; i++) {
23761 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
23762 - if (rc >= 0)
23763 + if (rc >= 0) {
23764 cpu_set(i, cpu_possible_map);
23765 + nr_cpu_ids = i + 1;
23766 + }
23767 }
23768 }
23769
23770 -void __init smp_alloc_memory(void)
23771 -{
23772 -}
23773 -
23774 static inline void
23775 set_cpu_sibling_map(unsigned int cpu)
23776 {
23777 @@ -110,7 +111,8 @@ static int __cpuinit xen_smp_intr_init(u
23778 {
23779 int rc;
23780
23781 - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
23782 + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) =
23783 + per_cpu(call1func_irq, cpu) = -1;
23784
23785 sprintf(resched_name[cpu], "resched%u", cpu);
23786 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
23787 @@ -134,6 +136,17 @@ static int __cpuinit xen_smp_intr_init(u
23788 goto fail;
23789 per_cpu(callfunc_irq, cpu) = rc;
23790
23791 + sprintf(call1func_name[cpu], "call1func%u", cpu);
23792 + rc = bind_ipi_to_irqhandler(CALL_FUNC_SINGLE_VECTOR,
23793 + cpu,
23794 + smp_call_function_single_interrupt,
23795 + IRQF_DISABLED|IRQF_NOBALANCING,
23796 + call1func_name[cpu],
23797 + NULL);
23798 + if (rc < 0)
23799 + goto fail;
23800 + per_cpu(call1func_irq, cpu) = rc;
23801 +
23802 rc = xen_spinlock_init(cpu);
23803 if (rc < 0)
23804 goto fail;
23805 @@ -148,6 +161,8 @@ static int __cpuinit xen_smp_intr_init(u
23806 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23807 if (per_cpu(callfunc_irq, cpu) >= 0)
23808 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23809 + if (per_cpu(call1func_irq, cpu) >= 0)
23810 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23811 xen_spinlock_cleanup(cpu);
23812 return rc;
23813 }
23814 @@ -160,6 +175,7 @@ static void __cpuexit xen_smp_intr_exit(
23815
23816 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
23817 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
23818 + unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
23819 xen_spinlock_cleanup(cpu);
23820 }
23821 #endif
23822 @@ -167,11 +183,7 @@ static void __cpuexit xen_smp_intr_exit(
23823 void __cpuinit cpu_bringup(void)
23824 {
23825 cpu_init();
23826 -#ifdef __i386__
23827 identify_secondary_cpu(&current_cpu_data);
23828 -#else
23829 - identify_cpu(&current_cpu_data);
23830 -#endif
23831 touch_softlockup_watchdog();
23832 preempt_disable();
23833 local_irq_enable();
23834 @@ -251,9 +263,6 @@ void __init smp_prepare_cpus(unsigned in
23835 struct task_struct *idle;
23836 int apicid;
23837 struct vcpu_get_physid cpu_id;
23838 -#ifdef __x86_64__
23839 - struct desc_ptr *gdt_descr;
23840 -#endif
23841 void *gdt_addr;
23842
23843 apicid = 0;
23844 @@ -266,7 +275,7 @@ void __init smp_prepare_cpus(unsigned in
23845
23846 current_thread_info()->cpu = 0;
23847
23848 - for (cpu = 0; cpu < NR_CPUS; cpu++) {
23849 + for_each_possible_cpu (cpu) {
23850 cpus_clear(per_cpu(cpu_sibling_map, cpu));
23851 cpus_clear(per_cpu(cpu_core_map, cpu));
23852 }
23853 @@ -293,21 +302,10 @@ void __init smp_prepare_cpus(unsigned in
23854 if (IS_ERR(idle))
23855 panic("failed fork for CPU %d", cpu);
23856
23857 -#ifdef __x86_64__
23858 - gdt_descr = &cpu_gdt_descr[cpu];
23859 - gdt_descr->address = get_zeroed_page(GFP_KERNEL);
23860 - if (unlikely(!gdt_descr->address)) {
23861 - printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
23862 - cpu);
23863 - continue;
23864 - }
23865 - gdt_descr->size = GDT_SIZE;
23866 - memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
23867 - gdt_addr = (void *)gdt_descr->address;
23868 -#else
23869 +#ifdef __i386__
23870 init_gdt(cpu);
23871 - gdt_addr = get_cpu_gdt_table(cpu);
23872 #endif
23873 + gdt_addr = get_cpu_gdt_table(cpu);
23874 make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
23875
23876 apicid = cpu;
23877 @@ -353,8 +351,8 @@ void __init smp_prepare_boot_cpu(void)
23878 {
23879 #ifdef __i386__
23880 init_gdt(smp_processor_id());
23881 - switch_to_new_gdt();
23882 #endif
23883 + switch_to_new_gdt();
23884 prefill_possible_map();
23885 }
23886
23887 --- sle11-2009-10-16.orig/drivers/xen/core/spinlock.c 2009-03-16 16:33:40.000000000 +0100
23888 +++ sle11-2009-10-16/drivers/xen/core/spinlock.c 2009-06-04 10:36:24.000000000 +0200
23889 @@ -5,6 +5,8 @@
23890 * portions of this file.
23891 */
23892
23893 +#if CONFIG_XEN_COMPAT >= 0x030200
23894 +
23895 #include <linux/init.h>
23896 #include <linux/irq.h>
23897 #include <linux/kernel.h>
23898 @@ -73,9 +75,9 @@ int xen_spin_wait(raw_spinlock_t *lock,
23899 /* announce we're spinning */
23900 spinning.ticket = token;
23901 spinning.lock = lock;
23902 - spinning.prev = __get_cpu_var(spinning);
23903 + spinning.prev = x86_read_percpu(spinning);
23904 smp_wmb();
23905 - __get_cpu_var(spinning) = &spinning;
23906 + x86_write_percpu(spinning, &spinning);
23907
23908 /* clear pending */
23909 xen_clear_irq_pending(irq);
23910 @@ -102,7 +104,7 @@ int xen_spin_wait(raw_spinlock_t *lock,
23911 kstat_this_cpu.irqs[irq] += !rc;
23912
23913 /* announce we're done */
23914 - __get_cpu_var(spinning) = spinning.prev;
23915 + x86_write_percpu(spinning, spinning.prev);
23916 rm_lock = &__get_cpu_var(spinning_rm_lock);
23917 raw_local_irq_save(flags);
23918 __raw_write_lock(rm_lock);
23919 @@ -159,3 +161,5 @@ void xen_spin_kick(raw_spinlock_t *lock,
23920 }
23921 }
23922 EXPORT_SYMBOL(xen_spin_kick);
23923 +
23924 +#endif /* CONFIG_XEN_COMPAT >= 0x030200 */
23925 --- sle11-2009-10-16.orig/drivers/xen/fbfront/xenfb.c 2009-03-16 16:38:05.000000000 +0100
23926 +++ sle11-2009-10-16/drivers/xen/fbfront/xenfb.c 2009-06-04 10:21:39.000000000 +0200
23927 @@ -18,6 +18,7 @@
23928 * frame buffer.
23929 */
23930
23931 +#include <linux/console.h>
23932 #include <linux/kernel.h>
23933 #include <linux/errno.h>
23934 #include <linux/fb.h>
23935 @@ -545,6 +546,28 @@ static unsigned long vmalloc_to_mfn(void
23936 return pfn_to_mfn(vmalloc_to_pfn(address));
23937 }
23938
23939 +static __devinit void
23940 +xenfb_make_preferred_console(void)
23941 +{
23942 + struct console *c;
23943 +
23944 + if (console_set_on_cmdline)
23945 + return;
23946 +
23947 + acquire_console_sem();
23948 + for (c = console_drivers; c; c = c->next) {
23949 + if (!strcmp(c->name, "tty") && c->index == 0)
23950 + break;
23951 + }
23952 + release_console_sem();
23953 + if (c) {
23954 + unregister_console(c);
23955 + c->flags |= CON_CONSDEV;
23956 + c->flags &= ~CON_PRINTBUFFER; /* don't print again */
23957 + register_console(c);
23958 + }
23959 +}
23960 +
23961 static int __devinit xenfb_probe(struct xenbus_device *dev,
23962 const struct xenbus_device_id *id)
23963 {
23964 @@ -665,6 +688,7 @@ static int __devinit xenfb_probe(struct
23965 if (ret < 0)
23966 goto error;
23967
23968 + xenfb_make_preferred_console();
23969 return 0;
23970
23971 error_nomem:
23972 @@ -882,4 +906,5 @@ static void __exit xenfb_cleanup(void)
23973 module_init(xenfb_init);
23974 module_exit(xenfb_cleanup);
23975
23976 +MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
23977 MODULE_LICENSE("GPL");
23978 --- sle11-2009-10-16.orig/drivers/xen/fbfront/xenkbd.c 2009-03-04 11:25:55.000000000 +0100
23979 +++ sle11-2009-10-16/drivers/xen/fbfront/xenkbd.c 2009-06-04 10:21:39.000000000 +0200
23980 @@ -350,4 +350,5 @@ static void __exit xenkbd_cleanup(void)
23981 module_init(xenkbd_init);
23982 module_exit(xenkbd_cleanup);
23983
23984 +MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
23985 MODULE_LICENSE("GPL");
23986 --- sle11-2009-10-16.orig/drivers/xen/gntdev/gntdev.c 2009-03-16 16:38:05.000000000 +0100
23987 +++ sle11-2009-10-16/drivers/xen/gntdev/gntdev.c 2009-06-04 10:21:39.000000000 +0200
23988 @@ -418,7 +418,7 @@ static int __init gntdev_init(void)
23989 }
23990
23991 device = device_create(class, NULL, MKDEV(gntdev_major, 0),
23992 - GNTDEV_NAME);
23993 + NULL, GNTDEV_NAME);
23994 if (IS_ERR(device)) {
23995 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
23996 printk(KERN_ERR "gntdev created with major number = %d\n",
23997 --- sle11-2009-10-16.orig/drivers/xen/netfront/accel.c 2009-03-30 16:39:19.000000000 +0200
23998 +++ sle11-2009-10-16/drivers/xen/netfront/accel.c 2009-06-04 10:21:39.000000000 +0200
23999 @@ -28,6 +28,7 @@
24000 * IN THE SOFTWARE.
24001 */
24002
24003 +#include <linux/version.h>
24004 #include <linux/netdevice.h>
24005 #include <linux/skbuff.h>
24006 #include <linux/list.h>
24007 --- sle11-2009-10-16.orig/drivers/xen/netfront/netfront.c 2009-03-30 16:40:17.000000000 +0200
24008 +++ sle11-2009-10-16/drivers/xen/netfront/netfront.c 2009-06-04 10:21:39.000000000 +0200
24009 @@ -640,7 +640,7 @@ static int network_open(struct net_devic
24010 }
24011 spin_unlock_bh(&np->rx_lock);
24012
24013 - network_maybe_wake_tx(dev);
24014 + netif_start_queue(dev);
24015
24016 return 0;
24017 }
24018 --- sle11-2009-10-16.orig/drivers/xen/sfc_netback/accel.h 2009-03-30 16:00:09.000000000 +0200
24019 +++ sle11-2009-10-16/drivers/xen/sfc_netback/accel.h 2009-06-04 10:21:39.000000000 +0200
24020 @@ -25,6 +25,7 @@
24021 #ifndef NETBACK_ACCEL_H
24022 #define NETBACK_ACCEL_H
24023
24024 +#include <linux/version.h>
24025 #include <linux/slab.h>
24026 #include <linux/ip.h>
24027 #include <linux/tcp.h>
24028 --- sle11-2009-10-16.orig/drivers/xen/sfc_netfront/accel.h 2009-03-30 16:34:56.000000000 +0200
24029 +++ sle11-2009-10-16/drivers/xen/sfc_netfront/accel.h 2009-06-04 10:21:39.000000000 +0200
24030 @@ -35,6 +35,7 @@
24031 #include <xen/evtchn.h>
24032
24033 #include <linux/kernel.h>
24034 +#include <linux/version.h>
24035 #include <linux/list.h>
24036
24037 enum netfront_accel_post_status {
24038 --- sle11-2009-10-16.orig/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:13:17.000000000 +0100
24039 +++ sle11-2009-10-16/drivers/xen/xenbus/xenbus_client.c 2009-06-04 10:21:39.000000000 +0200
24040 @@ -150,7 +150,7 @@ int xenbus_watch_pathfmt(struct xenbus_d
24041 char *path;
24042
24043 va_start(ap, pathfmt);
24044 - path = kvasprintf(GFP_KERNEL, pathfmt, ap);
24045 + path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap);
24046 va_end(ap);
24047
24048 if (!path) {
24049 --- sle11-2009-10-16.orig/drivers/xen/xenbus/xenbus_comms.c 2009-02-16 16:17:21.000000000 +0100
24050 +++ sle11-2009-10-16/drivers/xen/xenbus/xenbus_comms.c 2009-06-04 10:21:39.000000000 +0200
24051 @@ -228,14 +228,11 @@ int xb_init_comms(void)
24052 intf->rsp_cons = intf->rsp_prod;
24053 }
24054
24055 +#if defined(CONFIG_XEN) || defined(MODULE)
24056 if (xenbus_irq)
24057 unbind_from_irqhandler(xenbus_irq, &xb_waitq);
24058
24059 -#if defined(CONFIG_XEN) || defined(MODULE)
24060 err = bind_caller_port_to_irqhandler(
24061 -#else
24062 - err = bind_evtchn_to_irqhandler(
24063 -#endif
24064 xen_store_evtchn, wake_waiting,
24065 0, "xenbus", &xb_waitq);
24066 if (err <= 0) {
24067 @@ -244,6 +241,20 @@ int xb_init_comms(void)
24068 }
24069
24070 xenbus_irq = err;
24071 +#else
24072 + if (xenbus_irq) {
24073 + /* Already have an irq; assume we're resuming */
24074 + rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
24075 + } else {
24076 + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
24077 + 0, "xenbus", &xb_waitq);
24078 + if (err <= 0) {
24079 + printk(KERN_ERR "XENBUS request irq failed %i\n", err);
24080 + return err;
24081 + }
24082 + xenbus_irq = err;
24083 + }
24084 +#endif
24085
24086 return 0;
24087 }
24088 --- sle11-2009-10-16.orig/drivers/xen/xenbus/xenbus_probe.c 2009-03-16 16:38:05.000000000 +0100
24089 +++ sle11-2009-10-16/drivers/xen/xenbus/xenbus_probe.c 2009-06-04 10:21:39.000000000 +0200
24090 @@ -36,6 +36,7 @@
24091 __FUNCTION__, __LINE__, ##args)
24092
24093 #include <linux/kernel.h>
24094 +#include <linux/version.h>
24095 #include <linux/err.h>
24096 #include <linux/string.h>
24097 #include <linux/ctype.h>
24098 --- sle11-2009-10-16.orig/fs/aio.c 2009-03-24 10:13:25.000000000 +0100
24099 +++ sle11-2009-10-16/fs/aio.c 2009-06-04 10:21:39.000000000 +0200
24100 @@ -1335,7 +1335,7 @@ static int make_aio_fd(struct kioctx *io
24101 int fd;
24102 struct file *file;
24103
24104 - fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
24105 + fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
24106 if (fd < 0)
24107 return fd;
24108
24109 --- sle11-2009-10-16.orig/include/asm-generic/pgtable.h 2009-03-04 11:28:34.000000000 +0100
24110 +++ sle11-2009-10-16/include/asm-generic/pgtable.h 2009-06-04 10:21:39.000000000 +0200
24111 @@ -99,10 +99,6 @@ static inline void ptep_set_wrprotect(st
24112 }
24113 #endif
24114
24115 -#ifndef arch_change_pte_range
24116 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
24117 -#endif
24118 -
24119 #ifndef __HAVE_ARCH_PTE_SAME
24120 #define pte_same(A,B) (pte_val(A) == pte_val(B))
24121 #endif
24122 --- sle11-2009-10-16.orig/include/asm-x86/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
24123 +++ sle11-2009-10-16/include/asm-x86/dma-mapping.h 2009-06-04 10:21:39.000000000 +0200
24124 @@ -74,7 +74,7 @@ static inline struct dma_mapping_ops *ge
24125 /* Make sure we keep the same behaviour */
24126 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
24127 {
24128 -#ifdef CONFIG_X86_32
24129 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
24130 return 0;
24131 #else
24132 struct dma_mapping_ops *ops = get_dma_ops(dev);
24133 --- sle11-2009-10-16.orig/include/asm-x86/kexec.h 2008-12-01 11:11:08.000000000 +0100
24134 +++ sle11-2009-10-16/include/asm-x86/kexec.h 2009-06-04 10:21:39.000000000 +0200
24135 @@ -10,6 +10,7 @@
24136 # define VA_PTE_0 5
24137 # define PA_PTE_1 6
24138 # define VA_PTE_1 7
24139 +# ifndef CONFIG_XEN
24140 # define PA_SWAP_PAGE 8
24141 # ifdef CONFIG_X86_PAE
24142 # define PA_PMD_0 9
24143 @@ -20,6 +21,18 @@
24144 # else
24145 # define PAGES_NR 9
24146 # endif
24147 +# else /* CONFIG_XEN */
24148 +/*
24149 + * The hypervisor interface implicitly requires that all entries (except
24150 + * for possibly the final one) are arranged in matching PA_/VA_ pairs.
24151 + */
24152 +# define PA_PMD_0 8
24153 +# define VA_PMD_0 9
24154 +# define PA_PMD_1 10
24155 +# define VA_PMD_1 11
24156 +# define PA_SWAP_PAGE 12
24157 +# define PAGES_NR 13
24158 +# endif /* CONFIG_XEN */
24159 #else
24160 # define PA_CONTROL_PAGE 0
24161 # define VA_CONTROL_PAGE 1
24162 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:38:05.000000000 +0100
24163 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/desc.h 2009-06-04 10:21:39.000000000 +0200
24164 @@ -31,11 +31,17 @@ extern struct desc_ptr idt_descr;
24165 extern gate_desc idt_table[];
24166 #endif
24167
24168 +struct gdt_page {
24169 + struct desc_struct gdt[GDT_ENTRIES];
24170 +} __attribute__((aligned(PAGE_SIZE)));
24171 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
24172 +
24173 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
24174 +{
24175 + return per_cpu(gdt_page, cpu).gdt;
24176 +}
24177 +
24178 #ifdef CONFIG_X86_64
24179 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
24180 -extern struct desc_ptr cpu_gdt_descr[];
24181 -/* the cpu gdt accessor */
24182 -#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
24183
24184 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
24185 unsigned dpl, unsigned ist, unsigned seg)
24186 @@ -53,16 +59,6 @@ static inline void pack_gate(gate_desc *
24187 }
24188
24189 #else
24190 -struct gdt_page {
24191 - struct desc_struct gdt[GDT_ENTRIES];
24192 -} __attribute__((aligned(PAGE_SIZE)));
24193 -DECLARE_PER_CPU(struct gdt_page, gdt_page);
24194 -
24195 -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
24196 -{
24197 - return per_cpu(gdt_page, cpu).gdt;
24198 -}
24199 -
24200 static inline void pack_gate(gate_desc *gate, unsigned char type,
24201 unsigned long base, unsigned dpl, unsigned flags,
24202 unsigned short seg)
24203 @@ -333,6 +329,28 @@ static inline void set_intr_gate(unsigne
24204 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
24205 }
24206
24207 +#define SYS_VECTOR_FREE 0
24208 +#define SYS_VECTOR_ALLOCED 1
24209 +
24210 +extern int first_system_vector;
24211 +extern char system_vectors[];
24212 +
24213 +static inline void alloc_system_vector(int vector)
24214 +{
24215 + if (system_vectors[vector] == SYS_VECTOR_FREE) {
24216 + system_vectors[vector] = SYS_VECTOR_ALLOCED;
24217 + if (first_system_vector > vector)
24218 + first_system_vector = vector;
24219 + } else
24220 + BUG();
24221 +}
24222 +
24223 +static inline void alloc_intr_gate(unsigned int n, void *addr)
24224 +{
24225 + alloc_system_vector(n);
24226 + set_intr_gate(n, addr);
24227 +}
24228 +
24229 /*
24230 * This routine sets up an interrupt gate at directory privilege level 3.
24231 */
24232 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap.h 2009-03-16 16:38:05.000000000 +0100
24233 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap.h 2009-06-04 10:21:39.000000000 +0200
24234 @@ -7,7 +7,58 @@
24235 # include "fixmap_64.h"
24236 #endif
24237
24238 +extern int fixmaps_set;
24239 +
24240 +void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
24241 +
24242 +static inline void __set_fixmap(enum fixed_addresses idx,
24243 + maddr_t phys, pgprot_t flags)
24244 +{
24245 + xen_set_fixmap(idx, phys, flags);
24246 +}
24247 +
24248 +#define set_fixmap(idx, phys) \
24249 + __set_fixmap(idx, phys, PAGE_KERNEL)
24250 +
24251 +/*
24252 + * Some hardware wants to get fixmapped without caching.
24253 + */
24254 +#define set_fixmap_nocache(idx, phys) \
24255 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24256 +
24257 #define clear_fixmap(idx) \
24258 __set_fixmap(idx, 0, __pgprot(0))
24259
24260 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24261 +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24262 +
24263 +extern void __this_fixmap_does_not_exist(void);
24264 +
24265 +/*
24266 + * 'index to address' translation. If anyone tries to use the idx
24267 + * directly without translation, we catch the bug with a NULL-deference
24268 + * kernel oops. Illegal ranges of incoming indices are caught too.
24269 + */
24270 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24271 +{
24272 + /*
24273 + * this branch gets completely eliminated after inlining,
24274 + * except when someone tries to use fixaddr indices in an
24275 + * illegal way. (such as mixing up address types or using
24276 + * out-of-range indices).
24277 + *
24278 + * If it doesn't get removed, the linker will complain
24279 + * loudly with a reasonably clear error message..
24280 + */
24281 + if (idx >= __end_of_fixed_addresses)
24282 + __this_fixmap_does_not_exist();
24283 +
24284 + return __fix_to_virt(idx);
24285 +}
24286 +
24287 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
24288 +{
24289 + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24290 + return __virt_to_fix(vaddr);
24291 +}
24292 #endif
24293 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:38:05.000000000 +0100
24294 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-06-04 10:21:39.000000000 +0200
24295 @@ -58,10 +58,17 @@ enum fixed_addresses {
24296 #ifdef CONFIG_X86_LOCAL_APIC
24297 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24298 #endif
24299 -#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
24300 +#ifndef CONFIG_XEN
24301 +#ifdef CONFIG_X86_IO_APIC
24302 FIX_IO_APIC_BASE_0,
24303 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
24304 #endif
24305 +#else
24306 + FIX_SHARED_INFO,
24307 +#define NR_FIX_ISAMAPS 256
24308 + FIX_ISAMAP_END,
24309 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24310 +#endif
24311 #ifdef CONFIG_X86_VISWS_APIC
24312 FIX_CO_CPU, /* Cobalt timer */
24313 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
24314 @@ -78,51 +85,38 @@ enum fixed_addresses {
24315 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
24316 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
24317 #endif
24318 -#ifdef CONFIG_ACPI
24319 - FIX_ACPI_BEGIN,
24320 - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24321 -#endif
24322 #ifdef CONFIG_PCI_MMCONFIG
24323 FIX_PCIE_MCFG,
24324 #endif
24325 #ifdef CONFIG_PARAVIRT
24326 FIX_PARAVIRT_BOOTMAP,
24327 #endif
24328 - FIX_SHARED_INFO,
24329 -#define NR_FIX_ISAMAPS 256
24330 - FIX_ISAMAP_END,
24331 - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24332 __end_of_permanent_fixed_addresses,
24333 /*
24334 * 256 temporary boot-time mappings, used by early_ioremap(),
24335 * before ioremap() is functional.
24336 *
24337 - * We round it up to the next 512 pages boundary so that we
24338 + * We round it up to the next 256 pages boundary so that we
24339 * can have a single pgd entry and a single pte table:
24340 */
24341 #define NR_FIX_BTMAPS 64
24342 #define FIX_BTMAPS_NESTING 4
24343 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24344 - (__end_of_permanent_fixed_addresses & 511),
24345 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
24346 + (__end_of_permanent_fixed_addresses & 255),
24347 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24348 FIX_WP_TEST,
24349 +#ifdef CONFIG_ACPI
24350 + FIX_ACPI_BEGIN,
24351 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24352 +#endif
24353 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24354 FIX_OHCI1394_BASE,
24355 #endif
24356 __end_of_fixed_addresses
24357 };
24358
24359 -extern void __set_fixmap(enum fixed_addresses idx,
24360 - maddr_t phys, pgprot_t flags);
24361 extern void reserve_top_address(unsigned long reserve);
24362
24363 -#define set_fixmap(idx, phys) \
24364 - __set_fixmap(idx, phys, PAGE_KERNEL)
24365 -/*
24366 - * Some hardware wants to get fixmapped without caching.
24367 - */
24368 -#define set_fixmap_nocache(idx, phys) \
24369 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24370
24371 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
24372
24373 @@ -131,38 +125,5 @@ extern void reserve_top_address(unsigned
24374 #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
24375 #define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
24376
24377 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24378 -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
24379 -
24380 -extern void __this_fixmap_does_not_exist(void);
24381 -
24382 -/*
24383 - * 'index to address' translation. If anyone tries to use the idx
24384 - * directly without tranlation, we catch the bug with a NULL-deference
24385 - * kernel oops. Illegal ranges of incoming indices are caught too.
24386 - */
24387 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24388 -{
24389 - /*
24390 - * this branch gets completely eliminated after inlining,
24391 - * except when someone tries to use fixaddr indices in an
24392 - * illegal way. (such as mixing up address types or using
24393 - * out-of-range indices).
24394 - *
24395 - * If it doesn't get removed, the linker will complain
24396 - * loudly with a reasonably clear error message..
24397 - */
24398 - if (idx >= __end_of_fixed_addresses)
24399 - __this_fixmap_does_not_exist();
24400 -
24401 - return __fix_to_virt(idx);
24402 -}
24403 -
24404 -static inline unsigned long virt_to_fix(const unsigned long vaddr)
24405 -{
24406 - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
24407 - return __virt_to_fix(vaddr);
24408 -}
24409 -
24410 #endif /* !__ASSEMBLY__ */
24411 #endif
24412 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:38:05.000000000 +0100
24413 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-06-04 10:21:39.000000000 +0200
24414 @@ -12,6 +12,7 @@
24415 #define _ASM_FIXMAP_64_H
24416
24417 #include <linux/kernel.h>
24418 +#include <asm/acpi.h>
24419 #include <asm/apicdef.h>
24420 #include <asm/page.h>
24421 #include <asm/vsyscall.h>
24422 @@ -40,7 +41,6 @@ enum fixed_addresses {
24423 VSYSCALL_HPET,
24424 FIX_DBGP_BASE,
24425 FIX_EARLYCON_MEM_BASE,
24426 - FIX_HPET_BASE,
24427 #ifdef CONFIG_X86_LOCAL_APIC
24428 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
24429 #endif
24430 @@ -53,14 +53,21 @@ enum fixed_addresses {
24431 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
24432 + MAX_EFI_IO_PAGES - 1,
24433 #endif
24434 +#ifdef CONFIG_PARAVIRT
24435 + FIX_PARAVIRT_BOOTMAP,
24436 +#else
24437 + FIX_SHARED_INFO,
24438 +#endif
24439 #ifdef CONFIG_ACPI
24440 FIX_ACPI_BEGIN,
24441 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
24442 #endif
24443 - FIX_SHARED_INFO,
24444 #define NR_FIX_ISAMAPS 256
24445 FIX_ISAMAP_END,
24446 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
24447 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24448 + FIX_OHCI1394_BASE,
24449 +#endif
24450 __end_of_permanent_fixed_addresses,
24451 /*
24452 * 256 temporary boot-time mappings, used by early_ioremap(),
24453 @@ -71,27 +78,12 @@ enum fixed_addresses {
24454 */
24455 #define NR_FIX_BTMAPS 64
24456 #define FIX_BTMAPS_NESTING 4
24457 - FIX_BTMAP_END =
24458 - __end_of_permanent_fixed_addresses + 512 -
24459 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
24460 (__end_of_permanent_fixed_addresses & 511),
24461 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
24462 -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
24463 - FIX_OHCI1394_BASE,
24464 -#endif
24465 __end_of_fixed_addresses
24466 };
24467
24468 -extern void __set_fixmap(enum fixed_addresses idx,
24469 - unsigned long phys, pgprot_t flags);
24470 -
24471 -#define set_fixmap(idx, phys) \
24472 - __set_fixmap(idx, phys, PAGE_KERNEL)
24473 -/*
24474 - * Some hardware wants to get fixmapped without caching.
24475 - */
24476 -#define set_fixmap_nocache(idx, phys) \
24477 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
24478 -
24479 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
24480 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
24481 #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
24482 @@ -100,30 +92,4 @@ extern void __set_fixmap(enum fixed_addr
24483 #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
24484 #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
24485
24486 -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
24487 -
24488 -extern void __this_fixmap_does_not_exist(void);
24489 -
24490 -/*
24491 - * 'index to address' translation. If anyone tries to use the idx
24492 - * directly without translation, we catch the bug with a NULL-deference
24493 - * kernel oops. Illegal ranges of incoming indices are caught too.
24494 - */
24495 -static __always_inline unsigned long fix_to_virt(const unsigned int idx)
24496 -{
24497 - /*
24498 - * this branch gets completely eliminated after inlining,
24499 - * except when someone tries to use fixaddr indices in an
24500 - * illegal way. (such as mixing up address types or using
24501 - * out-of-range indices).
24502 - *
24503 - * If it doesn't get removed, the linker will complain
24504 - * loudly with a reasonably clear error message..
24505 - */
24506 - if (idx >= __end_of_fixed_addresses)
24507 - __this_fixmap_does_not_exist();
24508 -
24509 - return __fix_to_virt(idx);
24510 -}
24511 -
24512 #endif
24513 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:38:05.000000000 +0100
24514 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/highmem.h 2009-06-04 10:21:39.000000000 +0200
24515 @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *p
24516
24517 #define flush_cache_kmaps() do { } while (0)
24518
24519 +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
24520 + unsigned long end_pfn);
24521 +
24522 void clear_highpage(struct page *);
24523 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
24524 {
24525 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/hypercall.h 2009-02-16 16:18:36.000000000 +0100
24526 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/hypercall.h 2009-06-04 10:21:39.000000000 +0200
24527 @@ -323,9 +323,19 @@ static inline int __must_check
24528 HYPERVISOR_grant_table_op(
24529 unsigned int cmd, void *uop, unsigned int count)
24530 {
24531 + bool fixup = false;
24532 + int rc;
24533 +
24534 if (arch_use_lazy_mmu_mode())
24535 xen_multicall_flush(false);
24536 - return _hypercall3(int, grant_table_op, cmd, uop, count);
24537 +#ifdef GNTTABOP_map_grant_ref
24538 + if (cmd == GNTTABOP_map_grant_ref)
24539 +#endif
24540 + fixup = gnttab_pre_map_adjust(cmd, uop, count);
24541 + rc = _hypercall3(int, grant_table_op, cmd, uop, count);
24542 + if (rc == 0 && fixup)
24543 + rc = gnttab_post_map_adjust(uop, count);
24544 + return rc;
24545 }
24546
24547 static inline int __must_check
24548 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2009-03-16 16:33:40.000000000 +0100
24549 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/hypervisor.h 2009-06-04 10:21:39.000000000 +0200
24550 @@ -35,7 +35,6 @@
24551
24552 #include <linux/types.h>
24553 #include <linux/kernel.h>
24554 -#include <linux/version.h>
24555 #include <linux/errno.h>
24556 #include <xen/interface/xen.h>
24557 #include <xen/interface/platform.h>
24558 @@ -112,6 +111,8 @@ int xen_create_contiguous_region(
24559 unsigned long vstart, unsigned int order, unsigned int address_bits);
24560 void xen_destroy_contiguous_region(
24561 unsigned long vstart, unsigned int order);
24562 +int early_create_contiguous_region(unsigned long pfn, unsigned int order,
24563 + unsigned int address_bits);
24564
24565 struct page;
24566
24567 @@ -181,6 +182,29 @@ static inline void xen_multicall_flush(b
24568
24569 #endif /* CONFIG_XEN && !MODULE */
24570
24571 +#ifdef CONFIG_XEN
24572 +
24573 +struct gnttab_map_grant_ref;
24574 +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
24575 + unsigned int count);
24576 +#if CONFIG_XEN_COMPAT < 0x030400
24577 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
24578 +#else
24579 +static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
24580 + unsigned int count)
24581 +{
24582 + BUG();
24583 + return -ENOSYS;
24584 +}
24585 +#endif
24586 +
24587 +#else /* !CONFIG_XEN */
24588 +
24589 +#define gnttab_pre_map_adjust(...) false
24590 +#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
24591 +
24592 +#endif /* CONFIG_XEN */
24593 +
24594 #if defined(CONFIG_X86_64)
24595 #define MULTI_UVMFLAGS_INDEX 2
24596 #define MULTI_UVMDOMID_INDEX 3
24597 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/io.h 2009-03-16 16:38:05.000000000 +0100
24598 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/io.h 2009-09-24 11:02:00.000000000 +0200
24599 @@ -3,20 +3,140 @@
24600
24601 #define ARCH_HAS_IOREMAP_WC
24602
24603 +#include <linux/compiler.h>
24604 +
24605 +/*
24606 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24607 + * mappings, before the real ioremap() is functional.
24608 + * A boot-time mapping is currently limited to at most 16 pages.
24609 + */
24610 +#ifndef __ASSEMBLY__
24611 +extern void early_ioremap_init(void);
24612 +extern void early_ioremap_clear(void);
24613 +extern void early_ioremap_reset(void);
24614 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24615 +extern void early_iounmap(void *addr, unsigned long size);
24616 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24617 +#endif
24618 +
24619 +#define build_mmio_read(name, size, type, reg, barrier) \
24620 +static inline type name(const volatile void __iomem *addr) \
24621 +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
24622 +:"m" (*(volatile type __force *)addr) barrier); return ret; }
24623 +
24624 +#define build_mmio_write(name, size, type, reg, barrier) \
24625 +static inline void name(type val, volatile void __iomem *addr) \
24626 +{ asm volatile("mov" size " %0,%1": :reg (val), \
24627 +"m" (*(volatile type __force *)addr) barrier); }
24628 +
24629 +build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
24630 +build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
24631 +build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
24632 +
24633 +build_mmio_read(__readb, "b", unsigned char, "=q", )
24634 +build_mmio_read(__readw, "w", unsigned short, "=r", )
24635 +build_mmio_read(__readl, "l", unsigned int, "=r", )
24636 +
24637 +build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
24638 +build_mmio_write(writew, "w", unsigned short, "r", :"memory")
24639 +build_mmio_write(writel, "l", unsigned int, "r", :"memory")
24640 +
24641 +build_mmio_write(__writeb, "b", unsigned char, "q", )
24642 +build_mmio_write(__writew, "w", unsigned short, "r", )
24643 +build_mmio_write(__writel, "l", unsigned int, "r", )
24644 +
24645 +#define readb_relaxed(a) __readb(a)
24646 +#define readw_relaxed(a) __readw(a)
24647 +#define readl_relaxed(a) __readl(a)
24648 +#define __raw_readb __readb
24649 +#define __raw_readw __readw
24650 +#define __raw_readl __readl
24651 +
24652 +#define __raw_writeb __writeb
24653 +#define __raw_writew __writew
24654 +#define __raw_writel __writel
24655 +
24656 +#define mmiowb() barrier()
24657 +
24658 +#ifdef CONFIG_X86_64
24659 +build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
24660 +build_mmio_read(__readq, "q", unsigned long, "=r", )
24661 +build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
24662 +build_mmio_write(__writeq, "q", unsigned long, "r", )
24663 +
24664 +#define readq_relaxed(a) __readq(a)
24665 +#define __raw_readq __readq
24666 +#define __raw_writeq writeq
24667 +
24668 +/* Let people know we have them */
24669 +#define readq readq
24670 +#define writeq writeq
24671 +#endif
24672 +
24673 +#define native_io_delay xen_io_delay
24674 +
24675 #ifdef CONFIG_X86_32
24676 -# include "io_32.h"
24677 +# include "../../io_32.h"
24678 #else
24679 -# include "io_64.h"
24680 +# include "../../io_64.h"
24681 +#endif
24682 +
24683 +#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
24684 +
24685 +/* We will be supplying our own /dev/mem implementation */
24686 +#define ARCH_HAS_DEV_MEM
24687 +
24688 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
24689 +#undef page_to_phys
24690 +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
24691 +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
24692 +
24693 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
24694 + (unsigned long) (bv)->bv_offset)
24695 +
24696 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
24697 + (bvec_to_phys(vec1) + (vec1)->bv_len == bvec_to_phys(vec2) \
24698 + && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
24699 + == bvec_to_pseudophys(vec2))
24700 +
24701 +#undef virt_to_bus
24702 +#undef bus_to_virt
24703 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
24704 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
24705 +
24706 +#include <asm/fixmap.h>
24707 +
24708 +#undef __ISA_IO_base
24709 +#undef isa_virt_to_bus
24710 +#undef isa_page_to_bus
24711 +#undef isa_bus_to_virt
24712 +#define isa_virt_to_bus(_x) ({ \
24713 + unsigned long _va_ = (unsigned long)(_x); \
24714 + _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
24715 + ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
24716 + : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
24717 +#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
24718 +
24719 #endif
24720
24721 extern void *xlate_dev_mem_ptr(unsigned long phys);
24722 extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
24723
24724 -extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
24725 -extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
24726 -
24727 extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
24728 unsigned long prot_val);
24729 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
24730
24731 +/*
24732 + * early_ioremap() and early_iounmap() are for temporary early boot-time
24733 + * mappings, before the real ioremap() is functional.
24734 + * A boot-time mapping is currently limited to at most 16 pages.
24735 + */
24736 +extern void early_ioremap_init(void);
24737 +extern void early_ioremap_clear(void);
24738 +extern void early_ioremap_reset(void);
24739 +extern void *early_ioremap(unsigned long offset, unsigned long size);
24740 +extern void early_iounmap(void *addr, unsigned long size);
24741 +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
24742 +
24743 +
24744 #endif /* _ASM_X86_IO_H */
24745 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
24746 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/irq_vectors.h 2009-06-04 10:21:39.000000000 +0200
24747 @@ -0,0 +1,52 @@
24748 +#ifndef _ASM_IRQ_VECTORS_H
24749 +#define _ASM_IRQ_VECTORS_H
24750 +
24751 +#ifdef CONFIG_X86_32
24752 +# define SYSCALL_VECTOR 0x80
24753 +#else
24754 +# define IA32_SYSCALL_VECTOR 0x80
24755 +#endif
24756 +
24757 +#define RESCHEDULE_VECTOR 0
24758 +#define CALL_FUNCTION_VECTOR 1
24759 +#define CALL_FUNC_SINGLE_VECTOR 2
24760 +#define SPIN_UNLOCK_VECTOR 3
24761 +#define NR_IPIS 4
24762 +
24763 +/*
24764 + * The maximum number of vectors supported by i386 processors
24765 + * is limited to 256. For processors other than i386, NR_VECTORS
24766 + * should be changed accordingly.
24767 + */
24768 +#define NR_VECTORS 256
24769 +
24770 +#define FIRST_VM86_IRQ 3
24771 +#define LAST_VM86_IRQ 15
24772 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
24773 +
24774 +/*
24775 + * The flat IRQ space is divided into two regions:
24776 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
24777 + * if we have physical device-access privilege. This region is at the
24778 + * start of the IRQ space so that existing device drivers do not need
24779 + * to be modified to translate physical IRQ numbers into our IRQ space.
24780 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
24781 + * are bound using the provided bind/unbind functions.
24782 + */
24783 +
24784 +#define PIRQ_BASE 0
24785 +#if defined(NR_CPUS) && defined(MAX_IO_APICS)
24786 +# if NR_CPUS < MAX_IO_APICS
24787 +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
24788 +# else
24789 +# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
24790 +# endif
24791 +#endif
24792 +
24793 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
24794 +#define NR_DYNIRQS 256
24795 +
24796 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
24797 +#define NR_IRQ_VECTORS NR_IRQS
24798 +
24799 +#endif /* _ASM_IRQ_VECTORS_H */
24800 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:38:05.000000000 +0100
24801 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/irqflags.h 2009-06-04 10:21:39.000000000 +0200
24802 @@ -118,7 +118,7 @@ static inline void halt(void)
24803
24804 #ifndef CONFIG_X86_64
24805 #define INTERRUPT_RETURN iret
24806 -#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
24807 +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
24808 sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
24809 __TEST_PENDING ; \
24810 jnz 14f /* process more events if necessary... */ ; \
24811 @@ -175,18 +175,6 @@ static inline void trace_hardirqs_fixup_
24812 #else
24813
24814 #ifdef CONFIG_X86_64
24815 -/*
24816 - * Currently paravirt can't handle swapgs nicely when we
24817 - * don't have a stack we can rely on (such as a user space
24818 - * stack). So we either find a way around these or just fault
24819 - * and emulate if a guest tries to call swapgs directly.
24820 - *
24821 - * Either way, this is a good way to document that we don't
24822 - * have a reliable stack. x86_64 only.
24823 - */
24824 -#define SWAPGS_UNSAFE_STACK swapgs
24825 -#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
24826 -#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
24827 #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
24828 #define ARCH_LOCKDEP_SYS_EXIT_IRQ \
24829 TRACE_IRQS_ON; \
24830 @@ -198,24 +186,6 @@ static inline void trace_hardirqs_fixup_
24831 TRACE_IRQS_OFF;
24832
24833 #else
24834 -#define ARCH_TRACE_IRQS_ON \
24835 - pushl %eax; \
24836 - pushl %ecx; \
24837 - pushl %edx; \
24838 - call trace_hardirqs_on; \
24839 - popl %edx; \
24840 - popl %ecx; \
24841 - popl %eax;
24842 -
24843 -#define ARCH_TRACE_IRQS_OFF \
24844 - pushl %eax; \
24845 - pushl %ecx; \
24846 - pushl %edx; \
24847 - call trace_hardirqs_off; \
24848 - popl %edx; \
24849 - popl %ecx; \
24850 - popl %eax;
24851 -
24852 #define ARCH_LOCKDEP_SYS_EXIT \
24853 pushl %eax; \
24854 pushl %ecx; \
24855 @@ -229,8 +199,8 @@ static inline void trace_hardirqs_fixup_
24856 #endif
24857
24858 #ifdef CONFIG_TRACE_IRQFLAGS
24859 -# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
24860 -# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
24861 +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
24862 +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
24863 #else
24864 # define TRACE_IRQS_ON
24865 # define TRACE_IRQS_OFF
24866 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/mmu_context.h 2009-02-16 16:18:36.000000000 +0100
24867 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/mmu_context.h 2009-06-04 10:21:39.000000000 +0200
24868 @@ -1,5 +1,42 @@
24869 +#ifndef __ASM_X86_MMU_CONTEXT_H
24870 +#define __ASM_X86_MMU_CONTEXT_H
24871 +
24872 +#include <asm/desc.h>
24873 +#include <asm/atomic.h>
24874 +#include <asm/pgalloc.h>
24875 +#include <asm/tlbflush.h>
24876 +
24877 +void arch_exit_mmap(struct mm_struct *mm);
24878 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24879 +
24880 +void mm_pin(struct mm_struct *mm);
24881 +void mm_unpin(struct mm_struct *mm);
24882 +void mm_pin_all(void);
24883 +
24884 +static inline void xen_activate_mm(struct mm_struct *prev,
24885 + struct mm_struct *next)
24886 +{
24887 + if (!PagePinned(virt_to_page(next->pgd)))
24888 + mm_pin(next);
24889 +}
24890 +
24891 +/*
24892 + * Used for LDT copy/destruction.
24893 + */
24894 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24895 +void destroy_context(struct mm_struct *mm);
24896 +
24897 #ifdef CONFIG_X86_32
24898 # include "mmu_context_32.h"
24899 #else
24900 # include "mmu_context_64.h"
24901 #endif
24902 +
24903 +#define activate_mm(prev, next) \
24904 +do { \
24905 + xen_activate_mm(prev, next); \
24906 + switch_mm((prev), (next), NULL); \
24907 +} while (0);
24908 +
24909 +
24910 +#endif /* __ASM_X86_MMU_CONTEXT_H */
24911 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:38:05.000000000 +0100
24912 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-06-04 10:21:39.000000000 +0200
24913 @@ -1,32 +1,6 @@
24914 #ifndef __I386_SCHED_H
24915 #define __I386_SCHED_H
24916
24917 -#include <asm/desc.h>
24918 -#include <asm/atomic.h>
24919 -#include <asm/pgalloc.h>
24920 -#include <asm/tlbflush.h>
24921 -
24922 -void arch_exit_mmap(struct mm_struct *mm);
24923 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24924 -
24925 -void mm_pin(struct mm_struct *mm);
24926 -void mm_unpin(struct mm_struct *mm);
24927 -void mm_pin_all(void);
24928 -
24929 -static inline void xen_activate_mm(struct mm_struct *prev,
24930 - struct mm_struct *next)
24931 -{
24932 - if (!PagePinned(virt_to_page(next->pgd)))
24933 - mm_pin(next);
24934 -}
24935 -
24936 -/*
24937 - * Used for LDT copy/destruction.
24938 - */
24939 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24940 -void destroy_context(struct mm_struct *mm);
24941 -
24942 -
24943 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24944 {
24945 #if 0 /* XEN: no lazy tlb */
24946 @@ -107,10 +81,4 @@ static inline void switch_mm(struct mm_s
24947 #define deactivate_mm(tsk, mm) \
24948 asm("movl %0,%%gs": :"r" (0));
24949
24950 -#define activate_mm(prev, next) \
24951 -do { \
24952 - xen_activate_mm(prev, next); \
24953 - switch_mm((prev), (next), NULL); \
24954 -} while (0)
24955 -
24956 #endif
24957 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:38:05.000000000 +0100
24958 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-06-04 10:21:39.000000000 +0200
24959 @@ -1,23 +1,6 @@
24960 #ifndef __X86_64_MMU_CONTEXT_H
24961 #define __X86_64_MMU_CONTEXT_H
24962
24963 -#include <asm/desc.h>
24964 -#include <asm/atomic.h>
24965 -#include <asm/pgalloc.h>
24966 -#include <asm/page.h>
24967 -#include <asm/pda.h>
24968 -#include <asm/pgtable.h>
24969 -#include <asm/tlbflush.h>
24970 -
24971 -void arch_exit_mmap(struct mm_struct *mm);
24972 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
24973 -
24974 -/*
24975 - * possibly do the LDT unload here?
24976 - */
24977 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
24978 -void destroy_context(struct mm_struct *mm);
24979 -
24980 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
24981 {
24982 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
24983 @@ -58,10 +41,6 @@ static inline void __prepare_arch_switch
24984 }
24985 }
24986
24987 -extern void mm_pin(struct mm_struct *mm);
24988 -extern void mm_unpin(struct mm_struct *mm);
24989 -void mm_pin_all(void);
24990 -
24991 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
24992 struct task_struct *tsk)
24993 {
24994 @@ -124,11 +103,4 @@ do { \
24995 asm volatile("movl %0,%%fs"::"r"(0)); \
24996 } while (0)
24997
24998 -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
24999 -{
25000 - if (!PagePinned(virt_to_page(next->pgd)))
25001 - mm_pin(next);
25002 - switch_mm(prev, next, NULL);
25003 -}
25004 -
25005 #endif
25006 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:38:05.000000000 +0100
25007 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/page.h 2009-06-04 10:21:39.000000000 +0200
25008 @@ -16,9 +16,9 @@
25009 * below. The preprocessor will warn if the two definitions aren't identical.
25010 */
25011 #define _PAGE_BIT_PRESENT 0
25012 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25013 -#define _PAGE_BIT_IO 9
25014 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25015 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25016 +#define _PAGE_BIT_IO 11
25017 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25018
25019 #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
25020 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
25021 @@ -28,8 +28,11 @@
25022 (ie, 32-bit PAE). */
25023 #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
25024
25025 -/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
25026 -#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
25027 +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
25028 +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
25029 +
25030 +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
25031 +#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
25032
25033 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
25034 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
25035 @@ -39,8 +42,7 @@
25036 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
25037 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
25038
25039 -/* to align the pointer to the (next) page boundary */
25040 -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
25041 +#define HUGE_MAX_HSTATE 2
25042
25043 #ifndef __ASSEMBLY__
25044 #include <linux/types.h>
25045 @@ -61,9 +63,17 @@
25046
25047 #ifndef __ASSEMBLY__
25048
25049 +typedef struct { pgdval_t pgd; } pgd_t;
25050 +typedef struct { pgprotval_t pgprot; } pgprot_t;
25051 +
25052 extern int page_is_ram(unsigned long pagenr);
25053 extern int devmem_is_allowed(unsigned long pagenr);
25054 +extern void map_devmem(unsigned long pfn, unsigned long size,
25055 + pgprot_t vma_prot);
25056 +extern void unmap_devmem(unsigned long pfn, unsigned long size,
25057 + pgprot_t vma_prot);
25058
25059 +extern unsigned long max_low_pfn_mapped;
25060 extern unsigned long max_pfn_mapped;
25061
25062 struct page;
25063 @@ -84,15 +94,11 @@ static inline void copy_user_page(void *
25064 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
25065 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
25066
25067 -typedef struct { pgprotval_t pgprot; } pgprot_t;
25068 -
25069 #define pgprot_val(x) ((x).pgprot)
25070 #define __pgprot(x) ((pgprot_t) { (x) } )
25071
25072 #include <asm/maddr.h>
25073
25074 -typedef struct { pgdval_t pgd; } pgd_t;
25075 -
25076 #define __pgd_ma(x) ((pgd_t) { (x) } )
25077 static inline pgd_t xen_make_pgd(pgdval_t val)
25078 {
25079 @@ -196,6 +202,11 @@ static inline pteval_t xen_pte_val(pte_t
25080 return ret;
25081 }
25082
25083 +static inline pteval_t xen_pte_flags(pte_t pte)
25084 +{
25085 + return __pte_val(pte) & PTE_FLAGS_MASK;
25086 +}
25087 +
25088 #define pgd_val(x) xen_pgd_val(x)
25089 #define __pgd(x) xen_make_pgd(x)
25090
25091 @@ -210,6 +221,7 @@ static inline pteval_t xen_pte_val(pte_t
25092 #endif
25093
25094 #define pte_val(x) xen_pte_val(x)
25095 +#define pte_flags(x) xen_pte_flags(x)
25096 #define __pte(x) xen_make_pte(x)
25097
25098 #define __pa(x) __phys_addr((unsigned long)(x))
25099 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:38:05.000000000 +0100
25100 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/page_64.h 2009-06-04 10:21:39.000000000 +0200
25101 @@ -26,6 +26,12 @@
25102 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
25103 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
25104
25105 +/*
25106 + * Set __PAGE_OFFSET to the most negative possible address +
25107 + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
25108 + * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
25109 + * what Xen requires.
25110 + */
25111 #define __PAGE_OFFSET _AC(0xffff880000000000, UL)
25112
25113 #define __PHYSICAL_START CONFIG_PHYSICAL_START
25114 @@ -63,7 +69,8 @@
25115 void clear_page(void *page);
25116 void copy_page(void *to, void *from);
25117
25118 -extern unsigned long end_pfn;
25119 +/* duplicated to the one in bootmem.h */
25120 +extern unsigned long max_pfn;
25121
25122 static inline unsigned long __phys_addr(unsigned long x)
25123 {
25124 @@ -91,6 +98,11 @@ typedef union { pteval_t pte; unsigned i
25125 extern unsigned long init_memory_mapping(unsigned long start,
25126 unsigned long end);
25127
25128 +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
25129 +
25130 +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
25131 +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
25132 +
25133 #endif /* !__ASSEMBLY__ */
25134
25135 #ifdef CONFIG_FLATMEM
25136 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:38:05.000000000 +0100
25137 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pci.h 2009-06-04 10:21:39.000000000 +0200
25138 @@ -21,6 +21,8 @@ struct pci_sysdata {
25139 #endif
25140 };
25141
25142 +extern int pci_routeirq;
25143 +
25144 /* scan a bus after allocating a pci_sysdata for it */
25145 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
25146 int node);
25147 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pci_32.h 2009-02-16 16:18:36.000000000 +0100
25148 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pci_32.h 2009-06-04 10:21:39.000000000 +0200
25149 @@ -38,12 +38,14 @@ struct pci_dev;
25150 #define PCI_DMA_BUS_IS_PHYS (1)
25151
25152 /* pci_unmap_{page,single} is a nop so... */
25153 -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
25154 -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
25155 -#define pci_unmap_addr(PTR, ADDR_NAME) (0)
25156 -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
25157 -#define pci_unmap_len(PTR, LEN_NAME) (0)
25158 -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
25159 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
25160 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
25161 +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
25162 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25163 + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
25164 +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
25165 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
25166 + do { break; } while (pci_unmap_len(PTR, LEN_NAME))
25167
25168 #endif
25169
25170 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2009-03-16 16:38:05.000000000 +0100
25171 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgalloc.h 2009-06-04 10:21:39.000000000 +0200
25172 @@ -7,6 +7,9 @@
25173
25174 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
25175
25176 +static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
25177 +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
25178 +
25179 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
25180 static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
25181 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
25182 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:38:05.000000000 +0100
25183 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable.h 2009-06-04 10:21:39.000000000 +0200
25184 @@ -13,11 +13,12 @@
25185 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
25186 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
25187 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
25188 -#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
25189 +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
25190 +#define _PAGE_BIT_UNUSED2 10
25191 +#define _PAGE_BIT_IO 11 /* Mapped page is I/O or foreign and
25192 * has no associated page struct. */
25193 -#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
25194 -#define _PAGE_BIT_UNUSED3 11
25195 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
25196 +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
25197 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
25198
25199 /* If _PAGE_BIT_PRESENT is clear, we use these: */
25200 @@ -28,34 +29,31 @@
25201 /* if the user mapped it with PROT_NONE; pte_present gives true */
25202 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
25203
25204 -/*
25205 - * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
25206 - * sign-extended value on 32-bit with all 1's in the upper word,
25207 - * which preserves the upper pte values on 64-bit ptes:
25208 - */
25209 -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
25210 -#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
25211 -#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
25212 -#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
25213 -#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
25214 -#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
25215 -#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
25216 -#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
25217 -#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
25218 -#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
25219 -#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
25220 -#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
25221 -#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
25222 -#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
25223 +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
25224 +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
25225 +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
25226 +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
25227 +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
25228 +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
25229 +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
25230 +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
25231 +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
25232 +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
25233 +#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2)
25234 +#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
25235 +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
25236 +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
25237 +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
25238 +#define __HAVE_ARCH_PTE_SPECIAL
25239
25240 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
25241 -#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
25242 +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
25243 #else
25244 -#define _PAGE_NX 0
25245 +#define _PAGE_NX (_AT(pteval_t, 0))
25246 #endif
25247
25248 -#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
25249 -#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
25250 +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
25251 +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
25252
25253 #ifndef __ASSEMBLY__
25254 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
25255 @@ -71,8 +69,8 @@ extern unsigned int __kernel_page_user;
25256 _PAGE_DIRTY | __kernel_page_user)
25257
25258 /* Set of bits not changed in pte_modify */
25259 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25260 - _PAGE_ACCESSED | _PAGE_DIRTY)
25261 +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
25262 + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
25263
25264 /*
25265 * PAT settings are part of the hypervisor interface, which sets the
25266 @@ -102,19 +100,9 @@ extern unsigned int __kernel_page_user;
25267 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
25268 _PAGE_ACCESSED)
25269
25270 -#ifdef CONFIG_X86_32
25271 -#define _PAGE_KERNEL_EXEC \
25272 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
25273 -#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
25274 -
25275 -#ifndef __ASSEMBLY__
25276 -extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
25277 -#endif /* __ASSEMBLY__ */
25278 -#else
25279 #define __PAGE_KERNEL_EXEC \
25280 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
25281 #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
25282 -#endif
25283
25284 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
25285 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
25286 @@ -125,25 +113,22 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
25287 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
25288 #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
25289 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
25290 +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
25291 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
25292
25293 -/*
25294 - * We don't support GLOBAL page in xenolinux64
25295 - */
25296 -#define MAKE_GLOBAL(x) __pgprot((x))
25297 -
25298 -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
25299 -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
25300 -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
25301 -#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
25302 -#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
25303 -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
25304 -#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
25305 -#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
25306 -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
25307 -#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
25308 -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
25309 -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25310 +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
25311 +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
25312 +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
25313 +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
25314 +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
25315 +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
25316 +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
25317 +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
25318 +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
25319 +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
25320 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
25321 +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
25322 +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
25323
25324 /* xwr */
25325 #define __P000 PAGE_NONE
25326 @@ -182,27 +167,27 @@ extern struct list_head pgd_list;
25327 */
25328 static inline int pte_dirty(pte_t pte)
25329 {
25330 - return __pte_val(pte) & _PAGE_DIRTY;
25331 + return pte_flags(pte) & _PAGE_DIRTY;
25332 }
25333
25334 static inline int pte_young(pte_t pte)
25335 {
25336 - return __pte_val(pte) & _PAGE_ACCESSED;
25337 + return pte_flags(pte) & _PAGE_ACCESSED;
25338 }
25339
25340 static inline int pte_write(pte_t pte)
25341 {
25342 - return __pte_val(pte) & _PAGE_RW;
25343 + return pte_flags(pte) & _PAGE_RW;
25344 }
25345
25346 static inline int pte_file(pte_t pte)
25347 {
25348 - return __pte_val(pte) & _PAGE_FILE;
25349 + return pte_flags(pte) & _PAGE_FILE;
25350 }
25351
25352 static inline int pte_huge(pte_t pte)
25353 {
25354 - return __pte_val(pte) & _PAGE_PSE;
25355 + return pte_flags(pte) & _PAGE_PSE;
25356 }
25357
25358 static inline int pte_global(pte_t pte)
25359 @@ -212,12 +197,12 @@ static inline int pte_global(pte_t pte)
25360
25361 static inline int pte_exec(pte_t pte)
25362 {
25363 - return !(__pte_val(pte) & _PAGE_NX);
25364 + return !(pte_flags(pte) & _PAGE_NX);
25365 }
25366
25367 static inline int pte_special(pte_t pte)
25368 {
25369 - return 0;
25370 + return pte_flags(pte) & _PAGE_SPECIAL;
25371 }
25372
25373 static inline int pmd_large(pmd_t pte)
25374 @@ -228,22 +213,22 @@ static inline int pmd_large(pmd_t pte)
25375
25376 static inline pte_t pte_mkclean(pte_t pte)
25377 {
25378 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
25379 + return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
25380 }
25381
25382 static inline pte_t pte_mkold(pte_t pte)
25383 {
25384 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
25385 + return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
25386 }
25387
25388 static inline pte_t pte_wrprotect(pte_t pte)
25389 {
25390 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
25391 + return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
25392 }
25393
25394 static inline pte_t pte_mkexec(pte_t pte)
25395 {
25396 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
25397 + return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
25398 }
25399
25400 static inline pte_t pte_mkdirty(pte_t pte)
25401 @@ -268,7 +253,7 @@ static inline pte_t pte_mkhuge(pte_t pte
25402
25403 static inline pte_t pte_clrhuge(pte_t pte)
25404 {
25405 - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
25406 + return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
25407 }
25408
25409 static inline pte_t pte_mkglobal(pte_t pte)
25410 @@ -283,35 +268,46 @@ static inline pte_t pte_clrglobal(pte_t
25411
25412 static inline pte_t pte_mkspecial(pte_t pte)
25413 {
25414 - return pte;
25415 + return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
25416 }
25417
25418 extern pteval_t __supported_pte_mask;
25419
25420 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
25421 {
25422 - return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
25423 - pgprot_val(pgprot)) & __supported_pte_mask);
25424 + pgprotval_t prot = pgprot_val(pgprot);
25425 +
25426 + if (prot & _PAGE_PRESENT)
25427 + prot &= __supported_pte_mask;
25428 + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25429 }
25430
25431 static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
25432 {
25433 - return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
25434 - pgprot_val(pgprot)) & __supported_pte_mask);
25435 + pgprotval_t prot = pgprot_val(pgprot);
25436 +
25437 + if (prot & _PAGE_PRESENT)
25438 + prot &= __supported_pte_mask;
25439 + return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25440 }
25441
25442 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
25443 {
25444 - return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
25445 - pgprot_val(pgprot)) & __supported_pte_mask);
25446 + pgprotval_t prot = pgprot_val(pgprot);
25447 +
25448 + if (prot & _PAGE_PRESENT)
25449 + prot &= __supported_pte_mask;
25450 + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
25451 }
25452
25453 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
25454 {
25455 - pteval_t val = pte_val(pte);
25456 + pgprotval_t prot = pgprot_val(newprot);
25457 + pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
25458
25459 - val &= _PAGE_CHG_MASK;
25460 - val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
25461 + if (prot & _PAGE_PRESENT)
25462 + prot &= __supported_pte_mask;
25463 + val |= prot & ~_PAGE_CHG_MASK;
25464
25465 return __pte(val);
25466 }
25467 @@ -325,9 +321,11 @@ static inline pgprot_t pgprot_modify(pgp
25468 return __pgprot(preservebits | addbits);
25469 }
25470
25471 -#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
25472 +#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
25473
25474 -#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
25475 +#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
25476 + ? pgprot_val(p) & __supported_pte_mask \
25477 + : pgprot_val(p))
25478
25479 #ifndef __ASSEMBLY__
25480 #define __HAVE_PHYS_MEM_ACCESS_PROT
25481 @@ -338,6 +336,17 @@ int phys_mem_access_prot_allowed(struct
25482 unsigned long size, pgprot_t *vma_prot);
25483 #endif
25484
25485 +/* Install a pte for a particular vaddr in kernel space. */
25486 +void set_pte_vaddr(unsigned long vaddr, pte_t pte);
25487 +
25488 +#ifndef CONFIG_XEN
25489 +extern void native_pagetable_setup_start(pgd_t *base);
25490 +extern void native_pagetable_setup_done(pgd_t *base);
25491 +#else
25492 +static inline void xen_pagetable_setup_start(pgd_t *base) {}
25493 +static inline void xen_pagetable_setup_done(pgd_t *base) {}
25494 +#endif
25495 +
25496 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
25497 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
25498
25499 @@ -373,6 +382,26 @@ int phys_mem_access_prot_allowed(struct
25500 # include "pgtable_64.h"
25501 #endif
25502
25503 +/*
25504 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25505 + *
25506 + * this macro returns the index of the entry in the pgd page which would
25507 + * control the given virtual address
25508 + */
25509 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25510 +
25511 +/*
25512 + * pgd_offset() returns a (pgd_t *)
25513 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25514 + */
25515 +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25516 +/*
25517 + * a shortcut which implies the use of the kernel's pgd, instead
25518 + * of a process's
25519 + */
25520 +#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25521 +
25522 +
25523 #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
25524 #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
25525
25526 @@ -383,8 +412,15 @@ enum {
25527 PG_LEVEL_4K,
25528 PG_LEVEL_2M,
25529 PG_LEVEL_1G,
25530 + PG_LEVEL_NUM
25531 };
25532
25533 +#ifdef CONFIG_PROC_FS
25534 +extern void update_page_count(int level, unsigned long pages);
25535 +#else
25536 +static inline void update_page_count(int level, unsigned long pages) { }
25537 +#endif
25538 +
25539 /*
25540 * Helper function that returns the kernel pagetable entry controlling
25541 * the virtual address 'address'. NULL means no pagetable entry present.
25542 @@ -441,6 +477,8 @@ static inline void xen_pte_clear(struct
25543 * race with other CPU's that might be updating the dirty
25544 * bit at the same time.
25545 */
25546 +struct vm_area_struct;
25547 +
25548 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
25549 extern int ptep_set_access_flags(struct vm_area_struct *vma,
25550 unsigned long address, pte_t *ptep,
25551 @@ -523,9 +561,6 @@ static inline void clone_pgd_range(pgd_t
25552 memcpy(dst, src, count * sizeof(pgd_t));
25553 }
25554
25555 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
25556 - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
25557 -
25558 #define arbitrary_virt_to_machine(va) \
25559 ({ \
25560 unsigned int __lvl; \
25561 @@ -548,6 +583,34 @@ struct page *kmap_atomic_to_page(void *)
25562 #define ptep_to_machine(ptep) virt_to_machine(ptep)
25563 #endif
25564
25565 +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
25566 +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
25567 + pte_t *ptep)
25568 +{
25569 +#if CONFIG_XEN_COMPAT < 0x030300
25570 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
25571 + return ptep_get_and_clear(mm, addr, ptep);
25572 +#endif
25573 + return *ptep;
25574 +}
25575 +
25576 +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
25577 + pte_t *ptep, pte_t pte)
25578 +{
25579 + mmu_update_t u;
25580 +
25581 +#if CONFIG_XEN_COMPAT < 0x030300
25582 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
25583 + set_pte_at(mm, addr, ptep, pte);
25584 + return;
25585 + }
25586 +#endif
25587 + u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
25588 + u.val = __pte_val(pte);
25589 + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
25590 + BUG();
25591 +}
25592 +
25593 #include <asm-generic/pgtable.h>
25594
25595 #include <xen/features.h>
25596 @@ -576,10 +639,6 @@ int touch_pte_range(struct mm_struct *mm
25597 unsigned long address,
25598 unsigned long size);
25599
25600 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
25601 - unsigned long addr, unsigned long end, pgprot_t newprot,
25602 - int dirty_accountable);
25603 -
25604 #endif /* __ASSEMBLY__ */
25605
25606 #endif /* _ASM_X86_PGTABLE_H */
25607 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:38:05.000000000 +0100
25608 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-06-04 10:21:39.000000000 +0200
25609 @@ -14,11 +14,11 @@
25610 #define pmd_ERROR(e) \
25611 printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
25612 __FILE__, __LINE__, &(e), __pmd_val(e), \
25613 - (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25614 + (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25615 #define pgd_ERROR(e) \
25616 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
25617 __FILE__, __LINE__, &(e), __pgd_val(e), \
25618 - (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
25619 + (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
25620
25621 static inline int pud_none(pud_t pud)
25622 {
25623 @@ -27,7 +27,7 @@ static inline int pud_none(pud_t pud)
25624 }
25625 static inline int pud_bad(pud_t pud)
25626 {
25627 - return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25628 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
25629 }
25630
25631 static inline int pud_present(pud_t pud)
25632 @@ -102,9 +102,9 @@ static inline void pud_clear(pud_t *pudp
25633 xen_tlb_flush();
25634 }
25635
25636 -#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
25637 +#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
25638
25639 -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
25640 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
25641
25642
25643 /* Find an entry in the second-level page table.. */
25644 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:38:05.000000000 +0100
25645 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-06-04 10:21:39.000000000 +0200
25646 @@ -89,10 +89,10 @@ extern unsigned long pg0[];
25647 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
25648 can temporarily clear it. */
25649 #define pmd_present(x) (__pmd_val(x))
25650 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25651 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
25652 #else
25653 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
25654 -#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25655 +#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
25656 #endif
25657
25658
25659 @@ -119,26 +119,6 @@ extern unsigned long pg0[];
25660 */
25661 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
25662
25663 -/*
25664 - * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
25665 - *
25666 - * this macro returns the index of the entry in the pgd page which would
25667 - * control the given virtual address
25668 - */
25669 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25670 -#define pgd_index_k(addr) pgd_index((addr))
25671 -
25672 -/*
25673 - * pgd_offset() returns a (pgd_t *)
25674 - * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
25675 - */
25676 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25677 -
25678 -/*
25679 - * a shortcut which implies the use of the kernel's pgd, instead
25680 - * of a process's
25681 - */
25682 -#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
25683
25684 static inline int pud_large(pud_t pud) { return 0; }
25685
25686 @@ -165,7 +145,7 @@ static inline int pud_large(pud_t pud) {
25687 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25688
25689 #define pmd_page_vaddr(pmd) \
25690 - ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
25691 + ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
25692
25693 #if defined(CONFIG_HIGHPTE)
25694 #define pte_offset_map(dir, address) \
25695 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:38:05.000000000 +0100
25696 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-06-04 10:21:39.000000000 +0200
25697 @@ -23,6 +23,8 @@ extern void xen_init_pt(void);
25698 extern pud_t level3_kernel_pgt[512];
25699 extern pud_t level3_ident_pgt[512];
25700 extern pmd_t level2_kernel_pgt[512];
25701 +extern pmd_t level2_fixmap_pgt[512];
25702 +extern pmd_t level2_ident_pgt[512];
25703 extern pgd_t init_level4_pgt[];
25704
25705 #define swapper_pg_dir init_level4_pgt
25706 @@ -79,6 +81,9 @@ extern void paging_init(void);
25707
25708 struct mm_struct;
25709
25710 +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
25711 +
25712 +
25713 #define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
25714
25715 static inline void xen_set_pte(pte_t *ptep, pte_t pte)
25716 @@ -145,29 +150,29 @@ static inline void xen_pgd_clear(pgd_t *
25717 #define PGDIR_MASK (~(PGDIR_SIZE - 1))
25718
25719
25720 -#define MAXMEM _AC(0x00003fffffffffff, UL)
25721 +#define MAXMEM _AC(0x000004ffffffffff, UL)
25722 #define VMALLOC_START _AC(0xffffc20000000000, UL)
25723 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
25724 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
25725 #define MODULES_VADDR _AC(0xffffffffa0000000, UL)
25726 -#define MODULES_END _AC(0xfffffffffff00000, UL)
25727 +#define MODULES_END _AC(0xffffffffff000000, UL)
25728 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
25729
25730 #ifndef __ASSEMBLY__
25731
25732 static inline int pgd_bad(pgd_t pgd)
25733 {
25734 - return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25735 + return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25736 }
25737
25738 static inline int pud_bad(pud_t pud)
25739 {
25740 - return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25741 + return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25742 }
25743
25744 static inline int pmd_bad(pmd_t pmd)
25745 {
25746 - return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25747 + return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
25748 }
25749
25750 #define pte_none(x) (!(x).pte)
25751 @@ -175,7 +180,7 @@ static inline int pmd_bad(pmd_t pmd)
25752
25753 #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
25754
25755 -#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
25756 +#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
25757 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
25758 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
25759 #define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
25760 @@ -200,11 +205,8 @@ static inline int pmd_bad(pmd_t pmd)
25761 * Level 4 access.
25762 */
25763 #define pgd_page_vaddr(pgd) \
25764 - ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
25765 + ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
25766 #define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
25767 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
25768 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
25769 -#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
25770 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
25771 static inline int pgd_large(pgd_t pgd) { return 0; }
25772 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
25773 @@ -226,7 +228,7 @@ static inline int pud_large(pud_t pte)
25774 }
25775
25776 /* PMD - Level 2 access */
25777 -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
25778 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
25779 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
25780
25781 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
25782 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:38:05.000000000 +0100
25783 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/processor.h 2009-06-04 10:21:39.000000000 +0200
25784 @@ -134,7 +134,7 @@ extern __u32 cleared_cpu_caps[NCAPINTS
25785 #ifdef CONFIG_SMP
25786 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25787 #define cpu_data(cpu) per_cpu(cpu_info, cpu)
25788 -#define current_cpu_data cpu_data(smp_processor_id())
25789 +#define current_cpu_data __get_cpu_var(cpu_info)
25790 #else
25791 #define cpu_data(cpu) boot_cpu_data
25792 #define current_cpu_data boot_cpu_data
25793 @@ -153,7 +153,7 @@ static inline int hlt_works(int cpu)
25794
25795 extern void cpu_detect(struct cpuinfo_x86 *c);
25796
25797 -extern void identify_cpu(struct cpuinfo_x86 *);
25798 +extern void early_cpu_init(void);
25799 extern void identify_boot_cpu(void);
25800 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25801 extern void print_cpu_info(struct cpuinfo_x86 *);
25802 @@ -267,15 +267,11 @@ struct tss_struct {
25803 struct thread_struct *io_bitmap_owner;
25804
25805 /*
25806 - * Pad the TSS to be cacheline-aligned (size is 0x100):
25807 - */
25808 - unsigned long __cacheline_filler[35];
25809 - /*
25810 * .. and then another 0x100 bytes for the emergency kernel stack:
25811 */
25812 unsigned long stack[64];
25813
25814 -} __attribute__((packed));
25815 +} ____cacheline_aligned;
25816
25817 DECLARE_PER_CPU(struct tss_struct, init_tss);
25818
25819 @@ -668,11 +664,36 @@ static inline void __sti_mwait(unsigned
25820
25821 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25822
25823 -extern int force_mwait;
25824 -
25825 extern void select_idle_routine(const struct cpuinfo_x86 *c);
25826
25827 extern unsigned long boot_option_idle_override;
25828 +extern unsigned long idle_halt;
25829 +extern unsigned long idle_nomwait;
25830 +
25831 +#ifndef CONFIG_XEN
25832 +/*
25833 + * on systems with caches, caches must be flashed as the absolute
25834 + * last instruction before going into a suspended halt. Otherwise,
25835 + * dirty data can linger in the cache and become stale on resume,
25836 + * leading to strange errors.
25837 + *
25838 + * perform a variety of operations to guarantee that the compiler
25839 + * will not reorder instructions. wbinvd itself is serializing
25840 + * so the processor will not reorder.
25841 + *
25842 + * Systems without cache can just go into halt.
25843 + */
25844 +static inline void wbinvd_halt(void)
25845 +{
25846 + mb();
25847 + /* check for clflush to determine if wbinvd is legal */
25848 + if (cpu_has_clflush)
25849 + asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
25850 + else
25851 + while (1)
25852 + halt();
25853 +}
25854 +#endif
25855
25856 extern void enable_sep_cpu(void);
25857 extern int sysenter_setup(void);
25858 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:38:05.000000000 +0100
25859 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/segment.h 2009-06-04 10:21:39.000000000 +0200
25860 @@ -1,6 +1,15 @@
25861 #ifndef _ASM_X86_SEGMENT_H_
25862 #define _ASM_X86_SEGMENT_H_
25863
25864 +/* Constructor for a conventional segment GDT (or LDT) entry */
25865 +/* This is a macro so it can be used in initializers */
25866 +#define GDT_ENTRY(flags, base, limit) \
25867 + ((((base) & 0xff000000ULL) << (56-24)) | \
25868 + (((flags) & 0x0000f0ffULL) << 40) | \
25869 + (((limit) & 0x000f0000ULL) << (48-16)) | \
25870 + (((base) & 0x00ffffffULL) << 16) | \
25871 + (((limit) & 0x0000ffffULL)))
25872 +
25873 /* Simple and small GDT entries for booting only */
25874
25875 #define GDT_ENTRY_BOOT_CS 2
25876 @@ -61,18 +70,14 @@
25877 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
25878
25879 #define GDT_ENTRY_DEFAULT_USER_CS 14
25880 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
25881
25882 #define GDT_ENTRY_DEFAULT_USER_DS 15
25883 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
25884
25885 #define GDT_ENTRY_KERNEL_BASE 12
25886
25887 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
25888 -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25889
25890 #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
25891 -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25892
25893 #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
25894 #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
25895 @@ -143,10 +148,11 @@
25896 #else
25897 #include <asm/cache.h>
25898
25899 -#define __KERNEL_CS 0x10
25900 -#define __KERNEL_DS 0x18
25901 +#define GDT_ENTRY_KERNEL32_CS 1
25902 +#define GDT_ENTRY_KERNEL_CS 2
25903 +#define GDT_ENTRY_KERNEL_DS 3
25904
25905 -#define __KERNEL32_CS 0x08
25906 +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
25907
25908 /*
25909 * we cannot use the same code segment descriptor for user and kernel
25910 @@ -154,10 +160,10 @@
25911 * The segment offset needs to contain a RPL. Grr. -AK
25912 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
25913 */
25914 -
25915 -#define __USER32_CS 0x23 /* 4*8+3 */
25916 -#define __USER_DS 0x2b /* 5*8+3 */
25917 -#define __USER_CS 0x33 /* 6*8+3 */
25918 +#define GDT_ENTRY_DEFAULT_USER32_CS 4
25919 +#define GDT_ENTRY_DEFAULT_USER_DS 5
25920 +#define GDT_ENTRY_DEFAULT_USER_CS 6
25921 +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
25922 #define __USER32_DS __USER_DS
25923
25924 #define GDT_ENTRY_TSS 8 /* needs two entries */
25925 @@ -179,6 +185,11 @@
25926
25927 #endif
25928
25929 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
25930 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
25931 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
25932 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
25933 +
25934 /* User mode is privilege level 3 */
25935 #define USER_RPL 0x3
25936 /* LDT segment has TI set, GDT has it cleared */
25937 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/smp.h 2009-03-16 16:38:05.000000000 +0100
25938 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/smp.h 2009-06-04 10:21:39.000000000 +0200
25939 @@ -25,23 +25,16 @@ extern cpumask_t cpu_initialized;
25940 extern void (*mtrr_hook)(void);
25941 extern void zap_low_mappings(void);
25942
25943 +extern int __cpuinit get_local_pda(int cpu);
25944 +
25945 extern int smp_num_siblings;
25946 extern unsigned int num_processors;
25947 extern cpumask_t cpu_initialized;
25948
25949 -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
25950 -extern u16 x86_cpu_to_apicid_init[];
25951 -extern u16 x86_bios_cpu_apicid_init[];
25952 -extern void *x86_cpu_to_apicid_early_ptr;
25953 -extern void *x86_bios_cpu_apicid_early_ptr;
25954 -#else
25955 -#define x86_cpu_to_apicid_early_ptr NULL
25956 -#define x86_bios_cpu_apicid_early_ptr NULL
25957 -#endif
25958 -
25959 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
25960 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
25961 DECLARE_PER_CPU(u16, cpu_llc_id);
25962 +
25963 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
25964 DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
25965
25966 @@ -63,9 +56,9 @@ struct smp_ops {
25967
25968 void (*smp_send_stop)(void);
25969 void (*smp_send_reschedule)(int cpu);
25970 - int (*smp_call_function_mask)(cpumask_t mask,
25971 - void (*func)(void *info), void *info,
25972 - int wait);
25973 +
25974 + void (*send_call_func_ipi)(cpumask_t mask);
25975 + void (*send_call_func_single_ipi)(int cpu);
25976 };
25977
25978 /* Globals due to paravirt */
25979 @@ -106,11 +99,14 @@ static inline void smp_send_reschedule(i
25980 smp_ops.smp_send_reschedule(cpu);
25981 }
25982
25983 -static inline int smp_call_function_mask(cpumask_t mask,
25984 - void (*func) (void *info), void *info,
25985 - int wait)
25986 +static inline void arch_send_call_function_single_ipi(int cpu)
25987 {
25988 - return smp_ops.smp_call_function_mask(mask, func, info, wait);
25989 + smp_ops.send_call_func_single_ipi(cpu);
25990 +}
25991 +
25992 +static inline void arch_send_call_function_ipi(cpumask_t mask)
25993 +{
25994 + smp_ops.send_call_func_ipi(mask);
25995 }
25996
25997 void native_smp_prepare_boot_cpu(void);
25998 @@ -122,23 +118,19 @@ int native_cpu_up(unsigned int cpunum);
25999
26000 void xen_smp_send_stop(void);
26001 void xen_smp_send_reschedule(int cpu);
26002 -int xen_smp_call_function_mask(cpumask_t mask,
26003 - void (*func) (void *info), void *info,
26004 - int wait);
26005 +void xen_send_call_func_ipi(cpumask_t mask);
26006 +void xen_send_call_func_single_ipi(int cpu);
26007
26008 #define smp_send_stop xen_smp_send_stop
26009 #define smp_send_reschedule xen_smp_send_reschedule
26010 -#define smp_call_function_mask xen_smp_call_function_mask
26011 -
26012 -extern void prefill_possible_map(void);
26013 +#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
26014 +#define arch_send_call_function_ipi xen_send_call_func_ipi
26015
26016 #endif /* CONFIG_XEN */
26017
26018 extern int __cpu_disable(void);
26019 extern void __cpu_die(unsigned int cpu);
26020
26021 -extern void prefill_possible_map(void);
26022 -
26023 void smp_store_cpu_info(int id);
26024 #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
26025
26026 @@ -149,6 +141,14 @@ static inline int num_booting_cpus(void)
26027 }
26028 #endif /* CONFIG_SMP */
26029
26030 +#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN))
26031 +extern void prefill_possible_map(void);
26032 +#else
26033 +static inline void prefill_possible_map(void)
26034 +{
26035 +}
26036 +#endif
26037 +
26038 extern unsigned disabled_cpus __cpuinitdata;
26039
26040 #ifdef CONFIG_X86_32_SMP
26041 @@ -216,12 +216,8 @@ static inline int hard_smp_processor_id(
26042 #endif /* CONFIG_X86_LOCAL_APIC */
26043
26044 #ifdef CONFIG_HOTPLUG_CPU
26045 -extern void cpu_exit_clear(void);
26046 extern void cpu_uninit(void);
26047 #endif
26048
26049 -extern void smp_alloc_memory(void);
26050 -extern void lock_ipi_call_lock(void);
26051 -extern void unlock_ipi_call_lock(void);
26052 #endif /* __ASSEMBLY__ */
26053 #endif
26054 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:38:05.000000000 +0100
26055 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/spinlock.h 2009-06-04 11:09:05.000000000 +0200
26056 @@ -38,6 +38,11 @@
26057 # define UNLOCK_LOCK_PREFIX
26058 #endif
26059
26060 +/*
26061 + * Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
26062 + */
26063 +#if CONFIG_XEN_COMPAT >= 0x030200
26064 +
26065 int xen_spinlock_init(unsigned int cpu);
26066 void xen_spinlock_cleanup(unsigned int cpu);
26067 extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
26068 @@ -65,14 +70,14 @@ extern void xen_spin_kick(raw_spinlock_t
26069 */
26070 #if (NR_CPUS < 256)
26071 #define TICKET_SHIFT 8
26072 -#define __raw_spin_lock_preamble \
26073 +#define __ticket_spin_lock_preamble \
26074 asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
26075 "cmpb %h0, %b0\n\t" \
26076 "sete %1" \
26077 : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
26078 : "0" (0x0100) \
26079 : "memory", "cc")
26080 -#define __raw_spin_lock_body \
26081 +#define __ticket_spin_lock_body \
26082 asm("1:\t" \
26083 "cmpb %h0, %b0\n\t" \
26084 "je 2f\n\t" \
26085 @@ -88,7 +93,7 @@ extern void xen_spin_kick(raw_spinlock_t
26086 : "memory", "cc")
26087
26088
26089 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26090 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26091 {
26092 int tmp, new;
26093
26094 @@ -107,7 +112,7 @@ static __always_inline int __raw_spin_tr
26095 return tmp;
26096 }
26097
26098 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26099 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26100 {
26101 unsigned int token;
26102 unsigned char kick;
26103 @@ -124,7 +129,7 @@ static __always_inline void __raw_spin_u
26104 }
26105 #else
26106 #define TICKET_SHIFT 16
26107 -#define __raw_spin_lock_preamble \
26108 +#define __ticket_spin_lock_preamble \
26109 do { \
26110 unsigned int tmp; \
26111 asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
26112 @@ -136,7 +141,7 @@ static __always_inline void __raw_spin_u
26113 : "0" (0x00010000) \
26114 : "memory", "cc"); \
26115 } while (0)
26116 -#define __raw_spin_lock_body \
26117 +#define __ticket_spin_lock_body \
26118 do { \
26119 unsigned int tmp; \
26120 asm("shldl $16, %0, %2\n" \
26121 @@ -155,7 +160,7 @@ static __always_inline void __raw_spin_u
26122 : "memory", "cc"); \
26123 } while (0)
26124
26125 -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26126 +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
26127 {
26128 int tmp;
26129 int new;
26130 @@ -177,7 +182,7 @@ static __always_inline int __raw_spin_tr
26131 return tmp;
26132 }
26133
26134 -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26135 +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
26136 {
26137 unsigned int token, tmp;
26138 bool kick;
26139 @@ -195,49 +200,161 @@ static __always_inline void __raw_spin_u
26140 }
26141 #endif
26142
26143 -static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26144 +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
26145 {
26146 int tmp = ACCESS_ONCE(lock->slock);
26147
26148 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
26149 }
26150
26151 -static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26152 +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
26153 {
26154 int tmp = ACCESS_ONCE(lock->slock);
26155
26156 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
26157 }
26158
26159 -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26160 +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
26161 {
26162 unsigned int token, count;
26163 bool free;
26164
26165 - __raw_spin_lock_preamble;
26166 + __ticket_spin_lock_preamble;
26167 if (unlikely(!free))
26168 token = xen_spin_adjust(lock, token);
26169 do {
26170 count = 1 << 10;
26171 - __raw_spin_lock_body;
26172 + __ticket_spin_lock_body;
26173 } while (unlikely(!count) && !xen_spin_wait(lock, token));
26174 }
26175
26176 -static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26177 - unsigned long flags)
26178 +static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
26179 + unsigned long flags)
26180 {
26181 unsigned int token, count;
26182 bool free;
26183
26184 - __raw_spin_lock_preamble;
26185 + __ticket_spin_lock_preamble;
26186 if (unlikely(!free))
26187 token = xen_spin_adjust(lock, token);
26188 do {
26189 count = 1 << 10;
26190 - __raw_spin_lock_body;
26191 + __ticket_spin_lock_body;
26192 } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
26193 }
26194
26195 +#define __raw_spin(n) __ticket_spin_##n
26196 +
26197 +#else /* CONFIG_XEN_COMPAT < 0x030200 */
26198 +/*
26199 + * Define virtualization-friendly old-style lock byte lock, for use in
26200 + * pv_lock_ops if desired.
26201 + *
26202 + * This differs from the pre-2.6.24 spinlock by always using xchgb
26203 + * rather than decb to take the lock; this allows it to use a
26204 + * zero-initialized lock structure. It also maintains a 1-byte
26205 + * contention counter, so that we can implement
26206 + * __byte_spin_is_contended.
26207 + */
26208 +struct __byte_spinlock {
26209 + u8 lock;
26210 +#if NR_CPUS < 256
26211 + u8 spinners;
26212 +#else
26213 +#error NR_CPUS >= 256 support not implemented
26214 +#endif
26215 +};
26216 +
26217 +static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
26218 +static inline void xen_spinlock_cleanup(unsigned int cpu) {}
26219 +
26220 +static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
26221 +{
26222 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26223 + return bl->lock != 0;
26224 +}
26225 +
26226 +static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
26227 +{
26228 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26229 + return bl->spinners != 0;
26230 +}
26231 +
26232 +static inline void __byte_spin_lock(raw_spinlock_t *lock)
26233 +{
26234 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26235 + s8 val = 1;
26236 +
26237 + asm("1: xchgb %1, %0\n"
26238 + " test %1,%1\n"
26239 + " jz 3f\n"
26240 + " " LOCK_PREFIX "incb %2\n"
26241 + "2: rep;nop\n"
26242 + " cmpb $1, %0\n"
26243 + " je 2b\n"
26244 + " " LOCK_PREFIX "decb %2\n"
26245 + " jmp 1b\n"
26246 + "3:"
26247 + : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
26248 +}
26249 +
26250 +#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
26251 +
26252 +static inline int __byte_spin_trylock(raw_spinlock_t *lock)
26253 +{
26254 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26255 + u8 old = 1;
26256 +
26257 + asm("xchgb %1,%0"
26258 + : "+m" (bl->lock), "+q" (old) : : "memory");
26259 +
26260 + return old == 0;
26261 +}
26262 +
26263 +static inline void __byte_spin_unlock(raw_spinlock_t *lock)
26264 +{
26265 + struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
26266 + smp_wmb();
26267 + bl->lock = 0;
26268 +}
26269 +
26270 +#define __raw_spin(n) __byte_spin_##n
26271 +
26272 +#endif /* CONFIG_XEN_COMPAT */
26273 +
26274 +static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
26275 +{
26276 + return __raw_spin(is_locked)(lock);
26277 +}
26278 +
26279 +static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
26280 +{
26281 + return __raw_spin(is_contended)(lock);
26282 +}
26283 +
26284 +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
26285 +{
26286 + __raw_spin(lock)(lock);
26287 +}
26288 +
26289 +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
26290 + unsigned long flags)
26291 +{
26292 + __raw_spin(lock_flags)(lock, flags);
26293 +}
26294 +
26295 +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
26296 +{
26297 + return __raw_spin(trylock)(lock);
26298 +}
26299 +
26300 +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
26301 +{
26302 + __raw_spin(unlock)(lock);
26303 +}
26304 +
26305 +#undef __raw_spin
26306 +
26307 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
26308 {
26309 while (__raw_spin_is_locked(lock))
26310 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:38:05.000000000 +0100
26311 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/system.h 2009-06-04 10:21:39.000000000 +0200
26312 @@ -137,7 +137,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
26313 #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
26314 #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
26315
26316 -extern void load_gs_index(unsigned);
26317 +extern void xen_load_gs_index(unsigned);
26318
26319 /*
26320 * Load a segment. Fall back on loading the zero
26321 @@ -154,14 +154,14 @@ extern void load_gs_index(unsigned);
26322 "jmp 2b\n" \
26323 ".previous\n" \
26324 _ASM_EXTABLE(1b,3b) \
26325 - : :"r" (value), "r" (0))
26326 + : :"r" (value), "r" (0) : "memory")
26327
26328
26329 /*
26330 * Save a segment register away
26331 */
26332 #define savesegment(seg, value) \
26333 - asm volatile("mov %%" #seg ",%0":"=rm" (value))
26334 + asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
26335
26336 static inline unsigned long get_limit(unsigned long segment)
26337 {
26338 @@ -269,6 +269,7 @@ static inline void xen_wbinvd(void)
26339 #ifdef CONFIG_X86_64
26340 #define read_cr8() (xen_read_cr8())
26341 #define write_cr8(x) (xen_write_cr8(x))
26342 +#define load_gs_index xen_load_gs_index
26343 #endif
26344
26345 /* Clear the 'TS' bit */
26346 @@ -287,13 +288,12 @@ static inline void clflush(volatile void
26347 void disable_hlt(void);
26348 void enable_hlt(void);
26349
26350 -extern int es7000_plat;
26351 void cpu_idle_wait(void);
26352
26353 extern unsigned long arch_align_stack(unsigned long sp);
26354 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
26355
26356 -void default_idle(void);
26357 +void xen_idle(void);
26358
26359 /*
26360 * Force strict CPU ordering.
26361 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/xor_64.h 2009-03-16 16:38:05.000000000 +0100
26362 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/xor_64.h 2009-06-04 10:21:39.000000000 +0200
26363 @@ -1,3 +1,6 @@
26364 +#ifndef ASM_X86__XOR_64_H
26365 +#define ASM_X86__XOR_64_H
26366 +
26367 /*
26368 * x86-64 changes / gcc fixes from Andi Kleen.
26369 * Copyright 2002 Andi Kleen, SuSE Labs.
26370 @@ -330,3 +333,5 @@ do { \
26371 We may also be able to load into the L1 only depending on how the cpu
26372 deals with a load to a line that is being prefetched. */
26373 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
26374 +
26375 +#endif /* ASM_X86__XOR_64_H */
26376 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/irq_vectors.h 2009-03-16 16:33:40.000000000 +0100
26377 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26378 @@ -1,126 +0,0 @@
26379 -/*
26380 - * This file should contain #defines for all of the interrupt vector
26381 - * numbers used by this architecture.
26382 - *
26383 - * In addition, there are some standard defines:
26384 - *
26385 - * FIRST_EXTERNAL_VECTOR:
26386 - * The first free place for external interrupts
26387 - *
26388 - * SYSCALL_VECTOR:
26389 - * The IRQ vector a syscall makes the user to kernel transition
26390 - * under.
26391 - *
26392 - * TIMER_IRQ:
26393 - * The IRQ number the timer interrupt comes in at.
26394 - *
26395 - * NR_IRQS:
26396 - * The total number of interrupt vectors (including all the
26397 - * architecture specific interrupts) needed.
26398 - *
26399 - */
26400 -#ifndef _ASM_IRQ_VECTORS_H
26401 -#define _ASM_IRQ_VECTORS_H
26402 -
26403 -/*
26404 - * IDT vectors usable for external interrupt sources start
26405 - * at 0x20:
26406 - */
26407 -#define FIRST_EXTERNAL_VECTOR 0x20
26408 -
26409 -#define SYSCALL_VECTOR 0x80
26410 -
26411 -/*
26412 - * Vectors 0x20-0x2f are used for ISA interrupts.
26413 - */
26414 -
26415 -#if 0
26416 -/*
26417 - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
26418 - *
26419 - * some of the following vectors are 'rare', they are merged
26420 - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
26421 - * TLB, reschedule and local APIC vectors are performance-critical.
26422 - *
26423 - * Vectors 0xf0-0xfa are free (reserved for future Linux use).
26424 - */
26425 -#define SPURIOUS_APIC_VECTOR 0xff
26426 -#define ERROR_APIC_VECTOR 0xfe
26427 -#define INVALIDATE_TLB_VECTOR 0xfd
26428 -#define RESCHEDULE_VECTOR 0xfc
26429 -#define CALL_FUNCTION_VECTOR 0xfb
26430 -
26431 -#define THERMAL_APIC_VECTOR 0xf0
26432 -/*
26433 - * Local APIC timer IRQ vector is on a different priority level,
26434 - * to work around the 'lost local interrupt if more than 2 IRQ
26435 - * sources per level' errata.
26436 - */
26437 -#define LOCAL_TIMER_VECTOR 0xef
26438 -#endif
26439 -
26440 -#define SPURIOUS_APIC_VECTOR 0xff
26441 -#define ERROR_APIC_VECTOR 0xfe
26442 -
26443 -/*
26444 - * First APIC vector available to drivers: (vectors 0x30-0xee)
26445 - * we start at 0x31 to spread out vectors evenly between priority
26446 - * levels. (0x80 is the syscall vector)
26447 - */
26448 -#define FIRST_DEVICE_VECTOR 0x31
26449 -#define FIRST_SYSTEM_VECTOR 0xef
26450 -
26451 -/*
26452 - * 16 8259A IRQ's, 208 potential APIC interrupt sources.
26453 - * Right now the APIC is mostly only used for SMP.
26454 - * 256 vectors is an architectural limit. (we can have
26455 - * more than 256 devices theoretically, but they will
26456 - * have to use shared interrupts)
26457 - * Since vectors 0x00-0x1f are used/reserved for the CPU,
26458 - * the usable vector space is 0x20-0xff (224 vectors)
26459 - */
26460 -
26461 -#define RESCHEDULE_VECTOR 0
26462 -#define CALL_FUNCTION_VECTOR 1
26463 -#define SPIN_UNLOCK_VECTOR 2
26464 -#define NR_IPIS 3
26465 -
26466 -/*
26467 - * The maximum number of vectors supported by i386 processors
26468 - * is limited to 256. For processors other than i386, NR_VECTORS
26469 - * should be changed accordingly.
26470 - */
26471 -#define NR_VECTORS 256
26472 -
26473 -#define FPU_IRQ 13
26474 -
26475 -#define FIRST_VM86_IRQ 3
26476 -#define LAST_VM86_IRQ 15
26477 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
26478 -
26479 -/*
26480 - * The flat IRQ space is divided into two regions:
26481 - * 1. A one-to-one mapping of real physical IRQs. This space is only used
26482 - * if we have physical device-access privilege. This region is at the
26483 - * start of the IRQ space so that existing device drivers do not need
26484 - * to be modified to translate physical IRQ numbers into our IRQ space.
26485 - * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
26486 - * are bound using the provided bind/unbind functions.
26487 - */
26488 -
26489 -#define PIRQ_BASE 0
26490 -#if !defined(MAX_IO_APICS)
26491 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26492 -#elif NR_CPUS < MAX_IO_APICS
26493 -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
26494 -#else
26495 -# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
26496 -#endif
26497 -
26498 -#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
26499 -#define NR_DYNIRQS 256
26500 -
26501 -#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
26502 -#define NR_IRQ_VECTORS NR_IRQS
26503 -
26504 -#endif /* _ASM_IRQ_VECTORS_H */
26505 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/setup_arch_post.h 2009-10-28 14:55:02.000000000 +0100
26506 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26507 @@ -1,63 +0,0 @@
26508 -/**
26509 - * machine_specific_* - Hooks for machine specific setup.
26510 - *
26511 - * Description:
26512 - * This is included late in kernel/setup.c so that it can make
26513 - * use of all of the static functions.
26514 - **/
26515 -
26516 -#include <xen/interface/callback.h>
26517 -
26518 -extern void hypervisor_callback(void);
26519 -extern void failsafe_callback(void);
26520 -extern void nmi(void);
26521 -
26522 -static void __init machine_specific_arch_setup(void)
26523 -{
26524 - int ret;
26525 - static struct callback_register __initdata event = {
26526 - .type = CALLBACKTYPE_event,
26527 - .address = (unsigned long) hypervisor_callback,
26528 - };
26529 - static struct callback_register __initdata failsafe = {
26530 - .type = CALLBACKTYPE_failsafe,
26531 - .address = (unsigned long)failsafe_callback,
26532 - };
26533 - static struct callback_register __initdata syscall = {
26534 - .type = CALLBACKTYPE_syscall,
26535 - .address = (unsigned long)system_call,
26536 - };
26537 -#ifdef CONFIG_X86_LOCAL_APIC
26538 - static struct callback_register __initdata nmi_cb = {
26539 - .type = CALLBACKTYPE_nmi,
26540 - .address = (unsigned long)nmi,
26541 - };
26542 -#endif
26543 -
26544 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
26545 - if (ret == 0)
26546 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
26547 - if (ret == 0)
26548 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
26549 -#if CONFIG_XEN_COMPAT <= 0x030002
26550 - if (ret == -ENOSYS)
26551 - ret = HYPERVISOR_set_callbacks(
26552 - event.address,
26553 - failsafe.address,
26554 - syscall.address);
26555 -#endif
26556 - BUG_ON(ret);
26557 -
26558 -#ifdef CONFIG_X86_LOCAL_APIC
26559 - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
26560 -#if CONFIG_XEN_COMPAT <= 0x030002
26561 - if (ret == -ENOSYS) {
26562 - static struct xennmi_callback __initdata cb = {
26563 - .handler_address = (unsigned long)nmi
26564 - };
26565 -
26566 - HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
26567 - }
26568 -#endif
26569 -#endif
26570 -}
26571 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/setup_arch_pre.h 2009-10-28 14:55:02.000000000 +0100
26572 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26573 @@ -1,5 +0,0 @@
26574 -/* Hook to call BIOS initialisation function */
26575 -
26576 -#define ARCH_SETUP machine_specific_arch_setup();
26577 -
26578 -static void __init machine_specific_arch_setup(void);
26579 --- sle11-2009-10-16.orig/include/asm-x86/traps.h 2009-10-28 14:55:02.000000000 +0100
26580 +++ sle11-2009-10-16/include/asm-x86/traps.h 2009-06-04 10:21:39.000000000 +0200
26581 @@ -23,6 +23,9 @@ asmlinkage void spurious_interrupt_bug(v
26582 #ifdef CONFIG_X86_MCE
26583 asmlinkage void machine_check(void);
26584 #endif /* CONFIG_X86_MCE */
26585 +#ifdef CONFIG_X86_XEN
26586 +asmlinkage void fixup_4gb_segment(void);
26587 +#endif
26588
26589 void do_divide_error(struct pt_regs *, long);
26590 void do_overflow(struct pt_regs *, long);
26591 @@ -48,6 +51,9 @@ void math_error(void __user *);
26592 void do_coprocessor_error(struct pt_regs *, long);
26593 void do_simd_coprocessor_error(struct pt_regs *, long);
26594 void do_spurious_interrupt_bug(struct pt_regs *, long);
26595 +#ifdef CONFIG_XEN
26596 +void do_fixup_4gb_segment(struct pt_regs *, long);
26597 +#endif
26598 unsigned long patch_espfix_desc(unsigned long, unsigned long);
26599 asmlinkage void math_emulate(long);
26600
26601 --- sle11-2009-10-16.orig/include/asm-x86/xen/interface_64.h 2009-10-28 14:55:02.000000000 +0100
26602 +++ sle11-2009-10-16/include/asm-x86/xen/interface_64.h 2009-06-04 10:21:39.000000000 +0200
26603 @@ -136,7 +136,7 @@ struct cpu_user_regs {
26604 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
26605 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
26606 };
26607 -DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
26608 +DEFINE_XEN_GUEST_HANDLE_STRUCT(cpu_user_regs);
26609
26610 #undef __DECL_REG
26611
26612 --- sle11-2009-10-16.orig/include/linux/page-flags.h 2009-03-16 16:38:05.000000000 +0100
26613 +++ sle11-2009-10-16/include/linux/page-flags.h 2009-06-04 10:21:39.000000000 +0200
26614 @@ -110,9 +110,11 @@ enum pageflags {
26615 /* Filesystems */
26616 PG_checked = PG_owner_priv_1,
26617
26618 +#ifdef CONFIG_PARAVIRT_XEN
26619 /* XEN */
26620 PG_pinned = PG_owner_priv_1,
26621 PG_savepinned = PG_dirty,
26622 +#endif
26623
26624 /* SLOB */
26625 PG_slob_page = PG_active,
26626 @@ -187,8 +189,12 @@ PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU,
26627 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
26628 __PAGEFLAG(Slab, slab)
26629 PAGEFLAG(Checked, checked) /* Used by some filesystems */
26630 +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
26631 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
26632 +#endif
26633 +#ifdef CONFIG_PARAVIRT_XEN
26634 PAGEFLAG(SavePinned, savepinned); /* Xen */
26635 +#endif
26636 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
26637 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
26638 __SETPAGEFLAG(Private, private)
26639 --- sle11-2009-10-16.orig/include/xen/interface/memory.h 2009-03-16 16:38:05.000000000 +0100
26640 +++ sle11-2009-10-16/include/xen/interface/memory.h 2009-06-04 10:21:39.000000000 +0200
26641 @@ -82,6 +82,7 @@ struct xen_memory_reservation {
26642 domid_t domid;
26643
26644 };
26645 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
26646 typedef struct xen_memory_reservation xen_memory_reservation_t;
26647 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
26648
26649 @@ -167,11 +168,7 @@ struct xen_machphys_mfn_list {
26650 * any large discontiguities in the machine address space, 2MB gaps in
26651 * the machphys table will be represented by an MFN base of zero.
26652 */
26653 -#ifndef CONFIG_PARAVIRT_XEN
26654 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
26655 -#else
26656 - ulong extent_start;
26657 -#endif
26658
26659 /*
26660 * Number of extents written to the above array. This will be smaller
26661 @@ -179,6 +176,7 @@ struct xen_machphys_mfn_list {
26662 */
26663 unsigned int nr_extents;
26664 };
26665 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
26666 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
26667 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
26668
26669 @@ -218,6 +216,7 @@ struct xen_add_to_physmap {
26670 /* GPFN where the source mapping page should appear. */
26671 xen_pfn_t gpfn;
26672 };
26673 +DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
26674 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
26675 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
26676
26677 @@ -250,21 +249,13 @@ struct xen_translate_gpfn_list {
26678 xen_ulong_t nr_gpfns;
26679
26680 /* List of GPFNs to translate. */
26681 -#ifndef CONFIG_PARAVIRT_XEN
26682 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
26683 -#else
26684 - ulong gpfn_list;
26685 -#endif
26686
26687 /*
26688 * Output list to contain MFN translations. May be the same as the input
26689 * list (in which case each input GPFN is overwritten with the output MFN).
26690 */
26691 -#ifndef CONFIG_PARAVIRT_XEN
26692 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
26693 -#else
26694 - ulong mfn_list;
26695 -#endif
26696 };
26697 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
26698 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
26699 --- sle11-2009-10-16.orig/kernel/hrtimer.c 2009-10-28 14:55:02.000000000 +0100
26700 +++ sle11-2009-10-16/kernel/hrtimer.c 2009-06-04 10:21:39.000000000 +0200
26701 @@ -1084,7 +1084,7 @@ ktime_t hrtimer_get_remaining(const stru
26702 }
26703 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
26704
26705 -#ifdef CONFIG_NO_HZ
26706 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26707 /**
26708 * hrtimer_get_next_event - get the time until next expiry event
26709 *
26710 --- sle11-2009-10-16.orig/kernel/kexec.c 2009-02-17 12:38:20.000000000 +0100
26711 +++ sle11-2009-10-16/kernel/kexec.c 2009-06-04 10:21:39.000000000 +0200
26712 @@ -54,7 +54,7 @@ int dump_after_notifier;
26713 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
26714 u32
26715 #if defined(CONFIG_XEN) && defined(CONFIG_X86)
26716 -__attribute__((__section__(".bss.page_aligned"), __aligned__(PAGE_SIZE)))
26717 +__page_aligned_bss
26718 #endif
26719 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
26720 size_t vmcoreinfo_size;
26721 --- sle11-2009-10-16.orig/kernel/timer.c 2009-10-28 14:55:02.000000000 +0100
26722 +++ sle11-2009-10-16/kernel/timer.c 2009-06-04 10:21:39.000000000 +0200
26723 @@ -884,7 +884,7 @@ static inline void __run_timers(struct t
26724 spin_unlock_irq(&base->lock);
26725 }
26726
26727 -#ifdef CONFIG_NO_HZ
26728 +#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
26729 /*
26730 * Find out when the next timer event is due to happen. This
26731 * is used on S/390 to stop all activity when a cpus is idle.
26732 --- sle11-2009-10-16.orig/lib/swiotlb-xen.c 2009-03-16 16:38:05.000000000 +0100
26733 +++ sle11-2009-10-16/lib/swiotlb-xen.c 2009-06-04 10:21:39.000000000 +0200
26734 @@ -750,7 +750,7 @@ swiotlb_sync_sg_for_device(struct device
26735 }
26736
26737 int
26738 -swiotlb_dma_mapping_error(dma_addr_t dma_addr)
26739 +swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
26740 {
26741 return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
26742 }
26743 --- sle11-2009-10-16.orig/mm/mprotect.c 2009-03-04 11:28:34.000000000 +0100
26744 +++ sle11-2009-10-16/mm/mprotect.c 2009-06-04 10:21:39.000000000 +0200
26745 @@ -92,8 +92,6 @@ static inline void change_pmd_range(stru
26746 next = pmd_addr_end(addr, end);
26747 if (pmd_none_or_clear_bad(pmd))
26748 continue;
26749 - if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
26750 - continue;
26751 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
26752 } while (pmd++, addr = next, addr != end);
26753 }