]> git.ipfire.org Git - ipfire-2.x.git/blame - src/patches/suse-2.6.27.31/patches.xen/xen3-patch-2.6.26
Add a patch to fix Intel E100 wake-on-lan problems.
[ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.xen / xen3-patch-2.6.26
CommitLineData
2cb7cef9
BS
1From: kernel.org
2Subject: 2.6.26
3Patch-mainline: 2.6.26
4
5Acked-by: Jeff Mahoney <jeffm@suse.com>
6Automatically created from "patches.kernel.org/patch-2.6.26" by xen-port-patches.py
7
8--- sle11-2009-05-14.orig/arch/x86/Kconfig 2009-03-16 16:33:40.000000000 +0100
9+++ sle11-2009-05-14/arch/x86/Kconfig 2009-03-16 16:38:05.000000000 +0100
10@@ -28,7 +28,7 @@ config X86
11 select HAVE_DYNAMIC_FTRACE
12 select HAVE_FTRACE
13 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
14- select HAVE_ARCH_KGDB if !X86_VOYAGER
15+ select HAVE_ARCH_KGDB if !X86_VOYAGER && !XEN
16 select HAVE_ARCH_TRACEHOOK
17 select HAVE_GENERIC_DMA_COHERENT if X86_32
18 select HAVE_EFFICIENT_UNALIGNED_ACCESS
19@@ -486,6 +486,7 @@ config PARAVIRT_DEBUG
20
21 config MEMTEST
22 bool "Memtest"
23+ depends on !XEN
24 help
25 This option adds a kernel parameter 'memtest', which allows memtest
26 to be set.
27@@ -1007,7 +1008,7 @@ config X86_PAE
28 config DIRECT_GBPAGES
29 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
30 default y
31- depends on X86_64
32+ depends on X86_64 && !XEN
33 help
34 Allow the kernel linear mapping to use 1GB pages on CPUs that
35 support it. This can improve the kernel's performance a tiny bit by
36@@ -1349,8 +1350,7 @@ source kernel/Kconfig.hz
37
38 config KEXEC
39 bool "kexec system call"
40- depends on X86_BIOS_REBOOT
41- depends on !XEN_UNPRIVILEGED_GUEST
42+ depends on X86_BIOS_REBOOT || (XEN && !XEN_UNPRIVILEGED_GUEST)
43 help
44 kexec is a system call that implements the ability to shutdown your
45 current kernel, and to start another kernel. It is like a reboot
46@@ -1948,6 +1948,4 @@ source "crypto/Kconfig"
47
48 source "arch/x86/kvm/Kconfig"
49
50-source "drivers/xen/Kconfig"
51-
52 source "lib/Kconfig"
53--- sle11-2009-05-14.orig/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:33:40.000000000 +0100
54+++ sle11-2009-05-14/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:38:05.000000000 +0100
55@@ -129,12 +129,14 @@ sysenter_tracesys:
56 SAVE_REST
57 CLEAR_RREGS
58 movq %r9,R9(%rsp)
59- movq $-ENOSYS,RAX(%rsp) /* really needed? */
60+ movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
61 movq %rsp,%rdi /* &pt_regs -> arg1 */
62 call syscall_trace_enter
63 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
64 RESTORE_REST
65 xchgl %ebp,%r9d
66+ cmpl $(IA32_NR_syscalls-1),%eax
67+ ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
68 jmp sysenter_do_call
69 CFI_ENDPROC
70 ENDPROC(ia32_sysenter_target)
71@@ -200,13 +202,15 @@ cstar_tracesys:
72 SAVE_REST
73 CLEAR_RREGS
74 movq %r9,R9(%rsp)
75- movq $-ENOSYS,RAX(%rsp) /* really needed? */
76+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
77 movq %rsp,%rdi /* &pt_regs -> arg1 */
78 call syscall_trace_enter
79 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
80 RESTORE_REST
81 xchgl %ebp,%r9d
82 movl RSP-ARGOFFSET(%rsp), %r8d
83+ cmpl $(IA32_NR_syscalls-1),%eax
84+ ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
85 jmp cstar_do_call
86 END(ia32_cstar_target)
87
88@@ -264,7 +268,7 @@ ENTRY(ia32_syscall)
89 jnz ia32_tracesys
90 ia32_do_syscall:
91 cmpl $(IA32_NR_syscalls-1),%eax
92- ja ia32_badsys
93+ ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
94 IA32_ARG_FIXUP
95 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
96 ia32_sysret:
97@@ -274,7 +278,7 @@ ia32_sysret:
98 ia32_tracesys:
99 SAVE_REST
100 CLEAR_RREGS
101- movq $-ENOSYS,RAX(%rsp) /* really needed? */
102+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
103 movq %rsp,%rdi /* &pt_regs -> arg1 */
104 call syscall_trace_enter
105 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
106@@ -365,7 +369,7 @@ ia32_sys_call_table:
107 .quad sys_setuid16
108 .quad sys_getuid16
109 .quad compat_sys_stime /* stime */ /* 25 */
110- .quad sys32_ptrace /* ptrace */
111+ .quad compat_sys_ptrace /* ptrace */
112 .quad sys_alarm
113 .quad sys_fstat /* (old)fstat */
114 .quad sys_pause
115--- sle11-2009-05-14.orig/arch/x86/kernel/Makefile 2009-03-16 16:33:40.000000000 +0100
116+++ sle11-2009-05-14/arch/x86/kernel/Makefile 2009-03-16 16:38:05.000000000 +0100
117@@ -122,8 +122,7 @@ ifeq ($(CONFIG_X86_64),y)
118
119 obj-$(CONFIG_XEN) += nmi_64.o
120 time_64-$(CONFIG_XEN) += time_32.o
121- pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
122 endif
123
124-disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
125- smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
126+disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
127+ pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
128--- sle11-2009-05-14.orig/arch/x86/kernel/acpi/Makefile 2008-12-01 11:11:08.000000000 +0100
129+++ sle11-2009-05-14/arch/x86/kernel/acpi/Makefile 2009-03-16 16:38:05.000000000 +0100
130@@ -15,4 +15,4 @@ $(obj)/wakeup_rm.o: $(obj)/realmode/w
131 $(obj)/realmode/wakeup.bin: FORCE
132 $(Q)$(MAKE) $(build)=$(obj)/realmode
133
134-disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_$(BITS).o
135+disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_%.o
136--- sle11-2009-05-14.orig/arch/x86/kernel/acpi/boot.c 2009-03-16 16:33:40.000000000 +0100
137+++ sle11-2009-05-14/arch/x86/kernel/acpi/boot.c 2009-03-16 16:38:05.000000000 +0100
138@@ -251,19 +251,23 @@ static int __init acpi_parse_madt(struct
139
140 static void __cpuinit acpi_register_lapic(int id, u8 enabled)
141 {
142+#ifndef CONFIG_XEN
143 unsigned int ver = 0;
144+#endif
145
146 if (!enabled) {
147 ++disabled_cpus;
148 return;
149 }
150
151+#ifndef CONFIG_XEN
152 #ifdef CONFIG_X86_32
153 if (boot_cpu_physical_apicid != -1U)
154 ver = apic_version[boot_cpu_physical_apicid];
155 #endif
156
157 generic_processor_info(id, ver);
158+#endif
159 }
160
161 static int __init
162@@ -774,6 +778,7 @@ static int __init acpi_parse_fadt(struct
163 * returns 0 on success, < 0 on error
164 */
165
166+#ifndef CONFIG_XEN
167 static void __init acpi_register_lapic_address(unsigned long address)
168 {
169 mp_lapic_addr = address;
170@@ -787,6 +792,9 @@ static void __init acpi_register_lapic_a
171 #endif
172 }
173 }
174+#else
175+#define acpi_register_lapic_address(address)
176+#endif
177
178 static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
179 {
180--- sle11-2009-05-14.orig/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:33:40.000000000 +0100
181+++ sle11-2009-05-14/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:38:05.000000000 +0100
182@@ -10,15 +10,19 @@
183 #include <linux/dmi.h>
184 #include <linux/cpumask.h>
185
186-#include <asm/smp.h>
187+#include "realmode/wakeup.h"
188+#include "sleep.h"
189
190 #ifndef CONFIG_ACPI_PV_SLEEP
191-/* address in low memory of the wakeup routine. */
192-unsigned long acpi_wakeup_address = 0;
193+unsigned long acpi_wakeup_address;
194 unsigned long acpi_realmode_flags;
195-extern char wakeup_start, wakeup_end;
196
197-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
198+/* address in low memory of the wakeup routine. */
199+static unsigned long acpi_realmode;
200+
201+#ifdef CONFIG_64BIT
202+static char temp_stack[10240];
203+#endif
204 #endif
205
206 /**
207@@ -26,17 +30,69 @@ extern unsigned long acpi_copy_wakeup_ro
208 *
209 * Create an identity mapped page table and copy the wakeup routine to
210 * low memory.
211+ *
212+ * Note that this is too late to change acpi_wakeup_address.
213 */
214 int acpi_save_state_mem(void)
215 {
216 #ifndef CONFIG_ACPI_PV_SLEEP
217- if (!acpi_wakeup_address) {
218- printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
219+ struct wakeup_header *header;
220+
221+ if (!acpi_realmode) {
222+ printk(KERN_ERR "Could not allocate memory during boot, "
223+ "S3 disabled\n");
224 return -ENOMEM;
225 }
226- memcpy((void *)acpi_wakeup_address, &wakeup_start,
227- &wakeup_end - &wakeup_start);
228- acpi_copy_wakeup_routine(acpi_wakeup_address);
229+ memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
230+
231+ header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET);
232+ if (header->signature != 0x51ee1111) {
233+ printk(KERN_ERR "wakeup header does not match\n");
234+ return -EINVAL;
235+ }
236+
237+ header->video_mode = saved_video_mode;
238+
239+ header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
240+ /* GDT[0]: GDT self-pointer */
241+ header->wakeup_gdt[0] =
242+ (u64)(sizeof(header->wakeup_gdt) - 1) +
243+ ((u64)(acpi_wakeup_address +
244+ ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
245+ << 16);
246+ /* GDT[1]: real-mode-like code segment */
247+ header->wakeup_gdt[1] = (0x009bULL << 40) +
248+ ((u64)acpi_wakeup_address << 16) + 0xffff;
249+ /* GDT[2]: real-mode-like data segment */
250+ header->wakeup_gdt[2] = (0x0093ULL << 40) +
251+ ((u64)acpi_wakeup_address << 16) + 0xffff;
252+
253+#ifndef CONFIG_64BIT
254+ store_gdt((struct desc_ptr *)&header->pmode_gdt);
255+
256+ header->pmode_efer_low = nx_enabled;
257+ if (header->pmode_efer_low & 1) {
258+ /* This is strange, why not save efer, always? */
259+ rdmsr(MSR_EFER, header->pmode_efer_low,
260+ header->pmode_efer_high);
261+ }
262+#endif /* !CONFIG_64BIT */
263+
264+ header->pmode_cr0 = read_cr0();
265+ header->pmode_cr4 = read_cr4();
266+ header->realmode_flags = acpi_realmode_flags;
267+ header->real_magic = 0x12345678;
268+
269+#ifndef CONFIG_64BIT
270+ header->pmode_entry = (u32)&wakeup_pmode_return;
271+ header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
272+ saved_magic = 0x12345678;
273+#else /* CONFIG_64BIT */
274+ header->trampoline_segment = setup_trampoline() >> 4;
275+ init_rsp = (unsigned long)temp_stack + 4096;
276+ initial_code = (unsigned long)wakeup_long64;
277+ saved_magic = 0x123456789abcdef0;
278+#endif /* CONFIG_64BIT */
279 #endif
280
281 return 0;
282@@ -61,15 +117,20 @@ void acpi_restore_state_mem(void)
283 void __init acpi_reserve_bootmem(void)
284 {
285 #ifndef CONFIG_ACPI_PV_SLEEP
286- if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
287+ if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
288 printk(KERN_ERR
289 "ACPI: Wakeup code way too big, S3 disabled.\n");
290 return;
291 }
292
293- acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
294- if (!acpi_wakeup_address)
295+ acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE);
296+
297+ if (!acpi_realmode) {
298 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
299+ return;
300+ }
301+
302+ acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
303 #endif
304 }
305
306--- sle11-2009-05-14.orig/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:33:40.000000000 +0100
307+++ sle11-2009-05-14/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:38:05.000000000 +0100
308@@ -5,7 +5,6 @@
309 #include <linux/module.h>
310 #include <linux/percpu.h>
311 #include <linux/bootmem.h>
312-#include <asm/semaphore.h>
313 #include <asm/processor.h>
314 #include <asm/i387.h>
315 #include <asm/msr.h>
316@@ -13,6 +12,7 @@
317 #include <asm/mmu_context.h>
318 #include <asm/mtrr.h>
319 #include <asm/mce.h>
320+#include <asm/pat.h>
321 #ifdef CONFIG_X86_LOCAL_APIC
322 #include <asm/mpspec.h>
323 #include <asm/apic.h>
324@@ -69,9 +69,9 @@ __u32 cleared_cpu_caps[NCAPINTS] __cpuin
325 static int cachesize_override __cpuinitdata = -1;
326 static int disable_x86_serial_nr __cpuinitdata = 1;
327
328-struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
329+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
330
331-static void __cpuinit default_init(struct cpuinfo_x86 * c)
332+static void __cpuinit default_init(struct cpuinfo_x86 *c)
333 {
334 /* Not much we can do here... */
335 /* Check if at least it has cpuid */
336@@ -88,11 +88,11 @@ static struct cpu_dev __cpuinitdata defa
337 .c_init = default_init,
338 .c_vendor = "Unknown",
339 };
340-static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
341+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
342
343 static int __init cachesize_setup(char *str)
344 {
345- get_option (&str, &cachesize_override);
346+ get_option(&str, &cachesize_override);
347 return 1;
348 }
349 __setup("cachesize=", cachesize_setup);
350@@ -114,12 +114,12 @@ int __cpuinit get_model_name(struct cpui
351 /* Intel chips right-justify this string for some dumb reason;
352 undo that brain damage */
353 p = q = &c->x86_model_id[0];
354- while ( *p == ' ' )
355+ while (*p == ' ')
356 p++;
357- if ( p != q ) {
358- while ( *p )
359+ if (p != q) {
360+ while (*p)
361 *q++ = *p++;
362- while ( q <= &c->x86_model_id[48] )
363+ while (q <= &c->x86_model_id[48])
364 *q++ = '\0'; /* Zero-pad the rest */
365 }
366
367@@ -137,7 +137,7 @@ void __cpuinit display_cacheinfo(struct
368 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
369 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
370 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
371- c->x86_cache_size=(ecx>>24)+(edx>>24);
372+ c->x86_cache_size = (ecx>>24)+(edx>>24);
373 }
374
375 if (n < 0x80000006) /* Some chips just has a large L1. */
376@@ -145,16 +145,16 @@ void __cpuinit display_cacheinfo(struct
377
378 ecx = cpuid_ecx(0x80000006);
379 l2size = ecx >> 16;
380-
381+
382 /* do processor-specific cache resizing */
383 if (this_cpu->c_size_cache)
384- l2size = this_cpu->c_size_cache(c,l2size);
385+ l2size = this_cpu->c_size_cache(c, l2size);
386
387 /* Allow user to override all this if necessary. */
388 if (cachesize_override != -1)
389 l2size = cachesize_override;
390
391- if ( l2size == 0 )
392+ if (l2size == 0)
393 return; /* Again, no L2 cache is possible */
394
395 c->x86_cache_size = l2size;
396@@ -163,16 +163,19 @@ void __cpuinit display_cacheinfo(struct
397 l2size, ecx & 0xFF);
398 }
399
400-/* Naming convention should be: <Name> [(<Codename>)] */
401-/* This table only is used unless init_<vendor>() below doesn't set it; */
402-/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
403+/*
404+ * Naming convention should be: <Name> [(<Codename>)]
405+ * This table only is used unless init_<vendor>() below doesn't set it;
406+ * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
407+ *
408+ */
409
410 /* Look up CPU names by table lookup. */
411 static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
412 {
413 struct cpu_model_info *info;
414
415- if ( c->x86_model >= 16 )
416+ if (c->x86_model >= 16)
417 return NULL; /* Range check */
418
419 if (!this_cpu)
420@@ -197,9 +200,9 @@ static void __cpuinit get_cpu_vendor(str
421
422 for (i = 0; i < X86_VENDOR_NUM; i++) {
423 if (cpu_devs[i]) {
424- if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
425- (cpu_devs[i]->c_ident[1] &&
426- !strcmp(v,cpu_devs[i]->c_ident[1]))) {
427+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
428+ (cpu_devs[i]->c_ident[1] &&
429+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
430 c->x86_vendor = i;
431 if (!early)
432 this_cpu = cpu_devs[i];
433@@ -217,7 +220,7 @@ static void __cpuinit get_cpu_vendor(str
434 }
435
436
437-static int __init x86_fxsr_setup(char * s)
438+static int __init x86_fxsr_setup(char *s)
439 {
440 setup_clear_cpu_cap(X86_FEATURE_FXSR);
441 setup_clear_cpu_cap(X86_FEATURE_XMM);
442@@ -226,7 +229,7 @@ static int __init x86_fxsr_setup(char *
443 __setup("nofxsr", x86_fxsr_setup);
444
445
446-static int __init x86_sep_setup(char * s)
447+static int __init x86_sep_setup(char *s)
448 {
449 setup_clear_cpu_cap(X86_FEATURE_SEP);
450 return 1;
451@@ -315,12 +318,15 @@ static void __cpuinit early_get_cap(stru
452
453 }
454
455-/* Do minimum CPU detection early.
456- Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
457- The others are not touched to avoid unwanted side effects.
458-
459- WARNING: this function is only called on the BP. Don't add code here
460- that is supposed to run on all CPUs. */
461+/*
462+ * Do minimum CPU detection early.
463+ * Fields really needed: vendor, cpuid_level, family, model, mask,
464+ * cache alignment.
465+ * The others are not touched to avoid unwanted side effects.
466+ *
467+ * WARNING: this function is only called on the BP. Don't add code here
468+ * that is supposed to run on all CPUs.
469+ */
470 static void __init early_cpu_detect(void)
471 {
472 struct cpuinfo_x86 *c = &boot_cpu_data;
473@@ -335,19 +341,14 @@ static void __init early_cpu_detect(void
474
475 get_cpu_vendor(c, 1);
476
477- switch (c->x86_vendor) {
478- case X86_VENDOR_AMD:
479- early_init_amd(c);
480- break;
481- case X86_VENDOR_INTEL:
482- early_init_intel(c);
483- break;
484- }
485+ if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
486+ cpu_devs[c->x86_vendor]->c_early_init)
487+ cpu_devs[c->x86_vendor]->c_early_init(c);
488
489 early_get_cap(c);
490 }
491
492-static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
493+static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
494 {
495 u32 tfms, xlvl;
496 unsigned int ebx;
497@@ -358,13 +359,12 @@ static void __cpuinit generic_identify(s
498 (unsigned int *)&c->x86_vendor_id[0],
499 (unsigned int *)&c->x86_vendor_id[8],
500 (unsigned int *)&c->x86_vendor_id[4]);
501-
502+
503 get_cpu_vendor(c, 0);
504 /* Initialize the standard set of capabilities */
505 /* Note that the vendor-specific code below might override */
506-
507 /* Intel-defined flags: level 0x00000001 */
508- if ( c->cpuid_level >= 0x00000001 ) {
509+ if (c->cpuid_level >= 0x00000001) {
510 u32 capability, excap;
511 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
512 c->x86_capability[0] = capability;
513@@ -376,12 +376,14 @@ static void __cpuinit generic_identify(s
514 if (c->x86 >= 0x6)
515 c->x86_model += ((tfms >> 16) & 0xF) << 4;
516 c->x86_mask = tfms & 15;
517+ c->initial_apicid = (ebx >> 24) & 0xFF;
518 #ifdef CONFIG_X86_HT
519- c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
520+ c->apicid = phys_pkg_id(c->initial_apicid, 0);
521+ c->phys_proc_id = c->initial_apicid;
522 #else
523- c->apicid = (ebx >> 24) & 0xFF;
524+ c->apicid = c->initial_apicid;
525 #endif
526- if (c->x86_capability[0] & (1<<19))
527+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
528 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
529 } else {
530 /* Have CPUID level 0 only - unheard of */
531@@ -390,33 +392,30 @@ static void __cpuinit generic_identify(s
532
533 /* AMD-defined flags: level 0x80000001 */
534 xlvl = cpuid_eax(0x80000000);
535- if ( (xlvl & 0xffff0000) == 0x80000000 ) {
536- if ( xlvl >= 0x80000001 ) {
537+ if ((xlvl & 0xffff0000) == 0x80000000) {
538+ if (xlvl >= 0x80000001) {
539 c->x86_capability[1] = cpuid_edx(0x80000001);
540 c->x86_capability[6] = cpuid_ecx(0x80000001);
541 }
542- if ( xlvl >= 0x80000004 )
543+ if (xlvl >= 0x80000004)
544 get_model_name(c); /* Default name */
545 }
546
547 init_scattered_cpuid_features(c);
548 }
549
550-#ifdef CONFIG_X86_HT
551- c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
552-#endif
553 }
554
555 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
556 {
557- if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
558+ if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
559 /* Disable processor serial number */
560- unsigned long lo,hi;
561- rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
562+ unsigned long lo, hi;
563+ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
564 lo |= 0x200000;
565- wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
566+ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
567 printk(KERN_NOTICE "CPU serial number disabled.\n");
568- clear_bit(X86_FEATURE_PN, c->x86_capability);
569+ clear_cpu_cap(c, X86_FEATURE_PN);
570
571 /* Disabling the serial number may affect the cpuid level */
572 c->cpuid_level = cpuid_eax(0);
573@@ -451,9 +450,11 @@ void __cpuinit identify_cpu(struct cpuin
574 memset(&c->x86_capability, 0, sizeof c->x86_capability);
575
576 if (!have_cpuid_p()) {
577- /* First of all, decide if this is a 486 or higher */
578- /* It's a 486 if we can modify the AC flag */
579- if ( flag_is_changeable_p(X86_EFLAGS_AC) )
580+ /*
581+ * First of all, decide if this is a 486 or higher
582+ * It's a 486 if we can modify the AC flag
583+ */
584+ if (flag_is_changeable_p(X86_EFLAGS_AC))
585 c->x86 = 4;
586 else
587 c->x86 = 3;
588@@ -486,10 +487,10 @@ void __cpuinit identify_cpu(struct cpuin
589 */
590
591 /* If the model name is still unset, do table lookup. */
592- if ( !c->x86_model_id[0] ) {
593+ if (!c->x86_model_id[0]) {
594 char *p;
595 p = table_lookup_model(c);
596- if ( p )
597+ if (p)
598 strcpy(c->x86_model_id, p);
599 else
600 /* Last resort... */
601@@ -503,9 +504,9 @@ void __cpuinit identify_cpu(struct cpuin
602 * common between the CPUs. The first time this routine gets
603 * executed, c == &boot_cpu_data.
604 */
605- if ( c != &boot_cpu_data ) {
606+ if (c != &boot_cpu_data) {
607 /* AND the already accumulated flags with these */
608- for ( i = 0 ; i < NCAPINTS ; i++ )
609+ for (i = 0 ; i < NCAPINTS ; i++)
610 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
611 }
612
613@@ -549,7 +550,7 @@ void __cpuinit detect_ht(struct cpuinfo_
614
615 if (smp_num_siblings == 1) {
616 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
617- } else if (smp_num_siblings > 1 ) {
618+ } else if (smp_num_siblings > 1) {
619
620 if (smp_num_siblings > NR_CPUS) {
621 printk(KERN_WARNING "CPU: Unsupported number of the "
622@@ -559,7 +560,7 @@ void __cpuinit detect_ht(struct cpuinfo_
623 }
624
625 index_msb = get_count_order(smp_num_siblings);
626- c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
627+ c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
628
629 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
630 c->phys_proc_id);
631@@ -570,7 +571,7 @@ void __cpuinit detect_ht(struct cpuinfo_
632
633 core_bits = get_count_order(c->x86_max_cores);
634
635- c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
636+ c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
637 ((1 << core_bits) - 1);
638
639 if (c->x86_max_cores > 1)
640@@ -604,7 +605,7 @@ void __cpuinit print_cpu_info(struct cpu
641 else
642 printk("%s", c->x86_model_id);
643
644- if (c->x86_mask || c->cpuid_level >= 0)
645+ if (c->x86_mask || c->cpuid_level >= 0)
646 printk(" stepping %02x\n", c->x86_mask);
647 else
648 printk("\n");
649@@ -623,24 +624,17 @@ __setup("clearcpuid=", setup_disablecpui
650
651 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
652
653-/* This is hacky. :)
654- * We're emulating future behavior.
655- * In the future, the cpu-specific init functions will be called implicitly
656- * via the magic of initcalls.
657- * They will insert themselves into the cpu_devs structure.
658- * Then, when cpu_init() is called, we can just iterate over that array.
659- */
660 void __init early_cpu_init(void)
661 {
662- intel_cpu_init();
663- cyrix_init_cpu();
664- nsc_init_cpu();
665- amd_init_cpu();
666- centaur_init_cpu();
667- transmeta_init_cpu();
668- nexgen_init_cpu();
669- umc_init_cpu();
670+ struct cpu_vendor_dev *cvdev;
671+
672+ for (cvdev = __x86cpuvendor_start ;
673+ cvdev < __x86cpuvendor_end ;
674+ cvdev++)
675+ cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
676+
677 early_cpu_detect();
678+ validate_pat_support(&boot_cpu_data);
679 }
680
681 /* Make sure %fs is initialized properly in idle threads */
682@@ -685,7 +679,7 @@ void __cpuinit cpu_init(void)
683 int cpu = smp_processor_id();
684 struct task_struct *curr = current;
685 #ifndef CONFIG_X86_NO_TSS
686- struct tss_struct * t = &per_cpu(init_tss, cpu);
687+ struct tss_struct *t = &per_cpu(init_tss, cpu);
688 #endif
689 struct thread_struct *thread = &curr->thread;
690
691@@ -738,7 +732,7 @@ void __cpuinit cpu_init(void)
692 mxcsr_feature_mask_init();
693 }
694
695-#ifdef CONFIG_HOTPLUG_CPU
696+#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
697 void __cpuinit cpu_uninit(void)
698 {
699 int cpu = raw_smp_processor_id();
700--- sle11-2009-05-14.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-03-16 16:33:40.000000000 +0100
701+++ sle11-2009-05-14/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-03-16 16:38:05.000000000 +0100
702@@ -35,6 +35,8 @@ struct mtrr_ops *mtrr_if = &generic_mtrr
703 unsigned int num_var_ranges;
704 unsigned int mtrr_usage_table[MAX_VAR_RANGES];
705
706+static u64 tom2;
707+
708 static void __init set_num_var_ranges(void)
709 {
710 struct xen_platform_op op;
711@@ -162,8 +164,144 @@ mtrr_del(int reg, unsigned long base, un
712 EXPORT_SYMBOL(mtrr_add);
713 EXPORT_SYMBOL(mtrr_del);
714
715+/*
716+ * Returns the effective MTRR type for the region
717+ * Error returns:
718+ * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
719+ * - 0xFF - when MTRR is not enabled
720+ */
721+u8 mtrr_type_lookup(u64 start, u64 end)
722+{
723+ int i, error;
724+ u64 start_mfn, end_mfn, base_mfn, top_mfn;
725+ u8 prev_match, curr_match;
726+ struct xen_platform_op op;
727+
728+ if (!is_initial_xendomain())
729+ return MTRR_TYPE_WRBACK;
730+
731+ if (!num_var_ranges)
732+ return 0xFF;
733+
734+ start_mfn = start >> PAGE_SHIFT;
735+ /* Make end inclusive end, instead of exclusive */
736+ end_mfn = --end >> PAGE_SHIFT;
737+
738+ /* Look in fixed ranges. Just return the type as per start */
739+ if (start_mfn < 0x100) {
740+#if 0//todo
741+ op.cmd = XENPF_read_memtype;
742+ op.u.read_memtype.reg = ???;
743+ error = HYPERVISOR_platform_op(&op);
744+ if (!error)
745+ return op.u.read_memtype.type;
746+#endif
747+ return MTRR_TYPE_UNCACHABLE;
748+ }
749+
750+ /*
751+ * Look in variable ranges
752+ * Look of multiple ranges matching this address and pick type
753+ * as per MTRR precedence
754+ */
755+ prev_match = 0xFF;
756+ for (i = 0; i < num_var_ranges; ++i) {
757+ op.cmd = XENPF_read_memtype;
758+ op.u.read_memtype.reg = i;
759+ error = HYPERVISOR_platform_op(&op);
760+
761+ if (error || !op.u.read_memtype.nr_mfns)
762+ continue;
763+
764+ base_mfn = op.u.read_memtype.mfn;
765+ top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1;
766+
767+ if (base_mfn > end_mfn || start_mfn > top_mfn) {
768+ continue;
769+ }
770+
771+ if (base_mfn > start_mfn || end_mfn > top_mfn) {
772+ return 0xFE;
773+ }
774+
775+ curr_match = op.u.read_memtype.type;
776+ if (prev_match == 0xFF) {
777+ prev_match = curr_match;
778+ continue;
779+ }
780+
781+ if (prev_match == MTRR_TYPE_UNCACHABLE ||
782+ curr_match == MTRR_TYPE_UNCACHABLE) {
783+ return MTRR_TYPE_UNCACHABLE;
784+ }
785+
786+ if ((prev_match == MTRR_TYPE_WRBACK &&
787+ curr_match == MTRR_TYPE_WRTHROUGH) ||
788+ (prev_match == MTRR_TYPE_WRTHROUGH &&
789+ curr_match == MTRR_TYPE_WRBACK)) {
790+ prev_match = MTRR_TYPE_WRTHROUGH;
791+ curr_match = MTRR_TYPE_WRTHROUGH;
792+ }
793+
794+ if (prev_match != curr_match) {
795+ return MTRR_TYPE_UNCACHABLE;
796+ }
797+ }
798+
799+ if (tom2) {
800+ if (start >= (1ULL<<32) && (end < tom2))
801+ return MTRR_TYPE_WRBACK;
802+ }
803+
804+ if (prev_match != 0xFF)
805+ return prev_match;
806+
807+#if 0//todo
808+ op.cmd = XENPF_read_def_memtype;
809+ error = HYPERVISOR_platform_op(&op);
810+ if (!error)
811+ return op.u.read_def_memtype.type;
812+#endif
813+ return MTRR_TYPE_UNCACHABLE;
814+}
815+
816+/*
817+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
818+ * for memory >4GB. Check for that here.
819+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
820+ * apply to are wrong, but so far we don't know of any such case in the wild.
821+ */
822+#define Tom2Enabled (1U << 21)
823+#define Tom2ForceMemTypeWB (1U << 22)
824+
825+int __init amd_special_default_mtrr(void)
826+{
827+ u32 l, h;
828+
829+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
830+ return 0;
831+ if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
832+ return 0;
833+ /* In case some hypervisor doesn't pass SYSCFG through */
834+ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
835+ return 0;
836+ /*
837+ * Memory between 4GB and top of mem is forced WB by this magic bit.
838+ * Reserved before K8RevF, but should be zero there.
839+ */
840+ if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
841+ (Tom2Enabled | Tom2ForceMemTypeWB))
842+ return 1;
843+ return 0;
844+}
845+
846 void __init mtrr_bp_init(void)
847 {
848+ if (amd_special_default_mtrr()) {
849+ /* TOP_MEM2 */
850+ rdmsrl(MSR_K8_TOP_MEM2, tom2);
851+ tom2 &= 0xffffff8000000ULL;
852+ }
853 }
854
855 void mtrr_ap_init(void)
856--- sle11-2009-05-14.orig/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:33:40.000000000 +0100
857+++ sle11-2009-05-14/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:38:05.000000000 +0100
858@@ -469,7 +469,7 @@ int __init sanitize_e820_map(struct e820
859 * thinkpad 560x, for example, does not cooperate with the memory
860 * detection code.)
861 */
862-int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
863+int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
864 {
865 #ifndef CONFIG_XEN
866 /* Only one memory region (or negative)? Ignore it */
867@@ -480,33 +480,17 @@ int __init copy_e820_map(struct e820entr
868 #endif
869
870 do {
871- unsigned long long start = biosmap->addr;
872- unsigned long long size = biosmap->size;
873- unsigned long long end = start + size;
874- unsigned long type = biosmap->type;
875+ u64 start = biosmap->addr;
876+ u64 size = biosmap->size;
877+ u64 end = start + size;
878+ u32 type = biosmap->type;
879
880 /* Overflow in 64 bits? Ignore the memory map. */
881 if (start > end)
882 return -1;
883
884-#ifndef CONFIG_XEN
885- /*
886- * Some BIOSes claim RAM in the 640k - 1M region.
887- * Not right. Fix it up.
888- */
889- if (type == E820_RAM) {
890- if (start < 0x100000ULL && end > 0xA0000ULL) {
891- if (start < 0xA0000ULL)
892- add_memory_region(start, 0xA0000ULL-start, type);
893- if (end <= 0x100000ULL)
894- continue;
895- start = 0x100000ULL;
896- size = end - start;
897- }
898- }
899-#endif
900 add_memory_region(start, size, type);
901- } while (biosmap++,--nr_map);
902+ } while (biosmap++, --nr_map);
903
904 #ifdef CONFIG_XEN
905 if (is_initial_xendomain()) {
906@@ -528,7 +512,7 @@ int __init copy_e820_map(struct e820entr
907 /*
908 * Find the highest page frame number we have available
909 */
910-void __init find_max_pfn(void)
911+void __init propagate_e820_map(void)
912 {
913 int i;
914
915@@ -814,7 +798,7 @@ static int __init parse_memmap(char *arg
916 * size before original memory map is
917 * reset.
918 */
919- find_max_pfn();
920+ propagate_e820_map();
921 saved_max_pfn = max_pfn;
922 #endif
923 e820.nr_map = 0;
924--- sle11-2009-05-14.orig/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:33:40.000000000 +0100
925+++ sle11-2009-05-14/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:38:05.000000000 +0100
926@@ -40,11 +40,11 @@ struct e820map machine_e820;
927 unsigned long end_pfn;
928
929 /*
930- * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
931- * The direct mapping extends to end_pfn_map, so that we can directly access
932+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
933+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
934 * apertures, ACPI and other tables without having to play with fixmaps.
935 */
936-unsigned long end_pfn_map;
937+unsigned long max_pfn_mapped;
938
939 /*
940 * Last pfn which the user wants to use.
941@@ -63,8 +63,8 @@ struct early_res {
942 static struct early_res early_res[MAX_EARLY_RES] __initdata = {
943 #ifndef CONFIG_XEN
944 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
945-#ifdef CONFIG_SMP
946- { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
947+#ifdef CONFIG_X86_TRAMPOLINE
948+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
949 #endif
950 #endif
951 {}
952@@ -89,19 +89,47 @@ void __init reserve_early(unsigned long
953 strncpy(r->name, name, sizeof(r->name) - 1);
954 }
955
956-void __init early_res_to_bootmem(void)
957+void __init free_early(unsigned long start, unsigned long end)
958+{
959+ struct early_res *r;
960+ int i, j;
961+
962+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
963+ r = &early_res[i];
964+ if (start == r->start && end == r->end)
965+ break;
966+ }
967+ if (i >= MAX_EARLY_RES || !early_res[i].end)
968+ panic("free_early on not reserved area: %lx-%lx!", start, end);
969+
970+ for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
971+ ;
972+
973+ memmove(&early_res[i], &early_res[i + 1],
974+ (j - 1 - i) * sizeof(struct early_res));
975+
976+ early_res[j - 1].end = 0;
977+}
978+
979+void __init early_res_to_bootmem(unsigned long start, unsigned long end)
980 {
981 int i;
982+ unsigned long final_start, final_end;
983 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
984 struct early_res *r = &early_res[i];
985- printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
986- r->start, r->end - 1, r->name);
987- reserve_bootmem_generic(r->start, r->end - r->start);
988+ final_start = max(start, r->start);
989+ final_end = min(end, r->end);
990+ if (final_start >= final_end)
991+ continue;
992+ printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
993+ final_start, final_end - 1, r->name);
994+ reserve_bootmem_generic(final_start, final_end - final_start);
995 }
996 }
997
998 /* Check for already reserved areas */
999-static inline int bad_addr(unsigned long *addrp, unsigned long size)
1000+static inline int __init
1001+bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
1002 {
1003 int i;
1004 unsigned long addr = *addrp, last;
1005@@ -111,7 +139,7 @@ again:
1006 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1007 struct early_res *r = &early_res[i];
1008 if (last >= r->start && addr < r->end) {
1009- *addrp = addr = r->end;
1010+ *addrp = addr = round_up(r->end, align);
1011 changed = 1;
1012 goto again;
1013 }
1014@@ -119,6 +147,40 @@ again:
1015 return changed;
1016 }
1017
1018+/* Check for already reserved areas */
1019+static inline int __init
1020+bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
1021+{
1022+ int i;
1023+ unsigned long addr = *addrp, last;
1024+ unsigned long size = *sizep;
1025+ int changed = 0;
1026+again:
1027+ last = addr + size;
1028+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1029+ struct early_res *r = &early_res[i];
1030+ if (last > r->start && addr < r->start) {
1031+ size = r->start - addr;
1032+ changed = 1;
1033+ goto again;
1034+ }
1035+ if (last > r->end && addr < r->end) {
1036+ addr = round_up(r->end, align);
1037+ size = last - addr;
1038+ changed = 1;
1039+ goto again;
1040+ }
1041+ if (last <= r->end && addr >= r->start) {
1042+ (*sizep)++;
1043+ return 0;
1044+ }
1045+ }
1046+ if (changed) {
1047+ *addrp = addr;
1048+ *sizep = size;
1049+ }
1050+ return changed;
1051+}
1052 /*
1053 * This function checks if any part of the range <start,end> is mapped
1054 * with type.
1055@@ -194,26 +256,27 @@ int __init e820_all_mapped(unsigned long
1056 * Find a free area with specified alignment in a specific range.
1057 */
1058 unsigned long __init find_e820_area(unsigned long start, unsigned long end,
1059- unsigned size, unsigned long align)
1060+ unsigned long size, unsigned long align)
1061 {
1062 int i;
1063- unsigned long mask = ~(align - 1);
1064
1065 for (i = 0; i < e820.nr_map; i++) {
1066 struct e820entry *ei = &e820.map[i];
1067- unsigned long addr = ei->addr, last;
1068+ unsigned long addr, last;
1069+ unsigned long ei_last;
1070
1071 if (ei->type != E820_RAM)
1072 continue;
1073+ addr = round_up(ei->addr, align);
1074+ ei_last = ei->addr + ei->size;
1075 if (addr < start)
1076- addr = start;
1077- if (addr > ei->addr + ei->size)
1078+ addr = round_up(start, align);
1079+ if (addr >= ei_last)
1080 continue;
1081- while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
1082+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
1083 ;
1084- addr = (addr + align - 1) & mask;
1085 last = addr + size;
1086- if (last > ei->addr + ei->size)
1087+ if (last > ei_last)
1088 continue;
1089 if (last > end)
1090 continue;
1091@@ -223,6 +286,40 @@ unsigned long __init find_e820_area(unsi
1092 }
1093
1094 /*
1095+ * Find next free range after *start
1096+ */
1097+unsigned long __init find_e820_area_size(unsigned long start,
1098+ unsigned long *sizep,
1099+ unsigned long align)
1100+{
1101+ int i;
1102+
1103+ for (i = 0; i < e820.nr_map; i++) {
1104+ struct e820entry *ei = &e820.map[i];
1105+ unsigned long addr, last;
1106+ unsigned long ei_last;
1107+
1108+ if (ei->type != E820_RAM)
1109+ continue;
1110+ addr = round_up(ei->addr, align);
1111+ ei_last = ei->addr + ei->size;
1112+ if (addr < start)
1113+ addr = round_up(start, align);
1114+ if (addr >= ei_last)
1115+ continue;
1116+ *sizep = ei_last - addr;
1117+ while (bad_addr_size(&addr, sizep, align) &&
1118+ addr + *sizep <= ei_last)
1119+ ;
1120+ last = addr + *sizep;
1121+ if (last > ei_last)
1122+ continue;
1123+ return addr;
1124+ }
1125+ return -1UL;
1126+
1127+}
1128+/*
1129 * Find the highest page frame number we have available
1130 */
1131 unsigned long __init e820_end_of_ram(void)
1132@@ -231,31 +328,29 @@ unsigned long __init e820_end_of_ram(voi
1133
1134 end_pfn = find_max_pfn_with_active_regions();
1135
1136- if (end_pfn > end_pfn_map)
1137- end_pfn_map = end_pfn;
1138- if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
1139- end_pfn_map = MAXMEM>>PAGE_SHIFT;
1140+ if (end_pfn > max_pfn_mapped)
1141+ max_pfn_mapped = end_pfn;
1142+ if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
1143+ max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
1144 if (end_pfn > end_user_pfn)
1145 end_pfn = end_user_pfn;
1146- if (end_pfn > end_pfn_map)
1147- end_pfn = end_pfn_map;
1148+ if (end_pfn > max_pfn_mapped)
1149+ end_pfn = max_pfn_mapped;
1150
1151- printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
1152+ printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
1153 return end_pfn;
1154 }
1155
1156 /*
1157 * Mark e820 reserved areas as busy for the resource manager.
1158 */
1159-void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
1160- struct resource *code_resource,
1161- struct resource *data_resource,
1162- struct resource *bss_resource)
1163+void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
1164 {
1165 int i;
1166+ struct resource *res;
1167+
1168+ res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
1169 for (i = 0; i < nr_map; i++) {
1170- struct resource *res;
1171- res = alloc_bootmem_low(sizeof(struct resource));
1172 switch (e820[i].type) {
1173 case E820_RAM: res->name = "System RAM"; break;
1174 case E820_ACPI: res->name = "ACPI Tables"; break;
1175@@ -265,26 +360,8 @@ void __init e820_reserve_resources(struc
1176 res->start = e820[i].addr;
1177 res->end = res->start + e820[i].size - 1;
1178 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1179- request_resource(&iomem_resource, res);
1180- if (e820[i].type == E820_RAM) {
1181- /*
1182- * We don't know which RAM region contains kernel data,
1183- * so we try it repeatedly and let the resource manager
1184- * test it.
1185- */
1186-#ifndef CONFIG_XEN
1187- request_resource(res, code_resource);
1188- request_resource(res, data_resource);
1189- request_resource(res, bss_resource);
1190-#endif
1191-#ifdef CONFIG_KEXEC
1192- if (crashk_res.start != crashk_res.end)
1193- request_resource(res, &crashk_res);
1194-#ifdef CONFIG_XEN
1195- xen_machine_kexec_register_resources(res);
1196-#endif
1197-#endif
1198- }
1199+ insert_resource(&iomem_resource, res);
1200+ res++;
1201 }
1202 }
1203
1204@@ -338,9 +415,9 @@ static int __init e820_find_active_regio
1205 if (*ei_startpfn >= *ei_endpfn)
1206 return 0;
1207
1208- /* Check if end_pfn_map should be updated */
1209- if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
1210- end_pfn_map = *ei_endpfn;
1211+ /* Check if max_pfn_mapped should be updated */
1212+ if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
1213+ max_pfn_mapped = *ei_endpfn;
1214
1215 /* Skip if map is outside the node */
1216 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
1217@@ -667,10 +744,10 @@ static int __init copy_e820_map(struct e
1218 #endif
1219
1220 do {
1221- unsigned long start = biosmap->addr;
1222- unsigned long size = biosmap->size;
1223- unsigned long end = start + size;
1224- unsigned long type = biosmap->type;
1225+ u64 start = biosmap->addr;
1226+ u64 size = biosmap->size;
1227+ u64 end = start + size;
1228+ u32 type = biosmap->type;
1229
1230 /* Overflow in 64 bits? Ignore the memory map. */
1231 if (start > end)
1232@@ -801,7 +878,7 @@ static int __init parse_memmap_opt(char
1233 saved_max_pfn = e820_end_of_ram();
1234 remove_all_active_ranges();
1235 #endif
1236- end_pfn_map = 0;
1237+ max_pfn_mapped = 0;
1238 e820.nr_map = 0;
1239 userdef = 1;
1240 return 0;
1241--- sle11-2009-05-14.orig/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:33:40.000000000 +0100
1242+++ sle11-2009-05-14/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:38:05.000000000 +0100
1243@@ -13,7 +13,7 @@
1244
1245 #ifndef CONFIG_XEN
1246 static int max_ypos = 25, max_xpos = 80;
1247-static int current_ypos = 25, current_xpos = 0;
1248+static int current_ypos = 25, current_xpos;
1249
1250 static void early_vga_write(struct console *con, const char *str, unsigned n)
1251 {
1252@@ -108,12 +108,12 @@ static __init void early_serial_init(cha
1253
1254 if (*s) {
1255 unsigned port;
1256- if (!strncmp(s,"0x",2)) {
1257+ if (!strncmp(s, "0x", 2)) {
1258 early_serial_base = simple_strtoul(s, &e, 16);
1259 } else {
1260 static int bases[] = { 0x3f8, 0x2f8 };
1261
1262- if (!strncmp(s,"ttyS",4))
1263+ if (!strncmp(s, "ttyS", 4))
1264 s += 4;
1265 port = simple_strtoul(s, &e, 10);
1266 if (port > 1 || s == e)
1267@@ -223,7 +223,7 @@ static struct console simnow_console = {
1268
1269 /* Direct interface for emergencies */
1270 static struct console *early_console = &early_vga_console;
1271-static int early_console_initialized = 0;
1272+static int early_console_initialized;
1273
1274 void early_printk(const char *fmt, ...)
1275 {
1276@@ -231,9 +231,9 @@ void early_printk(const char *fmt, ...)
1277 int n;
1278 va_list ap;
1279
1280- va_start(ap,fmt);
1281- n = vscnprintf(buf,512,fmt,ap);
1282- early_console->write(early_console,buf,n);
1283+ va_start(ap, fmt);
1284+ n = vscnprintf(buf, 512, fmt, ap);
1285+ early_console->write(early_console, buf, n);
1286 va_end(ap);
1287 }
1288
1289@@ -259,16 +259,16 @@ static int __init setup_early_printk(cha
1290 early_console = &early_serial_console;
1291 } else if (!strncmp(buf, "vga", 3)) {
1292 #ifndef CONFIG_XEN
1293- && boot_params.screen_info.orig_video_isVGA == 1) {
1294+ && boot_params.screen_info.orig_video_isVGA == 1) {
1295 max_xpos = boot_params.screen_info.orig_video_cols;
1296 max_ypos = boot_params.screen_info.orig_video_lines;
1297 current_ypos = boot_params.screen_info.orig_y;
1298 #endif
1299 early_console = &early_vga_console;
1300- } else if (!strncmp(buf, "simnow", 6)) {
1301- simnow_init(buf + 6);
1302- early_console = &simnow_console;
1303- keep_early = 1;
1304+ } else if (!strncmp(buf, "simnow", 6)) {
1305+ simnow_init(buf + 6);
1306+ early_console = &simnow_console;
1307+ keep_early = 1;
1308 #ifdef CONFIG_XEN
1309 } else if (!strncmp(buf, "xen", 3)) {
1310 early_console = &xenboot_console;
1311--- sle11-2009-05-14.orig/arch/x86/kernel/entry_32-xen.S 2009-05-14 11:18:32.000000000 +0200
1312+++ sle11-2009-05-14/arch/x86/kernel/entry_32-xen.S 2009-03-16 16:38:05.000000000 +0100
1313@@ -1,5 +1,4 @@
1314 /*
1315- * linux/arch/i386/entry.S
1316 *
1317 * Copyright (C) 1991, 1992 Linus Torvalds
1318 */
1319@@ -51,6 +50,7 @@
1320 #include <asm/desc.h>
1321 #include <asm/percpu.h>
1322 #include <asm/dwarf2.h>
1323+#include <asm/processor-flags.h>
1324 #include "irq_vectors.h"
1325 #include <xen/interface/xen.h>
1326
1327@@ -69,12 +69,6 @@
1328
1329 #define nr_syscalls ((syscall_table_size)/4)
1330
1331-CF_MASK = 0x00000001
1332-TF_MASK = 0x00000100
1333-IF_MASK = 0x00000200
1334-DF_MASK = 0x00000400
1335-NT_MASK = 0x00004000
1336-VM_MASK = 0x00020000
1337 /* Pseudo-eflags. */
1338 NMI_MASK = 0x80000000
1339
1340@@ -87,7 +81,7 @@ NMI_MASK = 0x80000000
1341
1342 .macro TRACE_IRQS_IRET
1343 #ifdef CONFIG_TRACE_IRQFLAGS
1344- testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
1345+ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
1346 jz 1f
1347 TRACE_IRQS_ON
1348 1:
1349@@ -249,7 +243,7 @@ ret_from_intr:
1350 check_userspace:
1351 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
1352 movb PT_CS(%esp), %al
1353- andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
1354+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
1355 cmpl $USER_RPL, %eax
1356 jb resume_kernel # not returning to v8086 or userspace
1357
1358@@ -258,6 +252,7 @@ ENTRY(resume_userspace)
1359 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1360 # setting need_resched or sigpending
1361 # between sampling and the iret
1362+ TRACE_IRQS_OFF
1363 movl TI_flags(%ebp), %ecx
1364 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
1365 # int/exception return?
1366@@ -274,7 +269,7 @@ need_resched:
1367 movl TI_flags(%ebp), %ecx # need_resched set ?
1368 testb $_TIF_NEED_RESCHED, %cl
1369 jz restore_all
1370- testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
1371+ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
1372 jz restore_all
1373 call preempt_schedule_irq
1374 jmp need_resched
1375@@ -299,10 +294,10 @@ ENTRY(ia32_sysenter_target)
1376 movl SYSENTER_stack_sp0(%esp),%esp
1377 sysenter_past_esp:
1378 /*
1379- * No need to follow this irqs on/off section: the syscall
1380- * disabled irqs and here we enable it straight after entry:
1381+ * Interrupts are disabled here, but we can't trace it until
1382+ * enough kernel state to call TRACE_IRQS_OFF can be called - but
1383+ * we immediately enable interrupts at that point anyway.
1384 */
1385- ENABLE_INTERRUPTS(CLBR_NONE)
1386 pushl $(__USER_DS)
1387 CFI_ADJUST_CFA_OFFSET 4
1388 /*CFI_REL_OFFSET ss, 0*/
1389@@ -310,6 +305,7 @@ sysenter_past_esp:
1390 CFI_ADJUST_CFA_OFFSET 4
1391 CFI_REL_OFFSET esp, 0
1392 pushfl
1393+ orl $X86_EFLAGS_IF, (%esp)
1394 CFI_ADJUST_CFA_OFFSET 4
1395 pushl $(__USER_CS)
1396 CFI_ADJUST_CFA_OFFSET 4
1397@@ -323,6 +319,11 @@ sysenter_past_esp:
1398 CFI_ADJUST_CFA_OFFSET 4
1399 CFI_REL_OFFSET eip, 0
1400
1401+ pushl %eax
1402+ CFI_ADJUST_CFA_OFFSET 4
1403+ SAVE_ALL
1404+ ENABLE_INTERRUPTS(CLBR_NONE)
1405+
1406 /*
1407 * Load the potential sixth argument from user stack.
1408 * Careful about security.
1409@@ -330,14 +331,12 @@ sysenter_past_esp:
1410 cmpl $__PAGE_OFFSET-3,%ebp
1411 jae syscall_fault
1412 1: movl (%ebp),%ebp
1413+ movl %ebp,PT_EBP(%esp)
1414 .section __ex_table,"a"
1415 .align 4
1416 .long 1b,syscall_fault
1417 .previous
1418
1419- pushl %eax
1420- CFI_ADJUST_CFA_OFFSET 4
1421- SAVE_ALL
1422 GET_THREAD_INFO(%ebp)
1423 test_tif %ebp
1424 jnz syscall_trace_entry
1425@@ -414,7 +413,7 @@ syscall_exit:
1426 # setting need_resched or sigpending
1427 # between sampling and the iret
1428 TRACE_IRQS_OFF
1429- testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
1430+ testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
1431 jz no_singlestep
1432 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1433 no_singlestep:
1434@@ -430,7 +429,7 @@ restore_all:
1435 # See comments in process.c:copy_thread() for details.
1436 movb PT_OLDSS(%esp), %ah
1437 movb PT_CS(%esp), %al
1438- andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1439+ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1440 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
1441 CFI_REMEMBER_STATE
1442 je ldt_ss # returning to user-space with LDT SS
1443@@ -438,7 +437,7 @@ restore_nocheck:
1444 #else
1445 restore_nocheck:
1446 movl PT_EFLAGS(%esp), %eax
1447- testl $(VM_MASK|NMI_MASK), %eax
1448+ testl $(X86_EFLAGS_VM|NMI_MASK), %eax
1449 CFI_REMEMBER_STATE
1450 jnz hypervisor_iret
1451 shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
1452@@ -456,7 +455,7 @@ restore_nocheck_notrace:
1453 irq_return:
1454 INTERRUPT_RETURN
1455 .section .fixup,"ax"
1456-iret_exc:
1457+ENTRY(iret_exc)
1458 pushl $0 # no error code
1459 pushl $do_iret_error
1460 jmp error_code
1461@@ -560,7 +559,7 @@ work_resched:
1462 work_notifysig: # deal with pending signals and
1463 # notify-resume requests
1464 #ifdef CONFIG_VM86
1465- testl $VM_MASK, PT_EFLAGS(%esp)
1466+ testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
1467 movl %esp, %eax
1468 jne work_notifysig_v86 # returning to kernel-space or
1469 # vm86-space
1470@@ -617,9 +616,6 @@ END(syscall_exit_work)
1471
1472 RING0_INT_FRAME # can't unwind into user space anyway
1473 syscall_fault:
1474- pushl %eax # save orig_eax
1475- CFI_ADJUST_CFA_OFFSET 4
1476- SAVE_ALL
1477 GET_THREAD_INFO(%ebp)
1478 movl $-EFAULT,PT_EAX(%esp)
1479 jmp resume_userspace
1480--- sle11-2009-05-14.orig/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:33:40.000000000 +0100
1481+++ sle11-2009-05-14/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:38:05.000000000 +0100
1482@@ -338,19 +338,17 @@ badsys:
1483 /* Do syscall tracing */
1484 tracesys:
1485 SAVE_REST
1486- movq $-ENOSYS,RAX(%rsp)
1487+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
1488 FIXUP_TOP_OF_STACK %rdi
1489 movq %rsp,%rdi
1490 call syscall_trace_enter
1491 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
1492 RESTORE_REST
1493 cmpq $__NR_syscall_max,%rax
1494- movq $-ENOSYS,%rcx
1495- cmova %rcx,%rax
1496- ja 1f
1497+ ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
1498 movq %r10,%rcx /* fixup for C */
1499 call *sys_call_table(,%rax,8)
1500-1: movq %rax,RAX-ARGOFFSET(%rsp)
1501+ movq %rax,RAX-ARGOFFSET(%rsp)
1502 /* Use IRET because user could have changed frame */
1503
1504 /*
1505--- sle11-2009-05-14.orig/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
1506+++ sle11-2009-05-14/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
1507@@ -15,6 +15,7 @@
1508 #include <linux/kernel.h>
1509 #include <linux/ctype.h>
1510 #include <linux/init.h>
1511+#include <linux/hardirq.h>
1512
1513 #include <asm/smp.h>
1514 #include <asm/ipi.h>
1515@@ -24,17 +25,12 @@
1516 #include <acpi/acpi_bus.h>
1517 #endif
1518
1519-/* which logical CPU number maps to which CPU (physical APIC ID) */
1520 #ifndef CONFIG_XEN
1521-u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
1522- = { [0 ... NR_CPUS-1] = BAD_APICID };
1523-void *x86_cpu_to_apicid_early_ptr;
1524-#endif
1525-DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
1526-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
1527+DEFINE_PER_CPU(int, x2apic_extra_bits);
1528
1529-#ifndef CONFIG_XEN
1530 struct genapic __read_mostly *genapic = &apic_flat;
1531+
1532+static enum uv_system_type uv_system_type;
1533 #else
1534 extern struct genapic apic_xen;
1535 struct genapic __read_mostly *genapic = &apic_xen;
1536@@ -47,6 +43,9 @@ struct genapic __read_mostly *genapic =
1537 void __init setup_apic_routing(void)
1538 {
1539 #ifndef CONFIG_XEN
1540+ if (uv_system_type == UV_NON_UNIQUE_APIC)
1541+ genapic = &apic_x2apic_uv_x;
1542+ else
1543 #ifdef CONFIG_ACPI
1544 /*
1545 * Quirk: some x86_64 machines can only use physical APIC mode
1546@@ -59,7 +58,7 @@ void __init setup_apic_routing(void)
1547 else
1548 #endif
1549
1550- if (cpus_weight(cpu_possible_map) <= 8)
1551+ if (num_possible_cpus() <= 8)
1552 genapic = &apic_flat;
1553 else
1554 genapic = &apic_physflat;
1555@@ -85,3 +84,41 @@ void send_IPI_self(int vector)
1556 xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
1557 #endif
1558 }
1559+
1560+int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
1561+{
1562+#ifndef CONFIG_XEN
1563+ if (!strcmp(oem_id, "SGI")) {
1564+ if (!strcmp(oem_table_id, "UVL"))
1565+ uv_system_type = UV_LEGACY_APIC;
1566+ else if (!strcmp(oem_table_id, "UVX"))
1567+ uv_system_type = UV_X2APIC;
1568+ else if (!strcmp(oem_table_id, "UVH"))
1569+ uv_system_type = UV_NON_UNIQUE_APIC;
1570+ }
1571+#endif
1572+ return 0;
1573+}
1574+
1575+#ifndef CONFIG_XEN
1576+unsigned int read_apic_id(void)
1577+{
1578+ unsigned int id;
1579+
1580+ WARN_ON(preemptible() && num_online_cpus() > 1);
1581+ id = apic_read(APIC_ID);
1582+ if (uv_system_type >= UV_X2APIC)
1583+ id |= __get_cpu_var(x2apic_extra_bits);
1584+ return id;
1585+}
1586+
1587+enum uv_system_type get_uv_system_type(void)
1588+{
1589+ return uv_system_type;
1590+}
1591+
1592+int is_uv_system(void)
1593+{
1594+ return uv_system_type != UV_NONE;
1595+}
1596+#endif
1597--- sle11-2009-05-14.orig/arch/x86/kernel/genapic_xen_64.c 2008-12-15 11:27:22.000000000 +0100
1598+++ sle11-2009-05-14/arch/x86/kernel/genapic_xen_64.c 2009-03-16 16:38:05.000000000 +0100
1599@@ -72,9 +72,7 @@ static cpumask_t xen_target_cpus(void)
1600
1601 static cpumask_t xen_vector_allocation_domain(int cpu)
1602 {
1603- cpumask_t domain = CPU_MASK_NONE;
1604- cpu_set(cpu, domain);
1605- return domain;
1606+ return cpumask_of_cpu(cpu);
1607 }
1608
1609 /*
1610--- sle11-2009-05-14.orig/arch/x86/kernel/head64-xen.c 2009-03-16 16:33:40.000000000 +0100
1611+++ sle11-2009-05-14/arch/x86/kernel/head64-xen.c 2009-03-16 16:38:05.000000000 +0100
1612@@ -17,6 +17,7 @@
1613 #include <linux/string.h>
1614 #include <linux/percpu.h>
1615 #include <linux/start_kernel.h>
1616+#include <linux/io.h>
1617 #include <linux/module.h>
1618
1619 #include <asm/processor.h>
1620@@ -29,6 +30,7 @@
1621 #include <asm/sections.h>
1622 #include <asm/kdebug.h>
1623 #include <asm/e820.h>
1624+#include <asm/bios_ebda.h>
1625
1626 unsigned long start_pfn;
1627
1628@@ -75,34 +77,75 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
1629 unsigned int machine_to_phys_order;
1630 EXPORT_SYMBOL(machine_to_phys_order);
1631
1632-#define EBDA_ADDR_POINTER 0x40E
1633+#define BIOS_LOWMEM_KILOBYTES 0x413
1634
1635-static __init void reserve_ebda(void)
1636+/*
1637+ * The BIOS places the EBDA/XBDA at the top of conventional
1638+ * memory, and usually decreases the reported amount of
1639+ * conventional memory (int 0x12) too. This also contains a
1640+ * workaround for Dell systems that neglect to reserve EBDA.
1641+ * The same workaround also avoids a problem with the AMD768MPX
1642+ * chipset: reserve a page before VGA to prevent PCI prefetch
1643+ * into it (errata #56). Usually the page is reserved anyways,
1644+ * unless you have no PS/2 mouse plugged in.
1645+ */
1646+static void __init reserve_ebda_region(void)
1647 {
1648 #ifndef CONFIG_XEN
1649- unsigned ebda_addr, ebda_size;
1650+ unsigned int lowmem, ebda_addr;
1651
1652- /*
1653- * there is a real-mode segmented pointer pointing to the
1654- * 4K EBDA area at 0x40E
1655- */
1656- ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
1657- ebda_addr <<= 4;
1658-
1659- if (!ebda_addr)
1660+ /* To determine the position of the EBDA and the */
1661+ /* end of conventional memory, we need to look at */
1662+ /* the BIOS data area. In a paravirtual environment */
1663+ /* that area is absent. We'll just have to assume */
1664+ /* that the paravirt case can handle memory setup */
1665+ /* correctly, without our help. */
1666+ if (paravirt_enabled())
1667 return;
1668
1669- ebda_size = *(unsigned short *)__va(ebda_addr);
1670+ /* end of low (conventional) memory */
1671+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
1672+ lowmem <<= 10;
1673+
1674+ /* start of EBDA area */
1675+ ebda_addr = get_bios_ebda();
1676+
1677+ /* Fixup: bios puts an EBDA in the top 64K segment */
1678+ /* of conventional memory, but does not adjust lowmem. */
1679+ if ((lowmem - ebda_addr) <= 0x10000)
1680+ lowmem = ebda_addr;
1681+
1682+ /* Fixup: bios does not report an EBDA at all. */
1683+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
1684+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
1685+ lowmem = 0x9f000;
1686+
1687+ /* Paranoia: should never happen, but... */
1688+ if ((lowmem == 0) || (lowmem >= 0x100000))
1689+ lowmem = 0x9f000;
1690
1691- /* Round EBDA up to pages */
1692- if (ebda_size == 0)
1693- ebda_size = 1;
1694- ebda_size <<= 10;
1695- ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
1696- if (ebda_size > 64*1024)
1697- ebda_size = 64*1024;
1698+ /* reserve all memory between lowmem and the 1MB mark */
1699+ reserve_early(lowmem, 0x100000, "BIOS reserved");
1700+#endif
1701+}
1702
1703- reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
1704+static void __init reserve_setup_data(void)
1705+{
1706+#ifndef CONFIG_XEN
1707+ struct setup_data *data;
1708+ unsigned long pa_data;
1709+ char buf[32];
1710+
1711+ if (boot_params.hdr.version < 0x0209)
1712+ return;
1713+ pa_data = boot_params.hdr.setup_data;
1714+ while (pa_data) {
1715+ data = early_ioremap(pa_data, sizeof(*data));
1716+ sprintf(buf, "setup data %x", data->type);
1717+ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
1718+ pa_data = data->next;
1719+ early_iounmap(data, sizeof(*data));
1720+ }
1721 #endif
1722 }
1723
1724@@ -112,6 +155,19 @@ void __init x86_64_start_kernel(char * r
1725 unsigned long machine_to_phys_nr_ents;
1726 int i;
1727
1728+ /*
1729+ * Build-time sanity checks on the kernel image and module
1730+ * area mappings. (these are purely build-time and produce no code)
1731+ */
1732+ BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
1733+ BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
1734+ BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
1735+ BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
1736+ BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
1737+ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
1738+ BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
1739+ (__START_KERNEL & PGDIR_MASK)));
1740+
1741 xen_setup_features();
1742
1743 xen_start_info = (struct start_info *)real_mode_data;
1744@@ -140,7 +196,7 @@ void __init x86_64_start_kernel(char * r
1745 /* Cleanup the over mapped high alias */
1746 cleanup_highmap();
1747
1748- for (i = 0; i < IDT_ENTRIES; i++) {
1749+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
1750 #ifdef CONFIG_EARLY_PRINTK
1751 set_intr_gate(i, &early_idt_handlers[i]);
1752 #else
1753@@ -163,7 +219,8 @@ void __init x86_64_start_kernel(char * r
1754 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
1755 start_pfn << PAGE_SHIFT, "Xen provided");
1756
1757- reserve_ebda();
1758+ reserve_ebda_region();
1759+ reserve_setup_data();
1760
1761 /*
1762 * At this point everything still needed from the boot loader
1763--- sle11-2009-05-14.orig/arch/x86/kernel/head_32-xen.S 2009-03-16 16:33:40.000000000 +0100
1764+++ sle11-2009-05-14/arch/x86/kernel/head_32-xen.S 2009-03-16 16:38:05.000000000 +0100
1765@@ -69,7 +69,7 @@ ENTRY(startup_32)
1766 cld # gcc2 wants the direction flag cleared at all times
1767
1768 pushl $0 # fake return address for unwinder
1769- jmp start_kernel
1770+ jmp i386_start_kernel
1771
1772 #define HYPERCALL_PAGE_OFFSET 0x1000
1773 .org HYPERCALL_PAGE_OFFSET
1774--- sle11-2009-05-14.orig/arch/x86/kernel/init_task-xen.c 2009-03-16 16:33:40.000000000 +0100
1775+++ sle11-2009-05-14/arch/x86/kernel/init_task-xen.c 2009-03-16 16:38:05.000000000 +0100
1776@@ -11,7 +11,6 @@
1777 #include <asm/desc.h>
1778
1779 static struct fs_struct init_fs = INIT_FS;
1780-static struct files_struct init_files = INIT_FILES;
1781 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
1782 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
1783 #ifdef CONFIG_X86_XEN
1784--- sle11-2009-05-14.orig/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
1785+++ sle11-2009-05-14/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:38:05.000000000 +0100
1786@@ -88,6 +88,16 @@ int sis_apic_bug = -1;
1787 */
1788 int nr_ioapic_registers[MAX_IO_APICS];
1789
1790+/* I/O APIC entries */
1791+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
1792+int nr_ioapics;
1793+
1794+/* MP IRQ source entries */
1795+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
1796+
1797+/* # of MP IRQ source entries */
1798+int mp_irq_entries;
1799+
1800 static int disable_timer_pin_1 __initdata;
1801
1802 /*
1803@@ -863,10 +873,7 @@ static int __init find_isa_irq_pin(int i
1804 for (i = 0; i < mp_irq_entries; i++) {
1805 int lbus = mp_irqs[i].mpc_srcbus;
1806
1807- if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
1808- mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
1809- mp_bus_id_to_type[lbus] == MP_BUS_MCA
1810- ) &&
1811+ if (test_bit(lbus, mp_bus_not_pci) &&
1812 (mp_irqs[i].mpc_irqtype == type) &&
1813 (mp_irqs[i].mpc_srcbusirq == irq))
1814
1815@@ -882,10 +889,7 @@ static int __init find_isa_irq_apic(int
1816 for (i = 0; i < mp_irq_entries; i++) {
1817 int lbus = mp_irqs[i].mpc_srcbus;
1818
1819- if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
1820- mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
1821- mp_bus_id_to_type[lbus] == MP_BUS_MCA
1822- ) &&
1823+ if (test_bit(lbus, mp_bus_not_pci) &&
1824 (mp_irqs[i].mpc_irqtype == type) &&
1825 (mp_irqs[i].mpc_srcbusirq == irq))
1826 break;
1827@@ -926,7 +930,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
1828 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
1829 break;
1830
1831- if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
1832+ if (!test_bit(lbus, mp_bus_not_pci) &&
1833 !mp_irqs[i].mpc_irqtype &&
1834 (bus == lbus) &&
1835 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
1836@@ -977,6 +981,7 @@ void __init setup_ioapic_dest(void)
1837 #endif /* !CONFIG_XEN */
1838 #endif
1839
1840+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1841 /*
1842 * EISA Edge/Level control register, ELCR
1843 */
1844@@ -990,6 +995,13 @@ static int EISA_ELCR(unsigned int irq)
1845 "Broken MPtable reports ISA irq %d\n", irq);
1846 return 0;
1847 }
1848+#endif
1849+
1850+/* ISA interrupts are always polarity zero edge triggered,
1851+ * when listed as conforming in the MP table. */
1852+
1853+#define default_ISA_trigger(idx) (0)
1854+#define default_ISA_polarity(idx) (0)
1855
1856 /* EISA interrupts are always polarity zero and can be edge or level
1857 * trigger depending on the ELCR value. If an interrupt is listed as
1858@@ -997,13 +1009,7 @@ static int EISA_ELCR(unsigned int irq)
1859 * be read in from the ELCR */
1860
1861 #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
1862-#define default_EISA_polarity(idx) (0)
1863-
1864-/* ISA interrupts are always polarity zero edge triggered,
1865- * when listed as conforming in the MP table. */
1866-
1867-#define default_ISA_trigger(idx) (0)
1868-#define default_ISA_polarity(idx) (0)
1869+#define default_EISA_polarity(idx) default_ISA_polarity(idx)
1870
1871 /* PCI interrupts are always polarity one level triggered,
1872 * when listed as conforming in the MP table. */
1873@@ -1015,7 +1021,7 @@ static int EISA_ELCR(unsigned int irq)
1874 * when listed as conforming in the MP table. */
1875
1876 #define default_MCA_trigger(idx) (1)
1877-#define default_MCA_polarity(idx) (0)
1878+#define default_MCA_polarity(idx) default_ISA_polarity(idx)
1879
1880 static int MPBIOS_polarity(int idx)
1881 {
1882@@ -1029,35 +1035,9 @@ static int MPBIOS_polarity(int idx)
1883 {
1884 case 0: /* conforms, ie. bus-type dependent polarity */
1885 {
1886- switch (mp_bus_id_to_type[bus])
1887- {
1888- case MP_BUS_ISA: /* ISA pin */
1889- {
1890- polarity = default_ISA_polarity(idx);
1891- break;
1892- }
1893- case MP_BUS_EISA: /* EISA pin */
1894- {
1895- polarity = default_EISA_polarity(idx);
1896- break;
1897- }
1898- case MP_BUS_PCI: /* PCI pin */
1899- {
1900- polarity = default_PCI_polarity(idx);
1901- break;
1902- }
1903- case MP_BUS_MCA: /* MCA pin */
1904- {
1905- polarity = default_MCA_polarity(idx);
1906- break;
1907- }
1908- default:
1909- {
1910- printk(KERN_WARNING "broken BIOS!!\n");
1911- polarity = 1;
1912- break;
1913- }
1914- }
1915+ polarity = test_bit(bus, mp_bus_not_pci)?
1916+ default_ISA_polarity(idx):
1917+ default_PCI_polarity(idx);
1918 break;
1919 }
1920 case 1: /* high active */
1921@@ -1098,11 +1078,15 @@ static int MPBIOS_trigger(int idx)
1922 {
1923 case 0: /* conforms, ie. bus-type dependent */
1924 {
1925+ trigger = test_bit(bus, mp_bus_not_pci)?
1926+ default_ISA_trigger(idx):
1927+ default_PCI_trigger(idx);
1928+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1929 switch (mp_bus_id_to_type[bus])
1930 {
1931 case MP_BUS_ISA: /* ISA pin */
1932 {
1933- trigger = default_ISA_trigger(idx);
1934+ /* set before the switch */
1935 break;
1936 }
1937 case MP_BUS_EISA: /* EISA pin */
1938@@ -1112,7 +1096,7 @@ static int MPBIOS_trigger(int idx)
1939 }
1940 case MP_BUS_PCI: /* PCI pin */
1941 {
1942- trigger = default_PCI_trigger(idx);
1943+ /* set before the switch */
1944 break;
1945 }
1946 case MP_BUS_MCA: /* MCA pin */
1947@@ -1127,6 +1111,7 @@ static int MPBIOS_trigger(int idx)
1948 break;
1949 }
1950 }
1951+#endif
1952 break;
1953 }
1954 case 1: /* edge */
1955@@ -1176,39 +1161,22 @@ static int pin_2_irq(int idx, int apic,
1956 if (mp_irqs[idx].mpc_dstirq != pin)
1957 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1958
1959- switch (mp_bus_id_to_type[bus])
1960- {
1961- case MP_BUS_ISA: /* ISA pin */
1962- case MP_BUS_EISA:
1963- case MP_BUS_MCA:
1964- {
1965- irq = mp_irqs[idx].mpc_srcbusirq;
1966- break;
1967- }
1968- case MP_BUS_PCI: /* PCI pin */
1969- {
1970- /*
1971- * PCI IRQs are mapped in order
1972- */
1973- i = irq = 0;
1974- while (i < apic)
1975- irq += nr_ioapic_registers[i++];
1976- irq += pin;
1977-
1978- /*
1979- * For MPS mode, so far only needed by ES7000 platform
1980- */
1981- if (ioapic_renumber_irq)
1982- irq = ioapic_renumber_irq(apic, irq);
1983+ if (test_bit(bus, mp_bus_not_pci))
1984+ irq = mp_irqs[idx].mpc_srcbusirq;
1985+ else {
1986+ /*
1987+ * PCI IRQs are mapped in order
1988+ */
1989+ i = irq = 0;
1990+ while (i < apic)
1991+ irq += nr_ioapic_registers[i++];
1992+ irq += pin;
1993
1994- break;
1995- }
1996- default:
1997- {
1998- printk(KERN_ERR "unknown bus type %d.\n",bus);
1999- irq = 0;
2000- break;
2001- }
2002+ /*
2003+ * For MPS mode, so far only needed by ES7000 platform
2004+ */
2005+ if (ioapic_renumber_irq)
2006+ irq = ioapic_renumber_irq(apic, irq);
2007 }
2008
2009 /*
2010@@ -1314,7 +1282,6 @@ static void __init setup_IO_APIC_irqs(vo
2011 {
2012 struct IO_APIC_route_entry entry;
2013 int apic, pin, idx, irq, first_notcon = 1, vector;
2014- unsigned long flags;
2015
2016 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
2017
2018@@ -1380,9 +1347,7 @@ static void __init setup_IO_APIC_irqs(vo
2019 if (!apic && (irq < 16))
2020 disable_8259A_irq(irq);
2021 }
2022- spin_lock_irqsave(&ioapic_lock, flags);
2023- __ioapic_write_entry(apic, pin, entry);
2024- spin_unlock_irqrestore(&ioapic_lock, flags);
2025+ ioapic_write_entry(apic, pin, entry);
2026 }
2027 }
2028
2029@@ -1577,8 +1542,8 @@ void /*__init*/ print_local_APIC(void *
2030
2031 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
2032 smp_processor_id(), hard_smp_processor_id());
2033- v = apic_read(APIC_ID);
2034- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
2035+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
2036+ GET_APIC_ID(read_apic_id()));
2037 v = apic_read(APIC_LVR);
2038 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
2039 ver = GET_APIC_VERSION(v);
2040@@ -1791,7 +1756,7 @@ void disable_IO_APIC(void)
2041 entry.delivery_mode = dest_ExtINT; /* ExtInt */
2042 entry.vector = 0;
2043 entry.dest.physical.physical_dest =
2044- GET_APIC_ID(apic_read(APIC_ID));
2045+ GET_APIC_ID(read_apic_id());
2046
2047 /*
2048 * Add it to the IO-APIC irq-routing table:
2049@@ -2090,8 +2055,7 @@ static inline void init_IO_APIC_traps(vo
2050 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2051 */
2052 for (irq = 0; irq < NR_IRQS ; irq++) {
2053- int tmp = irq;
2054- if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
2055+ if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
2056 /*
2057 * Hmm.. We don't have an entry for this,
2058 * so default to an old-fashioned 8259
2059@@ -2166,7 +2130,7 @@ static void __init setup_nmi(void)
2060 * cycles as some i82489DX-based boards have glue logic that keeps the
2061 * 8259A interrupt line asserted until INTA. --macro
2062 */
2063-static inline void unlock_ExtINT_logic(void)
2064+static inline void __init unlock_ExtINT_logic(void)
2065 {
2066 int apic, pin, i;
2067 struct IO_APIC_route_entry entry0, entry1;
2068@@ -2218,8 +2182,6 @@ static inline void unlock_ExtINT_logic(v
2069 ioapic_write_entry(apic, pin, entry0);
2070 }
2071
2072-int timer_uses_ioapic_pin_0;
2073-
2074 /*
2075 * This code may look a bit paranoid, but it's supposed to cooperate with
2076 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
2077@@ -2259,9 +2221,6 @@ static inline void __init check_timer(vo
2078 pin2 = ioapic_i8259.pin;
2079 apic2 = ioapic_i8259.apic;
2080
2081- if (pin1 == 0)
2082- timer_uses_ioapic_pin_0 = 1;
2083-
2084 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
2085 vector, apic1, pin1, apic2, pin2);
2086
2087@@ -2555,6 +2514,7 @@ void destroy_irq(unsigned int irq)
2088 dynamic_irq_cleanup(irq);
2089
2090 spin_lock_irqsave(&vector_lock, flags);
2091+ clear_bit(irq_vector[irq], used_vectors);
2092 irq_vector[irq] = 0;
2093 spin_unlock_irqrestore(&vector_lock, flags);
2094 }
2095@@ -2871,7 +2831,6 @@ int __init io_apic_get_redir_entries (in
2096 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
2097 {
2098 struct IO_APIC_route_entry entry;
2099- unsigned long flags;
2100
2101 if (!IO_APIC_IRQ(irq)) {
2102 printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2103@@ -2912,9 +2871,7 @@ int io_apic_set_pci_routing (int ioapic,
2104 if (!ioapic && (irq < 16))
2105 disable_8259A_irq(irq);
2106
2107- spin_lock_irqsave(&ioapic_lock, flags);
2108- __ioapic_write_entry(ioapic, pin, entry);
2109- spin_unlock_irqrestore(&ioapic_lock, flags);
2110+ ioapic_write_entry(ioapic, pin, entry);
2111
2112 return 0;
2113 }
2114--- sle11-2009-05-14.orig/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
2115+++ sle11-2009-05-14/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
2116@@ -43,13 +43,15 @@
2117 #include <asm/smp.h>
2118 #include <asm/desc.h>
2119 #include <asm/proto.h>
2120-#include <asm/mach_apic.h>
2121 #include <asm/acpi.h>
2122 #include <asm/dma.h>
2123 #include <asm/nmi.h>
2124 #include <asm/msidef.h>
2125 #include <asm/hypertransport.h>
2126
2127+#include <mach_ipi.h>
2128+#include <mach_apic.h>
2129+
2130 struct irq_cfg {
2131 #ifndef CONFIG_XEN
2132 cpumask_t domain;
2133@@ -101,6 +103,16 @@ DEFINE_SPINLOCK(vector_lock);
2134 */
2135 int nr_ioapic_registers[MAX_IO_APICS];
2136
2137+/* I/O APIC entries */
2138+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
2139+int nr_ioapics;
2140+
2141+/* MP IRQ source entries */
2142+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
2143+
2144+/* # of MP IRQ source entries */
2145+int mp_irq_entries;
2146+
2147 /*
2148 * Rough estimation of how many shared IRQs there are, can
2149 * be changed anytime.
2150@@ -181,11 +193,10 @@ static inline void io_apic_modify(unsign
2151 writel(value, &io_apic->data);
2152 }
2153
2154-static int io_apic_level_ack_pending(unsigned int irq)
2155+static bool io_apic_level_ack_pending(unsigned int irq)
2156 {
2157 struct irq_pin_list *entry;
2158 unsigned long flags;
2159- int pending = 0;
2160
2161 spin_lock_irqsave(&ioapic_lock, flags);
2162 entry = irq_2_pin + irq;
2163@@ -198,13 +209,17 @@ static int io_apic_level_ack_pending(uns
2164 break;
2165 reg = io_apic_read(entry->apic, 0x10 + pin*2);
2166 /* Is the remote IRR bit set? */
2167- pending |= (reg >> 14) & 1;
2168+ if ((reg >> 14) & 1) {
2169+ spin_unlock_irqrestore(&ioapic_lock, flags);
2170+ return true;
2171+ }
2172 if (!entry->next)
2173 break;
2174 entry = irq_2_pin + entry->next;
2175 }
2176 spin_unlock_irqrestore(&ioapic_lock, flags);
2177- return pending;
2178+
2179+ return false;
2180 }
2181 #endif
2182
2183@@ -762,7 +777,7 @@ static void __clear_irq_vector(int irq)
2184 per_cpu(vector_irq, cpu)[vector] = -1;
2185
2186 cfg->vector = 0;
2187- cfg->domain = CPU_MASK_NONE;
2188+ cpus_clear(cfg->domain);
2189 }
2190
2191 void __setup_vector_irq(int cpu)
2192@@ -899,9 +914,8 @@ static void __init setup_IO_APIC_irqs(vo
2193 static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
2194 {
2195 struct IO_APIC_route_entry entry;
2196- unsigned long flags;
2197
2198- memset(&entry,0,sizeof(entry));
2199+ memset(&entry, 0, sizeof(entry));
2200
2201 disable_8259A_irq(0);
2202
2203@@ -929,10 +943,7 @@ static void __init setup_ExtINT_IRQ0_pin
2204 /*
2205 * Add it to the IO-APIC irq-routing table:
2206 */
2207- spin_lock_irqsave(&ioapic_lock, flags);
2208- io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
2209- io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
2210- spin_unlock_irqrestore(&ioapic_lock, flags);
2211+ ioapic_write_entry(apic, pin, entry);
2212
2213 enable_8259A_irq(0);
2214 }
2215@@ -1061,8 +1072,7 @@ void __apicdebuginit print_local_APIC(vo
2216
2217 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
2218 smp_processor_id(), hard_smp_processor_id());
2219- v = apic_read(APIC_ID);
2220- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
2221+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
2222 v = apic_read(APIC_LVR);
2223 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
2224 ver = GET_APIC_VERSION(v);
2225@@ -1260,7 +1270,7 @@ void disable_IO_APIC(void)
2226 entry.dest_mode = 0; /* Physical */
2227 entry.delivery_mode = dest_ExtINT; /* ExtInt */
2228 entry.vector = 0;
2229- entry.dest = GET_APIC_ID(apic_read(APIC_ID));
2230+ entry.dest = GET_APIC_ID(read_apic_id());
2231
2232 /*
2233 * Add it to the IO-APIC irq-routing table:
2234@@ -1353,9 +1363,7 @@ static int ioapic_retrigger_irq(unsigned
2235 unsigned long flags;
2236
2237 spin_lock_irqsave(&vector_lock, flags);
2238- cpus_clear(mask);
2239- cpu_set(first_cpu(cfg->domain), mask);
2240-
2241+ mask = cpumask_of_cpu(first_cpu(cfg->domain));
2242 send_IPI_mask(mask, cfg->vector);
2243 spin_unlock_irqrestore(&vector_lock, flags);
2244
2245@@ -1519,8 +1527,7 @@ static inline void init_IO_APIC_traps(vo
2246 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2247 */
2248 for (irq = 0; irq < NR_IRQS ; irq++) {
2249- int tmp = irq;
2250- if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
2251+ if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) {
2252 /*
2253 * Hmm.. We don't have an entry for this,
2254 * so default to an old-fashioned 8259
2255@@ -1597,22 +1604,19 @@ static void __init setup_nmi(void)
2256 * cycles as some i82489DX-based boards have glue logic that keeps the
2257 * 8259A interrupt line asserted until INTA. --macro
2258 */
2259-static inline void unlock_ExtINT_logic(void)
2260+static inline void __init unlock_ExtINT_logic(void)
2261 {
2262 int apic, pin, i;
2263 struct IO_APIC_route_entry entry0, entry1;
2264 unsigned char save_control, save_freq_select;
2265- unsigned long flags;
2266
2267 pin = find_isa_irq_pin(8, mp_INT);
2268 apic = find_isa_irq_apic(8, mp_INT);
2269 if (pin == -1)
2270 return;
2271
2272- spin_lock_irqsave(&ioapic_lock, flags);
2273- *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
2274- *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
2275- spin_unlock_irqrestore(&ioapic_lock, flags);
2276+ entry0 = ioapic_read_entry(apic, pin);
2277+
2278 clear_IO_APIC_pin(apic, pin);
2279
2280 memset(&entry1, 0, sizeof(entry1));
2281@@ -1625,10 +1629,7 @@ static inline void unlock_ExtINT_logic(v
2282 entry1.trigger = 0;
2283 entry1.vector = 0;
2284
2285- spin_lock_irqsave(&ioapic_lock, flags);
2286- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
2287- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
2288- spin_unlock_irqrestore(&ioapic_lock, flags);
2289+ ioapic_write_entry(apic, pin, entry1);
2290
2291 save_control = CMOS_READ(RTC_CONTROL);
2292 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2293@@ -1647,10 +1648,7 @@ static inline void unlock_ExtINT_logic(v
2294 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2295 clear_IO_APIC_pin(apic, pin);
2296
2297- spin_lock_irqsave(&ioapic_lock, flags);
2298- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
2299- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
2300- spin_unlock_irqrestore(&ioapic_lock, flags);
2301+ ioapic_write_entry(apic, pin, entry0);
2302 }
2303
2304 /*
2305@@ -2327,7 +2325,6 @@ static struct resource * __init ioapic_s
2306 res = (void *)mem;
2307
2308 if (mem != NULL) {
2309- memset(mem, 0, n);
2310 mem += sizeof(struct resource) * nr_ioapics;
2311
2312 for (i = 0; i < nr_ioapics; i++) {
2313--- /dev/null 1970-01-01 00:00:00.000000000 +0000
2314+++ sle11-2009-05-14/arch/x86/kernel/ipi-xen.c 2009-03-16 16:38:05.000000000 +0100
2315@@ -0,0 +1,232 @@
2316+#include <linux/cpumask.h>
2317+#include <linux/interrupt.h>
2318+#include <linux/init.h>
2319+
2320+#include <linux/mm.h>
2321+#include <linux/delay.h>
2322+#include <linux/spinlock.h>
2323+#include <linux/kernel_stat.h>
2324+#include <linux/mc146818rtc.h>
2325+#include <linux/cache.h>
2326+#include <linux/interrupt.h>
2327+#include <linux/cpu.h>
2328+#include <linux/module.h>
2329+
2330+#include <asm/smp.h>
2331+#include <asm/mtrr.h>
2332+#include <asm/tlbflush.h>
2333+#include <asm/mmu_context.h>
2334+#include <asm/apic.h>
2335+#include <asm/proto.h>
2336+
2337+#ifdef CONFIG_X86_32
2338+#ifndef CONFIG_XEN
2339+#include <mach_apic.h>
2340+/*
2341+ * the following functions deal with sending IPIs between CPUs.
2342+ *
2343+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
2344+ */
2345+
2346+static inline int __prepare_ICR(unsigned int shortcut, int vector)
2347+{
2348+ unsigned int icr = shortcut | APIC_DEST_LOGICAL;
2349+
2350+ switch (vector) {
2351+ default:
2352+ icr |= APIC_DM_FIXED | vector;
2353+ break;
2354+ case NMI_VECTOR:
2355+ icr |= APIC_DM_NMI;
2356+ break;
2357+ }
2358+ return icr;
2359+}
2360+
2361+static inline int __prepare_ICR2(unsigned int mask)
2362+{
2363+ return SET_APIC_DEST_FIELD(mask);
2364+}
2365+#else
2366+#include <xen/evtchn.h>
2367+
2368+DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
2369+
2370+static inline void __send_IPI_one(unsigned int cpu, int vector)
2371+{
2372+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
2373+ BUG_ON(irq < 0);
2374+ notify_remote_via_irq(irq);
2375+}
2376+#endif
2377+
2378+void __send_IPI_shortcut(unsigned int shortcut, int vector)
2379+{
2380+#ifndef CONFIG_XEN
2381+ /*
2382+ * Subtle. In the case of the 'never do double writes' workaround
2383+ * we have to lock out interrupts to be safe. As we don't care
2384+ * of the value read we use an atomic rmw access to avoid costly
2385+ * cli/sti. Otherwise we use an even cheaper single atomic write
2386+ * to the APIC.
2387+ */
2388+ unsigned int cfg;
2389+
2390+ /*
2391+ * Wait for idle.
2392+ */
2393+ apic_wait_icr_idle();
2394+
2395+ /*
2396+ * No need to touch the target chip field
2397+ */
2398+ cfg = __prepare_ICR(shortcut, vector);
2399+
2400+ /*
2401+ * Send the IPI. The write to APIC_ICR fires this off.
2402+ */
2403+ apic_write_around(APIC_ICR, cfg);
2404+#else
2405+ int cpu;
2406+
2407+ switch (shortcut) {
2408+ case APIC_DEST_SELF:
2409+ __send_IPI_one(smp_processor_id(), vector);
2410+ break;
2411+ case APIC_DEST_ALLBUT:
2412+ for_each_online_cpu(cpu)
2413+ if (cpu != smp_processor_id())
2414+ __send_IPI_one(cpu, vector);
2415+ break;
2416+ default:
2417+ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
2418+ vector);
2419+ break;
2420+ }
2421+#endif
2422+}
2423+
2424+void send_IPI_self(int vector)
2425+{
2426+ __send_IPI_shortcut(APIC_DEST_SELF, vector);
2427+}
2428+
2429+#ifndef CONFIG_XEN
2430+/*
2431+ * This is used to send an IPI with no shorthand notation (the destination is
2432+ * specified in bits 56 to 63 of the ICR).
2433+ */
2434+static inline void __send_IPI_dest_field(unsigned long mask, int vector)
2435+{
2436+ unsigned long cfg;
2437+
2438+ /*
2439+ * Wait for idle.
2440+ */
2441+ if (unlikely(vector == NMI_VECTOR))
2442+ safe_apic_wait_icr_idle();
2443+ else
2444+ apic_wait_icr_idle();
2445+
2446+ /*
2447+ * prepare target chip field
2448+ */
2449+ cfg = __prepare_ICR2(mask);
2450+ apic_write_around(APIC_ICR2, cfg);
2451+
2452+ /*
2453+ * program the ICR
2454+ */
2455+ cfg = __prepare_ICR(0, vector);
2456+
2457+ /*
2458+ * Send the IPI. The write to APIC_ICR fires this off.
2459+ */
2460+ apic_write_around(APIC_ICR, cfg);
2461+}
2462+#endif
2463+
2464+/*
2465+ * This is only used on smaller machines.
2466+ */
2467+void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
2468+{
2469+#ifndef CONFIG_XEN
2470+ unsigned long mask = cpus_addr(cpumask)[0];
2471+#else
2472+ cpumask_t mask;
2473+ unsigned int cpu;
2474+#endif
2475+ unsigned long flags;
2476+
2477+ local_irq_save(flags);
2478+#ifndef CONFIG_XEN
2479+ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
2480+ __send_IPI_dest_field(mask, vector);
2481+#else
2482+ cpus_andnot(mask, cpumask, cpu_online_map);
2483+ WARN_ON(!cpus_empty(mask));
2484+ for_each_online_cpu(cpu)
2485+ if (cpu_isset(cpu, cpumask))
2486+ __send_IPI_one(cpu, vector);
2487+#endif
2488+ local_irq_restore(flags);
2489+}
2490+
2491+void send_IPI_mask_sequence(cpumask_t mask, int vector)
2492+{
2493+#ifndef CONFIG_XEN
2494+ unsigned long flags;
2495+ unsigned int query_cpu;
2496+
2497+ /*
2498+ * Hack. The clustered APIC addressing mode doesn't allow us to send
2499+ * to an arbitrary mask, so I do a unicasts to each CPU instead. This
2500+ * should be modified to do 1 message per cluster ID - mbligh
2501+ */
2502+
2503+ local_irq_save(flags);
2504+ for_each_possible_cpu(query_cpu) {
2505+ if (cpu_isset(query_cpu, mask)) {
2506+ __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
2507+ vector);
2508+ }
2509+ }
2510+ local_irq_restore(flags);
2511+#else
2512+ send_IPI_mask_bitmask(mask, vector);
2513+#endif
2514+}
2515+
2516+/* must come after the send_IPI functions above for inlining */
2517+#include <mach_ipi.h>
2518+
2519+#ifndef CONFIG_XEN
2520+static int convert_apicid_to_cpu(int apic_id)
2521+{
2522+ int i;
2523+
2524+ for_each_possible_cpu(i) {
2525+ if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
2526+ return i;
2527+ }
2528+ return -1;
2529+}
2530+
2531+int safe_smp_processor_id(void)
2532+{
2533+ int apicid, cpuid;
2534+
2535+ if (!boot_cpu_has(X86_FEATURE_APIC))
2536+ return 0;
2537+
2538+ apicid = hard_smp_processor_id();
2539+ if (apicid == BAD_APICID)
2540+ return 0;
2541+
2542+ cpuid = convert_apicid_to_cpu(apicid);
2543+
2544+ return cpuid >= 0 ? cpuid : 0;
2545+}
2546+#endif
2547+#endif
2548--- sle11-2009-05-14.orig/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:33:40.000000000 +0100
2549+++ sle11-2009-05-14/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:38:05.000000000 +0100
2550@@ -79,7 +79,7 @@ unsigned int do_IRQ(struct pt_regs *regs
2551
2552 if (unlikely((unsigned)irq >= NR_IRQS)) {
2553 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
2554- __FUNCTION__, irq);
2555+ __func__, irq);
2556 BUG();
2557 }
2558
2559@@ -134,7 +134,7 @@ unsigned int do_IRQ(struct pt_regs *regs
2560 : "=a" (arg1), "=d" (arg2), "=b" (bx)
2561 : "0" (irq), "1" (desc), "2" (isp),
2562 "D" (desc->handle_irq)
2563- : "memory", "cc"
2564+ : "memory", "cc", "ecx"
2565 );
2566 } else
2567 #endif
2568@@ -190,8 +190,6 @@ void irq_ctx_exit(int cpu)
2569 hardirq_ctx[cpu] = NULL;
2570 }
2571
2572-extern asmlinkage void __do_softirq(void);
2573-
2574 asmlinkage void do_softirq(void)
2575 {
2576 unsigned long flags;
2577--- sle11-2009-05-14.orig/arch/x86/kernel/machine_kexec_64.c 2009-03-16 16:33:40.000000000 +0100
2578+++ sle11-2009-05-14/arch/x86/kernel/machine_kexec_64.c 2009-03-16 16:38:05.000000000 +0100
2579@@ -120,8 +120,6 @@ int __init machine_kexec_setup_resources
2580 return 0;
2581 }
2582
2583-void machine_kexec_register_resources(struct resource *res) { ; }
2584-
2585 #else /* CONFIG_XEN */
2586
2587 #define x__pmd(x) __pmd(x)
2588--- sle11-2009-05-14.orig/arch/x86/kernel/microcode-xen.c 2009-03-16 16:33:40.000000000 +0100
2589+++ sle11-2009-05-14/arch/x86/kernel/microcode-xen.c 2009-03-16 16:38:05.000000000 +0100
2590@@ -162,7 +162,7 @@ static int request_microcode(void)
2591 c->x86, c->x86_model, c->x86_mask);
2592 error = request_firmware(&firmware, name, &microcode_pdev->dev);
2593 if (error) {
2594- pr_debug("ucode data file %s load failed\n", name);
2595+ pr_debug("microcode: ucode data file %s load failed\n", name);
2596 return error;
2597 }
2598
2599--- sle11-2009-05-14.orig/arch/x86/kernel/mmconf-fam10h_64.c 2009-05-14 10:56:29.000000000 +0200
2600+++ sle11-2009-05-14/arch/x86/kernel/mmconf-fam10h_64.c 2009-03-16 16:38:05.000000000 +0100
2601@@ -219,6 +219,16 @@ void __cpuinit fam10h_check_enable_mmcfg
2602 val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2603 FAM10H_MMIO_CONF_ENABLE;
2604 wrmsrl(address, val);
2605+
2606+#ifdef CONFIG_XEN
2607+ {
2608+ u64 val2;
2609+
2610+ rdmsrl(address, val2);
2611+ if (val2 != val)
2612+ pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
2613+ }
2614+#endif
2615 }
2616
2617 static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
2618--- /dev/null 1970-01-01 00:00:00.000000000 +0000
2619+++ sle11-2009-05-14/arch/x86/kernel/mpparse-xen.c 2009-03-16 16:38:05.000000000 +0100
2620@@ -0,0 +1,1101 @@
2621+/*
2622+ * Intel Multiprocessor Specification 1.1 and 1.4
2623+ * compliant MP-table parsing routines.
2624+ *
2625+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
2626+ * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
2627+ * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
2628+ */
2629+
2630+#include <linux/mm.h>
2631+#include <linux/init.h>
2632+#include <linux/delay.h>
2633+#include <linux/bootmem.h>
2634+#include <linux/kernel_stat.h>
2635+#include <linux/mc146818rtc.h>
2636+#include <linux/bitops.h>
2637+#include <linux/acpi.h>
2638+#include <linux/module.h>
2639+
2640+#include <asm/smp.h>
2641+#include <asm/mtrr.h>
2642+#include <asm/mpspec.h>
2643+#include <asm/pgalloc.h>
2644+#include <asm/io_apic.h>
2645+#include <asm/proto.h>
2646+#include <asm/acpi.h>
2647+#include <asm/bios_ebda.h>
2648+
2649+#include <mach_apic.h>
2650+#ifdef CONFIG_X86_32
2651+#include <mach_apicdef.h>
2652+#include <mach_mpparse.h>
2653+#endif
2654+
2655+/* Have we found an MP table */
2656+int smp_found_config;
2657+
2658+/*
2659+ * Various Linux-internal data structures created from the
2660+ * MP-table.
2661+ */
2662+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
2663+int mp_bus_id_to_type[MAX_MP_BUSSES];
2664+#endif
2665+
2666+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
2667+int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
2668+
2669+static int mp_current_pci_id;
2670+
2671+int pic_mode;
2672+
2673+/*
2674+ * Intel MP BIOS table parsing routines:
2675+ */
2676+
2677+/*
2678+ * Checksum an MP configuration block.
2679+ */
2680+
2681+static int __init mpf_checksum(unsigned char *mp, int len)
2682+{
2683+ int sum = 0;
2684+
2685+ while (len--)
2686+ sum += *mp++;
2687+
2688+ return sum & 0xFF;
2689+}
2690+
2691+#ifdef CONFIG_X86_NUMAQ
2692+/*
2693+ * Have to match translation table entries to main table entries by counter
2694+ * hence the mpc_record variable .... can't see a less disgusting way of
2695+ * doing this ....
2696+ */
2697+
2698+static int mpc_record;
2699+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
2700+ __cpuinitdata;
2701+#endif
2702+
2703+static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
2704+{
2705+#ifndef CONFIG_XEN
2706+ int apicid;
2707+ char *bootup_cpu = "";
2708+
2709+ if (!(m->mpc_cpuflag & CPU_ENABLED)) {
2710+ disabled_cpus++;
2711+ return;
2712+ }
2713+#ifdef CONFIG_X86_NUMAQ
2714+ apicid = mpc_apic_id(m, translation_table[mpc_record]);
2715+#else
2716+ apicid = m->mpc_apicid;
2717+#endif
2718+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
2719+ bootup_cpu = " (Bootup-CPU)";
2720+ boot_cpu_physical_apicid = m->mpc_apicid;
2721+ }
2722+
2723+ printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
2724+ generic_processor_info(apicid, m->mpc_apicver);
2725+#else /* CONFIG_XEN */
2726+ num_processors++;
2727+#endif
2728+}
2729+
2730+static void __init MP_bus_info(struct mpc_config_bus *m)
2731+{
2732+ char str[7];
2733+
2734+ memcpy(str, m->mpc_bustype, 6);
2735+ str[6] = 0;
2736+
2737+#ifdef CONFIG_X86_NUMAQ
2738+ mpc_oem_bus_info(m, str, translation_table[mpc_record]);
2739+#else
2740+ Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
2741+#endif
2742+
2743+#if MAX_MP_BUSSES < 256
2744+ if (m->mpc_busid >= MAX_MP_BUSSES) {
2745+ printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
2746+ " is too large, max. supported is %d\n",
2747+ m->mpc_busid, str, MAX_MP_BUSSES - 1);
2748+ return;
2749+ }
2750+#endif
2751+
2752+ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
2753+ set_bit(m->mpc_busid, mp_bus_not_pci);
2754+#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
2755+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
2756+#endif
2757+ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
2758+#ifdef CONFIG_X86_NUMAQ
2759+ mpc_oem_pci_bus(m, translation_table[mpc_record]);
2760+#endif
2761+ clear_bit(m->mpc_busid, mp_bus_not_pci);
2762+ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
2763+ mp_current_pci_id++;
2764+#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
2765+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
2766+ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
2767+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
2768+ } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
2769+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
2770+#endif
2771+ } else
2772+ printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
2773+}
2774+
2775+#ifdef CONFIG_X86_IO_APIC
2776+
2777+static int bad_ioapic(unsigned long address)
2778+{
2779+ if (nr_ioapics >= MAX_IO_APICS) {
2780+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
2781+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
2782+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
2783+ }
2784+ if (!address) {
2785+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
2786+ " found in table, skipping!\n");
2787+ return 1;
2788+ }
2789+ return 0;
2790+}
2791+
2792+static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
2793+{
2794+ if (!(m->mpc_flags & MPC_APIC_USABLE))
2795+ return;
2796+
2797+ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
2798+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
2799+
2800+ if (bad_ioapic(m->mpc_apicaddr))
2801+ return;
2802+
2803+ mp_ioapics[nr_ioapics] = *m;
2804+ nr_ioapics++;
2805+}
2806+
2807+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
2808+{
2809+ mp_irqs[mp_irq_entries] = *m;
2810+ Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
2811+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
2812+ m->mpc_irqtype, m->mpc_irqflag & 3,
2813+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
2814+ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
2815+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
2816+ panic("Max # of irq sources exceeded!!\n");
2817+}
2818+
2819+#endif
2820+
2821+static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
2822+{
2823+ Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
2824+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
2825+ m->mpc_irqtype, m->mpc_irqflag & 3,
2826+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
2827+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
2828+}
2829+
2830+#ifdef CONFIG_X86_NUMAQ
2831+static void __init MP_translation_info(struct mpc_config_translation *m)
2832+{
2833+ printk(KERN_INFO
2834+ "Translation: record %d, type %d, quad %d, global %d, local %d\n",
2835+ mpc_record, m->trans_type, m->trans_quad, m->trans_global,
2836+ m->trans_local);
2837+
2838+ if (mpc_record >= MAX_MPC_ENTRY)
2839+ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
2840+ else
2841+ translation_table[mpc_record] = m; /* stash this for later */
2842+ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
2843+ node_set_online(m->trans_quad);
2844+}
2845+
2846+/*
2847+ * Read/parse the MPC oem tables
2848+ */
2849+
2850+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
2851+ unsigned short oemsize)
2852+{
2853+ int count = sizeof(*oemtable); /* the header size */
2854+ unsigned char *oemptr = ((unsigned char *)oemtable) + count;
2855+
2856+ mpc_record = 0;
2857+ printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
2858+ oemtable);
2859+ if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
2860+ printk(KERN_WARNING
2861+ "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
2862+ oemtable->oem_signature[0], oemtable->oem_signature[1],
2863+ oemtable->oem_signature[2], oemtable->oem_signature[3]);
2864+ return;
2865+ }
2866+ if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
2867+ printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
2868+ return;
2869+ }
2870+ while (count < oemtable->oem_length) {
2871+ switch (*oemptr) {
2872+ case MP_TRANSLATION:
2873+ {
2874+ struct mpc_config_translation *m =
2875+ (struct mpc_config_translation *)oemptr;
2876+ MP_translation_info(m);
2877+ oemptr += sizeof(*m);
2878+ count += sizeof(*m);
2879+ ++mpc_record;
2880+ break;
2881+ }
2882+ default:
2883+ {
2884+ printk(KERN_WARNING
2885+ "Unrecognised OEM table entry type! - %d\n",
2886+ (int)*oemptr);
2887+ return;
2888+ }
2889+ }
2890+ }
2891+}
2892+
2893+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
2894+ char *productid)
2895+{
2896+ if (strncmp(oem, "IBM NUMA", 8))
2897+ printk("Warning! May not be a NUMA-Q system!\n");
2898+ if (mpc->mpc_oemptr)
2899+ smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
2900+ mpc->mpc_oemsize);
2901+}
2902+#endif /* CONFIG_X86_NUMAQ */
2903+
2904+/*
2905+ * Read/parse the MPC
2906+ */
2907+
2908+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
2909+{
2910+ char str[16];
2911+ char oem[10];
2912+ int count = sizeof(*mpc);
2913+ unsigned char *mpt = ((unsigned char *)mpc) + count;
2914+
2915+ if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
2916+ printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
2917+ mpc->mpc_signature[0], mpc->mpc_signature[1],
2918+ mpc->mpc_signature[2], mpc->mpc_signature[3]);
2919+ return 0;
2920+ }
2921+ if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
2922+ printk(KERN_ERR "MPTABLE: checksum error!\n");
2923+ return 0;
2924+ }
2925+ if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
2926+ printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
2927+ mpc->mpc_spec);
2928+ return 0;
2929+ }
2930+ if (!mpc->mpc_lapic) {
2931+ printk(KERN_ERR "MPTABLE: null local APIC address!\n");
2932+ return 0;
2933+ }
2934+ memcpy(oem, mpc->mpc_oem, 8);
2935+ oem[8] = 0;
2936+ printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
2937+
2938+ memcpy(str, mpc->mpc_productid, 12);
2939+ str[12] = 0;
2940+ printk("Product ID: %s ", str);
2941+
2942+#ifdef CONFIG_X86_32
2943+ mps_oem_check(mpc, oem, str);
2944+#endif
2945+ printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
2946+
2947+ printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
2948+
2949+ /* save the local APIC address, it might be non-default */
2950+ if (!acpi_lapic)
2951+ mp_lapic_addr = mpc->mpc_lapic;
2952+
2953+ if (early)
2954+ return 1;
2955+
2956+ /*
2957+ * Now process the configuration blocks.
2958+ */
2959+#ifdef CONFIG_X86_NUMAQ
2960+ mpc_record = 0;
2961+#endif
2962+ while (count < mpc->mpc_length) {
2963+ switch (*mpt) {
2964+ case MP_PROCESSOR:
2965+ {
2966+ struct mpc_config_processor *m =
2967+ (struct mpc_config_processor *)mpt;
2968+ /* ACPI may have already provided this data */
2969+ if (!acpi_lapic)
2970+ MP_processor_info(m);
2971+ mpt += sizeof(*m);
2972+ count += sizeof(*m);
2973+ break;
2974+ }
2975+ case MP_BUS:
2976+ {
2977+ struct mpc_config_bus *m =
2978+ (struct mpc_config_bus *)mpt;
2979+ MP_bus_info(m);
2980+ mpt += sizeof(*m);
2981+ count += sizeof(*m);
2982+ break;
2983+ }
2984+ case MP_IOAPIC:
2985+ {
2986+#ifdef CONFIG_X86_IO_APIC
2987+ struct mpc_config_ioapic *m =
2988+ (struct mpc_config_ioapic *)mpt;
2989+ MP_ioapic_info(m);
2990+#endif
2991+ mpt += sizeof(struct mpc_config_ioapic);
2992+ count += sizeof(struct mpc_config_ioapic);
2993+ break;
2994+ }
2995+ case MP_INTSRC:
2996+ {
2997+#ifdef CONFIG_X86_IO_APIC
2998+ struct mpc_config_intsrc *m =
2999+ (struct mpc_config_intsrc *)mpt;
3000+
3001+ MP_intsrc_info(m);
3002+#endif
3003+ mpt += sizeof(struct mpc_config_intsrc);
3004+ count += sizeof(struct mpc_config_intsrc);
3005+ break;
3006+ }
3007+ case MP_LINTSRC:
3008+ {
3009+ struct mpc_config_lintsrc *m =
3010+ (struct mpc_config_lintsrc *)mpt;
3011+ MP_lintsrc_info(m);
3012+ mpt += sizeof(*m);
3013+ count += sizeof(*m);
3014+ break;
3015+ }
3016+ default:
3017+ /* wrong mptable */
3018+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
3019+ printk(KERN_ERR "type %x\n", *mpt);
3020+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
3021+ 1, mpc, mpc->mpc_length, 1);
3022+ count = mpc->mpc_length;
3023+ break;
3024+ }
3025+#ifdef CONFIG_X86_NUMAQ
3026+ ++mpc_record;
3027+#endif
3028+ }
3029+ setup_apic_routing();
3030+ if (!num_processors)
3031+ printk(KERN_ERR "MPTABLE: no processors registered!\n");
3032+ return num_processors;
3033+}
3034+
3035+#ifdef CONFIG_X86_IO_APIC
3036+
3037+static int __init ELCR_trigger(unsigned int irq)
3038+{
3039+ unsigned int port;
3040+
3041+ port = 0x4d0 + (irq >> 3);
3042+ return (inb(port) >> (irq & 7)) & 1;
3043+}
3044+
3045+static void __init construct_default_ioirq_mptable(int mpc_default_type)
3046+{
3047+ struct mpc_config_intsrc intsrc;
3048+ int i;
3049+ int ELCR_fallback = 0;
3050+
3051+ intsrc.mpc_type = MP_INTSRC;
3052+ intsrc.mpc_irqflag = 0; /* conforming */
3053+ intsrc.mpc_srcbus = 0;
3054+ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
3055+
3056+ intsrc.mpc_irqtype = mp_INT;
3057+
3058+ /*
3059+ * If true, we have an ISA/PCI system with no IRQ entries
3060+ * in the MP table. To prevent the PCI interrupts from being set up
3061+ * incorrectly, we try to use the ELCR. The sanity check to see if
3062+ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
3063+ * never be level sensitive, so we simply see if the ELCR agrees.
3064+ * If it does, we assume it's valid.
3065+ */
3066+ if (mpc_default_type == 5) {
3067+ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
3068+ "falling back to ELCR\n");
3069+
3070+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
3071+ ELCR_trigger(13))
3072+ printk(KERN_ERR "ELCR contains invalid data... "
3073+ "not using ELCR\n");
3074+ else {
3075+ printk(KERN_INFO
3076+ "Using ELCR to identify PCI interrupts\n");
3077+ ELCR_fallback = 1;
3078+ }
3079+ }
3080+
3081+ for (i = 0; i < 16; i++) {
3082+ switch (mpc_default_type) {
3083+ case 2:
3084+ if (i == 0 || i == 13)
3085+ continue; /* IRQ0 & IRQ13 not connected */
3086+ /* fall through */
3087+ default:
3088+ if (i == 2)
3089+ continue; /* IRQ2 is never connected */
3090+ }
3091+
3092+ if (ELCR_fallback) {
3093+ /*
3094+ * If the ELCR indicates a level-sensitive interrupt, we
3095+ * copy that information over to the MP table in the
3096+ * irqflag field (level sensitive, active high polarity).
3097+ */
3098+ if (ELCR_trigger(i))
3099+ intsrc.mpc_irqflag = 13;
3100+ else
3101+ intsrc.mpc_irqflag = 0;
3102+ }
3103+
3104+ intsrc.mpc_srcbusirq = i;
3105+ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
3106+ MP_intsrc_info(&intsrc);
3107+ }
3108+
3109+ intsrc.mpc_irqtype = mp_ExtINT;
3110+ intsrc.mpc_srcbusirq = 0;
3111+ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
3112+ MP_intsrc_info(&intsrc);
3113+}
3114+
3115+#endif
3116+
3117+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
3118+{
3119+ struct mpc_config_processor processor;
3120+ struct mpc_config_bus bus;
3121+#ifdef CONFIG_X86_IO_APIC
3122+ struct mpc_config_ioapic ioapic;
3123+#endif
3124+ struct mpc_config_lintsrc lintsrc;
3125+ int linttypes[2] = { mp_ExtINT, mp_NMI };
3126+ int i;
3127+
3128+ /*
3129+ * local APIC has default address
3130+ */
3131+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
3132+
3133+ /*
3134+ * 2 CPUs, numbered 0 & 1.
3135+ */
3136+ processor.mpc_type = MP_PROCESSOR;
3137+ /* Either an integrated APIC or a discrete 82489DX. */
3138+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
3139+ processor.mpc_cpuflag = CPU_ENABLED;
3140+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
3141+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
3142+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
3143+ processor.mpc_reserved[0] = 0;
3144+ processor.mpc_reserved[1] = 0;
3145+ for (i = 0; i < 2; i++) {
3146+ processor.mpc_apicid = i;
3147+ MP_processor_info(&processor);
3148+ }
3149+
3150+ bus.mpc_type = MP_BUS;
3151+ bus.mpc_busid = 0;
3152+ switch (mpc_default_type) {
3153+ default:
3154+ printk(KERN_ERR "???\nUnknown standard configuration %d\n",
3155+ mpc_default_type);
3156+ /* fall through */
3157+ case 1:
3158+ case 5:
3159+ memcpy(bus.mpc_bustype, "ISA ", 6);
3160+ break;
3161+ case 2:
3162+ case 6:
3163+ case 3:
3164+ memcpy(bus.mpc_bustype, "EISA ", 6);
3165+ break;
3166+ case 4:
3167+ case 7:
3168+ memcpy(bus.mpc_bustype, "MCA ", 6);
3169+ }
3170+ MP_bus_info(&bus);
3171+ if (mpc_default_type > 4) {
3172+ bus.mpc_busid = 1;
3173+ memcpy(bus.mpc_bustype, "PCI ", 6);
3174+ MP_bus_info(&bus);
3175+ }
3176+
3177+#ifdef CONFIG_X86_IO_APIC
3178+ ioapic.mpc_type = MP_IOAPIC;
3179+ ioapic.mpc_apicid = 2;
3180+ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
3181+ ioapic.mpc_flags = MPC_APIC_USABLE;
3182+ ioapic.mpc_apicaddr = 0xFEC00000;
3183+ MP_ioapic_info(&ioapic);
3184+
3185+ /*
3186+ * We set up most of the low 16 IO-APIC pins according to MPS rules.
3187+ */
3188+ construct_default_ioirq_mptable(mpc_default_type);
3189+#endif
3190+ lintsrc.mpc_type = MP_LINTSRC;
3191+ lintsrc.mpc_irqflag = 0; /* conforming */
3192+ lintsrc.mpc_srcbusid = 0;
3193+ lintsrc.mpc_srcbusirq = 0;
3194+ lintsrc.mpc_destapic = MP_APIC_ALL;
3195+ for (i = 0; i < 2; i++) {
3196+ lintsrc.mpc_irqtype = linttypes[i];
3197+ lintsrc.mpc_destapiclint = i;
3198+ MP_lintsrc_info(&lintsrc);
3199+ }
3200+}
3201+
3202+static struct intel_mp_floating *mpf_found;
3203+
3204+/*
3205+ * Scan the memory blocks for an SMP configuration block.
3206+ */
3207+static void __init __get_smp_config(unsigned early)
3208+{
3209+ struct intel_mp_floating *mpf = mpf_found;
3210+
3211+ if (acpi_lapic && early)
3212+ return;
3213+ /*
3214+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
3215+ * processors, where MPS only supports physical.
3216+ */
3217+ if (acpi_lapic && acpi_ioapic) {
3218+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
3219+ "information\n");
3220+ return;
3221+ } else if (acpi_lapic)
3222+ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
3223+ "configuration information\n");
3224+
3225+ printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
3226+ mpf->mpf_specification);
3227+#ifdef CONFIG_X86_32
3228+ if (mpf->mpf_feature2 & (1 << 7)) {
3229+ printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
3230+ pic_mode = 1;
3231+ } else {
3232+ printk(KERN_INFO " Virtual Wire compatibility mode.\n");
3233+ pic_mode = 0;
3234+ }
3235+#endif
3236+ /*
3237+ * Now see if we need to read further.
3238+ */
3239+ if (mpf->mpf_feature1 != 0) {
3240+ if (early) {
3241+ /*
3242+ * local APIC has default address
3243+ */
3244+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
3245+ return;
3246+ }
3247+
3248+ printk(KERN_INFO "Default MP configuration #%d\n",
3249+ mpf->mpf_feature1);
3250+ construct_default_ISA_mptable(mpf->mpf_feature1);
3251+
3252+ } else if (mpf->mpf_physptr) {
3253+
3254+ /*
3255+ * Read the physical hardware table. Anything here will
3256+ * override the defaults.
3257+ */
3258+ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
3259+ smp_found_config = 0;
3260+ printk(KERN_ERR
3261+ "BIOS bug, MP table errors detected!...\n");
3262+ printk(KERN_ERR "... disabling SMP support. "
3263+ "(tell your hw vendor)\n");
3264+ return;
3265+ }
3266+
3267+ if (early)
3268+ return;
3269+#ifdef CONFIG_X86_IO_APIC
3270+ /*
3271+ * If there are no explicit MP IRQ entries, then we are
3272+ * broken. We set up most of the low 16 IO-APIC pins to
3273+ * ISA defaults and hope it will work.
3274+ */
3275+ if (!mp_irq_entries) {
3276+ struct mpc_config_bus bus;
3277+
3278+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
3279+ "using default mptable. "
3280+ "(tell your hw vendor)\n");
3281+
3282+ bus.mpc_type = MP_BUS;
3283+ bus.mpc_busid = 0;
3284+ memcpy(bus.mpc_bustype, "ISA ", 6);
3285+ MP_bus_info(&bus);
3286+
3287+ construct_default_ioirq_mptable(0);
3288+ }
3289+#endif
3290+ } else
3291+ BUG();
3292+
3293+ if (!early)
3294+ printk(KERN_INFO "Processors: %d\n", num_processors);
3295+ /*
3296+ * Only use the first configuration found.
3297+ */
3298+}
3299+
3300+void __init early_get_smp_config(void)
3301+{
3302+ __get_smp_config(1);
3303+}
3304+
3305+void __init get_smp_config(void)
3306+{
3307+ __get_smp_config(0);
3308+}
3309+
3310+static int __init smp_scan_config(unsigned long base, unsigned long length,
3311+ unsigned reserve)
3312+{
3313+ unsigned int *bp = isa_bus_to_virt(base);
3314+ struct intel_mp_floating *mpf;
3315+
3316+ Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
3317+ BUILD_BUG_ON(sizeof(*mpf) != 16);
3318+
3319+ while (length > 0) {
3320+ mpf = (struct intel_mp_floating *)bp;
3321+ if ((*bp == SMP_MAGIC_IDENT) &&
3322+ (mpf->mpf_length == 1) &&
3323+ !mpf_checksum((unsigned char *)bp, 16) &&
3324+ ((mpf->mpf_specification == 1)
3325+ || (mpf->mpf_specification == 4))) {
3326+
3327+ smp_found_config = 1;
3328+ mpf_found = mpf;
3329+#ifdef CONFIG_X86_32
3330+#ifndef CONFIG_XEN
3331+ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
3332+ mpf, virt_to_phys(mpf));
3333+ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
3334+ BOOTMEM_DEFAULT);
3335+ if (mpf->mpf_physptr) {
3336+ /*
3337+ * We cannot access to MPC table to compute
3338+ * table size yet, as only few megabytes from
3339+ * the bottom is mapped now.
3340+ * PC-9800's MPC table places on the very last
3341+ * of physical memory; so that simply reserving
3342+ * PAGE_SIZE from mpg->mpf_physptr yields BUG()
3343+ * in reserve_bootmem.
3344+ */
3345+ unsigned long size = PAGE_SIZE;
3346+ unsigned long end = max_low_pfn * PAGE_SIZE;
3347+ if (mpf->mpf_physptr + size > end)
3348+ size = end - mpf->mpf_physptr;
3349+ reserve_bootmem(mpf->mpf_physptr, size,
3350+ BOOTMEM_DEFAULT);
3351+ }
3352+#else
3353+ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
3354+ mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
3355+#endif
3356+#elif !defined(CONFIG_XEN)
3357+ if (!reserve)
3358+ return 1;
3359+
3360+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
3361+ if (mpf->mpf_physptr)
3362+ reserve_bootmem_generic(mpf->mpf_physptr,
3363+ PAGE_SIZE);
3364+#endif
3365+ return 1;
3366+ }
3367+ bp += 4;
3368+ length -= 16;
3369+ }
3370+ return 0;
3371+}
3372+
3373+static void __init __find_smp_config(unsigned reserve)
3374+{
3375+#ifndef CONFIG_XEN
3376+ unsigned int address;
3377+#endif
3378+
3379+ /*
3380+ * FIXME: Linux assumes you have 640K of base ram..
3381+ * this continues the error...
3382+ *
3383+ * 1) Scan the bottom 1K for a signature
3384+ * 2) Scan the top 1K of base RAM
3385+ * 3) Scan the 64K of bios
3386+ */
3387+ if (smp_scan_config(0x0, 0x400, reserve) ||
3388+ smp_scan_config(639 * 0x400, 0x400, reserve) ||
3389+ smp_scan_config(0xF0000, 0x10000, reserve))
3390+ return;
3391+ /*
3392+ * If it is an SMP machine we should know now, unless the
3393+ * configuration is in an EISA/MCA bus machine with an
3394+ * extended bios data area.
3395+ *
3396+ * there is a real-mode segmented pointer pointing to the
3397+ * 4K EBDA area at 0x40E, calculate and scan it here.
3398+ *
3399+ * NOTE! There are Linux loaders that will corrupt the EBDA
3400+ * area, and as such this kind of SMP config may be less
3401+ * trustworthy, simply because the SMP table may have been
3402+ * stomped on during early boot. These loaders are buggy and
3403+ * should be fixed.
3404+ *
3405+ * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
3406+ */
3407+
3408+#ifndef CONFIG_XEN
3409+ address = get_bios_ebda();
3410+ if (address)
3411+ smp_scan_config(address, 0x400, reserve);
3412+#endif
3413+}
3414+
3415+void __init early_find_smp_config(void)
3416+{
3417+ __find_smp_config(0);
3418+}
3419+
3420+void __init find_smp_config(void)
3421+{
3422+ __find_smp_config(1);
3423+}
3424+
3425+/* --------------------------------------------------------------------------
3426+ ACPI-based MP Configuration
3427+ -------------------------------------------------------------------------- */
3428+
3429+/*
3430+ * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
3431+ */
3432+int es7000_plat;
3433+
3434+#ifdef CONFIG_ACPI
3435+
3436+#ifdef CONFIG_X86_IO_APIC
3437+
3438+#define MP_ISA_BUS 0
3439+
3440+extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
3441+
3442+static int mp_find_ioapic(int gsi)
3443+{
3444+ int i = 0;
3445+
3446+ /* Find the IOAPIC that manages this GSI. */
3447+ for (i = 0; i < nr_ioapics; i++) {
3448+ if ((gsi >= mp_ioapic_routing[i].gsi_base)
3449+ && (gsi <= mp_ioapic_routing[i].gsi_end))
3450+ return i;
3451+ }
3452+
3453+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
3454+ return -1;
3455+}
3456+
3457+static u8 __init uniq_ioapic_id(u8 id)
3458+{
3459+#ifdef CONFIG_X86_32
3460+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3461+ !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3462+ return io_apic_get_unique_id(nr_ioapics, id);
3463+ else
3464+ return id;
3465+#else
3466+ int i;
3467+ DECLARE_BITMAP(used, 256);
3468+ bitmap_zero(used, 256);
3469+ for (i = 0; i < nr_ioapics; i++) {
3470+ struct mpc_config_ioapic *ia = &mp_ioapics[i];
3471+ __set_bit(ia->mpc_apicid, used);
3472+ }
3473+ if (!test_bit(id, used))
3474+ return id;
3475+ return find_first_zero_bit(used, 256);
3476+#endif
3477+}
3478+
3479+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
3480+{
3481+ int idx = 0;
3482+
3483+ if (bad_ioapic(address))
3484+ return;
3485+
3486+ idx = nr_ioapics;
3487+
3488+ mp_ioapics[idx].mpc_type = MP_IOAPIC;
3489+ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
3490+ mp_ioapics[idx].mpc_apicaddr = address;
3491+
3492+#ifndef CONFIG_XEN
3493+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
3494+#endif
3495+ mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
3496+#ifdef CONFIG_X86_32
3497+ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
3498+#else
3499+ mp_ioapics[idx].mpc_apicver = 0;
3500+#endif
3501+ /*
3502+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
3503+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
3504+ */
3505+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
3506+ mp_ioapic_routing[idx].gsi_base = gsi_base;
3507+ mp_ioapic_routing[idx].gsi_end = gsi_base +
3508+ io_apic_get_redir_entries(idx);
3509+
3510+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
3511+ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
3512+ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
3513+ mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
3514+
3515+ nr_ioapics++;
3516+}
3517+
3518+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
3519+{
3520+ struct mpc_config_intsrc intsrc;
3521+ int ioapic = -1;
3522+ int pin = -1;
3523+
3524+ /*
3525+ * Convert 'gsi' to 'ioapic.pin'.
3526+ */
3527+ ioapic = mp_find_ioapic(gsi);
3528+ if (ioapic < 0)
3529+ return;
3530+ pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
3531+
3532+ /*
3533+ * TBD: This check is for faulty timer entries, where the override
3534+ * erroneously sets the trigger to level, resulting in a HUGE
3535+ * increase of timer interrupts!
3536+ */
3537+ if ((bus_irq == 0) && (trigger == 3))
3538+ trigger = 1;
3539+
3540+ intsrc.mpc_type = MP_INTSRC;
3541+ intsrc.mpc_irqtype = mp_INT;
3542+ intsrc.mpc_irqflag = (trigger << 2) | polarity;
3543+ intsrc.mpc_srcbus = MP_ISA_BUS;
3544+ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
3545+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
3546+ intsrc.mpc_dstirq = pin; /* INTIN# */
3547+
3548+ MP_intsrc_info(&intsrc);
3549+}
3550+
3551+void __init mp_config_acpi_legacy_irqs(void)
3552+{
3553+ struct mpc_config_intsrc intsrc;
3554+ int i = 0;
3555+ int ioapic = -1;
3556+
3557+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
3558+ /*
3559+ * Fabricate the legacy ISA bus (bus #31).
3560+ */
3561+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
3562+#endif
3563+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
3564+ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
3565+
3566+ /*
3567+ * Older generations of ES7000 have no legacy identity mappings
3568+ */
3569+ if (es7000_plat == 1)
3570+ return;
3571+
3572+ /*
3573+ * Locate the IOAPIC that manages the ISA IRQs (0-15).
3574+ */
3575+ ioapic = mp_find_ioapic(0);
3576+ if (ioapic < 0)
3577+ return;
3578+
3579+ intsrc.mpc_type = MP_INTSRC;
3580+ intsrc.mpc_irqflag = 0; /* Conforming */
3581+ intsrc.mpc_srcbus = MP_ISA_BUS;
3582+#ifdef CONFIG_X86_IO_APIC
3583+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
3584+#endif
3585+ /*
3586+ * Use the default configuration for the IRQs 0-15. Unless
3587+ * overridden by (MADT) interrupt source override entries.
3588+ */
3589+ for (i = 0; i < 16; i++) {
3590+ int idx;
3591+
3592+ for (idx = 0; idx < mp_irq_entries; idx++) {
3593+ struct mpc_config_intsrc *irq = mp_irqs + idx;
3594+
3595+ /* Do we already have a mapping for this ISA IRQ? */
3596+ if (irq->mpc_srcbus == MP_ISA_BUS
3597+ && irq->mpc_srcbusirq == i)
3598+ break;
3599+
3600+ /* Do we already have a mapping for this IOAPIC pin */
3601+ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
3602+ (irq->mpc_dstirq == i))
3603+ break;
3604+ }
3605+
3606+ if (idx != mp_irq_entries) {
3607+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
3608+ continue; /* IRQ already used */
3609+ }
3610+
3611+ intsrc.mpc_irqtype = mp_INT;
3612+ intsrc.mpc_srcbusirq = i; /* Identity mapped */
3613+ intsrc.mpc_dstirq = i;
3614+
3615+ MP_intsrc_info(&intsrc);
3616+ }
3617+}
3618+
3619+int mp_register_gsi(u32 gsi, int triggering, int polarity)
3620+{
3621+ int ioapic;
3622+ int ioapic_pin;
3623+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
3624+#define MAX_GSI_NUM 4096
3625+#define IRQ_COMPRESSION_START 64
3626+
3627+ static int pci_irq = IRQ_COMPRESSION_START;
3628+ /*
3629+ * Mapping between Global System Interrupts, which
3630+ * represent all possible interrupts, and IRQs
3631+ * assigned to actual devices.
3632+ */
3633+ static int gsi_to_irq[MAX_GSI_NUM];
3634+#else
3635+
3636+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
3637+ return gsi;
3638+#endif
3639+
3640+ /* Don't set up the ACPI SCI because it's already set up */
3641+ if (acpi_gbl_FADT.sci_interrupt == gsi)
3642+ return gsi;
3643+
3644+ ioapic = mp_find_ioapic(gsi);
3645+ if (ioapic < 0) {
3646+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
3647+ return gsi;
3648+ }
3649+
3650+ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
3651+
3652+#ifndef CONFIG_X86_32
3653+ if (ioapic_renumber_irq)
3654+ gsi = ioapic_renumber_irq(ioapic, gsi);
3655+#endif
3656+
3657+ /*
3658+ * Avoid pin reprogramming. PRTs typically include entries
3659+ * with redundant pin->gsi mappings (but unique PCI devices);
3660+ * we only program the IOAPIC on the first.
3661+ */
3662+ if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
3663+ printk(KERN_ERR "Invalid reference to IOAPIC pin "
3664+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
3665+ ioapic_pin);
3666+ return gsi;
3667+ }
3668+ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3669+ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
3670+ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
3671+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
3672+ return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
3673+#else
3674+ return gsi;
3675+#endif
3676+ }
3677+
3678+ set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
3679+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
3680+ /*
3681+ * For GSI >= 64, use IRQ compression
3682+ */
3683+ if ((gsi >= IRQ_COMPRESSION_START)
3684+ && (triggering == ACPI_LEVEL_SENSITIVE)) {
3685+ /*
3686+ * For PCI devices assign IRQs in order, avoiding gaps
3687+ * due to unused I/O APIC pins.
3688+ */
3689+ int irq = gsi;
3690+ if (gsi < MAX_GSI_NUM) {
3691+ /*
3692+ * Retain the VIA chipset work-around (gsi > 15), but
3693+ * avoid a problem where the 8254 timer (IRQ0) is setup
3694+ * via an override (so it's not on pin 0 of the ioapic),
3695+ * and at the same time, the pin 0 interrupt is a PCI
3696+ * type. The gsi > 15 test could cause these two pins
3697+ * to be shared as IRQ0, and they are not shareable.
3698+ * So test for this condition, and if necessary, avoid
3699+ * the pin collision.
3700+ */
3701+ gsi = pci_irq++;
3702+ /*
3703+ * Don't assign IRQ used by ACPI SCI
3704+ */
3705+ if (gsi == acpi_gbl_FADT.sci_interrupt)
3706+ gsi = pci_irq++;
3707+ gsi_to_irq[irq] = gsi;
3708+ } else {
3709+ printk(KERN_ERR "GSI %u is too high\n", gsi);
3710+ return gsi;
3711+ }
3712+ }
3713+#endif
3714+ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
3715+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
3716+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
3717+ return gsi;
3718+}
3719+
3720+#endif /* CONFIG_X86_IO_APIC */
3721+#endif /* CONFIG_ACPI */
3722--- sle11-2009-05-14.orig/arch/x86/kernel/mpparse_32-xen.c 2009-03-16 16:33:40.000000000 +0100
3723+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3724@@ -1,1161 +0,0 @@
3725-/*
3726- * Intel Multiprocessor Specification 1.1 and 1.4
3727- * compliant MP-table parsing routines.
3728- *
3729- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
3730- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
3731- *
3732- * Fixes
3733- * Erich Boleyn : MP v1.4 and additional changes.
3734- * Alan Cox : Added EBDA scanning
3735- * Ingo Molnar : various cleanups and rewrites
3736- * Maciej W. Rozycki: Bits for default MP configurations
3737- * Paul Diefenbaugh: Added full ACPI support
3738- */
3739-
3740-#include <linux/mm.h>
3741-#include <linux/init.h>
3742-#include <linux/acpi.h>
3743-#include <linux/delay.h>
3744-#include <linux/bootmem.h>
3745-#include <linux/kernel_stat.h>
3746-#include <linux/mc146818rtc.h>
3747-#include <linux/bitops.h>
3748-
3749-#include <asm/smp.h>
3750-#include <asm/acpi.h>
3751-#include <asm/mtrr.h>
3752-#include <asm/mpspec.h>
3753-#include <asm/io_apic.h>
3754-
3755-#include <mach_apic.h>
3756-#include <mach_apicdef.h>
3757-#include <mach_mpparse.h>
3758-#include <bios_ebda.h>
3759-
3760-/* Have we found an MP table */
3761-int smp_found_config;
3762-unsigned int __cpuinitdata maxcpus = NR_CPUS;
3763-
3764-/*
3765- * Various Linux-internal data structures created from the
3766- * MP-table.
3767- */
3768-int apic_version [MAX_APICS];
3769-int mp_bus_id_to_type [MAX_MP_BUSSES];
3770-int mp_bus_id_to_node [MAX_MP_BUSSES];
3771-int mp_bus_id_to_local [MAX_MP_BUSSES];
3772-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
3773-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
3774-static int mp_current_pci_id;
3775-
3776-/* I/O APIC entries */
3777-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
3778-
3779-/* # of MP IRQ source entries */
3780-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
3781-
3782-/* MP IRQ source entries */
3783-int mp_irq_entries;
3784-
3785-int nr_ioapics;
3786-
3787-int pic_mode;
3788-unsigned long mp_lapic_addr;
3789-
3790-unsigned int def_to_bigsmp = 0;
3791-
3792-/* Processor that is doing the boot up */
3793-unsigned int boot_cpu_physical_apicid = -1U;
3794-/* Internal processor count */
3795-unsigned int num_processors;
3796-
3797-/* Bitmask of physically existing CPUs */
3798-physid_mask_t phys_cpu_present_map;
3799-
3800-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
3801-
3802-/*
3803- * Intel MP BIOS table parsing routines:
3804- */
3805-
3806-
3807-/*
3808- * Checksum an MP configuration block.
3809- */
3810-
3811-static int __init mpf_checksum(unsigned char *mp, int len)
3812-{
3813- int sum = 0;
3814-
3815- while (len--)
3816- sum += *mp++;
3817-
3818- return sum & 0xFF;
3819-}
3820-
3821-/*
3822- * Have to match translation table entries to main table entries by counter
3823- * hence the mpc_record variable .... can't see a less disgusting way of
3824- * doing this ....
3825- */
3826-
3827-static int mpc_record;
3828-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
3829-
3830-#ifndef CONFIG_XEN
3831-static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
3832-{
3833- int ver, apicid;
3834- physid_mask_t phys_cpu;
3835-
3836- if (!(m->mpc_cpuflag & CPU_ENABLED))
3837- return;
3838-
3839- apicid = mpc_apic_id(m, translation_table[mpc_record]);
3840-
3841- if (m->mpc_featureflag&(1<<0))
3842- Dprintk(" Floating point unit present.\n");
3843- if (m->mpc_featureflag&(1<<7))
3844- Dprintk(" Machine Exception supported.\n");
3845- if (m->mpc_featureflag&(1<<8))
3846- Dprintk(" 64 bit compare & exchange supported.\n");
3847- if (m->mpc_featureflag&(1<<9))
3848- Dprintk(" Internal APIC present.\n");
3849- if (m->mpc_featureflag&(1<<11))
3850- Dprintk(" SEP present.\n");
3851- if (m->mpc_featureflag&(1<<12))
3852- Dprintk(" MTRR present.\n");
3853- if (m->mpc_featureflag&(1<<13))
3854- Dprintk(" PGE present.\n");
3855- if (m->mpc_featureflag&(1<<14))
3856- Dprintk(" MCA present.\n");
3857- if (m->mpc_featureflag&(1<<15))
3858- Dprintk(" CMOV present.\n");
3859- if (m->mpc_featureflag&(1<<16))
3860- Dprintk(" PAT present.\n");
3861- if (m->mpc_featureflag&(1<<17))
3862- Dprintk(" PSE present.\n");
3863- if (m->mpc_featureflag&(1<<18))
3864- Dprintk(" PSN present.\n");
3865- if (m->mpc_featureflag&(1<<19))
3866- Dprintk(" Cache Line Flush Instruction present.\n");
3867- /* 20 Reserved */
3868- if (m->mpc_featureflag&(1<<21))
3869- Dprintk(" Debug Trace and EMON Store present.\n");
3870- if (m->mpc_featureflag&(1<<22))
3871- Dprintk(" ACPI Thermal Throttle Registers present.\n");
3872- if (m->mpc_featureflag&(1<<23))
3873- Dprintk(" MMX present.\n");
3874- if (m->mpc_featureflag&(1<<24))
3875- Dprintk(" FXSR present.\n");
3876- if (m->mpc_featureflag&(1<<25))
3877- Dprintk(" XMM present.\n");
3878- if (m->mpc_featureflag&(1<<26))
3879- Dprintk(" Willamette New Instructions present.\n");
3880- if (m->mpc_featureflag&(1<<27))
3881- Dprintk(" Self Snoop present.\n");
3882- if (m->mpc_featureflag&(1<<28))
3883- Dprintk(" HT present.\n");
3884- if (m->mpc_featureflag&(1<<29))
3885- Dprintk(" Thermal Monitor present.\n");
3886- /* 30, 31 Reserved */
3887-
3888-
3889- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
3890- Dprintk(" Bootup CPU\n");
3891- boot_cpu_physical_apicid = m->mpc_apicid;
3892- }
3893-
3894- ver = m->mpc_apicver;
3895-
3896- /*
3897- * Validate version
3898- */
3899- if (ver == 0x0) {
3900- printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
3901- "fixing up to 0x10. (tell your hw vendor)\n",
3902- m->mpc_apicid);
3903- ver = 0x10;
3904- }
3905- apic_version[m->mpc_apicid] = ver;
3906-
3907- phys_cpu = apicid_to_cpu_present(apicid);
3908- physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
3909-
3910- if (num_processors >= NR_CPUS) {
3911- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
3912- " Processor ignored.\n", NR_CPUS);
3913- return;
3914- }
3915-
3916- if (num_processors >= maxcpus) {
3917- printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
3918- " Processor ignored.\n", maxcpus);
3919- return;
3920- }
3921-
3922- cpu_set(num_processors, cpu_possible_map);
3923- num_processors++;
3924-
3925- /*
3926- * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
3927- * but we need to work other dependencies like SMP_SUSPEND etc
3928- * before this can be done without some confusion.
3929- * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
3930- * - Ashok Raj <ashok.raj@intel.com>
3931- */
3932- if (num_processors > 8) {
3933- switch (boot_cpu_data.x86_vendor) {
3934- case X86_VENDOR_INTEL:
3935- if (!APIC_XAPIC(ver)) {
3936- def_to_bigsmp = 0;
3937- break;
3938- }
3939- /* If P4 and above fall through */
3940- case X86_VENDOR_AMD:
3941- def_to_bigsmp = 1;
3942- }
3943- }
3944- bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
3945-}
3946-#else
3947-static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
3948-{
3949- num_processors++;
3950-}
3951-#endif /* CONFIG_XEN */
3952-
3953-static void __init MP_bus_info (struct mpc_config_bus *m)
3954-{
3955- char str[7];
3956-
3957- memcpy(str, m->mpc_bustype, 6);
3958- str[6] = 0;
3959-
3960- mpc_oem_bus_info(m, str, translation_table[mpc_record]);
3961-
3962-#if MAX_MP_BUSSES < 256
3963- if (m->mpc_busid >= MAX_MP_BUSSES) {
3964- printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
3965- " is too large, max. supported is %d\n",
3966- m->mpc_busid, str, MAX_MP_BUSSES - 1);
3967- return;
3968- }
3969-#endif
3970-
3971- if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
3972- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
3973- } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
3974- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
3975- } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
3976- mpc_oem_pci_bus(m, translation_table[mpc_record]);
3977- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
3978- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
3979- mp_current_pci_id++;
3980- } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
3981- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
3982- } else {
3983- printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
3984- }
3985-}
3986-
3987-static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
3988-{
3989- if (!(m->mpc_flags & MPC_APIC_USABLE))
3990- return;
3991-
3992- printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
3993- m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
3994- if (nr_ioapics >= MAX_IO_APICS) {
3995- printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
3996- MAX_IO_APICS, nr_ioapics);
3997- panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
3998- }
3999- if (!m->mpc_apicaddr) {
4000- printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
4001- " found in MP table, skipping!\n");
4002- return;
4003- }
4004- mp_ioapics[nr_ioapics] = *m;
4005- nr_ioapics++;
4006-}
4007-
4008-static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
4009-{
4010- mp_irqs [mp_irq_entries] = *m;
4011- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
4012- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
4013- m->mpc_irqtype, m->mpc_irqflag & 3,
4014- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
4015- m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
4016- if (++mp_irq_entries == MAX_IRQ_SOURCES)
4017- panic("Max # of irq sources exceeded!!\n");
4018-}
4019-
4020-static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
4021-{
4022- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
4023- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
4024- m->mpc_irqtype, m->mpc_irqflag & 3,
4025- (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
4026- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
4027-}
4028-
4029-#ifdef CONFIG_X86_NUMAQ
4030-static void __init MP_translation_info (struct mpc_config_translation *m)
4031-{
4032- printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
4033-
4034- if (mpc_record >= MAX_MPC_ENTRY)
4035- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
4036- else
4037- translation_table[mpc_record] = m; /* stash this for later */
4038- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
4039- node_set_online(m->trans_quad);
4040-}
4041-
4042-/*
4043- * Read/parse the MPC oem tables
4044- */
4045-
4046-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
4047- unsigned short oemsize)
4048-{
4049- int count = sizeof (*oemtable); /* the header size */
4050- unsigned char *oemptr = ((unsigned char *)oemtable)+count;
4051-
4052- mpc_record = 0;
4053- printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
4054- if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
4055- {
4056- printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
4057- oemtable->oem_signature[0],
4058- oemtable->oem_signature[1],
4059- oemtable->oem_signature[2],
4060- oemtable->oem_signature[3]);
4061- return;
4062- }
4063- if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
4064- {
4065- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
4066- return;
4067- }
4068- while (count < oemtable->oem_length) {
4069- switch (*oemptr) {
4070- case MP_TRANSLATION:
4071- {
4072- struct mpc_config_translation *m=
4073- (struct mpc_config_translation *)oemptr;
4074- MP_translation_info(m);
4075- oemptr += sizeof(*m);
4076- count += sizeof(*m);
4077- ++mpc_record;
4078- break;
4079- }
4080- default:
4081- {
4082- printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
4083- return;
4084- }
4085- }
4086- }
4087-}
4088-
4089-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
4090- char *productid)
4091-{
4092- if (strncmp(oem, "IBM NUMA", 8))
4093- printk("Warning! May not be a NUMA-Q system!\n");
4094- if (mpc->mpc_oemptr)
4095- smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
4096- mpc->mpc_oemsize);
4097-}
4098-#endif /* CONFIG_X86_NUMAQ */
4099-
4100-/*
4101- * Read/parse the MPC
4102- */
4103-
4104-static int __init smp_read_mpc(struct mp_config_table *mpc)
4105-{
4106- char str[16];
4107- char oem[10];
4108- int count=sizeof(*mpc);
4109- unsigned char *mpt=((unsigned char *)mpc)+count;
4110-
4111- if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
4112- printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
4113- *(u32 *)mpc->mpc_signature);
4114- return 0;
4115- }
4116- if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
4117- printk(KERN_ERR "SMP mptable: checksum error!\n");
4118- return 0;
4119- }
4120- if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
4121- printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
4122- mpc->mpc_spec);
4123- return 0;
4124- }
4125- if (!mpc->mpc_lapic) {
4126- printk(KERN_ERR "SMP mptable: null local APIC address!\n");
4127- return 0;
4128- }
4129- memcpy(oem,mpc->mpc_oem,8);
4130- oem[8]=0;
4131- printk(KERN_INFO "OEM ID: %s ",oem);
4132-
4133- memcpy(str,mpc->mpc_productid,12);
4134- str[12]=0;
4135- printk("Product ID: %s ",str);
4136-
4137- mps_oem_check(mpc, oem, str);
4138-
4139- printk("APIC at: 0x%X\n", mpc->mpc_lapic);
4140-
4141- /*
4142- * Save the local APIC address (it might be non-default) -- but only
4143- * if we're not using ACPI.
4144- */
4145- if (!acpi_lapic)
4146- mp_lapic_addr = mpc->mpc_lapic;
4147-
4148- /*
4149- * Now process the configuration blocks.
4150- */
4151- mpc_record = 0;
4152- while (count < mpc->mpc_length) {
4153- switch(*mpt) {
4154- case MP_PROCESSOR:
4155- {
4156- struct mpc_config_processor *m=
4157- (struct mpc_config_processor *)mpt;
4158- /* ACPI may have already provided this data */
4159- if (!acpi_lapic)
4160- MP_processor_info(m);
4161- mpt += sizeof(*m);
4162- count += sizeof(*m);
4163- break;
4164- }
4165- case MP_BUS:
4166- {
4167- struct mpc_config_bus *m=
4168- (struct mpc_config_bus *)mpt;
4169- MP_bus_info(m);
4170- mpt += sizeof(*m);
4171- count += sizeof(*m);
4172- break;
4173- }
4174- case MP_IOAPIC:
4175- {
4176- struct mpc_config_ioapic *m=
4177- (struct mpc_config_ioapic *)mpt;
4178- MP_ioapic_info(m);
4179- mpt+=sizeof(*m);
4180- count+=sizeof(*m);
4181- break;
4182- }
4183- case MP_INTSRC:
4184- {
4185- struct mpc_config_intsrc *m=
4186- (struct mpc_config_intsrc *)mpt;
4187-
4188- MP_intsrc_info(m);
4189- mpt+=sizeof(*m);
4190- count+=sizeof(*m);
4191- break;
4192- }
4193- case MP_LINTSRC:
4194- {
4195- struct mpc_config_lintsrc *m=
4196- (struct mpc_config_lintsrc *)mpt;
4197- MP_lintsrc_info(m);
4198- mpt+=sizeof(*m);
4199- count+=sizeof(*m);
4200- break;
4201- }
4202- default:
4203- {
4204- count = mpc->mpc_length;
4205- break;
4206- }
4207- }
4208- ++mpc_record;
4209- }
4210- setup_apic_routing();
4211- if (!num_processors)
4212- printk(KERN_ERR "SMP mptable: no processors registered!\n");
4213- return num_processors;
4214-}
4215-
4216-static int __init ELCR_trigger(unsigned int irq)
4217-{
4218- unsigned int port;
4219-
4220- port = 0x4d0 + (irq >> 3);
4221- return (inb(port) >> (irq & 7)) & 1;
4222-}
4223-
4224-static void __init construct_default_ioirq_mptable(int mpc_default_type)
4225-{
4226- struct mpc_config_intsrc intsrc;
4227- int i;
4228- int ELCR_fallback = 0;
4229-
4230- intsrc.mpc_type = MP_INTSRC;
4231- intsrc.mpc_irqflag = 0; /* conforming */
4232- intsrc.mpc_srcbus = 0;
4233- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
4234-
4235- intsrc.mpc_irqtype = mp_INT;
4236-
4237- /*
4238- * If true, we have an ISA/PCI system with no IRQ entries
4239- * in the MP table. To prevent the PCI interrupts from being set up
4240- * incorrectly, we try to use the ELCR. The sanity check to see if
4241- * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
4242- * never be level sensitive, so we simply see if the ELCR agrees.
4243- * If it does, we assume it's valid.
4244- */
4245- if (mpc_default_type == 5) {
4246- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
4247-
4248- if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
4249- printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
4250- else {
4251- printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
4252- ELCR_fallback = 1;
4253- }
4254- }
4255-
4256- for (i = 0; i < 16; i++) {
4257- switch (mpc_default_type) {
4258- case 2:
4259- if (i == 0 || i == 13)
4260- continue; /* IRQ0 & IRQ13 not connected */
4261- /* fall through */
4262- default:
4263- if (i == 2)
4264- continue; /* IRQ2 is never connected */
4265- }
4266-
4267- if (ELCR_fallback) {
4268- /*
4269- * If the ELCR indicates a level-sensitive interrupt, we
4270- * copy that information over to the MP table in the
4271- * irqflag field (level sensitive, active high polarity).
4272- */
4273- if (ELCR_trigger(i))
4274- intsrc.mpc_irqflag = 13;
4275- else
4276- intsrc.mpc_irqflag = 0;
4277- }
4278-
4279- intsrc.mpc_srcbusirq = i;
4280- intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
4281- MP_intsrc_info(&intsrc);
4282- }
4283-
4284- intsrc.mpc_irqtype = mp_ExtINT;
4285- intsrc.mpc_srcbusirq = 0;
4286- intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
4287- MP_intsrc_info(&intsrc);
4288-}
4289-
4290-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
4291-{
4292- struct mpc_config_processor processor;
4293- struct mpc_config_bus bus;
4294- struct mpc_config_ioapic ioapic;
4295- struct mpc_config_lintsrc lintsrc;
4296- int linttypes[2] = { mp_ExtINT, mp_NMI };
4297- int i;
4298-
4299- /*
4300- * local APIC has default address
4301- */
4302- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
4303-
4304- /*
4305- * 2 CPUs, numbered 0 & 1.
4306- */
4307- processor.mpc_type = MP_PROCESSOR;
4308- /* Either an integrated APIC or a discrete 82489DX. */
4309- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
4310- processor.mpc_cpuflag = CPU_ENABLED;
4311- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
4312- (boot_cpu_data.x86_model << 4) |
4313- boot_cpu_data.x86_mask;
4314- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
4315- processor.mpc_reserved[0] = 0;
4316- processor.mpc_reserved[1] = 0;
4317- for (i = 0; i < 2; i++) {
4318- processor.mpc_apicid = i;
4319- MP_processor_info(&processor);
4320- }
4321-
4322- bus.mpc_type = MP_BUS;
4323- bus.mpc_busid = 0;
4324- switch (mpc_default_type) {
4325- default:
4326- printk("???\n");
4327- printk(KERN_ERR "Unknown standard configuration %d\n",
4328- mpc_default_type);
4329- /* fall through */
4330- case 1:
4331- case 5:
4332- memcpy(bus.mpc_bustype, "ISA ", 6);
4333- break;
4334- case 2:
4335- case 6:
4336- case 3:
4337- memcpy(bus.mpc_bustype, "EISA ", 6);
4338- break;
4339- case 4:
4340- case 7:
4341- memcpy(bus.mpc_bustype, "MCA ", 6);
4342- }
4343- MP_bus_info(&bus);
4344- if (mpc_default_type > 4) {
4345- bus.mpc_busid = 1;
4346- memcpy(bus.mpc_bustype, "PCI ", 6);
4347- MP_bus_info(&bus);
4348- }
4349-
4350- ioapic.mpc_type = MP_IOAPIC;
4351- ioapic.mpc_apicid = 2;
4352- ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
4353- ioapic.mpc_flags = MPC_APIC_USABLE;
4354- ioapic.mpc_apicaddr = 0xFEC00000;
4355- MP_ioapic_info(&ioapic);
4356-
4357- /*
4358- * We set up most of the low 16 IO-APIC pins according to MPS rules.
4359- */
4360- construct_default_ioirq_mptable(mpc_default_type);
4361-
4362- lintsrc.mpc_type = MP_LINTSRC;
4363- lintsrc.mpc_irqflag = 0; /* conforming */
4364- lintsrc.mpc_srcbusid = 0;
4365- lintsrc.mpc_srcbusirq = 0;
4366- lintsrc.mpc_destapic = MP_APIC_ALL;
4367- for (i = 0; i < 2; i++) {
4368- lintsrc.mpc_irqtype = linttypes[i];
4369- lintsrc.mpc_destapiclint = i;
4370- MP_lintsrc_info(&lintsrc);
4371- }
4372-}
4373-
4374-static struct intel_mp_floating *mpf_found;
4375-
4376-/*
4377- * Scan the memory blocks for an SMP configuration block.
4378- */
4379-void __init get_smp_config (void)
4380-{
4381- struct intel_mp_floating *mpf = mpf_found;
4382-
4383- /*
4384- * ACPI supports both logical (e.g. Hyper-Threading) and physical
4385- * processors, where MPS only supports physical.
4386- */
4387- if (acpi_lapic && acpi_ioapic) {
4388- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
4389- return;
4390- }
4391- else if (acpi_lapic)
4392- printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
4393-
4394- printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
4395- if (mpf->mpf_feature2 & (1<<7)) {
4396- printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
4397- pic_mode = 1;
4398- } else {
4399- printk(KERN_INFO " Virtual Wire compatibility mode.\n");
4400- pic_mode = 0;
4401- }
4402-
4403- /*
4404- * Now see if we need to read further.
4405- */
4406- if (mpf->mpf_feature1 != 0) {
4407-
4408- printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
4409- construct_default_ISA_mptable(mpf->mpf_feature1);
4410-
4411- } else if (mpf->mpf_physptr) {
4412-
4413- /*
4414- * Read the physical hardware table. Anything here will
4415- * override the defaults.
4416- */
4417- if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
4418- smp_found_config = 0;
4419- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
4420- printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
4421- return;
4422- }
4423- /*
4424- * If there are no explicit MP IRQ entries, then we are
4425- * broken. We set up most of the low 16 IO-APIC pins to
4426- * ISA defaults and hope it will work.
4427- */
4428- if (!mp_irq_entries) {
4429- struct mpc_config_bus bus;
4430-
4431- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
4432-
4433- bus.mpc_type = MP_BUS;
4434- bus.mpc_busid = 0;
4435- memcpy(bus.mpc_bustype, "ISA ", 6);
4436- MP_bus_info(&bus);
4437-
4438- construct_default_ioirq_mptable(0);
4439- }
4440-
4441- } else
4442- BUG();
4443-
4444- printk(KERN_INFO "Processors: %d\n", num_processors);
4445- /*
4446- * Only use the first configuration found.
4447- */
4448-}
4449-
4450-static int __init smp_scan_config (unsigned long base, unsigned long length)
4451-{
4452- unsigned long *bp = isa_bus_to_virt(base);
4453- struct intel_mp_floating *mpf;
4454-
4455- printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
4456- if (sizeof(*mpf) != 16)
4457- printk("Error: MPF size\n");
4458-
4459- while (length > 0) {
4460- mpf = (struct intel_mp_floating *)bp;
4461- if ((*bp == SMP_MAGIC_IDENT) &&
4462- (mpf->mpf_length == 1) &&
4463- !mpf_checksum((unsigned char *)bp, 16) &&
4464- ((mpf->mpf_specification == 1)
4465- || (mpf->mpf_specification == 4)) ) {
4466-
4467- smp_found_config = 1;
4468-#ifndef CONFIG_XEN
4469- printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4470- mpf, virt_to_phys(mpf));
4471- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
4472- BOOTMEM_DEFAULT);
4473- if (mpf->mpf_physptr) {
4474- /*
4475- * We cannot access to MPC table to compute
4476- * table size yet, as only few megabytes from
4477- * the bottom is mapped now.
4478- * PC-9800's MPC table places on the very last
4479- * of physical memory; so that simply reserving
4480- * PAGE_SIZE from mpg->mpf_physptr yields BUG()
4481- * in reserve_bootmem.
4482- */
4483- unsigned long size = PAGE_SIZE;
4484- unsigned long end = max_low_pfn * PAGE_SIZE;
4485- if (mpf->mpf_physptr + size > end)
4486- size = end - mpf->mpf_physptr;
4487- reserve_bootmem(mpf->mpf_physptr, size,
4488- BOOTMEM_DEFAULT);
4489- }
4490-#else
4491- printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4492- mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
4493-#endif
4494-
4495- mpf_found = mpf;
4496- return 1;
4497- }
4498- bp += 4;
4499- length -= 16;
4500- }
4501- return 0;
4502-}
4503-
4504-void __init find_smp_config (void)
4505-{
4506-#ifndef CONFIG_XEN
4507- unsigned int address;
4508-#endif
4509-
4510- /*
4511- * FIXME: Linux assumes you have 640K of base ram..
4512- * this continues the error...
4513- *
4514- * 1) Scan the bottom 1K for a signature
4515- * 2) Scan the top 1K of base RAM
4516- * 3) Scan the 64K of bios
4517- */
4518- if (smp_scan_config(0x0,0x400) ||
4519- smp_scan_config(639*0x400,0x400) ||
4520- smp_scan_config(0xF0000,0x10000))
4521- return;
4522- /*
4523- * If it is an SMP machine we should know now, unless the
4524- * configuration is in an EISA/MCA bus machine with an
4525- * extended bios data area.
4526- *
4527- * there is a real-mode segmented pointer pointing to the
4528- * 4K EBDA area at 0x40E, calculate and scan it here.
4529- *
4530- * NOTE! There are Linux loaders that will corrupt the EBDA
4531- * area, and as such this kind of SMP config may be less
4532- * trustworthy, simply because the SMP table may have been
4533- * stomped on during early boot. These loaders are buggy and
4534- * should be fixed.
4535- *
4536- * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
4537- */
4538-
4539-#ifndef CONFIG_XEN
4540- address = get_bios_ebda();
4541- if (address)
4542- smp_scan_config(address, 0x400);
4543-#endif
4544-}
4545-
4546-int es7000_plat;
4547-
4548-/* --------------------------------------------------------------------------
4549- ACPI-based MP Configuration
4550- -------------------------------------------------------------------------- */
4551-
4552-#ifdef CONFIG_ACPI
4553-
4554-void __init mp_register_lapic_address(u64 address)
4555-{
4556-#ifndef CONFIG_XEN
4557- mp_lapic_addr = (unsigned long) address;
4558-
4559- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
4560-
4561- if (boot_cpu_physical_apicid == -1U)
4562- boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
4563-
4564- Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
4565-#endif
4566-}
4567-
4568-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
4569-{
4570- struct mpc_config_processor processor;
4571- int boot_cpu = 0;
4572-
4573- if (MAX_APICS - id <= 0) {
4574- printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
4575- id, MAX_APICS);
4576- return;
4577- }
4578-
4579- if (id == boot_cpu_physical_apicid)
4580- boot_cpu = 1;
4581-
4582-#ifndef CONFIG_XEN
4583- processor.mpc_type = MP_PROCESSOR;
4584- processor.mpc_apicid = id;
4585- processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
4586- processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
4587- processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
4588- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
4589- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
4590- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
4591- processor.mpc_reserved[0] = 0;
4592- processor.mpc_reserved[1] = 0;
4593-#endif
4594-
4595- MP_processor_info(&processor);
4596-}
4597-
4598-#ifdef CONFIG_X86_IO_APIC
4599-
4600-#define MP_ISA_BUS 0
4601-#define MP_MAX_IOAPIC_PIN 127
4602-
4603-static struct mp_ioapic_routing {
4604- int apic_id;
4605- int gsi_base;
4606- int gsi_end;
4607- u32 pin_programmed[4];
4608-} mp_ioapic_routing[MAX_IO_APICS];
4609-
4610-static int mp_find_ioapic (int gsi)
4611-{
4612- int i = 0;
4613-
4614- /* Find the IOAPIC that manages this GSI. */
4615- for (i = 0; i < nr_ioapics; i++) {
4616- if ((gsi >= mp_ioapic_routing[i].gsi_base)
4617- && (gsi <= mp_ioapic_routing[i].gsi_end))
4618- return i;
4619- }
4620-
4621- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
4622-
4623- return -1;
4624-}
4625-
4626-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
4627-{
4628- int idx = 0;
4629- int tmpid;
4630-
4631- if (nr_ioapics >= MAX_IO_APICS) {
4632- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
4633- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
4634- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
4635- }
4636- if (!address) {
4637- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
4638- " found in MADT table, skipping!\n");
4639- return;
4640- }
4641-
4642- idx = nr_ioapics++;
4643-
4644- mp_ioapics[idx].mpc_type = MP_IOAPIC;
4645- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
4646- mp_ioapics[idx].mpc_apicaddr = address;
4647-
4648-#ifndef CONFIG_XEN
4649- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4650-#endif
4651- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
4652- && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
4653- tmpid = io_apic_get_unique_id(idx, id);
4654- else
4655- tmpid = id;
4656- if (tmpid == -1) {
4657- nr_ioapics--;
4658- return;
4659- }
4660- mp_ioapics[idx].mpc_apicid = tmpid;
4661- mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
4662-
4663- /*
4664- * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4665- * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4666- */
4667- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
4668- mp_ioapic_routing[idx].gsi_base = gsi_base;
4669- mp_ioapic_routing[idx].gsi_end = gsi_base +
4670- io_apic_get_redir_entries(idx);
4671-
4672- printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4673- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4674- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4675- mp_ioapic_routing[idx].gsi_base,
4676- mp_ioapic_routing[idx].gsi_end);
4677-}
4678-
4679-void __init
4680-mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
4681-{
4682- struct mpc_config_intsrc intsrc;
4683- int ioapic = -1;
4684- int pin = -1;
4685-
4686- /*
4687- * Convert 'gsi' to 'ioapic.pin'.
4688- */
4689- ioapic = mp_find_ioapic(gsi);
4690- if (ioapic < 0)
4691- return;
4692- pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
4693-
4694- /*
4695- * TBD: This check is for faulty timer entries, where the override
4696- * erroneously sets the trigger to level, resulting in a HUGE
4697- * increase of timer interrupts!
4698- */
4699- if ((bus_irq == 0) && (trigger == 3))
4700- trigger = 1;
4701-
4702- intsrc.mpc_type = MP_INTSRC;
4703- intsrc.mpc_irqtype = mp_INT;
4704- intsrc.mpc_irqflag = (trigger << 2) | polarity;
4705- intsrc.mpc_srcbus = MP_ISA_BUS;
4706- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
4707- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
4708- intsrc.mpc_dstirq = pin; /* INTIN# */
4709-
4710- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
4711- intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
4712- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
4713- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
4714-
4715- mp_irqs[mp_irq_entries] = intsrc;
4716- if (++mp_irq_entries == MAX_IRQ_SOURCES)
4717- panic("Max # of irq sources exceeded!\n");
4718-}
4719-
4720-void __init mp_config_acpi_legacy_irqs (void)
4721-{
4722- struct mpc_config_intsrc intsrc;
4723- int i = 0;
4724- int ioapic = -1;
4725-
4726- /*
4727- * Fabricate the legacy ISA bus (bus #31).
4728- */
4729- mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
4730- Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
4731-
4732- /*
4733- * Older generations of ES7000 have no legacy identity mappings
4734- */
4735- if (es7000_plat == 1)
4736- return;
4737-
4738- /*
4739- * Locate the IOAPIC that manages the ISA IRQs (0-15).
4740- */
4741- ioapic = mp_find_ioapic(0);
4742- if (ioapic < 0)
4743- return;
4744-
4745- intsrc.mpc_type = MP_INTSRC;
4746- intsrc.mpc_irqflag = 0; /* Conforming */
4747- intsrc.mpc_srcbus = MP_ISA_BUS;
4748- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
4749-
4750- /*
4751- * Use the default configuration for the IRQs 0-15. Unless
4752- * overridden by (MADT) interrupt source override entries.
4753- */
4754- for (i = 0; i < 16; i++) {
4755- int idx;
4756-
4757- for (idx = 0; idx < mp_irq_entries; idx++) {
4758- struct mpc_config_intsrc *irq = mp_irqs + idx;
4759-
4760- /* Do we already have a mapping for this ISA IRQ? */
4761- if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
4762- break;
4763-
4764- /* Do we already have a mapping for this IOAPIC pin */
4765- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
4766- (irq->mpc_dstirq == i))
4767- break;
4768- }
4769-
4770- if (idx != mp_irq_entries) {
4771- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
4772- continue; /* IRQ already used */
4773- }
4774-
4775- intsrc.mpc_irqtype = mp_INT;
4776- intsrc.mpc_srcbusirq = i; /* Identity mapped */
4777- intsrc.mpc_dstirq = i;
4778-
4779- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
4780- "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
4781- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
4782- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
4783- intsrc.mpc_dstirq);
4784-
4785- mp_irqs[mp_irq_entries] = intsrc;
4786- if (++mp_irq_entries == MAX_IRQ_SOURCES)
4787- panic("Max # of irq sources exceeded!\n");
4788- }
4789-}
4790-
4791-#define MAX_GSI_NUM 4096
4792-#define IRQ_COMPRESSION_START 64
4793-
4794-int mp_register_gsi(u32 gsi, int triggering, int polarity)
4795-{
4796- int ioapic = -1;
4797- int ioapic_pin = 0;
4798- int idx, bit = 0;
4799- static int pci_irq = IRQ_COMPRESSION_START;
4800- /*
4801- * Mapping between Global System Interrupts, which
4802- * represent all possible interrupts, and IRQs
4803- * assigned to actual devices.
4804- */
4805- static int gsi_to_irq[MAX_GSI_NUM];
4806-
4807- /* Don't set up the ACPI SCI because it's already set up */
4808- if (acpi_gbl_FADT.sci_interrupt == gsi)
4809- return gsi;
4810-
4811- ioapic = mp_find_ioapic(gsi);
4812- if (ioapic < 0) {
4813- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
4814- return gsi;
4815- }
4816-
4817- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
4818-
4819- if (ioapic_renumber_irq)
4820- gsi = ioapic_renumber_irq(ioapic, gsi);
4821-
4822- /*
4823- * Avoid pin reprogramming. PRTs typically include entries
4824- * with redundant pin->gsi mappings (but unique PCI devices);
4825- * we only program the IOAPIC on the first.
4826- */
4827- bit = ioapic_pin % 32;
4828- idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
4829- if (idx > 3) {
4830- printk(KERN_ERR "Invalid reference to IOAPIC pin "
4831- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
4832- ioapic_pin);
4833- return gsi;
4834- }
4835- if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
4836- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
4837- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
4838- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
4839- }
4840-
4841- mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
4842-
4843- /*
4844- * For GSI >= 64, use IRQ compression
4845- */
4846- if ((gsi >= IRQ_COMPRESSION_START)
4847- && (triggering == ACPI_LEVEL_SENSITIVE)) {
4848- /*
4849- * For PCI devices assign IRQs in order, avoiding gaps
4850- * due to unused I/O APIC pins.
4851- */
4852- int irq = gsi;
4853- if (gsi < MAX_GSI_NUM) {
4854- /*
4855- * Retain the VIA chipset work-around (gsi > 15), but
4856- * avoid a problem where the 8254 timer (IRQ0) is setup
4857- * via an override (so it's not on pin 0 of the ioapic),
4858- * and at the same time, the pin 0 interrupt is a PCI
4859- * type. The gsi > 15 test could cause these two pins
4860- * to be shared as IRQ0, and they are not shareable.
4861- * So test for this condition, and if necessary, avoid
4862- * the pin collision.
4863- */
4864- if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
4865- gsi = pci_irq++;
4866- /*
4867- * Don't assign IRQ used by ACPI SCI
4868- */
4869- if (gsi == acpi_gbl_FADT.sci_interrupt)
4870- gsi = pci_irq++;
4871- gsi_to_irq[irq] = gsi;
4872- } else {
4873- printk(KERN_ERR "GSI %u is too high\n", gsi);
4874- return gsi;
4875- }
4876- }
4877-
4878- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
4879- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
4880- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
4881- return gsi;
4882-}
4883-
4884-#endif /* CONFIG_X86_IO_APIC */
4885-#endif /* CONFIG_ACPI */
4886--- sle11-2009-05-14.orig/arch/x86/kernel/mpparse_64-xen.c 2009-03-16 16:33:40.000000000 +0100
4887+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
4888@@ -1,879 +0,0 @@
4889-/*
4890- * Intel Multiprocessor Specification 1.1 and 1.4
4891- * compliant MP-table parsing routines.
4892- *
4893- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
4894- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
4895- *
4896- * Fixes
4897- * Erich Boleyn : MP v1.4 and additional changes.
4898- * Alan Cox : Added EBDA scanning
4899- * Ingo Molnar : various cleanups and rewrites
4900- * Maciej W. Rozycki: Bits for default MP configurations
4901- * Paul Diefenbaugh: Added full ACPI support
4902- */
4903-
4904-#include <linux/mm.h>
4905-#include <linux/init.h>
4906-#include <linux/delay.h>
4907-#include <linux/bootmem.h>
4908-#include <linux/kernel_stat.h>
4909-#include <linux/mc146818rtc.h>
4910-#include <linux/acpi.h>
4911-#include <linux/module.h>
4912-
4913-#include <asm/smp.h>
4914-#include <asm/mtrr.h>
4915-#include <asm/mpspec.h>
4916-#include <asm/pgalloc.h>
4917-#include <asm/io_apic.h>
4918-#include <asm/proto.h>
4919-#include <asm/acpi.h>
4920-
4921-/* Have we found an MP table */
4922-int smp_found_config;
4923-
4924-/*
4925- * Various Linux-internal data structures created from the
4926- * MP-table.
4927- */
4928-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
4929-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
4930-
4931-static int mp_current_pci_id = 0;
4932-/* I/O APIC entries */
4933-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
4934-
4935-/* # of MP IRQ source entries */
4936-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
4937-
4938-/* MP IRQ source entries */
4939-int mp_irq_entries;
4940-
4941-int nr_ioapics;
4942-unsigned long mp_lapic_addr = 0;
4943-
4944-
4945-
4946-/* Processor that is doing the boot up */
4947-unsigned int boot_cpu_id = -1U;
4948-EXPORT_SYMBOL(boot_cpu_id);
4949-
4950-/* Internal processor count */
4951-unsigned int num_processors;
4952-
4953-unsigned disabled_cpus __cpuinitdata;
4954-
4955-/* Bitmask of physically existing CPUs */
4956-physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4957-
4958-#ifndef CONFIG_XEN
4959-u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
4960- = { [0 ... NR_CPUS-1] = BAD_APICID };
4961-void *x86_bios_cpu_apicid_early_ptr;
4962-#endif
4963-DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
4964-EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
4965-
4966-
4967-/*
4968- * Intel MP BIOS table parsing routines:
4969- */
4970-
4971-/*
4972- * Checksum an MP configuration block.
4973- */
4974-
4975-static int __init mpf_checksum(unsigned char *mp, int len)
4976-{
4977- int sum = 0;
4978-
4979- while (len--)
4980- sum += *mp++;
4981-
4982- return sum & 0xFF;
4983-}
4984-
4985-#ifndef CONFIG_XEN
4986-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
4987-{
4988- int cpu;
4989- cpumask_t tmp_map;
4990- char *bootup_cpu = "";
4991-
4992- if (!(m->mpc_cpuflag & CPU_ENABLED)) {
4993- disabled_cpus++;
4994- return;
4995- }
4996- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4997- bootup_cpu = " (Bootup-CPU)";
4998- boot_cpu_id = m->mpc_apicid;
4999- }
5000-
5001- printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
5002-
5003- if (num_processors >= NR_CPUS) {
5004- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
5005- " Processor ignored.\n", NR_CPUS);
5006- return;
5007- }
5008-
5009- num_processors++;
5010- cpus_complement(tmp_map, cpu_present_map);
5011- cpu = first_cpu(tmp_map);
5012-
5013- physid_set(m->mpc_apicid, phys_cpu_present_map);
5014- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
5015- /*
5016- * x86_bios_cpu_apicid is required to have processors listed
5017- * in same order as logical cpu numbers. Hence the first
5018- * entry is BSP, and so on.
5019- */
5020- cpu = 0;
5021- }
5022- /* are we being called early in kernel startup? */
5023- if (x86_cpu_to_apicid_early_ptr) {
5024- u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
5025- u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
5026-
5027- cpu_to_apicid[cpu] = m->mpc_apicid;
5028- bios_cpu_apicid[cpu] = m->mpc_apicid;
5029- } else {
5030- per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
5031- per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
5032- }
5033-
5034- cpu_set(cpu, cpu_possible_map);
5035- cpu_set(cpu, cpu_present_map);
5036-}
5037-#else
5038-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
5039-{
5040- num_processors++;
5041-}
5042-#endif /* CONFIG_XEN */
5043-
5044-static void __init MP_bus_info (struct mpc_config_bus *m)
5045-{
5046- char str[7];
5047-
5048- memcpy(str, m->mpc_bustype, 6);
5049- str[6] = 0;
5050- Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
5051-
5052- if (strncmp(str, "ISA", 3) == 0) {
5053- set_bit(m->mpc_busid, mp_bus_not_pci);
5054- } else if (strncmp(str, "PCI", 3) == 0) {
5055- clear_bit(m->mpc_busid, mp_bus_not_pci);
5056- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
5057- mp_current_pci_id++;
5058- } else {
5059- printk(KERN_ERR "Unknown bustype %s\n", str);
5060- }
5061-}
5062-
5063-static int bad_ioapic(unsigned long address)
5064-{
5065- if (nr_ioapics >= MAX_IO_APICS) {
5066- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
5067- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
5068- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
5069- }
5070- if (!address) {
5071- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
5072- " found in table, skipping!\n");
5073- return 1;
5074- }
5075- return 0;
5076-}
5077-
5078-static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
5079-{
5080- if (!(m->mpc_flags & MPC_APIC_USABLE))
5081- return;
5082-
5083- printk("I/O APIC #%d at 0x%X.\n",
5084- m->mpc_apicid, m->mpc_apicaddr);
5085-
5086- if (bad_ioapic(m->mpc_apicaddr))
5087- return;
5088-
5089- mp_ioapics[nr_ioapics] = *m;
5090- nr_ioapics++;
5091-}
5092-
5093-static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
5094-{
5095- mp_irqs [mp_irq_entries] = *m;
5096- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
5097- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
5098- m->mpc_irqtype, m->mpc_irqflag & 3,
5099- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
5100- m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
5101- if (++mp_irq_entries >= MAX_IRQ_SOURCES)
5102- panic("Max # of irq sources exceeded!!\n");
5103-}
5104-
5105-static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
5106-{
5107- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
5108- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
5109- m->mpc_irqtype, m->mpc_irqflag & 3,
5110- (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
5111- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
5112-}
5113-
5114-/*
5115- * Read/parse the MPC
5116- */
5117-
5118-static int __init smp_read_mpc(struct mp_config_table *mpc)
5119-{
5120- char str[16];
5121- int count=sizeof(*mpc);
5122- unsigned char *mpt=((unsigned char *)mpc)+count;
5123-
5124- if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
5125- printk("MPTABLE: bad signature [%c%c%c%c]!\n",
5126- mpc->mpc_signature[0],
5127- mpc->mpc_signature[1],
5128- mpc->mpc_signature[2],
5129- mpc->mpc_signature[3]);
5130- return 0;
5131- }
5132- if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
5133- printk("MPTABLE: checksum error!\n");
5134- return 0;
5135- }
5136- if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
5137- printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
5138- mpc->mpc_spec);
5139- return 0;
5140- }
5141- if (!mpc->mpc_lapic) {
5142- printk(KERN_ERR "MPTABLE: null local APIC address!\n");
5143- return 0;
5144- }
5145- memcpy(str,mpc->mpc_oem,8);
5146- str[8] = 0;
5147- printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
5148-
5149- memcpy(str,mpc->mpc_productid,12);
5150- str[12] = 0;
5151- printk("MPTABLE: Product ID: %s ",str);
5152-
5153- printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
5154-
5155- /* save the local APIC address, it might be non-default */
5156- if (!acpi_lapic)
5157- mp_lapic_addr = mpc->mpc_lapic;
5158-
5159- /*
5160- * Now process the configuration blocks.
5161- */
5162- while (count < mpc->mpc_length) {
5163- switch(*mpt) {
5164- case MP_PROCESSOR:
5165- {
5166- struct mpc_config_processor *m=
5167- (struct mpc_config_processor *)mpt;
5168- if (!acpi_lapic)
5169- MP_processor_info(m);
5170- mpt += sizeof(*m);
5171- count += sizeof(*m);
5172- break;
5173- }
5174- case MP_BUS:
5175- {
5176- struct mpc_config_bus *m=
5177- (struct mpc_config_bus *)mpt;
5178- MP_bus_info(m);
5179- mpt += sizeof(*m);
5180- count += sizeof(*m);
5181- break;
5182- }
5183- case MP_IOAPIC:
5184- {
5185- struct mpc_config_ioapic *m=
5186- (struct mpc_config_ioapic *)mpt;
5187- MP_ioapic_info(m);
5188- mpt += sizeof(*m);
5189- count += sizeof(*m);
5190- break;
5191- }
5192- case MP_INTSRC:
5193- {
5194- struct mpc_config_intsrc *m=
5195- (struct mpc_config_intsrc *)mpt;
5196-
5197- MP_intsrc_info(m);
5198- mpt += sizeof(*m);
5199- count += sizeof(*m);
5200- break;
5201- }
5202- case MP_LINTSRC:
5203- {
5204- struct mpc_config_lintsrc *m=
5205- (struct mpc_config_lintsrc *)mpt;
5206- MP_lintsrc_info(m);
5207- mpt += sizeof(*m);
5208- count += sizeof(*m);
5209- break;
5210- }
5211- }
5212- }
5213- setup_apic_routing();
5214- if (!num_processors)
5215- printk(KERN_ERR "MPTABLE: no processors registered!\n");
5216- return num_processors;
5217-}
5218-
5219-static int __init ELCR_trigger(unsigned int irq)
5220-{
5221- unsigned int port;
5222-
5223- port = 0x4d0 + (irq >> 3);
5224- return (inb(port) >> (irq & 7)) & 1;
5225-}
5226-
5227-static void __init construct_default_ioirq_mptable(int mpc_default_type)
5228-{
5229- struct mpc_config_intsrc intsrc;
5230- int i;
5231- int ELCR_fallback = 0;
5232-
5233- intsrc.mpc_type = MP_INTSRC;
5234- intsrc.mpc_irqflag = 0; /* conforming */
5235- intsrc.mpc_srcbus = 0;
5236- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
5237-
5238- intsrc.mpc_irqtype = mp_INT;
5239-
5240- /*
5241- * If true, we have an ISA/PCI system with no IRQ entries
5242- * in the MP table. To prevent the PCI interrupts from being set up
5243- * incorrectly, we try to use the ELCR. The sanity check to see if
5244- * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
5245- * never be level sensitive, so we simply see if the ELCR agrees.
5246- * If it does, we assume it's valid.
5247- */
5248- if (mpc_default_type == 5) {
5249- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
5250-
5251- if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
5252- printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
5253- else {
5254- printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
5255- ELCR_fallback = 1;
5256- }
5257- }
5258-
5259- for (i = 0; i < 16; i++) {
5260- switch (mpc_default_type) {
5261- case 2:
5262- if (i == 0 || i == 13)
5263- continue; /* IRQ0 & IRQ13 not connected */
5264- /* fall through */
5265- default:
5266- if (i == 2)
5267- continue; /* IRQ2 is never connected */
5268- }
5269-
5270- if (ELCR_fallback) {
5271- /*
5272- * If the ELCR indicates a level-sensitive interrupt, we
5273- * copy that information over to the MP table in the
5274- * irqflag field (level sensitive, active high polarity).
5275- */
5276- if (ELCR_trigger(i))
5277- intsrc.mpc_irqflag = 13;
5278- else
5279- intsrc.mpc_irqflag = 0;
5280- }
5281-
5282- intsrc.mpc_srcbusirq = i;
5283- intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
5284- MP_intsrc_info(&intsrc);
5285- }
5286-
5287- intsrc.mpc_irqtype = mp_ExtINT;
5288- intsrc.mpc_srcbusirq = 0;
5289- intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
5290- MP_intsrc_info(&intsrc);
5291-}
5292-
5293-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
5294-{
5295- struct mpc_config_processor processor;
5296- struct mpc_config_bus bus;
5297- struct mpc_config_ioapic ioapic;
5298- struct mpc_config_lintsrc lintsrc;
5299- int linttypes[2] = { mp_ExtINT, mp_NMI };
5300- int i;
5301-
5302- /*
5303- * local APIC has default address
5304- */
5305- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
5306-
5307- /*
5308- * 2 CPUs, numbered 0 & 1.
5309- */
5310- processor.mpc_type = MP_PROCESSOR;
5311- processor.mpc_apicver = 0;
5312- processor.mpc_cpuflag = CPU_ENABLED;
5313- processor.mpc_cpufeature = 0;
5314- processor.mpc_featureflag = 0;
5315- processor.mpc_reserved[0] = 0;
5316- processor.mpc_reserved[1] = 0;
5317- for (i = 0; i < 2; i++) {
5318- processor.mpc_apicid = i;
5319- MP_processor_info(&processor);
5320- }
5321-
5322- bus.mpc_type = MP_BUS;
5323- bus.mpc_busid = 0;
5324- switch (mpc_default_type) {
5325- default:
5326- printk(KERN_ERR "???\nUnknown standard configuration %d\n",
5327- mpc_default_type);
5328- /* fall through */
5329- case 1:
5330- case 5:
5331- memcpy(bus.mpc_bustype, "ISA ", 6);
5332- break;
5333- }
5334- MP_bus_info(&bus);
5335- if (mpc_default_type > 4) {
5336- bus.mpc_busid = 1;
5337- memcpy(bus.mpc_bustype, "PCI ", 6);
5338- MP_bus_info(&bus);
5339- }
5340-
5341- ioapic.mpc_type = MP_IOAPIC;
5342- ioapic.mpc_apicid = 2;
5343- ioapic.mpc_apicver = 0;
5344- ioapic.mpc_flags = MPC_APIC_USABLE;
5345- ioapic.mpc_apicaddr = 0xFEC00000;
5346- MP_ioapic_info(&ioapic);
5347-
5348- /*
5349- * We set up most of the low 16 IO-APIC pins according to MPS rules.
5350- */
5351- construct_default_ioirq_mptable(mpc_default_type);
5352-
5353- lintsrc.mpc_type = MP_LINTSRC;
5354- lintsrc.mpc_irqflag = 0; /* conforming */
5355- lintsrc.mpc_srcbusid = 0;
5356- lintsrc.mpc_srcbusirq = 0;
5357- lintsrc.mpc_destapic = MP_APIC_ALL;
5358- for (i = 0; i < 2; i++) {
5359- lintsrc.mpc_irqtype = linttypes[i];
5360- lintsrc.mpc_destapiclint = i;
5361- MP_lintsrc_info(&lintsrc);
5362- }
5363-}
5364-
5365-static struct intel_mp_floating *mpf_found;
5366-
5367-/*
5368- * Scan the memory blocks for an SMP configuration block.
5369- */
5370-void __init get_smp_config (void)
5371-{
5372- struct intel_mp_floating *mpf = mpf_found;
5373-
5374- /*
5375- * ACPI supports both logical (e.g. Hyper-Threading) and physical
5376- * processors, where MPS only supports physical.
5377- */
5378- if (acpi_lapic && acpi_ioapic) {
5379- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
5380- return;
5381- }
5382- else if (acpi_lapic)
5383- printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
5384-
5385- printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
5386-
5387- /*
5388- * Now see if we need to read further.
5389- */
5390- if (mpf->mpf_feature1 != 0) {
5391-
5392- printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
5393- construct_default_ISA_mptable(mpf->mpf_feature1);
5394-
5395- } else if (mpf->mpf_physptr) {
5396-
5397- /*
5398- * Read the physical hardware table. Anything here will
5399- * override the defaults.
5400- */
5401- if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
5402- smp_found_config = 0;
5403- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
5404- printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
5405- return;
5406- }
5407- /*
5408- * If there are no explicit MP IRQ entries, then we are
5409- * broken. We set up most of the low 16 IO-APIC pins to
5410- * ISA defaults and hope it will work.
5411- */
5412- if (!mp_irq_entries) {
5413- struct mpc_config_bus bus;
5414-
5415- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
5416-
5417- bus.mpc_type = MP_BUS;
5418- bus.mpc_busid = 0;
5419- memcpy(bus.mpc_bustype, "ISA ", 6);
5420- MP_bus_info(&bus);
5421-
5422- construct_default_ioirq_mptable(0);
5423- }
5424-
5425- } else
5426- BUG();
5427-
5428- printk(KERN_INFO "Processors: %d\n", num_processors);
5429- /*
5430- * Only use the first configuration found.
5431- */
5432-}
5433-
5434-static int __init smp_scan_config (unsigned long base, unsigned long length)
5435-{
5436- extern void __bad_mpf_size(void);
5437- unsigned int *bp = isa_bus_to_virt(base);
5438- struct intel_mp_floating *mpf;
5439-
5440- Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
5441- if (sizeof(*mpf) != 16)
5442- __bad_mpf_size();
5443-
5444- while (length > 0) {
5445- mpf = (struct intel_mp_floating *)bp;
5446- if ((*bp == SMP_MAGIC_IDENT) &&
5447- (mpf->mpf_length == 1) &&
5448- !mpf_checksum((unsigned char *)bp, 16) &&
5449- ((mpf->mpf_specification == 1)
5450- || (mpf->mpf_specification == 4)) ) {
5451-
5452- smp_found_config = 1;
5453- mpf_found = mpf;
5454- return 1;
5455- }
5456- bp += 4;
5457- length -= 16;
5458- }
5459- return 0;
5460-}
5461-
5462-void __init find_smp_config(void)
5463-{
5464- unsigned int address;
5465-
5466- /*
5467- * FIXME: Linux assumes you have 640K of base ram..
5468- * this continues the error...
5469- *
5470- * 1) Scan the bottom 1K for a signature
5471- * 2) Scan the top 1K of base RAM
5472- * 3) Scan the 64K of bios
5473- */
5474- if (smp_scan_config(0x0,0x400) ||
5475- smp_scan_config(639*0x400,0x400) ||
5476- smp_scan_config(0xF0000,0x10000))
5477- return;
5478- /*
5479- * If it is an SMP machine we should know now.
5480- *
5481- * there is a real-mode segmented pointer pointing to the
5482- * 4K EBDA area at 0x40E, calculate and scan it here.
5483- *
5484- * NOTE! There are Linux loaders that will corrupt the EBDA
5485- * area, and as such this kind of SMP config may be less
5486- * trustworthy, simply because the SMP table may have been
5487- * stomped on during early boot. These loaders are buggy and
5488- * should be fixed.
5489- */
5490-
5491- address = *(unsigned short *)phys_to_virt(0x40E);
5492- address <<= 4;
5493- if (smp_scan_config(address, 0x1000))
5494- return;
5495-
5496- /* If we have come this far, we did not find an MP table */
5497- printk(KERN_INFO "No mptable found.\n");
5498-}
5499-
5500-/* --------------------------------------------------------------------------
5501- ACPI-based MP Configuration
5502- -------------------------------------------------------------------------- */
5503-
5504-#ifdef CONFIG_ACPI
5505-
5506-void __init mp_register_lapic_address(u64 address)
5507-{
5508-#ifndef CONFIG_XEN
5509- mp_lapic_addr = (unsigned long) address;
5510- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
5511- if (boot_cpu_id == -1U)
5512- boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
5513-#endif
5514-}
5515-
5516-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
5517-{
5518- struct mpc_config_processor processor;
5519- int boot_cpu = 0;
5520-
5521- if (id == boot_cpu_id)
5522- boot_cpu = 1;
5523-
5524-#ifndef CONFIG_XEN
5525- processor.mpc_type = MP_PROCESSOR;
5526- processor.mpc_apicid = id;
5527- processor.mpc_apicver = 0;
5528- processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
5529- processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
5530- processor.mpc_cpufeature = 0;
5531- processor.mpc_featureflag = 0;
5532- processor.mpc_reserved[0] = 0;
5533- processor.mpc_reserved[1] = 0;
5534-#endif
5535-
5536- MP_processor_info(&processor);
5537-}
5538-
5539-#define MP_ISA_BUS 0
5540-#define MP_MAX_IOAPIC_PIN 127
5541-
5542-static struct mp_ioapic_routing {
5543- int apic_id;
5544- int gsi_start;
5545- int gsi_end;
5546- u32 pin_programmed[4];
5547-} mp_ioapic_routing[MAX_IO_APICS];
5548-
5549-static int mp_find_ioapic(int gsi)
5550-{
5551- int i = 0;
5552-
5553- /* Find the IOAPIC that manages this GSI. */
5554- for (i = 0; i < nr_ioapics; i++) {
5555- if ((gsi >= mp_ioapic_routing[i].gsi_start)
5556- && (gsi <= mp_ioapic_routing[i].gsi_end))
5557- return i;
5558- }
5559-
5560- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
5561- return -1;
5562-}
5563-
5564-static u8 uniq_ioapic_id(u8 id)
5565-{
5566- int i;
5567- DECLARE_BITMAP(used, 256);
5568- bitmap_zero(used, 256);
5569- for (i = 0; i < nr_ioapics; i++) {
5570- struct mpc_config_ioapic *ia = &mp_ioapics[i];
5571- __set_bit(ia->mpc_apicid, used);
5572- }
5573- if (!test_bit(id, used))
5574- return id;
5575- return find_first_zero_bit(used, 256);
5576-}
5577-
5578-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
5579-{
5580- int idx = 0;
5581-
5582- if (bad_ioapic(address))
5583- return;
5584-
5585- idx = nr_ioapics;
5586-
5587- mp_ioapics[idx].mpc_type = MP_IOAPIC;
5588- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
5589- mp_ioapics[idx].mpc_apicaddr = address;
5590-
5591-#ifndef CONFIG_XEN
5592- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
5593-#endif
5594- mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
5595- mp_ioapics[idx].mpc_apicver = 0;
5596-
5597- /*
5598- * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
5599- * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
5600- */
5601- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
5602- mp_ioapic_routing[idx].gsi_start = gsi_base;
5603- mp_ioapic_routing[idx].gsi_end = gsi_base +
5604- io_apic_get_redir_entries(idx);
5605-
5606- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
5607- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
5608- mp_ioapics[idx].mpc_apicaddr,
5609- mp_ioapic_routing[idx].gsi_start,
5610- mp_ioapic_routing[idx].gsi_end);
5611-
5612- nr_ioapics++;
5613-}
5614-
5615-void __init
5616-mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
5617-{
5618- struct mpc_config_intsrc intsrc;
5619- int ioapic = -1;
5620- int pin = -1;
5621-
5622- /*
5623- * Convert 'gsi' to 'ioapic.pin'.
5624- */
5625- ioapic = mp_find_ioapic(gsi);
5626- if (ioapic < 0)
5627- return;
5628- pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
5629-
5630- /*
5631- * TBD: This check is for faulty timer entries, where the override
5632- * erroneously sets the trigger to level, resulting in a HUGE
5633- * increase of timer interrupts!
5634- */
5635- if ((bus_irq == 0) && (trigger == 3))
5636- trigger = 1;
5637-
5638- intsrc.mpc_type = MP_INTSRC;
5639- intsrc.mpc_irqtype = mp_INT;
5640- intsrc.mpc_irqflag = (trigger << 2) | polarity;
5641- intsrc.mpc_srcbus = MP_ISA_BUS;
5642- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
5643- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
5644- intsrc.mpc_dstirq = pin; /* INTIN# */
5645-
5646- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
5647- intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
5648- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
5649- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
5650-
5651- mp_irqs[mp_irq_entries] = intsrc;
5652- if (++mp_irq_entries == MAX_IRQ_SOURCES)
5653- panic("Max # of irq sources exceeded!\n");
5654-}
5655-
5656-void __init mp_config_acpi_legacy_irqs(void)
5657-{
5658- struct mpc_config_intsrc intsrc;
5659- int i = 0;
5660- int ioapic = -1;
5661-
5662- /*
5663- * Fabricate the legacy ISA bus (bus #31).
5664- */
5665- set_bit(MP_ISA_BUS, mp_bus_not_pci);
5666-
5667- /*
5668- * Locate the IOAPIC that manages the ISA IRQs (0-15).
5669- */
5670- ioapic = mp_find_ioapic(0);
5671- if (ioapic < 0)
5672- return;
5673-
5674- intsrc.mpc_type = MP_INTSRC;
5675- intsrc.mpc_irqflag = 0; /* Conforming */
5676- intsrc.mpc_srcbus = MP_ISA_BUS;
5677- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
5678-
5679- /*
5680- * Use the default configuration for the IRQs 0-15. Unless
5681- * overridden by (MADT) interrupt source override entries.
5682- */
5683- for (i = 0; i < 16; i++) {
5684- int idx;
5685-
5686- for (idx = 0; idx < mp_irq_entries; idx++) {
5687- struct mpc_config_intsrc *irq = mp_irqs + idx;
5688-
5689- /* Do we already have a mapping for this ISA IRQ? */
5690- if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
5691- break;
5692-
5693- /* Do we already have a mapping for this IOAPIC pin */
5694- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
5695- (irq->mpc_dstirq == i))
5696- break;
5697- }
5698-
5699- if (idx != mp_irq_entries) {
5700- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
5701- continue; /* IRQ already used */
5702- }
5703-
5704- intsrc.mpc_irqtype = mp_INT;
5705- intsrc.mpc_srcbusirq = i; /* Identity mapped */
5706- intsrc.mpc_dstirq = i;
5707-
5708- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
5709- "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
5710- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
5711- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
5712- intsrc.mpc_dstirq);
5713-
5714- mp_irqs[mp_irq_entries] = intsrc;
5715- if (++mp_irq_entries == MAX_IRQ_SOURCES)
5716- panic("Max # of irq sources exceeded!\n");
5717- }
5718-}
5719-
5720-int mp_register_gsi(u32 gsi, int triggering, int polarity)
5721-{
5722- int ioapic = -1;
5723- int ioapic_pin = 0;
5724- int idx, bit = 0;
5725-
5726- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
5727- return gsi;
5728-
5729- /* Don't set up the ACPI SCI because it's already set up */
5730- if (acpi_gbl_FADT.sci_interrupt == gsi)
5731- return gsi;
5732-
5733- ioapic = mp_find_ioapic(gsi);
5734- if (ioapic < 0) {
5735- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
5736- return gsi;
5737- }
5738-
5739- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
5740-
5741- /*
5742- * Avoid pin reprogramming. PRTs typically include entries
5743- * with redundant pin->gsi mappings (but unique PCI devices);
5744- * we only program the IOAPIC on the first.
5745- */
5746- bit = ioapic_pin % 32;
5747- idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
5748- if (idx > 3) {
5749- printk(KERN_ERR "Invalid reference to IOAPIC pin "
5750- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
5751- ioapic_pin);
5752- return gsi;
5753- }
5754- if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
5755- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
5756- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
5757- return gsi;
5758- }
5759-
5760- mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
5761-
5762- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
5763- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
5764- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
5765- return gsi;
5766-}
5767-#endif /*CONFIG_ACPI*/
5768--- sle11-2009-05-14.orig/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:33:40.000000000 +0100
5769+++ sle11-2009-05-14/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:38:05.000000000 +0100
5770@@ -1,283 +1,251 @@
5771-/*
5772- * Dynamic DMA mapping support.
5773- *
5774- * On i386 there is no hardware dynamic DMA address translation,
5775- * so consistent alloc/free are merely page allocation/freeing.
5776- * The rest of the dynamic DMA mapping interface is implemented
5777- * in asm/pci.h.
5778- */
5779-
5780-#include <linux/types.h>
5781-#include <linux/mm.h>
5782-#include <linux/string.h>
5783+#include <linux/dma-mapping.h>
5784+#include <linux/dmar.h>
5785+#include <linux/bootmem.h>
5786 #include <linux/pci.h>
5787-#include <linux/module.h>
5788-#include <linux/version.h>
5789-#include <asm/io.h>
5790-#include <xen/balloon.h>
5791-#include <xen/gnttab.h>
5792-#include <asm/swiotlb.h>
5793-#include <asm/tlbflush.h>
5794-#include <asm/swiotlb_32.h>
5795-#include <asm/gnttab_dma.h>
5796-#include <asm/bug.h>
5797
5798-#ifdef __x86_64__
5799-#include <asm/iommu.h>
5800+#include <asm/proto.h>
5801+#include <asm/dma.h>
5802+#include <asm/gart.h>
5803+#include <asm/calgary.h>
5804+
5805+int forbid_dac __read_mostly;
5806+EXPORT_SYMBOL(forbid_dac);
5807+
5808+const struct dma_mapping_ops *dma_ops;
5809+EXPORT_SYMBOL(dma_ops);
5810+
5811+static int iommu_sac_force __read_mostly;
5812+
5813+#ifdef CONFIG_IOMMU_DEBUG
5814+int panic_on_overflow __read_mostly = 1;
5815+int force_iommu __read_mostly = 1;
5816+#else
5817+int panic_on_overflow __read_mostly = 0;
5818+int force_iommu __read_mostly = 0;
5819+#endif
5820
5821 int iommu_merge __read_mostly = 0;
5822-EXPORT_SYMBOL(iommu_merge);
5823
5824-dma_addr_t bad_dma_address __read_mostly;
5825-EXPORT_SYMBOL(bad_dma_address);
5826+int no_iommu __read_mostly;
5827+/* Set this to 1 if there is a HW IOMMU in the system */
5828+int iommu_detected __read_mostly = 0;
5829
5830 /* This tells the BIO block layer to assume merging. Default to off
5831 because we cannot guarantee merging later. */
5832 int iommu_bio_merge __read_mostly = 0;
5833 EXPORT_SYMBOL(iommu_bio_merge);
5834
5835-int force_iommu __read_mostly= 0;
5836+dma_addr_t bad_dma_address __read_mostly = 0;
5837+EXPORT_SYMBOL(bad_dma_address);
5838
5839-__init int iommu_setup(char *p)
5840-{
5841- return 1;
5842-}
5843+/* Dummy device used for NULL arguments (normally ISA). Better would
5844+ be probably a smaller DMA mask, but this is bug-to-bug compatible
5845+ to older i386. */
5846+struct device fallback_dev = {
5847+ .bus_id = "fallback device",
5848+ .coherent_dma_mask = DMA_32BIT_MASK,
5849+ .dma_mask = &fallback_dev.coherent_dma_mask,
5850+};
5851
5852-void __init pci_iommu_alloc(void)
5853+int dma_set_mask(struct device *dev, u64 mask)
5854 {
5855-#ifdef CONFIG_SWIOTLB
5856- pci_swiotlb_init();
5857-#endif
5858-}
5859+ if (!dev->dma_mask || !dma_supported(dev, mask))
5860+ return -EIO;
5861+
5862+ *dev->dma_mask = mask;
5863
5864-static int __init pci_iommu_init(void)
5865-{
5866- no_iommu_init();
5867 return 0;
5868 }
5869+EXPORT_SYMBOL(dma_set_mask);
5870
5871-/* Must execute after PCI subsystem */
5872-fs_initcall(pci_iommu_init);
5873-#endif
5874-
5875-struct dma_coherent_mem {
5876- void *virt_base;
5877- u32 device_base;
5878- int size;
5879- int flags;
5880- unsigned long *bitmap;
5881-};
5882-
5883-#define IOMMU_BUG_ON(test) \
5884-do { \
5885- if (unlikely(test)) { \
5886- printk(KERN_ALERT "Fatal DMA error! " \
5887- "Please use 'swiotlb=force'\n"); \
5888- BUG(); \
5889- } \
5890-} while (0)
5891+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
5892+static __initdata void *dma32_bootmem_ptr;
5893+static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
5894
5895-static int check_pages_physically_contiguous(unsigned long pfn,
5896- unsigned int offset,
5897- size_t length)
5898+static int __init parse_dma32_size_opt(char *p)
5899 {
5900- unsigned long next_mfn;
5901- int i;
5902- int nr_pages;
5903-
5904- next_mfn = pfn_to_mfn(pfn);
5905- nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
5906-
5907- for (i = 1; i < nr_pages; i++) {
5908- if (pfn_to_mfn(++pfn) != ++next_mfn)
5909- return 0;
5910- }
5911- return 1;
5912+ if (!p)
5913+ return -EINVAL;
5914+ dma32_bootmem_size = memparse(p, &p);
5915+ return 0;
5916 }
5917+early_param("dma32_size", parse_dma32_size_opt);
5918
5919-int range_straddles_page_boundary(paddr_t p, size_t size)
5920+void __init dma32_reserve_bootmem(void)
5921 {
5922- unsigned long pfn = p >> PAGE_SHIFT;
5923- unsigned int offset = p & ~PAGE_MASK;
5924+ unsigned long size, align;
5925+ if (end_pfn <= MAX_DMA32_PFN)
5926+ return;
5927
5928- return ((offset + size > PAGE_SIZE) &&
5929- !check_pages_physically_contiguous(pfn, offset, size));
5930+ align = 64ULL<<20;
5931+ size = round_up(dma32_bootmem_size, align);
5932+ dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
5933+ __pa(MAX_DMA_ADDRESS));
5934+ if (dma32_bootmem_ptr)
5935+ dma32_bootmem_size = size;
5936+ else
5937+ dma32_bootmem_size = 0;
5938 }
5939-
5940-int
5941-dma_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
5942- enum dma_data_direction direction)
5943+static void __init dma32_free_bootmem(void)
5944 {
5945- int i, rc;
5946+ int node;
5947+
5948+ if (end_pfn <= MAX_DMA32_PFN)
5949+ return;
5950
5951- BUG_ON(!valid_dma_direction(direction));
5952- WARN_ON(nents == 0 || sgl->length == 0);
5953+ if (!dma32_bootmem_ptr)
5954+ return;
5955
5956- if (swiotlb) {
5957- rc = swiotlb_map_sg(hwdev, sgl, nents, direction);
5958- } else {
5959- struct scatterlist *sg;
5960-
5961- for_each_sg(sgl, sg, nents, i) {
5962- BUG_ON(!sg_page(sg));
5963- sg->dma_address =
5964- gnttab_dma_map_page(sg_page(sg)) + sg->offset;
5965- sg->dma_length = sg->length;
5966- IOMMU_BUG_ON(address_needs_mapping(
5967- hwdev, sg->dma_address));
5968- IOMMU_BUG_ON(range_straddles_page_boundary(
5969- page_to_pseudophys(sg_page(sg)) + sg->offset,
5970- sg->length));
5971- }
5972- rc = nents;
5973- }
5974+ for_each_online_node(node)
5975+ free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
5976+ dma32_bootmem_size);
5977
5978- flush_write_buffers();
5979- return rc;
5980+ dma32_bootmem_ptr = NULL;
5981+ dma32_bootmem_size = 0;
5982 }
5983-EXPORT_SYMBOL(dma_map_sg);
5984+#else
5985+#define dma32_free_bootmem() ((void)0)
5986+#endif
5987
5988-void
5989-dma_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
5990- enum dma_data_direction direction)
5991-{
5992- int i;
5993+static const struct dma_mapping_ops swiotlb_dma_ops = {
5994+ .mapping_error = swiotlb_dma_mapping_error,
5995+ .map_single = swiotlb_map_single_phys,
5996+ .unmap_single = swiotlb_unmap_single,
5997+ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
5998+ .sync_single_for_device = swiotlb_sync_single_for_device,
5999+ .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
6000+ .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
6001+ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
6002+ .sync_sg_for_device = swiotlb_sync_sg_for_device,
6003+ .map_sg = swiotlb_map_sg,
6004+ .unmap_sg = swiotlb_unmap_sg,
6005+ .dma_supported = swiotlb_dma_supported
6006+};
6007
6008- BUG_ON(!valid_dma_direction(direction));
6009- if (swiotlb)
6010- swiotlb_unmap_sg(hwdev, sgl, nents, direction);
6011- else {
6012- struct scatterlist *sg;
6013+void __init pci_iommu_alloc(void)
6014+{
6015+ /* free the range so iommu could get some range less than 4G */
6016+ dma32_free_bootmem();
6017+ /*
6018+ * The order of these functions is important for
6019+ * fall-back/fail-over reasons
6020+ */
6021+#ifdef CONFIG_GART_IOMMU
6022+ gart_iommu_hole_init();
6023+#endif
6024
6025- for_each_sg(sgl, sg, nents, i)
6026- gnttab_dma_unmap_page(sg->dma_address);
6027- }
6028-}
6029-EXPORT_SYMBOL(dma_unmap_sg);
6030+#ifdef CONFIG_CALGARY_IOMMU
6031+ detect_calgary();
6032+#endif
6033
6034-#ifdef CONFIG_HIGHMEM
6035-dma_addr_t
6036-dma_map_page(struct device *dev, struct page *page, unsigned long offset,
6037- size_t size, enum dma_data_direction direction)
6038-{
6039- dma_addr_t dma_addr;
6040+ detect_intel_iommu();
6041
6042- BUG_ON(!valid_dma_direction(direction));
6043+#ifdef CONFIG_SWIOTLB
6044+ swiotlb_init();
6045 if (swiotlb) {
6046- dma_addr = swiotlb_map_page(
6047- dev, page, offset, size, direction);
6048- } else {
6049- dma_addr = gnttab_dma_map_page(page) + offset;
6050- IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
6051+ printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
6052+ dma_ops = &swiotlb_dma_ops;
6053 }
6054-
6055- return dma_addr;
6056+#endif
6057 }
6058-EXPORT_SYMBOL(dma_map_page);
6059
6060-void
6061-dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
6062- enum dma_data_direction direction)
6063+/*
6064+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
6065+ * documentation.
6066+ */
6067+static __init int iommu_setup(char *p)
6068 {
6069- BUG_ON(!valid_dma_direction(direction));
6070- if (swiotlb)
6071- swiotlb_unmap_page(dev, dma_address, size, direction);
6072- else
6073- gnttab_dma_unmap_page(dma_address);
6074-}
6075-EXPORT_SYMBOL(dma_unmap_page);
6076-#endif /* CONFIG_HIGHMEM */
6077+ iommu_merge = 1;
6078
6079-int
6080-dma_mapping_error(dma_addr_t dma_addr)
6081-{
6082- if (swiotlb)
6083- return swiotlb_dma_mapping_error(dma_addr);
6084- return 0;
6085-}
6086-EXPORT_SYMBOL(dma_mapping_error);
6087+ if (!p)
6088+ return -EINVAL;
6089
6090-int
6091-dma_supported(struct device *dev, u64 mask)
6092-{
6093- if (swiotlb)
6094- return swiotlb_dma_supported(dev, mask);
6095- /*
6096- * By default we'll BUG when an infeasible DMA is requested, and
6097- * request swiotlb=force (see IOMMU_BUG_ON).
6098- */
6099- return 1;
6100-}
6101-EXPORT_SYMBOL(dma_supported);
6102+ while (*p) {
6103+ if (!strncmp(p, "off", 3))
6104+ no_iommu = 1;
6105+ /* gart_parse_options has more force support */
6106+ if (!strncmp(p, "force", 5))
6107+ force_iommu = 1;
6108+ if (!strncmp(p, "noforce", 7)) {
6109+ iommu_merge = 0;
6110+ force_iommu = 0;
6111+ }
6112
6113-void *dma_alloc_coherent(struct device *dev, size_t size,
6114- dma_addr_t *dma_handle, gfp_t gfp)
6115-{
6116- void *ret;
6117- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6118- unsigned int order = get_order(size);
6119- unsigned long vstart;
6120- u64 mask;
6121+ if (!strncmp(p, "biomerge", 8)) {
6122+ iommu_bio_merge = 4096;
6123+ iommu_merge = 1;
6124+ force_iommu = 1;
6125+ }
6126+ if (!strncmp(p, "panic", 5))
6127+ panic_on_overflow = 1;
6128+ if (!strncmp(p, "nopanic", 7))
6129+ panic_on_overflow = 0;
6130+ if (!strncmp(p, "merge", 5)) {
6131+ iommu_merge = 1;
6132+ force_iommu = 1;
6133+ }
6134+ if (!strncmp(p, "nomerge", 7))
6135+ iommu_merge = 0;
6136+ if (!strncmp(p, "forcesac", 8))
6137+ iommu_sac_force = 1;
6138+ if (!strncmp(p, "allowdac", 8))
6139+ forbid_dac = 0;
6140+ if (!strncmp(p, "nodac", 5))
6141+ forbid_dac = -1;
6142+ if (!strncmp(p, "usedac", 6)) {
6143+ forbid_dac = -1;
6144+ return 1;
6145+ }
6146+#ifdef CONFIG_SWIOTLB
6147+ if (!strncmp(p, "soft", 4))
6148+ swiotlb = 1;
6149+#endif
6150
6151- /* ignore region specifiers */
6152- gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
6153+#ifdef CONFIG_GART_IOMMU
6154+ gart_parse_options(p);
6155+#endif
6156
6157- if (mem) {
6158- int page = bitmap_find_free_region(mem->bitmap, mem->size,
6159- order);
6160- if (page >= 0) {
6161- *dma_handle = mem->device_base + (page << PAGE_SHIFT);
6162- ret = mem->virt_base + (page << PAGE_SHIFT);
6163- memset(ret, 0, size);
6164- return ret;
6165- }
6166- if (mem->flags & DMA_MEMORY_EXCLUSIVE)
6167- return NULL;
6168+#ifdef CONFIG_CALGARY_IOMMU
6169+ if (!strncmp(p, "calgary", 7))
6170+ use_calgary = 1;
6171+#endif /* CONFIG_CALGARY_IOMMU */
6172+
6173+ p += strcspn(p, ",");
6174+ if (*p == ',')
6175+ ++p;
6176 }
6177+ return 0;
6178+}
6179+early_param("iommu", iommu_setup);
6180
6181- if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
6182- gfp |= GFP_DMA;
6183-
6184- vstart = __get_free_pages(gfp, order);
6185- ret = (void *)vstart;
6186+static int check_pages_physically_contiguous(unsigned long pfn,
6187+ unsigned int offset,
6188+ size_t length)
6189+{
6190+ unsigned long next_mfn;
6191+ int i;
6192+ int nr_pages;
6193
6194- if (dev != NULL && dev->coherent_dma_mask)
6195- mask = dev->coherent_dma_mask;
6196- else
6197- mask = 0xffffffff;
6198+ next_mfn = pfn_to_mfn(pfn);
6199+ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
6200
6201- if (ret != NULL) {
6202- if (xen_create_contiguous_region(vstart, order,
6203- fls64(mask)) != 0) {
6204- free_pages(vstart, order);
6205- return NULL;
6206- }
6207- memset(ret, 0, size);
6208- *dma_handle = virt_to_bus(ret);
6209+ for (i = 1; i < nr_pages; i++) {
6210+ if (pfn_to_mfn(++pfn) != ++next_mfn)
6211+ return 0;
6212 }
6213- return ret;
6214+ return 1;
6215 }
6216-EXPORT_SYMBOL(dma_alloc_coherent);
6217
6218-void dma_free_coherent(struct device *dev, size_t size,
6219- void *vaddr, dma_addr_t dma_handle)
6220+int range_straddles_page_boundary(paddr_t p, size_t size)
6221 {
6222- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6223- int order = get_order(size);
6224-
6225- WARN_ON(irqs_disabled()); /* for portability */
6226- if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
6227- int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
6228+ unsigned long pfn = p >> PAGE_SHIFT;
6229+ unsigned int offset = p & ~PAGE_MASK;
6230
6231- bitmap_release_region(mem->bitmap, page, order);
6232- } else {
6233- xen_destroy_contiguous_region((unsigned long)vaddr, order);
6234- free_pages((unsigned long)vaddr, order);
6235- }
6236+ return ((offset + size > PAGE_SIZE) &&
6237+ !check_pages_physically_contiguous(pfn, offset, size));
6238 }
6239-EXPORT_SYMBOL(dma_free_coherent);
6240
6241-#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
6242+#ifdef CONFIG_X86_32
6243 int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
6244 dma_addr_t device_addr, size_t size, int flags)
6245 {
6246@@ -327,8 +295,8 @@ EXPORT_SYMBOL(dma_declare_coherent_memor
6247 void dma_release_declared_memory(struct device *dev)
6248 {
6249 struct dma_coherent_mem *mem = dev->dma_mem;
6250-
6251- if(!mem)
6252+
6253+ if (!mem)
6254 return;
6255 dev->dma_mem = NULL;
6256 iounmap(mem->virt_base);
6257@@ -341,8 +309,10 @@ void *dma_mark_declared_memory_occupied(
6258 dma_addr_t device_addr, size_t size)
6259 {
6260 struct dma_coherent_mem *mem = dev->dma_mem;
6261- int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
6262 int pos, err;
6263+ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
6264+
6265+ pages >>= PAGE_SHIFT;
6266
6267 if (!mem)
6268 return ERR_PTR(-EINVAL);
6269@@ -354,103 +324,270 @@ void *dma_mark_declared_memory_occupied(
6270 return mem->virt_base + (pos << PAGE_SHIFT);
6271 }
6272 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
6273-#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
6274-
6275-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
6276-/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
6277
6278-int forbid_dac;
6279-EXPORT_SYMBOL(forbid_dac);
6280-
6281-static __devinit void via_no_dac(struct pci_dev *dev)
6282+static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
6283+ dma_addr_t *dma_handle, void **ret)
6284 {
6285- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
6286- printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
6287- forbid_dac = 1;
6288+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6289+ int order = get_order(size);
6290+
6291+ if (mem) {
6292+ int page = bitmap_find_free_region(mem->bitmap, mem->size,
6293+ order);
6294+ if (page >= 0) {
6295+ *dma_handle = mem->device_base + (page << PAGE_SHIFT);
6296+ *ret = mem->virt_base + (page << PAGE_SHIFT);
6297+ memset(*ret, 0, size);
6298+ }
6299+ if (mem->flags & DMA_MEMORY_EXCLUSIVE)
6300+ *ret = NULL;
6301 }
6302+ return (mem != NULL);
6303 }
6304-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
6305
6306-static int check_iommu(char *s)
6307+static int dma_release_coherent(struct device *dev, int order, void *vaddr)
6308 {
6309- if (!strcmp(s, "usedac")) {
6310- forbid_dac = -1;
6311+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6312+
6313+ if (mem && vaddr >= mem->virt_base && vaddr <
6314+ (mem->virt_base + (mem->size << PAGE_SHIFT))) {
6315+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
6316+
6317+ bitmap_release_region(mem->bitmap, page, order);
6318 return 1;
6319 }
6320 return 0;
6321 }
6322-__setup("iommu=", check_iommu);
6323+#else
6324+#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
6325+#define dma_release_coherent(dev, order, vaddr) (0)
6326+#endif /* CONFIG_X86_32 */
6327+
6328+int dma_supported(struct device *dev, u64 mask)
6329+{
6330+#ifdef CONFIG_PCI
6331+ if (mask > 0xffffffff && forbid_dac > 0) {
6332+ printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
6333+ dev->bus_id);
6334+ return 0;
6335+ }
6336 #endif
6337
6338-dma_addr_t
6339-dma_map_single(struct device *dev, void *ptr, size_t size,
6340- enum dma_data_direction direction)
6341+ if (dma_ops->dma_supported)
6342+ return dma_ops->dma_supported(dev, mask);
6343+
6344+ /* Copied from i386. Doesn't make much sense, because it will
6345+ only work for pci_alloc_coherent.
6346+ The caller just has to use GFP_DMA in this case. */
6347+ if (mask < DMA_24BIT_MASK)
6348+ return 0;
6349+
6350+ /* Tell the device to use SAC when IOMMU force is on. This
6351+ allows the driver to use cheaper accesses in some cases.
6352+
6353+ Problem with this is that if we overflow the IOMMU area and
6354+ return DAC as fallback address the device may not handle it
6355+ correctly.
6356+
6357+ As a special case some controllers have a 39bit address
6358+ mode that is as efficient as 32bit (aic79xx). Don't force
6359+ SAC for these. Assume all masks <= 40 bits are of this
6360+ type. Normally this doesn't make any difference, but gives
6361+ more gentle handling of IOMMU overflow. */
6362+ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
6363+ printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
6364+ dev->bus_id, mask);
6365+ return 0;
6366+ }
6367+
6368+ return 1;
6369+}
6370+EXPORT_SYMBOL(dma_supported);
6371+
6372+/* Allocate DMA memory on node near device */
6373+static struct page *
6374+dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
6375 {
6376- dma_addr_t dma;
6377+ int node;
6378
6379- BUG_ON(!valid_dma_direction(direction));
6380- WARN_ON(size == 0);
6381+ node = dev_to_node(dev);
6382
6383- if (swiotlb) {
6384- dma = swiotlb_map_single(dev, ptr, size, direction);
6385- } else {
6386- dma = gnttab_dma_map_page(virt_to_page(ptr)) +
6387- offset_in_page(ptr);
6388- IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size));
6389- IOMMU_BUG_ON(address_needs_mapping(dev, dma));
6390- }
6391-
6392- flush_write_buffers();
6393- return dma;
6394-}
6395-EXPORT_SYMBOL(dma_map_single);
6396-
6397-void
6398-dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
6399- enum dma_data_direction direction)
6400-{
6401- BUG_ON(!valid_dma_direction(direction));
6402- if (swiotlb)
6403- swiotlb_unmap_single(dev, dma_addr, size, direction);
6404- else
6405- gnttab_dma_unmap_page(dma_addr);
6406+ return alloc_pages_node(node, gfp, order);
6407+}
6408+
6409+/*
6410+ * Allocate memory for a coherent mapping.
6411+ */
6412+void *
6413+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
6414+ gfp_t gfp)
6415+{
6416+ void *memory = NULL;
6417+ struct page *page;
6418+ unsigned long dma_mask = 0;
6419+ int noretry = 0;
6420+ unsigned int order = get_order(size);
6421+
6422+ /* ignore region specifiers */
6423+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
6424+
6425+ if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
6426+ return memory;
6427+
6428+ if (!dev) {
6429+ dev = &fallback_dev;
6430+ gfp |= GFP_DMA;
6431+ }
6432+ dma_mask = dev->coherent_dma_mask;
6433+ if (dma_mask == 0)
6434+ dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
6435+
6436+ /* Device not DMA able */
6437+ if (dev->dma_mask == NULL)
6438+ return NULL;
6439+
6440+ /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
6441+ if (gfp & __GFP_DMA)
6442+ noretry = 1;
6443+
6444+#ifdef CONFIG_XEN
6445+ gfp &= ~(__GFP_DMA | __GFP_DMA32);
6446+#else
6447+#ifdef CONFIG_X86_64
6448+ /* Why <=? Even when the mask is smaller than 4GB it is often
6449+ larger than 16MB and in this case we have a chance of
6450+ finding fitting memory in the next higher zone first. If
6451+ not retry with true GFP_DMA. -AK */
6452+ if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
6453+ gfp |= GFP_DMA32;
6454+#endif
6455+
6456+ again:
6457+#endif
6458+ page = dma_alloc_pages(dev,
6459+ noretry ? gfp | __GFP_NORETRY : gfp, order);
6460+ if (page == NULL)
6461+ return NULL;
6462+
6463+#ifndef CONFIG_XEN
6464+ {
6465+ int high, mmu;
6466+ dma_addr_t bus = page_to_phys(page);
6467+ memory = page_address(page);
6468+ high = (bus + size) >= dma_mask;
6469+ mmu = high;
6470+ if (force_iommu && !(gfp & GFP_DMA))
6471+ mmu = 1;
6472+ else if (high) {
6473+ free_pages((unsigned long)memory, order);
6474+
6475+ /* Don't use the 16MB ZONE_DMA unless absolutely
6476+ needed. It's better to use remapping first. */
6477+ if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
6478+ gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
6479+ goto again;
6480+ }
6481+
6482+ /* Let low level make its own zone decisions */
6483+ gfp &= ~(GFP_DMA32|GFP_DMA);
6484+
6485+ if (dma_ops->alloc_coherent)
6486+ return dma_ops->alloc_coherent(dev, size,
6487+ dma_handle, gfp);
6488+ return NULL;
6489+ }
6490+
6491+ memset(memory, 0, size);
6492+ if (!mmu) {
6493+ *dma_handle = bus;
6494+ return memory;
6495+ }
6496+ }
6497+
6498+ if (dma_ops->alloc_coherent) {
6499+ free_pages((unsigned long)memory, order);
6500+ gfp &= ~(GFP_DMA|GFP_DMA32);
6501+ return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
6502+ }
6503+
6504+ if (dma_ops->map_simple) {
6505+ *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
6506+ size,
6507+ PCI_DMA_BIDIRECTIONAL);
6508+ if (*dma_handle != bad_dma_address)
6509+ return memory;
6510+ }
6511+#else
6512+ memory = page_address(page);
6513+ if (xen_create_contiguous_region((unsigned long)memory, order,
6514+ fls64(dma_mask)) == 0) {
6515+ memset(memory, 0, size);
6516+ *dma_handle = virt_to_bus(memory);
6517+ return memory;
6518+ }
6519+#endif
6520+
6521+ if (panic_on_overflow)
6522+ panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
6523+ (unsigned long)size);
6524+ free_pages((unsigned long)memory, order);
6525+ return NULL;
6526 }
6527-EXPORT_SYMBOL(dma_unmap_single);
6528+EXPORT_SYMBOL(dma_alloc_coherent);
6529
6530-void
6531-dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
6532- enum dma_data_direction direction)
6533+/*
6534+ * Unmap coherent memory.
6535+ * The caller must ensure that the device has finished accessing the mapping.
6536+ */
6537+void dma_free_coherent(struct device *dev, size_t size,
6538+ void *vaddr, dma_addr_t bus)
6539 {
6540- if (swiotlb)
6541- swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
6542+ int order = get_order(size);
6543+ WARN_ON(irqs_disabled()); /* for portability */
6544+ if (dma_release_coherent(dev, order, vaddr))
6545+ return;
6546+#ifndef CONFIG_XEN
6547+ if (dma_ops->unmap_single)
6548+ dma_ops->unmap_single(dev, bus, size, 0);
6549+#endif
6550+ xen_destroy_contiguous_region((unsigned long)vaddr, order);
6551+ free_pages((unsigned long)vaddr, order);
6552 }
6553-EXPORT_SYMBOL(dma_sync_single_for_cpu);
6554+EXPORT_SYMBOL(dma_free_coherent);
6555
6556-void
6557-dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
6558- enum dma_data_direction direction)
6559+static int __init pci_iommu_init(void)
6560 {
6561- if (swiotlb)
6562- swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
6563+#ifdef CONFIG_CALGARY_IOMMU
6564+ calgary_iommu_init();
6565+#endif
6566+
6567+ intel_iommu_init();
6568+
6569+#ifdef CONFIG_GART_IOMMU
6570+ gart_iommu_init();
6571+#endif
6572+
6573+ no_iommu_init();
6574+ return 0;
6575 }
6576-EXPORT_SYMBOL(dma_sync_single_for_device);
6577
6578-void
6579-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
6580- enum dma_data_direction direction)
6581+void pci_iommu_shutdown(void)
6582 {
6583- if (swiotlb)
6584- swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
6585- flush_write_buffers();
6586+ gart_iommu_shutdown();
6587 }
6588-EXPORT_SYMBOL(dma_sync_sg_for_cpu);
6589+/* Must execute after PCI subsystem */
6590+fs_initcall(pci_iommu_init);
6591+
6592+#ifdef CONFIG_PCI
6593+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
6594
6595-void
6596-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
6597- enum dma_data_direction direction)
6598+static __devinit void via_no_dac(struct pci_dev *dev)
6599 {
6600- if (swiotlb)
6601- swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
6602- flush_write_buffers();
6603+ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
6604+ printk(KERN_INFO "PCI: VIA PCI bridge detected."
6605+ "Disabling DAC.\n");
6606+ forbid_dac = 1;
6607+ }
6608 }
6609-EXPORT_SYMBOL(dma_sync_sg_for_device);
6610+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
6611+#endif
6612--- /dev/null 1970-01-01 00:00:00.000000000 +0000
6613+++ sle11-2009-05-14/arch/x86/kernel/pci-nommu-xen.c 2009-03-16 16:38:05.000000000 +0100
6614@@ -0,0 +1,103 @@
6615+#include <linux/dma-mapping.h>
6616+#include <linux/dmar.h>
6617+#include <linux/bootmem.h>
6618+#include <linux/pci.h>
6619+
6620+#include <xen/gnttab.h>
6621+
6622+#include <asm/proto.h>
6623+#include <asm/dma.h>
6624+#include <asm/swiotlb.h>
6625+#include <asm/tlbflush.h>
6626+#include <asm/gnttab_dma.h>
6627+#include <asm/bug.h>
6628+
6629+#define IOMMU_BUG_ON(test) \
6630+do { \
6631+ if (unlikely(test)) { \
6632+ printk(KERN_ALERT "Fatal DMA error! " \
6633+ "Please use 'swiotlb=force'\n"); \
6634+ BUG(); \
6635+ } \
6636+} while (0)
6637+
6638+static int
6639+gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
6640+ int direction)
6641+{
6642+ unsigned int i;
6643+ struct scatterlist *sg;
6644+
6645+ WARN_ON(nents == 0 || sgl->length == 0);
6646+
6647+ for_each_sg(sgl, sg, nents, i) {
6648+ BUG_ON(!sg_page(sg));
6649+ sg->dma_address =
6650+ gnttab_dma_map_page(sg_page(sg)) + sg->offset;
6651+ sg->dma_length = sg->length;
6652+ IOMMU_BUG_ON(address_needs_mapping(
6653+ hwdev, sg->dma_address));
6654+ IOMMU_BUG_ON(range_straddles_page_boundary(
6655+ page_to_pseudophys(sg_page(sg)) + sg->offset,
6656+ sg->length));
6657+ }
6658+
6659+ return nents;
6660+}
6661+
6662+static void
6663+gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
6664+ int direction)
6665+{
6666+ unsigned int i;
6667+ struct scatterlist *sg;
6668+
6669+ for_each_sg(sgl, sg, nents, i)
6670+ gnttab_dma_unmap_page(sg->dma_address);
6671+}
6672+
6673+static dma_addr_t
6674+gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size,
6675+ int direction)
6676+{
6677+ dma_addr_t dma;
6678+
6679+ WARN_ON(size == 0);
6680+
6681+ dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) +
6682+ offset_in_page(paddr);
6683+ IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
6684+ IOMMU_BUG_ON(address_needs_mapping(dev, dma));
6685+
6686+ return dma;
6687+}
6688+
6689+static void
6690+gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
6691+ int direction)
6692+{
6693+ gnttab_dma_unmap_page(dma_addr);
6694+}
6695+
6696+static int nommu_mapping_error(dma_addr_t dma_addr)
6697+{
6698+ return (dma_addr == bad_dma_address);
6699+}
6700+
6701+static const struct dma_mapping_ops nommu_dma_ops = {
6702+ .map_single = gnttab_map_single,
6703+ .unmap_single = gnttab_unmap_single,
6704+ .map_sg = gnttab_map_sg,
6705+ .unmap_sg = gnttab_unmap_sg,
6706+ .dma_supported = swiotlb_dma_supported,
6707+ .mapping_error = nommu_mapping_error
6708+};
6709+
6710+void __init no_iommu_init(void)
6711+{
6712+ if (dma_ops)
6713+ return;
6714+
6715+ force_iommu = 0; /* no HW IOMMU */
6716+ dma_ops = &nommu_dma_ops;
6717+}
6718--- /dev/null 1970-01-01 00:00:00.000000000 +0000
6719+++ sle11-2009-05-14/arch/x86/kernel/process-xen.c 2009-03-16 16:38:05.000000000 +0100
6720@@ -0,0 +1,188 @@
6721+#include <linux/errno.h>
6722+#include <linux/kernel.h>
6723+#include <linux/mm.h>
6724+#include <linux/smp.h>
6725+#include <linux/slab.h>
6726+#include <linux/sched.h>
6727+#include <linux/module.h>
6728+#include <linux/pm.h>
6729+
6730+struct kmem_cache *task_xstate_cachep;
6731+
6732+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
6733+{
6734+ *dst = *src;
6735+ if (src->thread.xstate) {
6736+ dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
6737+ GFP_KERNEL);
6738+ if (!dst->thread.xstate)
6739+ return -ENOMEM;
6740+ WARN_ON((unsigned long)dst->thread.xstate & 15);
6741+ memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
6742+ }
6743+ return 0;
6744+}
6745+
6746+void free_thread_xstate(struct task_struct *tsk)
6747+{
6748+ if (tsk->thread.xstate) {
6749+ kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
6750+ tsk->thread.xstate = NULL;
6751+ }
6752+}
6753+
6754+void free_thread_info(struct thread_info *ti)
6755+{
6756+ free_thread_xstate(ti->task);
6757+ free_pages((unsigned long)ti, get_order(THREAD_SIZE));
6758+}
6759+
6760+void arch_task_cache_init(void)
6761+{
6762+ task_xstate_cachep =
6763+ kmem_cache_create("task_xstate", xstate_size,
6764+ __alignof__(union thread_xstate),
6765+ SLAB_PANIC, NULL);
6766+}
6767+
6768+static void do_nothing(void *unused)
6769+{
6770+}
6771+
6772+/*
6773+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
6774+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
6775+ * handler on SMP systems.
6776+ *
6777+ * Caller must have changed pm_idle to the new value before the call. Old
6778+ * pm_idle value will not be used by any CPU after the return of this function.
6779+ */
6780+void cpu_idle_wait(void)
6781+{
6782+ smp_mb();
6783+ /* kick all the CPUs so that they exit out of pm_idle */
6784+ smp_call_function(do_nothing, NULL, 0, 1);
6785+}
6786+EXPORT_SYMBOL_GPL(cpu_idle_wait);
6787+
6788+#ifndef CONFIG_XEN
6789+/*
6790+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
6791+ * which can obviate IPI to trigger checking of need_resched.
6792+ * We execute MONITOR against need_resched and enter optimized wait state
6793+ * through MWAIT. Whenever someone changes need_resched, we would be woken
6794+ * up from MWAIT (without an IPI).
6795+ *
6796+ * New with Core Duo processors, MWAIT can take some hints based on CPU
6797+ * capability.
6798+ */
6799+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
6800+{
6801+ if (!need_resched()) {
6802+ __monitor((void *)&current_thread_info()->flags, 0, 0);
6803+ smp_mb();
6804+ if (!need_resched())
6805+ __mwait(ax, cx);
6806+ }
6807+}
6808+
6809+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
6810+static void mwait_idle(void)
6811+{
6812+ if (!need_resched()) {
6813+ __monitor((void *)&current_thread_info()->flags, 0, 0);
6814+ smp_mb();
6815+ if (!need_resched())
6816+ __sti_mwait(0, 0);
6817+ else
6818+ local_irq_enable();
6819+ } else
6820+ local_irq_enable();
6821+}
6822+#endif
6823+
6824+/*
6825+ * On SMP it's slightly faster (but much more power-consuming!)
6826+ * to poll the ->work.need_resched flag instead of waiting for the
6827+ * cross-CPU IPI to arrive. Use this option with caution.
6828+ */
6829+static void poll_idle(void)
6830+{
6831+ local_irq_enable();
6832+ cpu_relax();
6833+}
6834+
6835+#ifndef CONFIG_XEN
6836+/*
6837+ * mwait selection logic:
6838+ *
6839+ * It depends on the CPU. For AMD CPUs that support MWAIT this is
6840+ * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
6841+ * then depend on a clock divisor and current Pstate of the core. If
6842+ * all cores of a processor are in halt state (C1) the processor can
6843+ * enter the C1E (C1 enhanced) state. If mwait is used this will never
6844+ * happen.
6845+ *
6846+ * idle=mwait overrides this decision and forces the usage of mwait.
6847+ */
6848+static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
6849+{
6850+ if (force_mwait)
6851+ return 1;
6852+
6853+ if (c->x86_vendor == X86_VENDOR_AMD) {
6854+ switch(c->x86) {
6855+ case 0x10:
6856+ case 0x11:
6857+ return 0;
6858+ }
6859+ }
6860+ return 1;
6861+}
6862+#endif
6863+
6864+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
6865+{
6866+#ifndef CONFIG_XEN
6867+ static int selected;
6868+
6869+ if (selected)
6870+ return;
6871+#ifdef CONFIG_X86_SMP
6872+ if (pm_idle == poll_idle && smp_num_siblings > 1) {
6873+ printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
6874+ " performance may degrade.\n");
6875+ }
6876+#endif
6877+ if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
6878+ /*
6879+ * Skip, if setup has overridden idle.
6880+ * One CPU supports mwait => All CPUs supports mwait
6881+ */
6882+ if (!pm_idle) {
6883+ printk(KERN_INFO "using mwait in idle threads.\n");
6884+ pm_idle = mwait_idle;
6885+ }
6886+ }
6887+ selected = 1;
6888+#endif
6889+}
6890+
6891+static int __init idle_setup(char *str)
6892+{
6893+ if (!strcmp(str, "poll")) {
6894+ printk("using polling idle threads.\n");
6895+ pm_idle = poll_idle;
6896+ }
6897+#ifndef CONFIG_XEN
6898+ else if (!strcmp(str, "mwait"))
6899+ force_mwait = 1;
6900+#endif
6901+ else
6902+ return -1;
6903+
6904+ boot_option_idle_override = 1;
6905+ return 0;
6906+}
6907+early_param("idle", idle_setup);
6908+
6909--- sle11-2009-05-14.orig/arch/x86/kernel/process_32-xen.c 2009-03-16 16:33:40.000000000 +0100
6910+++ sle11-2009-05-14/arch/x86/kernel/process_32-xen.c 2009-03-16 16:38:05.000000000 +0100
6911@@ -36,6 +36,7 @@
6912 #include <linux/personality.h>
6913 #include <linux/tick.h>
6914 #include <linux/percpu.h>
6915+#include <linux/prctl.h>
6916
6917 #include <asm/uaccess.h>
6918 #include <asm/pgtable.h>
6919@@ -45,7 +46,6 @@
6920 #include <asm/processor.h>
6921 #include <asm/i387.h>
6922 #include <asm/desc.h>
6923-#include <asm/vm86.h>
6924 #ifdef CONFIG_MATH_EMULATION
6925 #include <asm/math_emu.h>
6926 #endif
6927@@ -102,16 +102,6 @@ void enable_hlt(void)
6928
6929 EXPORT_SYMBOL(enable_hlt);
6930
6931-/*
6932- * On SMP it's slightly faster (but much more power-consuming!)
6933- * to poll the ->work.need_resched flag instead of waiting for the
6934- * cross-CPU IPI to arrive. Use this option with caution.
6935- */
6936-static void poll_idle(void)
6937-{
6938- cpu_relax();
6939-}
6940-
6941 static void xen_idle(void)
6942 {
6943 current_thread_info()->status &= ~TS_POLLING;
6944@@ -121,20 +111,10 @@ static void xen_idle(void)
6945 */
6946 smp_mb();
6947
6948- local_irq_disable();
6949- if (!need_resched()) {
6950- ktime_t t0, t1;
6951- u64 t0n, t1n;
6952-
6953- t0 = ktime_get();
6954- t0n = ktime_to_ns(t0);
6955+ if (!need_resched())
6956 safe_halt(); /* enables interrupts racelessly */
6957- local_irq_disable();
6958- t1 = ktime_get();
6959- t1n = ktime_to_ns(t1);
6960- sched_clock_idle_wakeup_event(t1n - t0n);
6961- }
6962- local_irq_enable();
6963+ else
6964+ local_irq_enable();
6965 current_thread_info()->status |= TS_POLLING;
6966 }
6967 #ifdef CONFIG_APM_MODULE
6968@@ -142,7 +122,6 @@ EXPORT_SYMBOL(default_idle);
6969 #endif
6970
6971 #ifdef CONFIG_HOTPLUG_CPU
6972-extern cpumask_t cpu_initialized;
6973 static inline void play_dead(void)
6974 {
6975 idle_task_exit();
6976@@ -187,6 +166,7 @@ void cpu_idle(void)
6977 if (cpu_is_offline(cpu))
6978 play_dead();
6979
6980+ local_irq_disable();
6981 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
6982 idle();
6983 }
6984@@ -197,44 +177,6 @@ void cpu_idle(void)
6985 }
6986 }
6987
6988-static void do_nothing(void *unused)
6989-{
6990-}
6991-
6992-/*
6993- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
6994- * pm_idle and update to new pm_idle value. Required while changing pm_idle
6995- * handler on SMP systems.
6996- *
6997- * Caller must have changed pm_idle to the new value before the call. Old
6998- * pm_idle value will not be used by any CPU after the return of this function.
6999- */
7000-void cpu_idle_wait(void)
7001-{
7002- smp_mb();
7003- /* kick all the CPUs so that they exit out of pm_idle */
7004- smp_call_function(do_nothing, NULL, 0, 1);
7005-}
7006-EXPORT_SYMBOL_GPL(cpu_idle_wait);
7007-
7008-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
7009-{
7010-}
7011-
7012-static int __init idle_setup(char *str)
7013-{
7014- if (!strcmp(str, "poll")) {
7015- printk("using polling idle threads.\n");
7016- pm_idle = poll_idle;
7017- }
7018- else
7019- return -1;
7020-
7021- boot_option_idle_override = 1;
7022- return 0;
7023-}
7024-early_param("idle", idle_setup);
7025-
7026 void __show_registers(struct pt_regs *regs, int all)
7027 {
7028 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
7029@@ -260,7 +202,7 @@ void __show_registers(struct pt_regs *re
7030 init_utsname()->version);
7031
7032 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
7033- 0xffff & regs->cs, regs->ip, regs->flags,
7034+ (u16)regs->cs, regs->ip, regs->flags,
7035 smp_processor_id());
7036 print_symbol("EIP is at %s\n", regs->ip);
7037
7038@@ -269,8 +211,7 @@ void __show_registers(struct pt_regs *re
7039 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
7040 regs->si, regs->di, regs->bp, sp);
7041 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
7042- regs->ds & 0xffff, regs->es & 0xffff,
7043- regs->fs & 0xffff, gs, ss);
7044+ (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
7045
7046 if (!all)
7047 return;
7048@@ -367,6 +308,7 @@ void flush_thread(void)
7049 /*
7050 * Forget coprocessor state..
7051 */
7052+ tsk->fpu_counter = 0;
7053 clear_fpu(tsk);
7054 clear_used_math();
7055 }
7056@@ -437,11 +379,30 @@ int copy_thread(int nr, unsigned long cl
7057 return err;
7058 }
7059
7060-#ifdef CONFIG_SECCOMP
7061+void
7062+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
7063+{
7064+ __asm__("movl %0, %%gs" :: "r"(0));
7065+ regs->fs = 0;
7066+ set_fs(USER_DS);
7067+ regs->ds = __USER_DS;
7068+ regs->es = __USER_DS;
7069+ regs->ss = __USER_DS;
7070+ regs->cs = __USER_CS;
7071+ regs->ip = new_ip;
7072+ regs->sp = new_sp;
7073+ /*
7074+ * Free the old FP and other extended state
7075+ */
7076+ free_thread_xstate(current);
7077+}
7078+EXPORT_SYMBOL_GPL(start_thread);
7079+
7080 static void hard_disable_TSC(void)
7081 {
7082 write_cr4(read_cr4() | X86_CR4_TSD);
7083 }
7084+
7085 void disable_TSC(void)
7086 {
7087 preempt_disable();
7088@@ -453,11 +414,47 @@ void disable_TSC(void)
7089 hard_disable_TSC();
7090 preempt_enable();
7091 }
7092+
7093 static void hard_enable_TSC(void)
7094 {
7095 write_cr4(read_cr4() & ~X86_CR4_TSD);
7096 }
7097-#endif /* CONFIG_SECCOMP */
7098+
7099+static void enable_TSC(void)
7100+{
7101+ preempt_disable();
7102+ if (test_and_clear_thread_flag(TIF_NOTSC))
7103+ /*
7104+ * Must flip the CPU state synchronously with
7105+ * TIF_NOTSC in the current running context.
7106+ */
7107+ hard_enable_TSC();
7108+ preempt_enable();
7109+}
7110+
7111+int get_tsc_mode(unsigned long adr)
7112+{
7113+ unsigned int val;
7114+
7115+ if (test_thread_flag(TIF_NOTSC))
7116+ val = PR_TSC_SIGSEGV;
7117+ else
7118+ val = PR_TSC_ENABLE;
7119+
7120+ return put_user(val, (unsigned int __user *)adr);
7121+}
7122+
7123+int set_tsc_mode(unsigned int val)
7124+{
7125+ if (val == PR_TSC_SIGSEGV)
7126+ disable_TSC();
7127+ else if (val == PR_TSC_ENABLE)
7128+ enable_TSC();
7129+ else
7130+ return -EINVAL;
7131+
7132+ return 0;
7133+}
7134
7135 static noinline void
7136 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
7137@@ -473,12 +470,12 @@ __switch_to_xtra(struct task_struct *pre
7138 /* we clear debugctl to make sure DS
7139 * is not in use when we change it */
7140 debugctl = 0;
7141- wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
7142+ update_debugctlmsr(0);
7143 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
7144 }
7145
7146 if (next->debugctlmsr != debugctl)
7147- wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
7148+ update_debugctlmsr(next->debugctlmsr);
7149
7150 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
7151 set_debugreg(next->debugreg0, 0);
7152@@ -490,7 +487,6 @@ __switch_to_xtra(struct task_struct *pre
7153 set_debugreg(next->debugreg7, 7);
7154 }
7155
7156-#ifdef CONFIG_SECCOMP
7157 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
7158 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
7159 /* prev and next are different */
7160@@ -499,7 +495,6 @@ __switch_to_xtra(struct task_struct *pre
7161 else
7162 hard_enable_TSC();
7163 }
7164-#endif
7165
7166 #ifdef X86_BTS
7167 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
7168@@ -637,7 +632,7 @@ struct task_struct * __switch_to(struct
7169
7170 /* we're going to use this soon, after a few expensive things */
7171 if (next_p->fpu_counter > 5)
7172- prefetch(&next->i387.fxsave);
7173+ prefetch(next->xstate);
7174
7175 /*
7176 * Now maybe handle debug registers
7177@@ -658,8 +653,11 @@ struct task_struct * __switch_to(struct
7178 /* If the task has used fpu the last 5 timeslices, just do a full
7179 * restore of the math state immediately to avoid the trap; the
7180 * chances of needing FPU soon are obviously high now
7181+ *
7182+ * tsk_used_math() checks prevent calling math_state_restore(),
7183+ * which can sleep in the case of !tsk_used_math()
7184 */
7185- if (next_p->fpu_counter > 5)
7186+ if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
7187 math_state_restore();
7188
7189 /*
7190--- sle11-2009-05-14.orig/arch/x86/kernel/process_64-xen.c 2009-03-16 16:33:40.000000000 +0100
7191+++ sle11-2009-05-14/arch/x86/kernel/process_64-xen.c 2009-03-16 16:38:05.000000000 +0100
7192@@ -39,6 +39,7 @@
7193 #include <linux/kprobes.h>
7194 #include <linux/kdebug.h>
7195 #include <linux/tick.h>
7196+#include <linux/prctl.h>
7197
7198 #include <asm/uaccess.h>
7199 #include <asm/pgtable.h>
7200@@ -102,17 +103,6 @@ void exit_idle(void)
7201 __exit_idle();
7202 }
7203
7204-/*
7205- * On SMP it's slightly faster (but much more power-consuming!)
7206- * to poll the ->need_resched flag instead of waiting for the
7207- * cross-CPU IPI to arrive. Use this option with caution.
7208- */
7209-static void poll_idle(void)
7210-{
7211- local_irq_enable();
7212- cpu_relax();
7213-}
7214-
7215 static void xen_idle(void)
7216 {
7217 current_thread_info()->status &= ~TS_POLLING;
7218@@ -121,20 +111,10 @@ static void xen_idle(void)
7219 * test NEED_RESCHED:
7220 */
7221 smp_mb();
7222- local_irq_disable();
7223- if (!need_resched()) {
7224- ktime_t t0, t1;
7225- u64 t0n, t1n;
7226-
7227- t0 = ktime_get();
7228- t0n = ktime_to_ns(t0);
7229+ if (!need_resched())
7230 safe_halt(); /* enables interrupts racelessly */
7231- local_irq_disable();
7232- t1 = ktime_get();
7233- t1n = ktime_to_ns(t1);
7234- sched_clock_idle_wakeup_event(t1n - t0n);
7235- }
7236- local_irq_enable();
7237+ else
7238+ local_irq_enable();
7239 current_thread_info()->status |= TS_POLLING;
7240 }
7241
7242@@ -195,45 +175,6 @@ void cpu_idle(void)
7243 }
7244 }
7245
7246-static void do_nothing(void *unused)
7247-{
7248-}
7249-
7250-/*
7251- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
7252- * pm_idle and update to new pm_idle value. Required while changing pm_idle
7253- * handler on SMP systems.
7254- *
7255- * Caller must have changed pm_idle to the new value before the call. Old
7256- * pm_idle value will not be used by any CPU after the return of this function.
7257- */
7258-void cpu_idle_wait(void)
7259-{
7260- smp_mb();
7261- /* kick all the CPUs so that they exit out of pm_idle */
7262- smp_call_function(do_nothing, NULL, 0, 1);
7263-}
7264-EXPORT_SYMBOL_GPL(cpu_idle_wait);
7265-
7266-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
7267-{
7268-}
7269-
7270-static int __init idle_setup(char *str)
7271-{
7272- if (!strcmp(str, "poll")) {
7273- printk("using polling idle threads.\n");
7274- pm_idle = poll_idle;
7275- } else if (!strcmp(str, "mwait"))
7276- force_mwait = 1;
7277- else
7278- return -1;
7279-
7280- boot_option_idle_override = 1;
7281- return 0;
7282-}
7283-early_param("idle", idle_setup);
7284-
7285 /* Prints also some state that isn't saved in the pt_regs */
7286 void __show_regs(struct pt_regs * regs)
7287 {
7288@@ -360,6 +301,7 @@ void flush_thread(void)
7289 /*
7290 * Forget coprocessor state..
7291 */
7292+ tsk->fpu_counter = 0;
7293 clear_fpu(tsk);
7294 clear_used_math();
7295 }
7296@@ -472,6 +414,83 @@ out:
7297 return err;
7298 }
7299
7300+void
7301+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
7302+{
7303+ asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
7304+ load_gs_index(0);
7305+ regs->ip = new_ip;
7306+ regs->sp = new_sp;
7307+ write_pda(oldrsp, new_sp);
7308+ regs->cs = __USER_CS;
7309+ regs->ss = __USER_DS;
7310+ regs->flags = 0x200;
7311+ set_fs(USER_DS);
7312+ /*
7313+ * Free the old FP and other extended state
7314+ */
7315+ free_thread_xstate(current);
7316+}
7317+EXPORT_SYMBOL_GPL(start_thread);
7318+
7319+static void hard_disable_TSC(void)
7320+{
7321+ write_cr4(read_cr4() | X86_CR4_TSD);
7322+}
7323+
7324+void disable_TSC(void)
7325+{
7326+ preempt_disable();
7327+ if (!test_and_set_thread_flag(TIF_NOTSC))
7328+ /*
7329+ * Must flip the CPU state synchronously with
7330+ * TIF_NOTSC in the current running context.
7331+ */
7332+ hard_disable_TSC();
7333+ preempt_enable();
7334+}
7335+
7336+static void hard_enable_TSC(void)
7337+{
7338+ write_cr4(read_cr4() & ~X86_CR4_TSD);
7339+}
7340+
7341+static void enable_TSC(void)
7342+{
7343+ preempt_disable();
7344+ if (test_and_clear_thread_flag(TIF_NOTSC))
7345+ /*
7346+ * Must flip the CPU state synchronously with
7347+ * TIF_NOTSC in the current running context.
7348+ */
7349+ hard_enable_TSC();
7350+ preempt_enable();
7351+}
7352+
7353+int get_tsc_mode(unsigned long adr)
7354+{
7355+ unsigned int val;
7356+
7357+ if (test_thread_flag(TIF_NOTSC))
7358+ val = PR_TSC_SIGSEGV;
7359+ else
7360+ val = PR_TSC_ENABLE;
7361+
7362+ return put_user(val, (unsigned int __user *)adr);
7363+}
7364+
7365+int set_tsc_mode(unsigned int val)
7366+{
7367+ if (val == PR_TSC_SIGSEGV)
7368+ disable_TSC();
7369+ else if (val == PR_TSC_ENABLE)
7370+ enable_TSC();
7371+ else
7372+ return -EINVAL;
7373+
7374+ return 0;
7375+}
7376+
7377 /*
7378 * This special macro can be used to load a debugging register
7379 */
7380@@ -491,12 +510,12 @@ static inline void __switch_to_xtra(stru
7381 /* we clear debugctl to make sure DS
7382 * is not in use when we change it */
7383 debugctl = 0;
7384- wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
7385+ update_debugctlmsr(0);
7386 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
7387 }
7388
7389 if (next->debugctlmsr != debugctl)
7390- wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
7391+ update_debugctlmsr(next->debugctlmsr);
7392
7393 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
7394 loaddebug(next, 0);
7395@@ -508,6 +527,15 @@ static inline void __switch_to_xtra(stru
7396 loaddebug(next, 7);
7397 }
7398
7399+ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
7400+ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
7401+ /* prev and next are different */
7402+ if (test_tsk_thread_flag(next_p, TIF_NOTSC))
7403+ hard_disable_TSC();
7404+ else
7405+ hard_enable_TSC();
7406+ }
7407+
7408 #ifdef X86_BTS
7409 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
7410 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
7411@@ -547,7 +575,7 @@ __switch_to(struct task_struct *prev_p,
7412
7413 /* we're going to use this soon, after a few expensive things */
7414 if (next_p->fpu_counter>5)
7415- prefetch(&next->i387.fxsave);
7416+ prefetch(next->xstate);
7417
7418 /*
7419 * This is basically '__unlazy_fpu', except that we queue a
7420@@ -680,8 +708,11 @@ __switch_to(struct task_struct *prev_p,
7421 /* If the task has used fpu the last 5 timeslices, just do a full
7422 * restore of the math state immediately to avoid the trap; the
7423 * chances of needing FPU soon are obviously high now
7424+ *
7425+ * tsk_used_math() checks prevent calling math_state_restore(),
7426+ * which can sleep in the case of !tsk_used_math()
7427 */
7428- if (next_p->fpu_counter>5)
7429+ if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
7430 math_state_restore();
7431 return prev_p;
7432 }
7433--- /dev/null 1970-01-01 00:00:00.000000000 +0000
7434+++ sle11-2009-05-14/arch/x86/kernel/setup-xen.c 2009-03-16 16:38:05.000000000 +0100
7435@@ -0,0 +1,141 @@
7436+#include <linux/kernel.h>
7437+#include <linux/module.h>
7438+#include <linux/init.h>
7439+#include <linux/bootmem.h>
7440+#include <linux/percpu.h>
7441+#include <asm/smp.h>
7442+#include <asm/percpu.h>
7443+#include <asm/sections.h>
7444+#include <asm/processor.h>
7445+#include <asm/setup.h>
7446+#include <asm/topology.h>
7447+#include <asm/mpspec.h>
7448+#include <asm/apicdef.h>
7449+
7450+#ifdef CONFIG_X86_LOCAL_APIC
7451+unsigned int num_processors;
7452+unsigned disabled_cpus __cpuinitdata;
7453+/* Processor that is doing the boot up */
7454+unsigned int boot_cpu_physical_apicid = -1U;
7455+EXPORT_SYMBOL(boot_cpu_physical_apicid);
7456+
7457+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
7458+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
7459+
7460+/* Bitmask of physically existing CPUs */
7461+physid_mask_t phys_cpu_present_map;
7462+#endif
7463+
7464+#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
7465+/*
7466+ * Copy data used in early init routines from the initial arrays to the
7467+ * per cpu data areas. These arrays then become expendable and the
7468+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
7469+ */
7470+static void __init setup_per_cpu_maps(void)
7471+{
7472+#ifndef CONFIG_XEN
7473+ int cpu;
7474+
7475+ for_each_possible_cpu(cpu) {
7476+ per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
7477+ per_cpu(x86_bios_cpu_apicid, cpu) =
7478+ x86_bios_cpu_apicid_init[cpu];
7479+#ifdef CONFIG_NUMA
7480+ per_cpu(x86_cpu_to_node_map, cpu) =
7481+ x86_cpu_to_node_map_init[cpu];
7482+#endif
7483+ }
7484+
7485+ /* indicate the early static arrays will soon be gone */
7486+ x86_cpu_to_apicid_early_ptr = NULL;
7487+ x86_bios_cpu_apicid_early_ptr = NULL;
7488+#ifdef CONFIG_NUMA
7489+ x86_cpu_to_node_map_early_ptr = NULL;
7490+#endif
7491+#endif
7492+}
7493+
7494+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
7495+cpumask_t *cpumask_of_cpu_map __read_mostly;
7496+EXPORT_SYMBOL(cpumask_of_cpu_map);
7497+
7498+/* requires nr_cpu_ids to be initialized */
7499+static void __init setup_cpumask_of_cpu(void)
7500+{
7501+ int i;
7502+
7503+ /* alloc_bootmem zeroes memory */
7504+ cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
7505+ for (i = 0; i < nr_cpu_ids; i++)
7506+ cpu_set(i, cpumask_of_cpu_map[i]);
7507+}
7508+#else
7509+static inline void setup_cpumask_of_cpu(void) { }
7510+#endif
7511+
7512+#ifdef CONFIG_X86_32
7513+/*
7514+ * Great future not-so-futuristic plan: make i386 and x86_64 do it
7515+ * the same way
7516+ */
7517+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
7518+EXPORT_SYMBOL(__per_cpu_offset);
7519+#endif
7520+
7521+/*
7522+ * Great future plan:
7523+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
7524+ * Always point %gs to its beginning
7525+ */
7526+void __init setup_per_cpu_areas(void)
7527+{
7528+ int i, highest_cpu = 0;
7529+ unsigned long size;
7530+
7531+#ifdef CONFIG_HOTPLUG_CPU
7532+ prefill_possible_map();
7533+#endif
7534+
7535+ /* Copy section for each CPU (we discard the original) */
7536+ size = PERCPU_ENOUGH_ROOM;
7537+ printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
7538+ size);
7539+
7540+ for_each_possible_cpu(i) {
7541+ char *ptr;
7542+#ifndef CONFIG_NEED_MULTIPLE_NODES
7543+ ptr = alloc_bootmem_pages(size);
7544+#else
7545+ int node = early_cpu_to_node(i);
7546+ if (!node_online(node) || !NODE_DATA(node)) {
7547+ ptr = alloc_bootmem_pages(size);
7548+ printk(KERN_INFO
7549+ "cpu %d has no node or node-local memory\n", i);
7550+ }
7551+ else
7552+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
7553+#endif
7554+ if (!ptr)
7555+ panic("Cannot allocate cpu data for CPU %d\n", i);
7556+#ifdef CONFIG_X86_64
7557+ cpu_pda(i)->data_offset = ptr - __per_cpu_start;
7558+#else
7559+ __per_cpu_offset[i] = ptr - __per_cpu_start;
7560+#endif
7561+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
7562+
7563+ highest_cpu = i;
7564+ }
7565+
7566+ nr_cpu_ids = highest_cpu + 1;
7567+ printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
7568+
7569+ /* Setup percpu data maps */
7570+ setup_per_cpu_maps();
7571+
7572+ /* Setup cpumask_of_cpu map */
7573+ setup_cpumask_of_cpu();
7574+}
7575+
7576+#endif
7577--- sle11-2009-05-14.orig/arch/x86/kernel/setup64-xen.c 2009-03-16 16:33:40.000000000 +0100
7578+++ sle11-2009-05-14/arch/x86/kernel/setup64-xen.c 2009-03-16 16:38:05.000000000 +0100
7579@@ -15,6 +15,7 @@
7580 #include <linux/bootmem.h>
7581 #include <linux/bitops.h>
7582 #include <linux/module.h>
7583+#include <linux/kgdb.h>
7584 #include <asm/pda.h>
7585 #include <asm/pgtable.h>
7586 #include <asm/processor.h>
7587@@ -27,6 +28,7 @@
7588 #include <asm/proto.h>
7589 #include <asm/sections.h>
7590 #include <asm/setup.h>
7591+#include <asm/genapic.h>
7592 #ifdef CONFIG_XEN
7593 #include <asm/hypervisor.h>
7594 #endif
7595@@ -81,8 +83,8 @@ int force_personality32 = 0;
7596 Control non executable heap for 32bit processes.
7597 To control the stack too use noexec=off
7598
7599-on PROT_READ does not imply PROT_EXEC for 32bit processes
7600-off PROT_READ implies PROT_EXEC (default)
7601+on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
7602+off PROT_READ implies PROT_EXEC
7603 */
7604 static int __init nonx32_setup(char *str)
7605 {
7606@@ -94,85 +96,6 @@ static int __init nonx32_setup(char *str
7607 }
7608 __setup("noexec32=", nonx32_setup);
7609
7610-/*
7611- * Copy data used in early init routines from the initial arrays to the
7612- * per cpu data areas. These arrays then become expendable and the
7613- * *_early_ptr's are zeroed indicating that the static arrays are gone.
7614- */
7615-static void __init setup_per_cpu_maps(void)
7616-{
7617-#ifndef CONFIG_XEN
7618- int cpu;
7619-
7620- for_each_possible_cpu(cpu) {
7621-#ifdef CONFIG_SMP
7622- if (per_cpu_offset(cpu)) {
7623-#endif
7624- per_cpu(x86_cpu_to_apicid, cpu) =
7625- x86_cpu_to_apicid_init[cpu];
7626- per_cpu(x86_bios_cpu_apicid, cpu) =
7627- x86_bios_cpu_apicid_init[cpu];
7628-#ifdef CONFIG_NUMA
7629- per_cpu(x86_cpu_to_node_map, cpu) =
7630- x86_cpu_to_node_map_init[cpu];
7631-#endif
7632-#ifdef CONFIG_SMP
7633- }
7634- else
7635- printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
7636- cpu);
7637-#endif
7638- }
7639-
7640- /* indicate the early static arrays will soon be gone */
7641- x86_cpu_to_apicid_early_ptr = NULL;
7642- x86_bios_cpu_apicid_early_ptr = NULL;
7643-#ifdef CONFIG_NUMA
7644- x86_cpu_to_node_map_early_ptr = NULL;
7645-#endif
7646-#endif
7647-}
7648-
7649-/*
7650- * Great future plan:
7651- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
7652- * Always point %gs to its beginning
7653- */
7654-void __init setup_per_cpu_areas(void)
7655-{
7656- int i;
7657- unsigned long size;
7658-
7659-#ifdef CONFIG_HOTPLUG_CPU
7660- prefill_possible_map();
7661-#endif
7662-
7663- /* Copy section for each CPU (we discard the original) */
7664- size = PERCPU_ENOUGH_ROOM;
7665-
7666- printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
7667- for_each_cpu_mask (i, cpu_possible_map) {
7668- char *ptr;
7669-#ifndef CONFIG_NEED_MULTIPLE_NODES
7670- ptr = alloc_bootmem_pages(size);
7671-#else
7672- int node = early_cpu_to_node(i);
7673-
7674- if (!node_online(node) || !NODE_DATA(node))
7675- ptr = alloc_bootmem_pages(size);
7676- else
7677- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
7678-#endif
7679- if (!ptr)
7680- panic("Cannot allocate cpu data for CPU %d\n", i);
7681- cpu_pda(i)->data_offset = ptr - __per_cpu_start;
7682- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
7683- }
7684-
7685- /* setup percpu data maps early */
7686- setup_per_cpu_maps();
7687-}
7688-
7689 #ifdef CONFIG_XEN
7690 static void __init_refok switch_pt(int cpu)
7691 {
7692@@ -410,6 +333,17 @@ void __cpuinit cpu_init (void)
7693 #endif
7694 load_LDT(&init_mm.context);
7695
7696+#ifdef CONFIG_KGDB
7697+ /*
7698+ * If the kgdb is connected no debug regs should be altered. This
7699+ * is only applicable when KGDB and a KGDB I/O module are built
7700+ * into the kernel and you are using early debugging with
7701+ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
7702+ */
7703+ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
7704+ arch_kgdb_ops.correct_hw_break();
7705+ else {
7706+#endif
7707 /*
7708 * Clear all 6 debug registers:
7709 */
7710@@ -420,10 +354,17 @@ void __cpuinit cpu_init (void)
7711 set_debugreg(0UL, 3);
7712 set_debugreg(0UL, 6);
7713 set_debugreg(0UL, 7);
7714+#ifdef CONFIG_KGDB
7715+ /* If the kgdb is connected no debug regs should be altered. */
7716+ }
7717+#endif
7718
7719 fpu_init();
7720
7721 asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
7722 if (raw_irqs_disabled())
7723 kernel_eflags &= ~X86_EFLAGS_IF;
7724+
7725+ if (is_uv_system())
7726+ uv_cpu_init();
7727 }
7728--- sle11-2009-05-14.orig/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:33:40.000000000 +0100
7729+++ sle11-2009-05-14/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:38:05.000000000 +0100
7730@@ -39,6 +39,7 @@
7731 #include <linux/efi.h>
7732 #include <linux/init.h>
7733 #include <linux/edd.h>
7734+#include <linux/iscsi_ibft.h>
7735 #include <linux/nodemask.h>
7736 #include <linux/kernel.h>
7737 #include <linux/percpu.h>
7738@@ -49,6 +50,7 @@
7739 #include <linux/pfn.h>
7740 #include <linux/pci.h>
7741 #include <linux/init_ohci1394_dma.h>
7742+#include <linux/kvm_para.h>
7743
7744 #include <video/edid.h>
7745
7746@@ -70,8 +72,9 @@
7747 #include <xen/firmware.h>
7748 #include <xen/xencons.h>
7749 #include <setup_arch.h>
7750-#include <bios_ebda.h>
7751+#include <asm/bios_ebda.h>
7752 #include <asm/cacheflush.h>
7753+#include <asm/processor.h>
7754
7755 #ifdef CONFIG_XEN
7756 #include <xen/interface/kexec.h>
7757@@ -136,7 +139,12 @@ static struct resource standard_io_resou
7758 }, {
7759 .name = "keyboard",
7760 .start = 0x0060,
7761- .end = 0x006f,
7762+ .end = 0x0060,
7763+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
7764+}, {
7765+ .name = "keyboard",
7766+ .start = 0x0064,
7767+ .end = 0x0064,
7768 .flags = IORESOURCE_BUSY | IORESOURCE_IO
7769 }, {
7770 .name = "dma page reg",
7771@@ -166,6 +174,8 @@ struct cpuinfo_x86 new_cpu_data __cpuini
7772 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
7773 EXPORT_SYMBOL(boot_cpu_data);
7774
7775+unsigned int def_to_bigsmp;
7776+
7777 #ifndef CONFIG_X86_PAE
7778 unsigned long mmu_cr4_features;
7779 #else
7780@@ -204,7 +214,7 @@ EXPORT_SYMBOL(ist_info);
7781 extern void early_cpu_init(void);
7782 extern int root_mountflags;
7783
7784-unsigned long saved_videomode;
7785+unsigned long saved_video_mode;
7786
7787 #define RAMDISK_IMAGE_START_MASK 0x07FF
7788 #define RAMDISK_PROMPT_FLAG 0x8000
7789@@ -259,7 +269,7 @@ static inline void copy_edd(void)
7790 }
7791 #endif
7792
7793-int __initdata user_defined_memmap = 0;
7794+int __initdata user_defined_memmap;
7795
7796 /*
7797 * "mem=nopentium" disables the 4MB page tables.
7798@@ -420,20 +430,59 @@ unsigned long __init find_max_low_pfn(vo
7799 }
7800
7801 #ifndef CONFIG_XEN
7802+#define BIOS_LOWMEM_KILOBYTES 0x413
7803+
7804 /*
7805- * workaround for Dell systems that neglect to reserve EBDA
7806+ * The BIOS places the EBDA/XBDA at the top of conventional
7807+ * memory, and usually decreases the reported amount of
7808+ * conventional memory (int 0x12) too. This also contains a
7809+ * workaround for Dell systems that neglect to reserve EBDA.
7810+ * The same workaround also avoids a problem with the AMD768MPX
7811+ * chipset: reserve a page before VGA to prevent PCI prefetch
7812+ * into it (errata #56). Usually the page is reserved anyways,
7813+ * unless you have no PS/2 mouse plugged in.
7814 */
7815 static void __init reserve_ebda_region(void)
7816 {
7817- unsigned int addr;
7818- addr = get_bios_ebda();
7819- if (addr)
7820- reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
7821+ unsigned int lowmem, ebda_addr;
7822+
7823+ /* To determine the position of the EBDA and the */
7824+ /* end of conventional memory, we need to look at */
7825+ /* the BIOS data area. In a paravirtual environment */
7826+ /* that area is absent. We'll just have to assume */
7827+ /* that the paravirt case can handle memory setup */
7828+ /* correctly, without our help. */
7829+ if (paravirt_enabled())
7830+ return;
7831+
7832+ /* end of low (conventional) memory */
7833+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
7834+ lowmem <<= 10;
7835+
7836+ /* start of EBDA area */
7837+ ebda_addr = get_bios_ebda();
7838+
7839+ /* Fixup: bios puts an EBDA in the top 64K segment */
7840+ /* of conventional memory, but does not adjust lowmem. */
7841+ if ((lowmem - ebda_addr) <= 0x10000)
7842+ lowmem = ebda_addr;
7843+
7844+ /* Fixup: bios does not report an EBDA at all. */
7845+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
7846+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
7847+ lowmem = 0x9f000;
7848+
7849+ /* Paranoia: should never happen, but... */
7850+ if ((lowmem == 0) || (lowmem >= 0x100000))
7851+ lowmem = 0x9f000;
7852+
7853+ /* reserve all memory between lowmem and the 1MB mark */
7854+ reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
7855 }
7856 #endif
7857
7858 #ifndef CONFIG_NEED_MULTIPLE_NODES
7859-void __init setup_bootmem_allocator(void);
7860+static void __init setup_bootmem_allocator(void);
7861 static unsigned long __init setup_memory(void)
7862 {
7863 /*
7864@@ -469,7 +518,7 @@ static unsigned long __init setup_memory
7865 return max_low_pfn;
7866 }
7867
7868-void __init zone_sizes_init(void)
7869+static void __init zone_sizes_init(void)
7870 {
7871 unsigned long max_zone_pfns[MAX_NR_ZONES];
7872 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
7873@@ -521,10 +570,16 @@ static void __init reserve_crashkernel(v
7874 (unsigned long)(crash_size >> 20),
7875 (unsigned long)(crash_base >> 20),
7876 (unsigned long)(total_mem >> 20));
7877+
7878+ if (reserve_bootmem(crash_base, crash_size,
7879+ BOOTMEM_EXCLUSIVE) < 0) {
7880+ printk(KERN_INFO "crashkernel reservation "
7881+ "failed - memory is in use\n");
7882+ return;
7883+ }
7884+
7885 crashk_res.start = crash_base;
7886 crashk_res.end = crash_base + crash_size - 1;
7887- reserve_bootmem(crash_base, crash_size,
7888- BOOTMEM_DEFAULT);
7889 } else
7890 printk(KERN_INFO "crashkernel reservation failed - "
7891 "you have to specify a base address\n");
7892@@ -658,16 +713,9 @@ void __init setup_bootmem_allocator(void
7893 */
7894 reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
7895
7896- /* reserve EBDA region, it's a 4K region */
7897+ /* reserve EBDA region */
7898 reserve_ebda_region();
7899
7900- /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
7901- PCI prefetch into it (errata #56). Usually the page is reserved anyways,
7902- unless you have no PS/2 mouse plugged in. */
7903- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
7904- boot_cpu_data.x86 == 6)
7905- reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
7906-
7907 #ifdef CONFIG_SMP
7908 /*
7909 * But first pinch a few for the stack/trampoline stuff
7910@@ -689,6 +737,8 @@ void __init setup_bootmem_allocator(void
7911 #endif
7912 numa_kva_reserve();
7913 reserve_crashkernel();
7914+
7915+ reserve_ibft_region();
7916 }
7917
7918 /*
7919@@ -724,6 +774,18 @@ char * __init __attribute__((weak)) memo
7920 return machine_specific_memory_setup();
7921 }
7922
7923+#ifdef CONFIG_NUMA
7924+/*
7925+ * In the golden day, when everything among i386 and x86_64 will be
7926+ * integrated, this will not live here
7927+ */
7928+void *x86_cpu_to_node_map_early_ptr;
7929+int x86_cpu_to_node_map_init[NR_CPUS] = {
7930+ [0 ... NR_CPUS-1] = NUMA_NO_NODE
7931+};
7932+DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
7933+#endif
7934+
7935 /*
7936 * Determine if we were loaded by an EFI loader. If so, then we have also been
7937 * passed the efi memmap, systab, etc., so we should use these data structures
7938@@ -773,7 +835,7 @@ void __init setup_arch(char **cmdline_p)
7939 copy_edid();
7940 apm_info.bios = boot_params.apm_bios_info;
7941 ist_info = boot_params.ist_info;
7942- saved_videomode = boot_params.hdr.vid_mode;
7943+ saved_video_mode = boot_params.hdr.vid_mode;
7944 if( boot_params.sys_desc_table.length != 0 ) {
7945 set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
7946 machine_id = boot_params.sys_desc_table.table[0];
7947@@ -840,15 +902,19 @@ void __init setup_arch(char **cmdline_p)
7948 efi_init();
7949
7950 /* update e820 for memory not covered by WB MTRRs */
7951- find_max_pfn();
7952+ propagate_e820_map();
7953 mtrr_bp_init();
7954 #ifndef CONFIG_XEN
7955 if (mtrr_trim_uncached_memory(max_pfn))
7956- find_max_pfn();
7957+ propagate_e820_map();
7958 #endif
7959
7960 max_low_pfn = setup_memory();
7961
7962+#ifdef CONFIG_KVM_CLOCK
7963+ kvmclock_init();
7964+#endif
7965+
7966 #ifdef CONFIG_VMI
7967 /*
7968 * Must be after max_low_pfn is determined, and before kernel
7969@@ -856,6 +922,7 @@ void __init setup_arch(char **cmdline_p)
7970 */
7971 vmi_init();
7972 #endif
7973+ kvm_guest_init();
7974
7975 /*
7976 * NOTE: before this point _nobody_ is allowed to allocate
7977@@ -977,6 +1044,18 @@ void __init setup_arch(char **cmdline_p)
7978
7979 io_delay_init();
7980
7981+#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
7982+ /*
7983+ * setup to use the early static init tables during kernel startup
7984+ * X86_SMP will exclude sub-arches that don't deal well with it.
7985+ */
7986+ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
7987+ x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
7988+#ifdef CONFIG_NUMA
7989+ x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
7990+#endif
7991+#endif
7992+
7993 #ifdef CONFIG_X86_GENERICARCH
7994 generic_apic_probe();
7995 #endif
7996--- sle11-2009-05-14.orig/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:33:40.000000000 +0100
7997+++ sle11-2009-05-14/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:38:05.000000000 +0100
7998@@ -29,18 +29,22 @@
7999 #include <linux/crash_dump.h>
8000 #include <linux/root_dev.h>
8001 #include <linux/pci.h>
8002+#include <asm/pci-direct.h>
8003 #include <linux/efi.h>
8004 #include <linux/acpi.h>
8005 #include <linux/kallsyms.h>
8006 #include <linux/edd.h>
8007+#include <linux/iscsi_ibft.h>
8008 #include <linux/mmzone.h>
8009 #include <linux/kexec.h>
8010 #include <linux/cpufreq.h>
8011 #include <linux/dmi.h>
8012 #include <linux/dma-mapping.h>
8013 #include <linux/ctype.h>
8014+#include <linux/sort.h>
8015 #include <linux/uaccess.h>
8016 #include <linux/init_ohci1394_dma.h>
8017+#include <linux/kvm_para.h>
8018
8019 #include <asm/mtrr.h>
8020 #include <asm/uaccess.h>
8021@@ -58,7 +62,6 @@
8022 #include <asm/mmu_context.h>
8023 #include <asm/proto.h>
8024 #include <asm/setup.h>
8025-#include <asm/mach_apic.h>
8026 #include <asm/numa.h>
8027 #include <asm/sections.h>
8028 #include <asm/dmi.h>
8029@@ -66,6 +69,9 @@
8030 #include <asm/mce.h>
8031 #include <asm/ds.h>
8032 #include <asm/topology.h>
8033+#include <asm/pat.h>
8034+
8035+#include <mach_apic.h>
8036 #ifdef CONFIG_XEN
8037 #include <linux/percpu.h>
8038 #include <xen/interface/physdev.h>
8039@@ -149,7 +155,7 @@ extern int root_mountflags;
8040
8041 char __initdata command_line[COMMAND_LINE_SIZE];
8042
8043-struct resource standard_io_resources[] = {
8044+static struct resource standard_io_resources[] = {
8045 { .name = "dma1", .start = 0x00, .end = 0x1f,
8046 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8047 { .name = "pic1", .start = 0x20, .end = 0x21,
8048@@ -158,7 +164,9 @@ struct resource standard_io_resources[]
8049 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8050 { .name = "timer1", .start = 0x50, .end = 0x53,
8051 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8052- { .name = "keyboard", .start = 0x60, .end = 0x6f,
8053+ { .name = "keyboard", .start = 0x60, .end = 0x60,
8054+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8055+ { .name = "keyboard", .start = 0x64, .end = 0x64,
8056 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8057 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
8058 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8059@@ -224,8 +232,10 @@ contig_initmem_init(unsigned long start_
8060 e820_register_active_regions(0, start_pfn, end_pfn);
8061 #ifdef CONFIG_XEN
8062 free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
8063+ early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
8064 #else
8065 free_bootmem_with_active_regions(0, end_pfn);
8066+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
8067 #endif
8068 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
8069 }
8070@@ -290,6 +300,7 @@ static void __init reserve_crashkernel(v
8071 (unsigned long)(total_mem >> 20));
8072 crashk_res.start = crash_base;
8073 crashk_res.end = crash_base + crash_size - 1;
8074+ insert_resource(&iomem_resource, &crashk_res);
8075 }
8076 }
8077 #else
8078@@ -306,6 +317,40 @@ void __attribute__((weak)) __init memory
8079 machine_specific_memory_setup();
8080 }
8081
8082+static void __init parse_setup_data(void)
8083+{
8084+ struct setup_data *data;
8085+ unsigned long pa_data;
8086+
8087+ if (boot_params.hdr.version < 0x0209)
8088+ return;
8089+ pa_data = boot_params.hdr.setup_data;
8090+ while (pa_data) {
8091+ data = early_ioremap(pa_data, PAGE_SIZE);
8092+ switch (data->type) {
8093+ default:
8094+ break;
8095+ }
8096+#ifndef CONFIG_DEBUG_BOOT_PARAMS
8097+ free_early(pa_data, pa_data+sizeof(*data)+data->len);
8098+#endif
8099+ pa_data = data->next;
8100+ early_iounmap(data, PAGE_SIZE);
8101+ }
8102+}
8103+
8104+#ifdef CONFIG_PCI_MMCONFIG
8105+extern void __cpuinit fam10h_check_enable_mmcfg(void);
8106+extern void __init check_enable_amd_mmconf_dmi(void);
8107+#else
8108+void __cpuinit fam10h_check_enable_mmcfg(void)
8109+{
8110+}
8111+void __init check_enable_amd_mmconf_dmi(void)
8112+{
8113+}
8114+#endif
8115+
8116 /*
8117 * setup_arch - architecture-specific boot-time initializations
8118 *
8119@@ -389,6 +434,8 @@ void __init setup_arch(char **cmdline_p)
8120 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
8121 *cmdline_p = command_line;
8122
8123+ parse_setup_data();
8124+
8125 parse_early_param();
8126
8127 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
8128@@ -398,6 +445,13 @@ void __init setup_arch(char **cmdline_p)
8129
8130 finish_e820_parsing();
8131
8132+#ifndef CONFIG_XEN
8133+ /* after parse_early_param, so could debug it */
8134+ insert_resource(&iomem_resource, &code_resource);
8135+ insert_resource(&iomem_resource, &data_resource);
8136+ insert_resource(&iomem_resource, &bss_resource);
8137+#endif
8138+
8139 early_gart_iommu_check();
8140
8141 e820_register_active_regions(0, 0, -1UL);
8142@@ -420,15 +474,23 @@ void __init setup_arch(char **cmdline_p)
8143
8144 check_efer();
8145
8146- init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
8147+ max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
8148 if (efi_enabled)
8149 efi_init();
8150
8151+#ifndef CONFIG_XEN
8152+ vsmp_init();
8153+#endif
8154+
8155 if (is_initial_xendomain())
8156 dmi_scan_machine();
8157
8158 io_delay_init();
8159
8160+#ifdef CONFIG_KVM_CLOCK
8161+ kvmclock_init();
8162+#endif
8163+
8164 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
8165 /* setup to use the early static init tables during kernel startup */
8166 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
8167@@ -459,9 +521,9 @@ void __init setup_arch(char **cmdline_p)
8168 contig_initmem_init(0, end_pfn);
8169 #endif
8170
8171- early_res_to_bootmem();
8172-
8173 #ifndef CONFIG_XEN
8174+ dma32_reserve_bootmem();
8175+
8176 #ifdef CONFIG_ACPI_SLEEP
8177 /*
8178 * Reserve low memory region for sleep support.
8179@@ -487,16 +549,17 @@ void __init setup_arch(char **cmdline_p)
8180 unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
8181
8182 if (ramdisk_end <= end_of_mem) {
8183-#ifndef CONFIG_XEN
8184- reserve_bootmem_generic(ramdisk_image, ramdisk_size);
8185-#endif
8186+ /*
8187+ * don't need to reserve again, already reserved early
8188+ * in x86_64_start_kernel, and early_res_to_bootmem
8189+ * convert that to reserved in bootmem
8190+ */
8191 initrd_start = ramdisk_image + PAGE_OFFSET;
8192 initrd_end = initrd_start+ramdisk_size;
8193 #ifdef CONFIG_XEN
8194 initrd_below_start_ok = 1;
8195 #endif
8196 } else {
8197- /* Assumes everything on node 0 */
8198 free_bootmem(ramdisk_image, ramdisk_size);
8199 printk(KERN_ERR "initrd extends beyond end of memory "
8200 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
8201@@ -506,6 +569,9 @@ void __init setup_arch(char **cmdline_p)
8202 }
8203 #endif
8204 reserve_crashkernel();
8205+
8206+ reserve_ibft_region();
8207+
8208 paging_init();
8209 map_vsyscall();
8210 #ifdef CONFIG_X86_LOCAL_APIC
8211@@ -633,16 +699,16 @@ void __init setup_arch(char **cmdline_p)
8212 prefill_possible_map();
8213 #endif
8214
8215+ kvm_guest_init();
8216+
8217 /*
8218 * We trust e820 completely. No explicit ROM probing in memory.
8219 */
8220 #ifdef CONFIG_XEN
8221 if (is_initial_xendomain())
8222- e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
8223- &code_resource, &data_resource, &bss_resource);
8224+ e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
8225 #else
8226- e820_reserve_resources(e820.map, e820.nr_map,
8227- &code_resource, &data_resource, &bss_resource);
8228+ e820_reserve_resources(e820.map, e820.nr_map);
8229 e820_mark_nosave_regions();
8230 #endif
8231
8232@@ -690,6 +756,9 @@ void __init setup_arch(char **cmdline_p)
8233 #endif
8234
8235 #endif /* !CONFIG_XEN */
8236+
8237+ /* do this before identify_cpu for boot cpu */
8238+ check_enable_amd_mmconf_dmi();
8239 }
8240
8241 #ifdef CONFIG_XEN
8242@@ -786,9 +855,9 @@ static void __cpuinit amd_detect_cmp(str
8243 bits = c->x86_coreid_bits;
8244
8245 /* Low order bits define the core id (index of core in socket) */
8246- c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
8247- /* Convert the APIC ID into the socket ID */
8248- c->phys_proc_id = phys_pkg_id(bits);
8249+ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
8250+ /* Convert the initial APIC ID into the socket ID */
8251+ c->phys_proc_id = c->initial_apicid >> bits;
8252
8253 #ifdef CONFIG_NUMA
8254 node = c->phys_proc_id;
8255@@ -805,7 +874,7 @@ static void __cpuinit amd_detect_cmp(str
8256 If that doesn't result in a usable node fall back to the
8257 path for the previous case. */
8258
8259- int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
8260+ int ht_nodeid = c->initial_apicid;
8261
8262 if (ht_nodeid >= 0 &&
8263 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
8264@@ -913,7 +982,7 @@ static void __cpuinit init_amd(struct cp
8265
8266 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
8267 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
8268- clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
8269+ clear_cpu_cap(c, 0*32+31);
8270
8271 /* On C+ stepping K8 rep microcode works well for copy/memset */
8272 level = cpuid_eax(1);
8273@@ -955,9 +1024,25 @@ static void __cpuinit init_amd(struct cp
8274 /* MFENCE stops RDTSC speculation */
8275 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
8276
8277+ if (c->x86 == 0x10)
8278+ fam10h_check_enable_mmcfg();
8279+
8280 #ifndef CONFIG_XEN
8281 if (amd_apic_timer_broken())
8282 disable_apic_timer = 1;
8283+
8284+ if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
8285+ unsigned long long tseg;
8286+
8287+ /*
8288+ * Split up direct mapping around the TSEG SMM area.
8289+ * Don't do it for gbpages because there seems very little
8290+ * benefit in doing so.
8291+ */
8292+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
8293+ (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
8294+ set_memory_4k((unsigned long)__va(tseg), 1);
8295+ }
8296 #endif
8297 }
8298
8299@@ -1051,7 +1136,7 @@ static void __cpuinit early_init_intel(s
8300 {
8301 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
8302 (c->x86 == 0x6 && c->x86_model >= 0x0e))
8303- set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
8304+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8305 }
8306
8307 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
8308@@ -1094,9 +1179,6 @@ static void __cpuinit init_intel(struct
8309
8310 if (c->x86 == 15)
8311 c->x86_cache_alignment = c->x86_clflush_size * 2;
8312- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
8313- (c->x86 == 0x6 && c->x86_model >= 0x0e))
8314- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8315 if (c->x86 == 6)
8316 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
8317 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
8318@@ -1105,6 +1187,32 @@ static void __cpuinit init_intel(struct
8319 srat_detect_node();
8320 }
8321
8322+static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
8323+{
8324+ if (c->x86 == 0x6 && c->x86_model >= 0xf)
8325+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8326+}
8327+
8328+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
8329+{
8330+ /* Cache sizes */
8331+ unsigned n;
8332+
8333+ n = c->extended_cpuid_level;
8334+ if (n >= 0x80000008) {
8335+ unsigned eax = cpuid_eax(0x80000008);
8336+ c->x86_virt_bits = (eax >> 8) & 0xff;
8337+ c->x86_phys_bits = eax & 0xff;
8338+ }
8339+
8340+ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
8341+ c->x86_cache_alignment = c->x86_clflush_size * 2;
8342+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8343+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
8344+ }
8345+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
8346+}
8347+
8348 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
8349 {
8350 char *v = c->x86_vendor_id;
8351@@ -1113,6 +1221,8 @@ static void __cpuinit get_cpu_vendor(str
8352 c->x86_vendor = X86_VENDOR_AMD;
8353 else if (!strcmp(v, "GenuineIntel"))
8354 c->x86_vendor = X86_VENDOR_INTEL;
8355+ else if (!strcmp(v, "CentaurHauls"))
8356+ c->x86_vendor = X86_VENDOR_CENTAUR;
8357 else
8358 c->x86_vendor = X86_VENDOR_UNKNOWN;
8359 }
8360@@ -1160,15 +1270,16 @@ static void __cpuinit early_identify_cpu
8361 c->x86 += (tfms >> 20) & 0xff;
8362 if (c->x86 >= 0x6)
8363 c->x86_model += ((tfms >> 16) & 0xF) << 4;
8364- if (c->x86_capability[0] & (1<<19))
8365+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
8366 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
8367 } else {
8368 /* Have CPUID level 0 only - unheard of */
8369 c->x86 = 4;
8370 }
8371
8372+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
8373 #ifdef CONFIG_SMP
8374- c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
8375+ c->phys_proc_id = c->initial_apicid;
8376 #endif
8377 /* AMD-defined flags: level 0x80000001 */
8378 xlvl = cpuid_eax(0x80000000);
8379@@ -1201,8 +1312,12 @@ static void __cpuinit early_identify_cpu
8380 case X86_VENDOR_INTEL:
8381 early_init_intel(c);
8382 break;
8383+ case X86_VENDOR_CENTAUR:
8384+ early_init_centaur(c);
8385+ break;
8386 }
8387
8388+ validate_pat_support(c);
8389 }
8390
8391 /*
8392@@ -1237,6 +1352,10 @@ void __cpuinit identify_cpu(struct cpuin
8393 init_intel(c);
8394 break;
8395
8396+ case X86_VENDOR_CENTAUR:
8397+ init_centaur(c);
8398+ break;
8399+
8400 case X86_VENDOR_UNKNOWN:
8401 default:
8402 display_cacheinfo(c);
8403@@ -1266,14 +1385,24 @@ void __cpuinit identify_cpu(struct cpuin
8404 #endif
8405 select_idle_routine(c);
8406
8407- if (c != &boot_cpu_data)
8408- mtrr_ap_init();
8409 #ifdef CONFIG_NUMA
8410 numa_add_cpu(smp_processor_id());
8411 #endif
8412
8413 }
8414
8415+void __cpuinit identify_boot_cpu(void)
8416+{
8417+ identify_cpu(&boot_cpu_data);
8418+}
8419+
8420+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
8421+{
8422+ BUG_ON(c == &boot_cpu_data);
8423+ identify_cpu(c);
8424+ mtrr_ap_init();
8425+}
8426+
8427 static __init int setup_noclflush(char *arg)
8428 {
8429 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
8430@@ -1302,123 +1431,3 @@ static __init int setup_disablecpuid(cha
8431 return 1;
8432 }
8433 __setup("clearcpuid=", setup_disablecpuid);
8434-
8435-/*
8436- * Get CPU information for use by the procfs.
8437- */
8438-
8439-static int show_cpuinfo(struct seq_file *m, void *v)
8440-{
8441- struct cpuinfo_x86 *c = v;
8442- int cpu = 0, i;
8443-
8444-#ifdef CONFIG_SMP
8445- cpu = c->cpu_index;
8446-#endif
8447-
8448- seq_printf(m, "processor\t: %u\n"
8449- "vendor_id\t: %s\n"
8450- "cpu family\t: %d\n"
8451- "model\t\t: %d\n"
8452- "model name\t: %s\n",
8453- (unsigned)cpu,
8454- c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8455- c->x86,
8456- (int)c->x86_model,
8457- c->x86_model_id[0] ? c->x86_model_id : "unknown");
8458-
8459- if (c->x86_mask || c->cpuid_level >= 0)
8460- seq_printf(m, "stepping\t: %d\n", c->x86_mask);
8461- else
8462- seq_printf(m, "stepping\t: unknown\n");
8463-
8464- if (cpu_has(c, X86_FEATURE_TSC)) {
8465- unsigned int freq = cpufreq_quick_get((unsigned)cpu);
8466-
8467- if (!freq)
8468- freq = cpu_khz;
8469- seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
8470- freq / 1000, (freq % 1000));
8471- }
8472-
8473- /* Cache size */
8474- if (c->x86_cache_size >= 0)
8475- seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
8476-
8477-#ifdef CONFIG_SMP
8478- if (smp_num_siblings * c->x86_max_cores > 1) {
8479- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
8480- seq_printf(m, "siblings\t: %d\n",
8481- cpus_weight(per_cpu(cpu_core_map, cpu)));
8482- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
8483- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
8484- }
8485-#endif
8486-
8487- seq_printf(m,
8488- "fpu\t\t: yes\n"
8489- "fpu_exception\t: yes\n"
8490- "cpuid level\t: %d\n"
8491- "wp\t\t: yes\n"
8492- "flags\t\t:",
8493- c->cpuid_level);
8494-
8495- for (i = 0; i < 32*NCAPINTS; i++)
8496- if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8497- seq_printf(m, " %s", x86_cap_flags[i]);
8498-
8499- seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
8500- c->loops_per_jiffy/(500000/HZ),
8501- (c->loops_per_jiffy/(5000/HZ)) % 100);
8502-
8503- if (c->x86_tlbsize > 0)
8504- seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
8505- seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
8506- seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
8507-
8508- seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8509- c->x86_phys_bits, c->x86_virt_bits);
8510-
8511- seq_printf(m, "power management:");
8512- for (i = 0; i < 32; i++) {
8513- if (c->x86_power & (1 << i)) {
8514- if (i < ARRAY_SIZE(x86_power_flags) &&
8515- x86_power_flags[i])
8516- seq_printf(m, "%s%s",
8517- x86_power_flags[i][0]?" ":"",
8518- x86_power_flags[i]);
8519- else
8520- seq_printf(m, " [%d]", i);
8521- }
8522- }
8523-
8524- seq_printf(m, "\n\n");
8525-
8526- return 0;
8527-}
8528-
8529-static void *c_start(struct seq_file *m, loff_t *pos)
8530-{
8531- if (*pos == 0) /* just in case, cpu 0 is not the first */
8532- *pos = first_cpu(cpu_online_map);
8533- if ((*pos) < NR_CPUS && cpu_online(*pos))
8534- return &cpu_data(*pos);
8535- return NULL;
8536-}
8537-
8538-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
8539-{
8540- *pos = next_cpu(*pos, cpu_online_map);
8541- return c_start(m, pos);
8542-}
8543-
8544-static void c_stop(struct seq_file *m, void *v)
8545-{
8546-}
8547-
8548-const struct seq_operations cpuinfo_op = {
8549- .start = c_start,
8550- .next = c_next,
8551- .stop = c_stop,
8552- .show = show_cpuinfo,
8553-};
8554--- /dev/null 1970-01-01 00:00:00.000000000 +0000
8555+++ sle11-2009-05-14/arch/x86/kernel/smp-xen.c 2009-03-16 16:38:05.000000000 +0100
8556@@ -0,0 +1,329 @@
8557+/*
8558+ * Intel SMP support routines.
8559+ *
8560+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
8561+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
8562+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
8563+ *
8564+ * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
8565+ *
8566+ * This code is released under the GNU General Public License version 2 or
8567+ * later.
8568+ */
8569+
8570+#include <linux/init.h>
8571+
8572+#include <linux/mm.h>
8573+#include <linux/delay.h>
8574+#include <linux/spinlock.h>
8575+#include <linux/kernel_stat.h>
8576+#include <linux/mc146818rtc.h>
8577+#include <linux/cache.h>
8578+#include <linux/interrupt.h>
8579+#include <linux/cpu.h>
8580+
8581+#include <asm/mtrr.h>
8582+#include <asm/tlbflush.h>
8583+#include <asm/mmu_context.h>
8584+#include <asm/proto.h>
8585+#include <mach_ipi.h>
8586+#include <xen/evtchn.h>
8587+/*
8588+ * Some notes on x86 processor bugs affecting SMP operation:
8589+ *
8590+ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
8591+ * The Linux implications for SMP are handled as follows:
8592+ *
8593+ * Pentium III / [Xeon]
8594+ * None of the E1AP-E3AP errata are visible to the user.
8595+ *
8596+ * E1AP. see PII A1AP
8597+ * E2AP. see PII A2AP
8598+ * E3AP. see PII A3AP
8599+ *
8600+ * Pentium II / [Xeon]
8601+ * None of the A1AP-A3AP errata are visible to the user.
8602+ *
8603+ * A1AP. see PPro 1AP
8604+ * A2AP. see PPro 2AP
8605+ * A3AP. see PPro 7AP
8606+ *
8607+ * Pentium Pro
8608+ * None of 1AP-9AP errata are visible to the normal user,
8609+ * except occasional delivery of 'spurious interrupt' as trap #15.
8610+ * This is very rare and a non-problem.
8611+ *
8612+ * 1AP. Linux maps APIC as non-cacheable
8613+ * 2AP. worked around in hardware
8614+ * 3AP. fixed in C0 and above steppings microcode update.
8615+ * Linux does not use excessive STARTUP_IPIs.
8616+ * 4AP. worked around in hardware
8617+ * 5AP. symmetric IO mode (normal Linux operation) not affected.
8618+ * 'noapic' mode has vector 0xf filled out properly.
8619+ * 6AP. 'noapic' mode might be affected - fixed in later steppings
8620+ * 7AP. We do not assume writes to the LVT deassering IRQs
8621+ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
8622+ * 9AP. We do not use mixed mode
8623+ *
8624+ * Pentium
8625+ * There is a marginal case where REP MOVS on 100MHz SMP
8626+ * machines with B stepping processors can fail. XXX should provide
8627+ * an L1cache=Writethrough or L1cache=off option.
8628+ *
8629+ * B stepping CPUs may hang. There are hardware work arounds
8630+ * for this. We warn about it in case your board doesn't have the work
8631+ * arounds. Basically that's so I can tell anyone with a B stepping
8632+ * CPU and SMP problems "tough".
8633+ *
8634+ * Specific items [From Pentium Processor Specification Update]
8635+ *
8636+ * 1AP. Linux doesn't use remote read
8637+ * 2AP. Linux doesn't trust APIC errors
8638+ * 3AP. We work around this
8639+ * 4AP. Linux never generated 3 interrupts of the same priority
8640+ * to cause a lost local interrupt.
8641+ * 5AP. Remote read is never used
8642+ * 6AP. not affected - worked around in hardware
8643+ * 7AP. not affected - worked around in hardware
8644+ * 8AP. worked around in hardware - we get explicit CS errors if not
8645+ * 9AP. only 'noapic' mode affected. Might generate spurious
8646+ * interrupts, we log only the first one and count the
8647+ * rest silently.
8648+ * 10AP. not affected - worked around in hardware
8649+ * 11AP. Linux reads the APIC between writes to avoid this, as per
8650+ * the documentation. Make sure you preserve this as it affects
8651+ * the C stepping chips too.
8652+ * 12AP. not affected - worked around in hardware
8653+ * 13AP. not affected - worked around in hardware
8654+ * 14AP. we always deassert INIT during bootup
8655+ * 15AP. not affected - worked around in hardware
8656+ * 16AP. not affected - worked around in hardware
8657+ * 17AP. not affected - worked around in hardware
8658+ * 18AP. not affected - worked around in hardware
8659+ * 19AP. not affected - worked around in BIOS
8660+ *
8661+ * If this sounds worrying believe me these bugs are either ___RARE___,
8662+ * or are signal timing bugs worked around in hardware and there's
8663+ * about nothing of note with C stepping upwards.
8664+ */
8665+
8666+/*
8667+ * this function sends a 'reschedule' IPI to another CPU.
8668+ * it goes straight through and wastes no time serializing
8669+ * anything. Worst case is that we lose a reschedule ...
8670+ */
8671+void xen_smp_send_reschedule(int cpu)
8672+{
8673+ if (unlikely(cpu_is_offline(cpu))) {
8674+ WARN_ON(1);
8675+ return;
8676+ }
8677+ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
8678+}
8679+
8680+/*
8681+ * Structure and data for smp_call_function(). This is designed to minimise
8682+ * static memory requirements. It also looks cleaner.
8683+ */
8684+static DEFINE_SPINLOCK(call_lock);
8685+
8686+struct call_data_struct {
8687+ void (*func) (void *info);
8688+ void *info;
8689+ atomic_t started;
8690+ atomic_t finished;
8691+ int wait;
8692+};
8693+
8694+void lock_ipi_call_lock(void)
8695+{
8696+ spin_lock_irq(&call_lock);
8697+}
8698+
8699+void unlock_ipi_call_lock(void)
8700+{
8701+ spin_unlock_irq(&call_lock);
8702+}
8703+
8704+static struct call_data_struct *call_data;
8705+
8706+static void __smp_call_function(void (*func) (void *info), void *info,
8707+ int nonatomic, int wait)
8708+{
8709+ struct call_data_struct data;
8710+ int cpus = num_online_cpus() - 1;
8711+
8712+ if (!cpus)
8713+ return;
8714+
8715+ data.func = func;
8716+ data.info = info;
8717+ atomic_set(&data.started, 0);
8718+ data.wait = wait;
8719+ if (wait)
8720+ atomic_set(&data.finished, 0);
8721+
8722+ call_data = &data;
8723+ mb();
8724+
8725+ /* Send a message to all other CPUs and wait for them to respond */
8726+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
8727+
8728+ /* Wait for response */
8729+ while (atomic_read(&data.started) != cpus)
8730+ cpu_relax();
8731+
8732+ if (wait)
8733+ while (atomic_read(&data.finished) != cpus)
8734+ cpu_relax();
8735+}
8736+
8737+
8738+/**
8739+ * smp_call_function_mask(): Run a function on a set of other CPUs.
8740+ * @mask: The set of cpus to run on. Must not include the current cpu.
8741+ * @func: The function to run. This must be fast and non-blocking.
8742+ * @info: An arbitrary pointer to pass to the function.
8743+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
8744+ *
8745+ * Returns 0 on success, else a negative status code.
8746+ *
8747+ * If @wait is true, then returns once @func has returned; otherwise
8748+ * it returns just before the target cpu calls @func.
8749+ *
8750+ * You must not call this function with disabled interrupts or from a
8751+ * hardware interrupt handler or from a bottom half handler.
8752+ */
8753+int
8754+xen_smp_call_function_mask(cpumask_t mask,
8755+ void (*func)(void *), void *info,
8756+ int wait)
8757+{
8758+ struct call_data_struct data;
8759+ cpumask_t allbutself;
8760+ int cpus;
8761+
8762+ /* Can deadlock when called with interrupts disabled */
8763+ WARN_ON(irqs_disabled());
8764+
8765+ /* Holding any lock stops cpus from going down. */
8766+ spin_lock(&call_lock);
8767+
8768+ allbutself = cpu_online_map;
8769+ cpu_clear(smp_processor_id(), allbutself);
8770+
8771+ cpus_and(mask, mask, allbutself);
8772+ cpus = cpus_weight(mask);
8773+
8774+ if (!cpus) {
8775+ spin_unlock(&call_lock);
8776+ return 0;
8777+ }
8778+
8779+ data.func = func;
8780+ data.info = info;
8781+ atomic_set(&data.started, 0);
8782+ data.wait = wait;
8783+ if (wait)
8784+ atomic_set(&data.finished, 0);
8785+
8786+ call_data = &data;
8787+ wmb();
8788+
8789+ /* Send a message to other CPUs */
8790+ if (cpus_equal(mask, allbutself) &&
8791+ cpus_equal(cpu_online_map, cpu_callout_map))
8792+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
8793+ else
8794+ send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
8795+
8796+ /* Wait for response */
8797+ while (atomic_read(&data.started) != cpus)
8798+ cpu_relax();
8799+
8800+ if (wait)
8801+ while (atomic_read(&data.finished) != cpus)
8802+ cpu_relax();
8803+ spin_unlock(&call_lock);
8804+
8805+ return 0;
8806+}
8807+
8808+static void stop_this_cpu(void *dummy)
8809+{
8810+ local_irq_disable();
8811+ /*
8812+ * Remove this CPU:
8813+ */
8814+ cpu_clear(smp_processor_id(), cpu_online_map);
8815+ disable_all_local_evtchn();
8816+ if (hlt_works(smp_processor_id()))
8817+ for (;;) halt();
8818+ for (;;);
8819+}
8820+
8821+/*
8822+ * this function calls the 'stop' function on all other CPUs in the system.
8823+ */
8824+
8825+void xen_smp_send_stop(void)
8826+{
8827+ int nolock;
8828+ unsigned long flags;
8829+
8830+ /* Don't deadlock on the call lock in panic */
8831+ nolock = !spin_trylock(&call_lock);
8832+ local_irq_save(flags);
8833+ __smp_call_function(stop_this_cpu, NULL, 0, 0);
8834+ if (!nolock)
8835+ spin_unlock(&call_lock);
8836+ disable_all_local_evtchn();
8837+ local_irq_restore(flags);
8838+}
8839+
8840+/*
8841+ * Reschedule call back. Nothing to do,
8842+ * all the work is done automatically when
8843+ * we return from the interrupt.
8844+ */
8845+irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
8846+{
8847+#ifdef CONFIG_X86_32
8848+ __get_cpu_var(irq_stat).irq_resched_count++;
8849+#else
8850+ add_pda(irq_resched_count, 1);
8851+#endif
8852+ return IRQ_HANDLED;
8853+}
8854+
8855+irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
8856+{
8857+ void (*func) (void *info) = call_data->func;
8858+ void *info = call_data->info;
8859+ int wait = call_data->wait;
8860+
8861+ /*
8862+ * Notify initiating CPU that I've grabbed the data and am
8863+ * about to execute the function
8864+ */
8865+ mb();
8866+ atomic_inc(&call_data->started);
8867+ /*
8868+ * At this point the info structure may be out of scope unless wait==1
8869+ */
8870+ irq_enter();
8871+ (*func)(info);
8872+#ifdef CONFIG_X86_32
8873+ __get_cpu_var(irq_stat).irq_call_count++;
8874+#else
8875+ add_pda(irq_call_count, 1);
8876+#endif
8877+ irq_exit();
8878+
8879+ if (wait) {
8880+ mb();
8881+ atomic_inc(&call_data->finished);
8882+ }
8883+
8884+ return IRQ_HANDLED;
8885+}
8886--- sle11-2009-05-14.orig/arch/x86/kernel/smp_32-xen.c 2009-03-16 16:33:40.000000000 +0100
8887+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
8888@@ -1,647 +0,0 @@
8889-/*
8890- * Intel SMP support routines.
8891- *
8892- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
8893- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
8894- *
8895- * This code is released under the GNU General Public License version 2 or
8896- * later.
8897- */
8898-
8899-#include <linux/init.h>
8900-
8901-#include <linux/mm.h>
8902-#include <linux/delay.h>
8903-#include <linux/spinlock.h>
8904-#include <linux/kernel_stat.h>
8905-#include <linux/mc146818rtc.h>
8906-#include <linux/cache.h>
8907-#include <linux/interrupt.h>
8908-#include <linux/cpu.h>
8909-#include <linux/module.h>
8910-
8911-#include <asm/mtrr.h>
8912-#include <asm/tlbflush.h>
8913-#include <asm/mmu_context.h>
8914-#if 0
8915-#include <mach_apic.h>
8916-#endif
8917-#include <xen/evtchn.h>
8918-
8919-/*
8920- * Some notes on x86 processor bugs affecting SMP operation:
8921- *
8922- * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
8923- * The Linux implications for SMP are handled as follows:
8924- *
8925- * Pentium III / [Xeon]
8926- * None of the E1AP-E3AP errata are visible to the user.
8927- *
8928- * E1AP. see PII A1AP
8929- * E2AP. see PII A2AP
8930- * E3AP. see PII A3AP
8931- *
8932- * Pentium II / [Xeon]
8933- * None of the A1AP-A3AP errata are visible to the user.
8934- *
8935- * A1AP. see PPro 1AP
8936- * A2AP. see PPro 2AP
8937- * A3AP. see PPro 7AP
8938- *
8939- * Pentium Pro
8940- * None of 1AP-9AP errata are visible to the normal user,
8941- * except occasional delivery of 'spurious interrupt' as trap #15.
8942- * This is very rare and a non-problem.
8943- *
8944- * 1AP. Linux maps APIC as non-cacheable
8945- * 2AP. worked around in hardware
8946- * 3AP. fixed in C0 and above steppings microcode update.
8947- * Linux does not use excessive STARTUP_IPIs.
8948- * 4AP. worked around in hardware
8949- * 5AP. symmetric IO mode (normal Linux operation) not affected.
8950- * 'noapic' mode has vector 0xf filled out properly.
8951- * 6AP. 'noapic' mode might be affected - fixed in later steppings
8952- * 7AP. We do not assume writes to the LVT deassering IRQs
8953- * 8AP. We do not enable low power mode (deep sleep) during MP bootup
8954- * 9AP. We do not use mixed mode
8955- *
8956- * Pentium
8957- * There is a marginal case where REP MOVS on 100MHz SMP
8958- * machines with B stepping processors can fail. XXX should provide
8959- * an L1cache=Writethrough or L1cache=off option.
8960- *
8961- * B stepping CPUs may hang. There are hardware work arounds
8962- * for this. We warn about it in case your board doesn't have the work
8963- * arounds. Basically that's so I can tell anyone with a B stepping
8964- * CPU and SMP problems "tough".
8965- *
8966- * Specific items [From Pentium Processor Specification Update]
8967- *
8968- * 1AP. Linux doesn't use remote read
8969- * 2AP. Linux doesn't trust APIC errors
8970- * 3AP. We work around this
8971- * 4AP. Linux never generated 3 interrupts of the same priority
8972- * to cause a lost local interrupt.
8973- * 5AP. Remote read is never used
8974- * 6AP. not affected - worked around in hardware
8975- * 7AP. not affected - worked around in hardware
8976- * 8AP. worked around in hardware - we get explicit CS errors if not
8977- * 9AP. only 'noapic' mode affected. Might generate spurious
8978- * interrupts, we log only the first one and count the
8979- * rest silently.
8980- * 10AP. not affected - worked around in hardware
8981- * 11AP. Linux reads the APIC between writes to avoid this, as per
8982- * the documentation. Make sure you preserve this as it affects
8983- * the C stepping chips too.
8984- * 12AP. not affected - worked around in hardware
8985- * 13AP. not affected - worked around in hardware
8986- * 14AP. we always deassert INIT during bootup
8987- * 15AP. not affected - worked around in hardware
8988- * 16AP. not affected - worked around in hardware
8989- * 17AP. not affected - worked around in hardware
8990- * 18AP. not affected - worked around in hardware
8991- * 19AP. not affected - worked around in BIOS
8992- *
8993- * If this sounds worrying believe me these bugs are either ___RARE___,
8994- * or are signal timing bugs worked around in hardware and there's
8995- * about nothing of note with C stepping upwards.
8996- */
8997-
8998-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
8999-
9000-/*
9001- * the following functions deal with sending IPIs between CPUs.
9002- *
9003- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
9004- */
9005-
9006-static inline int __prepare_ICR (unsigned int shortcut, int vector)
9007-{
9008- unsigned int icr = shortcut | APIC_DEST_LOGICAL;
9009-
9010- switch (vector) {
9011- default:
9012- icr |= APIC_DM_FIXED | vector;
9013- break;
9014- case NMI_VECTOR:
9015- icr |= APIC_DM_NMI;
9016- break;
9017- }
9018- return icr;
9019-}
9020-
9021-static inline int __prepare_ICR2 (unsigned int mask)
9022-{
9023- return SET_APIC_DEST_FIELD(mask);
9024-}
9025-
9026-DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
9027-
9028-static inline void __send_IPI_one(unsigned int cpu, int vector)
9029-{
9030- int irq = per_cpu(ipi_to_irq, cpu)[vector];
9031- BUG_ON(irq < 0);
9032- notify_remote_via_irq(irq);
9033-}
9034-
9035-void __send_IPI_shortcut(unsigned int shortcut, int vector)
9036-{
9037- int cpu;
9038-
9039- switch (shortcut) {
9040- case APIC_DEST_SELF:
9041- __send_IPI_one(smp_processor_id(), vector);
9042- break;
9043- case APIC_DEST_ALLBUT:
9044- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
9045- if (cpu == smp_processor_id())
9046- continue;
9047- if (cpu_isset(cpu, cpu_online_map)) {
9048- __send_IPI_one(cpu, vector);
9049- }
9050- }
9051- break;
9052- default:
9053- printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
9054- vector);
9055- break;
9056- }
9057-}
9058-
9059-void send_IPI_self(int vector)
9060-{
9061- __send_IPI_shortcut(APIC_DEST_SELF, vector);
9062-}
9063-
9064-/*
9065- * This is only used on smaller machines.
9066- */
9067-void send_IPI_mask_bitmask(cpumask_t mask, int vector)
9068-{
9069- unsigned long flags;
9070- unsigned int cpu;
9071-
9072- local_irq_save(flags);
9073- WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
9074-
9075- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
9076- if (cpu_isset(cpu, mask)) {
9077- __send_IPI_one(cpu, vector);
9078- }
9079- }
9080-
9081- local_irq_restore(flags);
9082-}
9083-
9084-void send_IPI_mask_sequence(cpumask_t mask, int vector)
9085-{
9086-
9087- send_IPI_mask_bitmask(mask, vector);
9088-}
9089-
9090-#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
9091-
9092-#if 0 /* XEN */
9093-/*
9094- * Smarter SMP flushing macros.
9095- * c/o Linus Torvalds.
9096- *
9097- * These mean you can really definitely utterly forget about
9098- * writing to user space from interrupts. (Its not allowed anyway).
9099- *
9100- * Optimizations Manfred Spraul <manfred@colorfullife.com>
9101- */
9102-
9103-static cpumask_t flush_cpumask;
9104-static struct mm_struct * flush_mm;
9105-static unsigned long flush_va;
9106-static DEFINE_SPINLOCK(tlbstate_lock);
9107-
9108-/*
9109- * We cannot call mmdrop() because we are in interrupt context,
9110- * instead update mm->cpu_vm_mask.
9111- *
9112- * We need to reload %cr3 since the page tables may be going
9113- * away from under us..
9114- */
9115-void leave_mm(int cpu)
9116-{
9117- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
9118- BUG();
9119- cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
9120- load_cr3(swapper_pg_dir);
9121-}
9122-EXPORT_SYMBOL_GPL(leave_mm);
9123-
9124-/*
9125- *
9126- * The flush IPI assumes that a thread switch happens in this order:
9127- * [cpu0: the cpu that switches]
9128- * 1) switch_mm() either 1a) or 1b)
9129- * 1a) thread switch to a different mm
9130- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
9131- * Stop ipi delivery for the old mm. This is not synchronized with
9132- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
9133- * for the wrong mm, and in the worst case we perform a superfluous
9134- * tlb flush.
9135- * 1a2) set cpu_tlbstate to TLBSTATE_OK
9136- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
9137- * was in lazy tlb mode.
9138- * 1a3) update cpu_tlbstate[].active_mm
9139- * Now cpu0 accepts tlb flushes for the new mm.
9140- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
9141- * Now the other cpus will send tlb flush ipis.
9142- * 1a4) change cr3.
9143- * 1b) thread switch without mm change
9144- * cpu_tlbstate[].active_mm is correct, cpu0 already handles
9145- * flush ipis.
9146- * 1b1) set cpu_tlbstate to TLBSTATE_OK
9147- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
9148- * Atomically set the bit [other cpus will start sending flush ipis],
9149- * and test the bit.
9150- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
9151- * 2) switch %%esp, ie current
9152- *
9153- * The interrupt must handle 2 special cases:
9154- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
9155- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
9156- * runs in kernel space, the cpu could load tlb entries for user space
9157- * pages.
9158- *
9159- * The good news is that cpu_tlbstate is local to each cpu, no
9160- * write/read ordering problems.
9161- */
9162-
9163-/*
9164- * TLB flush IPI:
9165- *
9166- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
9167- * 2) Leave the mm if we are in the lazy tlb mode.
9168- */
9169-
9170-irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id)
9171-{
9172- unsigned long cpu;
9173-
9174- cpu = get_cpu();
9175-
9176- if (!cpu_isset(cpu, flush_cpumask))
9177- goto out;
9178- /*
9179- * This was a BUG() but until someone can quote me the
9180- * line from the intel manual that guarantees an IPI to
9181- * multiple CPUs is retried _only_ on the erroring CPUs
9182- * its staying as a return
9183- *
9184- * BUG();
9185- */
9186-
9187- if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
9188- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
9189- if (flush_va == TLB_FLUSH_ALL)
9190- local_flush_tlb();
9191- else
9192- __flush_tlb_one(flush_va);
9193- } else
9194- leave_mm(cpu);
9195- }
9196- smp_mb__before_clear_bit();
9197- cpu_clear(cpu, flush_cpumask);
9198- smp_mb__after_clear_bit();
9199-out:
9200- put_cpu_no_resched();
9201- __get_cpu_var(irq_stat).irq_tlb_count++;
9202-
9203- return IRQ_HANDLED;
9204-}
9205-
9206-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
9207- unsigned long va)
9208-{
9209- cpumask_t cpumask = *cpumaskp;
9210-
9211- /*
9212- * A couple of (to be removed) sanity checks:
9213- *
9214- * - current CPU must not be in mask
9215- * - mask must exist :)
9216- */
9217- BUG_ON(cpus_empty(cpumask));
9218- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
9219- BUG_ON(!mm);
9220-
9221-#ifdef CONFIG_HOTPLUG_CPU
9222- /* If a CPU which we ran on has gone down, OK. */
9223- cpus_and(cpumask, cpumask, cpu_online_map);
9224- if (unlikely(cpus_empty(cpumask)))
9225- return;
9226-#endif
9227-
9228- /*
9229- * i'm not happy about this global shared spinlock in the
9230- * MM hot path, but we'll see how contended it is.
9231- * AK: x86-64 has a faster method that could be ported.
9232- */
9233- spin_lock(&tlbstate_lock);
9234-
9235- flush_mm = mm;
9236- flush_va = va;
9237- cpus_or(flush_cpumask, cpumask, flush_cpumask);
9238- /*
9239- * We have to send the IPI only to
9240- * CPUs affected.
9241- */
9242- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
9243-
9244- while (!cpus_empty(flush_cpumask))
9245- /* nothing. lockup detection does not belong here */
9246- cpu_relax();
9247-
9248- flush_mm = NULL;
9249- flush_va = 0;
9250- spin_unlock(&tlbstate_lock);
9251-}
9252-
9253-void flush_tlb_current_task(void)
9254-{
9255- struct mm_struct *mm = current->mm;
9256- cpumask_t cpu_mask;
9257-
9258- preempt_disable();
9259- cpu_mask = mm->cpu_vm_mask;
9260- cpu_clear(smp_processor_id(), cpu_mask);
9261-
9262- local_flush_tlb();
9263- if (!cpus_empty(cpu_mask))
9264- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9265- preempt_enable();
9266-}
9267-
9268-void flush_tlb_mm (struct mm_struct * mm)
9269-{
9270- cpumask_t cpu_mask;
9271-
9272- preempt_disable();
9273- cpu_mask = mm->cpu_vm_mask;
9274- cpu_clear(smp_processor_id(), cpu_mask);
9275-
9276- if (current->active_mm == mm) {
9277- if (current->mm)
9278- local_flush_tlb();
9279- else
9280- leave_mm(smp_processor_id());
9281- }
9282- if (!cpus_empty(cpu_mask))
9283- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9284-
9285- preempt_enable();
9286-}
9287-
9288-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
9289-{
9290- struct mm_struct *mm = vma->vm_mm;
9291- cpumask_t cpu_mask;
9292-
9293- preempt_disable();
9294- cpu_mask = mm->cpu_vm_mask;
9295- cpu_clear(smp_processor_id(), cpu_mask);
9296-
9297- if (current->active_mm == mm) {
9298- if(current->mm)
9299- __flush_tlb_one(va);
9300- else
9301- leave_mm(smp_processor_id());
9302- }
9303-
9304- if (!cpus_empty(cpu_mask))
9305- flush_tlb_others(cpu_mask, mm, va);
9306-
9307- preempt_enable();
9308-}
9309-EXPORT_SYMBOL(flush_tlb_page);
9310-
9311-static void do_flush_tlb_all(void* info)
9312-{
9313- unsigned long cpu = smp_processor_id();
9314-
9315- __flush_tlb_all();
9316- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
9317- leave_mm(cpu);
9318-}
9319-
9320-void flush_tlb_all(void)
9321-{
9322- on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
9323-}
9324-
9325-#endif /* XEN */
9326-
9327-/*
9328- * this function sends a 'reschedule' IPI to another CPU.
9329- * it goes straight through and wastes no time serializing
9330- * anything. Worst case is that we lose a reschedule ...
9331- */
9332-void xen_smp_send_reschedule(int cpu)
9333-{
9334- WARN_ON(cpu_is_offline(cpu));
9335- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
9336-}
9337-
9338-/*
9339- * Structure and data for smp_call_function(). This is designed to minimise
9340- * static memory requirements. It also looks cleaner.
9341- */
9342-static DEFINE_SPINLOCK(call_lock);
9343-
9344-struct call_data_struct {
9345- void (*func) (void *info);
9346- void *info;
9347- atomic_t started;
9348- atomic_t finished;
9349- int wait;
9350-};
9351-
9352-void lock_ipi_call_lock(void)
9353-{
9354- spin_lock_irq(&call_lock);
9355-}
9356-
9357-void unlock_ipi_call_lock(void)
9358-{
9359- spin_unlock_irq(&call_lock);
9360-}
9361-
9362-static struct call_data_struct *call_data;
9363-
9364-static void __smp_call_function(void (*func) (void *info), void *info,
9365- int nonatomic, int wait)
9366-{
9367- struct call_data_struct data;
9368- int cpus = num_online_cpus() - 1;
9369-
9370- if (!cpus)
9371- return;
9372-
9373- data.func = func;
9374- data.info = info;
9375- atomic_set(&data.started, 0);
9376- data.wait = wait;
9377- if (wait)
9378- atomic_set(&data.finished, 0);
9379-
9380- call_data = &data;
9381- mb();
9382-
9383- /* Send a message to all other CPUs and wait for them to respond */
9384- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
9385-
9386- /* Wait for response */
9387- while (atomic_read(&data.started) != cpus)
9388- cpu_relax();
9389-
9390- if (wait)
9391- while (atomic_read(&data.finished) != cpus)
9392- cpu_relax();
9393-}
9394-
9395-
9396-/**
9397- * smp_call_function_mask(): Run a function on a set of other CPUs.
9398- * @mask: The set of cpus to run on. Must not include the current cpu.
9399- * @func: The function to run. This must be fast and non-blocking.
9400- * @info: An arbitrary pointer to pass to the function.
9401- * @wait: If true, wait (atomically) until function has completed on other CPUs.
9402- *
9403- * Returns 0 on success, else a negative status code.
9404- *
9405- * If @wait is true, then returns once @func has returned; otherwise
9406- * it returns just before the target cpu calls @func.
9407- *
9408- * You must not call this function with disabled interrupts or from a
9409- * hardware interrupt handler or from a bottom half handler.
9410- */
9411-int
9412-xen_smp_call_function_mask(cpumask_t mask,
9413- void (*func)(void *), void *info,
9414- int wait)
9415-{
9416- struct call_data_struct data;
9417- cpumask_t allbutself;
9418- int cpus;
9419-
9420- /* Can deadlock when called with interrupts disabled */
9421- WARN_ON(irqs_disabled());
9422-
9423- /* Holding any lock stops cpus from going down. */
9424- spin_lock(&call_lock);
9425-
9426- allbutself = cpu_online_map;
9427- cpu_clear(smp_processor_id(), allbutself);
9428-
9429- cpus_and(mask, mask, allbutself);
9430- cpus = cpus_weight(mask);
9431-
9432- if (!cpus) {
9433- spin_unlock(&call_lock);
9434- return 0;
9435- }
9436-
9437- data.func = func;
9438- data.info = info;
9439- atomic_set(&data.started, 0);
9440- data.wait = wait;
9441- if (wait)
9442- atomic_set(&data.finished, 0);
9443-
9444- call_data = &data;
9445- mb();
9446-
9447- /* Send a message to other CPUs */
9448- if (cpus_equal(mask, allbutself))
9449- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
9450- else
9451- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
9452-
9453- /* Wait for response */
9454- while (atomic_read(&data.started) != cpus)
9455- cpu_relax();
9456-
9457- if (wait)
9458- while (atomic_read(&data.finished) != cpus)
9459- cpu_relax();
9460- spin_unlock(&call_lock);
9461-
9462- return 0;
9463-}
9464-
9465-static void stop_this_cpu (void * dummy)
9466-{
9467- local_irq_disable();
9468- /*
9469- * Remove this CPU:
9470- */
9471- cpu_clear(smp_processor_id(), cpu_online_map);
9472- disable_all_local_evtchn();
9473- if (cpu_data(smp_processor_id()).hlt_works_ok)
9474- for(;;) halt();
9475- for (;;);
9476-}
9477-
9478-/*
9479- * this function calls the 'stop' function on all other CPUs in the system.
9480- */
9481-
9482-void xen_smp_send_stop(void)
9483-{
9484- /* Don't deadlock on the call lock in panic */
9485- int nolock = !spin_trylock(&call_lock);
9486- unsigned long flags;
9487-
9488- local_irq_save(flags);
9489- __smp_call_function(stop_this_cpu, NULL, 0, 0);
9490- if (!nolock)
9491- spin_unlock(&call_lock);
9492- disable_all_local_evtchn();
9493- local_irq_restore(flags);
9494-}
9495-
9496-/*
9497- * Reschedule call back. Nothing to do,
9498- * all the work is done automatically when
9499- * we return from the interrupt.
9500- */
9501-irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
9502-{
9503- __get_cpu_var(irq_stat).irq_resched_count++;
9504-
9505- return IRQ_HANDLED;
9506-}
9507-
9508-#include <linux/kallsyms.h>
9509-irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
9510-{
9511- void (*func) (void *info) = call_data->func;
9512- void *info = call_data->info;
9513- int wait = call_data->wait;
9514-
9515- /*
9516- * Notify initiating CPU that I've grabbed the data and am
9517- * about to execute the function
9518- */
9519- mb();
9520- atomic_inc(&call_data->started);
9521- /*
9522- * At this point the info structure may be out of scope unless wait==1
9523- */
9524- irq_enter();
9525- (*func)(info);
9526- __get_cpu_var(irq_stat).irq_call_count++;
9527- irq_exit();
9528-
9529- if (wait) {
9530- mb();
9531- atomic_inc(&call_data->finished);
9532- }
9533-
9534- return IRQ_HANDLED;
9535-}
9536--- sle11-2009-05-14.orig/arch/x86/kernel/smp_64-xen.c 2009-03-16 16:33:40.000000000 +0100
9537+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
9538@@ -1,554 +0,0 @@
9539-/*
9540- * Intel SMP support routines.
9541- *
9542- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
9543- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
9544- * (c) 2002,2003 Andi Kleen, SuSE Labs.
9545- *
9546- * This code is released under the GNU General Public License version 2 or
9547- * later.
9548- */
9549-
9550-#include <linux/init.h>
9551-
9552-#include <linux/mm.h>
9553-#include <linux/delay.h>
9554-#include <linux/spinlock.h>
9555-#include <linux/smp.h>
9556-#include <linux/kernel_stat.h>
9557-#include <linux/mc146818rtc.h>
9558-#include <linux/interrupt.h>
9559-
9560-#include <asm/mtrr.h>
9561-#include <asm/pgalloc.h>
9562-#include <asm/tlbflush.h>
9563-#include <asm/mach_apic.h>
9564-#include <asm/mmu_context.h>
9565-#include <asm/proto.h>
9566-#include <asm/apicdef.h>
9567-#include <asm/idle.h>
9568-#ifdef CONFIG_XEN
9569-#include <xen/evtchn.h>
9570-#endif
9571-
9572-#ifndef CONFIG_XEN
9573-/*
9574- * Smarter SMP flushing macros.
9575- * c/o Linus Torvalds.
9576- *
9577- * These mean you can really definitely utterly forget about
9578- * writing to user space from interrupts. (Its not allowed anyway).
9579- *
9580- * Optimizations Manfred Spraul <manfred@colorfullife.com>
9581- *
9582- * More scalable flush, from Andi Kleen
9583- *
9584- * To avoid global state use 8 different call vectors.
9585- * Each CPU uses a specific vector to trigger flushes on other
9586- * CPUs. Depending on the received vector the target CPUs look into
9587- * the right per cpu variable for the flush data.
9588- *
9589- * With more than 8 CPUs they are hashed to the 8 available
9590- * vectors. The limited global vector space forces us to this right now.
9591- * In future when interrupts are split into per CPU domains this could be
9592- * fixed, at the cost of triggering multiple IPIs in some cases.
9593- */
9594-
9595-union smp_flush_state {
9596- struct {
9597- cpumask_t flush_cpumask;
9598- struct mm_struct *flush_mm;
9599- unsigned long flush_va;
9600- spinlock_t tlbstate_lock;
9601- };
9602- char pad[SMP_CACHE_BYTES];
9603-} ____cacheline_aligned;
9604-
9605-/* State is put into the per CPU data section, but padded
9606- to a full cache line because other CPUs can access it and we don't
9607- want false sharing in the per cpu data segment. */
9608-static DEFINE_PER_CPU(union smp_flush_state, flush_state);
9609-
9610-/*
9611- * We cannot call mmdrop() because we are in interrupt context,
9612- * instead update mm->cpu_vm_mask.
9613- */
9614-void leave_mm(int cpu)
9615-{
9616- if (read_pda(mmu_state) == TLBSTATE_OK)
9617- BUG();
9618- cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
9619- load_cr3(swapper_pg_dir);
9620-}
9621-EXPORT_SYMBOL_GPL(leave_mm);
9622-
9623-/*
9624- *
9625- * The flush IPI assumes that a thread switch happens in this order:
9626- * [cpu0: the cpu that switches]
9627- * 1) switch_mm() either 1a) or 1b)
9628- * 1a) thread switch to a different mm
9629- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
9630- * Stop ipi delivery for the old mm. This is not synchronized with
9631- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
9632- * for the wrong mm, and in the worst case we perform a superfluous
9633- * tlb flush.
9634- * 1a2) set cpu mmu_state to TLBSTATE_OK
9635- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
9636- * was in lazy tlb mode.
9637- * 1a3) update cpu active_mm
9638- * Now cpu0 accepts tlb flushes for the new mm.
9639- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
9640- * Now the other cpus will send tlb flush ipis.
9641- * 1a4) change cr3.
9642- * 1b) thread switch without mm change
9643- * cpu active_mm is correct, cpu0 already handles
9644- * flush ipis.
9645- * 1b1) set cpu mmu_state to TLBSTATE_OK
9646- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
9647- * Atomically set the bit [other cpus will start sending flush ipis],
9648- * and test the bit.
9649- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
9650- * 2) switch %%esp, ie current
9651- *
9652- * The interrupt must handle 2 special cases:
9653- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
9654- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
9655- * runs in kernel space, the cpu could load tlb entries for user space
9656- * pages.
9657- *
9658- * The good news is that cpu mmu_state is local to each cpu, no
9659- * write/read ordering problems.
9660- */
9661-
9662-/*
9663- * TLB flush IPI:
9664- *
9665- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
9666- * 2) Leave the mm if we are in the lazy tlb mode.
9667- *
9668- * Interrupts are disabled.
9669- */
9670-
9671-asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
9672-{
9673- int cpu;
9674- int sender;
9675- union smp_flush_state *f;
9676-
9677- cpu = smp_processor_id();
9678- /*
9679- * orig_rax contains the negated interrupt vector.
9680- * Use that to determine where the sender put the data.
9681- */
9682- sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
9683- f = &per_cpu(flush_state, sender);
9684-
9685- if (!cpu_isset(cpu, f->flush_cpumask))
9686- goto out;
9687- /*
9688- * This was a BUG() but until someone can quote me the
9689- * line from the intel manual that guarantees an IPI to
9690- * multiple CPUs is retried _only_ on the erroring CPUs
9691- * its staying as a return
9692- *
9693- * BUG();
9694- */
9695-
9696- if (f->flush_mm == read_pda(active_mm)) {
9697- if (read_pda(mmu_state) == TLBSTATE_OK) {
9698- if (f->flush_va == TLB_FLUSH_ALL)
9699- local_flush_tlb();
9700- else
9701- __flush_tlb_one(f->flush_va);
9702- } else
9703- leave_mm(cpu);
9704- }
9705-out:
9706- ack_APIC_irq();
9707- cpu_clear(cpu, f->flush_cpumask);
9708- add_pda(irq_tlb_count, 1);
9709-}
9710-
9711-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
9712- unsigned long va)
9713-{
9714- int sender;
9715- union smp_flush_state *f;
9716- cpumask_t cpumask = *cpumaskp;
9717-
9718- /* Caller has disabled preemption */
9719- sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
9720- f = &per_cpu(flush_state, sender);
9721-
9722- /*
9723- * Could avoid this lock when
9724- * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
9725- * probably not worth checking this for a cache-hot lock.
9726- */
9727- spin_lock(&f->tlbstate_lock);
9728-
9729- f->flush_mm = mm;
9730- f->flush_va = va;
9731- cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
9732-
9733- /*
9734- * We have to send the IPI only to
9735- * CPUs affected.
9736- */
9737- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
9738-
9739- while (!cpus_empty(f->flush_cpumask))
9740- cpu_relax();
9741-
9742- f->flush_mm = NULL;
9743- f->flush_va = 0;
9744- spin_unlock(&f->tlbstate_lock);
9745-}
9746-
9747-int __cpuinit init_smp_flush(void)
9748-{
9749- int i;
9750-
9751- for_each_cpu_mask(i, cpu_possible_map) {
9752- spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
9753- }
9754- return 0;
9755-}
9756-core_initcall(init_smp_flush);
9757-
9758-void flush_tlb_current_task(void)
9759-{
9760- struct mm_struct *mm = current->mm;
9761- cpumask_t cpu_mask;
9762-
9763- preempt_disable();
9764- cpu_mask = mm->cpu_vm_mask;
9765- cpu_clear(smp_processor_id(), cpu_mask);
9766-
9767- local_flush_tlb();
9768- if (!cpus_empty(cpu_mask))
9769- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9770- preempt_enable();
9771-}
9772-
9773-void flush_tlb_mm (struct mm_struct * mm)
9774-{
9775- cpumask_t cpu_mask;
9776-
9777- preempt_disable();
9778- cpu_mask = mm->cpu_vm_mask;
9779- cpu_clear(smp_processor_id(), cpu_mask);
9780-
9781- if (current->active_mm == mm) {
9782- if (current->mm)
9783- local_flush_tlb();
9784- else
9785- leave_mm(smp_processor_id());
9786- }
9787- if (!cpus_empty(cpu_mask))
9788- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9789-
9790- preempt_enable();
9791-}
9792-
9793-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
9794-{
9795- struct mm_struct *mm = vma->vm_mm;
9796- cpumask_t cpu_mask;
9797-
9798- preempt_disable();
9799- cpu_mask = mm->cpu_vm_mask;
9800- cpu_clear(smp_processor_id(), cpu_mask);
9801-
9802- if (current->active_mm == mm) {
9803- if(current->mm)
9804- __flush_tlb_one(va);
9805- else
9806- leave_mm(smp_processor_id());
9807- }
9808-
9809- if (!cpus_empty(cpu_mask))
9810- flush_tlb_others(cpu_mask, mm, va);
9811-
9812- preempt_enable();
9813-}
9814-
9815-static void do_flush_tlb_all(void* info)
9816-{
9817- unsigned long cpu = smp_processor_id();
9818-
9819- __flush_tlb_all();
9820- if (read_pda(mmu_state) == TLBSTATE_LAZY)
9821- leave_mm(cpu);
9822-}
9823-
9824-void flush_tlb_all(void)
9825-{
9826- on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
9827-}
9828-#endif /* Xen */
9829-
9830-/*
9831- * this function sends a 'reschedule' IPI to another CPU.
9832- * it goes straight through and wastes no time serializing
9833- * anything. Worst case is that we lose a reschedule ...
9834- */
9835-
9836-void smp_send_reschedule(int cpu)
9837-{
9838- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
9839-}
9840-
9841-/*
9842- * Structure and data for smp_call_function(). This is designed to minimise
9843- * static memory requirements. It also looks cleaner.
9844- */
9845-static DEFINE_SPINLOCK(call_lock);
9846-
9847-struct call_data_struct {
9848- void (*func) (void *info);
9849- void *info;
9850- atomic_t started;
9851- atomic_t finished;
9852- int wait;
9853-};
9854-
9855-static struct call_data_struct * call_data;
9856-
9857-void lock_ipi_call_lock(void)
9858-{
9859- spin_lock_irq(&call_lock);
9860-}
9861-
9862-void unlock_ipi_call_lock(void)
9863-{
9864- spin_unlock_irq(&call_lock);
9865-}
9866-
9867-/*
9868- * this function sends a 'generic call function' IPI to all other CPU
9869- * of the system defined in the mask.
9870- */
9871-static int __smp_call_function_mask(cpumask_t mask,
9872- void (*func)(void *), void *info,
9873- int wait)
9874-{
9875- struct call_data_struct data;
9876- cpumask_t allbutself;
9877- int cpus;
9878-
9879- allbutself = cpu_online_map;
9880- cpu_clear(smp_processor_id(), allbutself);
9881-
9882- cpus_and(mask, mask, allbutself);
9883- cpus = cpus_weight(mask);
9884-
9885- if (!cpus)
9886- return 0;
9887-
9888- data.func = func;
9889- data.info = info;
9890- atomic_set(&data.started, 0);
9891- data.wait = wait;
9892- if (wait)
9893- atomic_set(&data.finished, 0);
9894-
9895- call_data = &data;
9896- wmb();
9897-
9898- /* Send a message to other CPUs */
9899- if (cpus_equal(mask, allbutself))
9900- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
9901- else
9902- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
9903-
9904- /* Wait for response */
9905- while (atomic_read(&data.started) != cpus)
9906- cpu_relax();
9907-
9908- if (!wait)
9909- return 0;
9910-
9911- while (atomic_read(&data.finished) != cpus)
9912- cpu_relax();
9913-
9914- return 0;
9915-}
9916-/**
9917- * smp_call_function_mask(): Run a function on a set of other CPUs.
9918- * @mask: The set of cpus to run on. Must not include the current cpu.
9919- * @func: The function to run. This must be fast and non-blocking.
9920- * @info: An arbitrary pointer to pass to the function.
9921- * @wait: If true, wait (atomically) until function has completed on other CPUs.
9922- *
9923- * Returns 0 on success, else a negative status code.
9924- *
9925- * If @wait is true, then returns once @func has returned; otherwise
9926- * it returns just before the target cpu calls @func.
9927- *
9928- * You must not call this function with disabled interrupts or from a
9929- * hardware interrupt handler or from a bottom half handler.
9930- */
9931-int smp_call_function_mask(cpumask_t mask,
9932- void (*func)(void *), void *info,
9933- int wait)
9934-{
9935- int ret;
9936-
9937- /* Can deadlock when called with interrupts disabled */
9938- WARN_ON(irqs_disabled());
9939-
9940- spin_lock(&call_lock);
9941- ret = __smp_call_function_mask(mask, func, info, wait);
9942- spin_unlock(&call_lock);
9943- return ret;
9944-}
9945-EXPORT_SYMBOL(smp_call_function_mask);
9946-
9947-/*
9948- * smp_call_function_single - Run a function on a specific CPU
9949- * @func: The function to run. This must be fast and non-blocking.
9950- * @info: An arbitrary pointer to pass to the function.
9951- * @nonatomic: Currently unused.
9952- * @wait: If true, wait until function has completed on other CPUs.
9953- *
9954- * Retrurns 0 on success, else a negative status code.
9955- *
9956- * Does not return until the remote CPU is nearly ready to execute <func>
9957- * or is or has executed.
9958- */
9959-
9960-int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
9961- int nonatomic, int wait)
9962-{
9963- /* prevent preemption and reschedule on another processor */
9964- int ret, me = get_cpu();
9965-
9966- /* Can deadlock when called with interrupts disabled */
9967- WARN_ON(irqs_disabled());
9968-
9969- if (cpu == me) {
9970- local_irq_disable();
9971- func(info);
9972- local_irq_enable();
9973- put_cpu();
9974- return 0;
9975- }
9976-
9977- ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
9978-
9979- put_cpu();
9980- return ret;
9981-}
9982-EXPORT_SYMBOL(smp_call_function_single);
9983-
9984-/*
9985- * smp_call_function - run a function on all other CPUs.
9986- * @func: The function to run. This must be fast and non-blocking.
9987- * @info: An arbitrary pointer to pass to the function.
9988- * @nonatomic: currently unused.
9989- * @wait: If true, wait (atomically) until function has completed on other
9990- * CPUs.
9991- *
9992- * Returns 0 on success, else a negative status code. Does not return until
9993- * remote CPUs are nearly ready to execute func or are or have executed.
9994- *
9995- * You must not call this function with disabled interrupts or from a
9996- * hardware interrupt handler or from a bottom half handler.
9997- * Actually there are a few legal cases, like panic.
9998- */
9999-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
10000- int wait)
10001-{
10002- return smp_call_function_mask(cpu_online_map, func, info, wait);
10003-}
10004-EXPORT_SYMBOL(smp_call_function);
10005-
10006-static void stop_this_cpu(void *dummy)
10007-{
10008- local_irq_disable();
10009- /*
10010- * Remove this CPU:
10011- */
10012- cpu_clear(smp_processor_id(), cpu_online_map);
10013- disable_all_local_evtchn();
10014- for (;;)
10015- halt();
10016-}
10017-
10018-void smp_send_stop(void)
10019-{
10020- int nolock;
10021- unsigned long flags;
10022-
10023-#ifndef CONFIG_XEN
10024- if (reboot_force)
10025- return;
10026-#endif
10027-
10028- /* Don't deadlock on the call lock in panic */
10029- nolock = !spin_trylock(&call_lock);
10030- local_irq_save(flags);
10031- __smp_call_function_mask(cpu_online_map, stop_this_cpu, NULL, 0);
10032- if (!nolock)
10033- spin_unlock(&call_lock);
10034- disable_all_local_evtchn();
10035- local_irq_restore(flags);
10036-}
10037-
10038-/*
10039- * Reschedule call back. Nothing to do,
10040- * all the work is done automatically when
10041- * we return from the interrupt.
10042- */
10043-#ifndef CONFIG_XEN
10044-asmlinkage void smp_reschedule_interrupt(void)
10045-#else
10046-asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx)
10047-#endif
10048-{
10049-#ifndef CONFIG_XEN
10050- ack_APIC_irq();
10051-#endif
10052- add_pda(irq_resched_count, 1);
10053-#ifdef CONFIG_XEN
10054- return IRQ_HANDLED;
10055-#endif
10056-}
10057-
10058-#ifndef CONFIG_XEN
10059-asmlinkage void smp_call_function_interrupt(void)
10060-#else
10061-asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx)
10062-#endif
10063-{
10064- void (*func) (void *info) = call_data->func;
10065- void *info = call_data->info;
10066- int wait = call_data->wait;
10067-
10068-#ifndef CONFIG_XEN
10069- ack_APIC_irq();
10070-#endif
10071- /*
10072- * Notify initiating CPU that I've grabbed the data and am
10073- * about to execute the function
10074- */
10075- mb();
10076- atomic_inc(&call_data->started);
10077- /*
10078- * At this point the info structure may be out of scope unless wait==1
10079- */
10080- exit_idle();
10081- irq_enter();
10082- (*func)(info);
10083- add_pda(irq_call_count, 1);
10084- irq_exit();
10085- if (wait) {
10086- mb();
10087- atomic_inc(&call_data->finished);
10088- }
10089-#ifdef CONFIG_XEN
10090- return IRQ_HANDLED;
10091-#endif
10092-}
10093--- sle11-2009-05-14.orig/arch/x86/kernel/time_32-xen.c 2009-03-24 10:12:48.000000000 +0100
10094+++ sle11-2009-05-14/arch/x86/kernel/time_32-xen.c 2009-03-24 10:13:09.000000000 +0100
10095@@ -699,8 +699,6 @@ int xen_update_persistent_clock(void)
10096 return 0;
10097 }
10098
10099-extern void (*late_time_init)(void);
10100-
10101 /* Dynamically-mapped IRQ. */
10102 DEFINE_PER_CPU(int, timer_irq);
10103
10104--- sle11-2009-05-14.orig/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:33:40.000000000 +0100
10105+++ sle11-2009-05-14/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:38:05.000000000 +0100
10106@@ -9,26 +9,28 @@
10107 * 'Traps.c' handles hardware traps and faults after we have saved some
10108 * state in 'asm.s'.
10109 */
10110-#include <linux/sched.h>
10111+#include <linux/interrupt.h>
10112+#include <linux/kallsyms.h>
10113+#include <linux/spinlock.h>
10114+#include <linux/highmem.h>
10115+#include <linux/kprobes.h>
10116+#include <linux/uaccess.h>
10117+#include <linux/utsname.h>
10118+#include <linux/kdebug.h>
10119 #include <linux/kernel.h>
10120+#include <linux/module.h>
10121+#include <linux/ptrace.h>
10122 #include <linux/string.h>
10123+#include <linux/unwind.h>
10124+#include <linux/delay.h>
10125 #include <linux/errno.h>
10126+#include <linux/kexec.h>
10127+#include <linux/sched.h>
10128 #include <linux/timer.h>
10129-#include <linux/mm.h>
10130 #include <linux/init.h>
10131-#include <linux/delay.h>
10132-#include <linux/spinlock.h>
10133-#include <linux/interrupt.h>
10134-#include <linux/highmem.h>
10135-#include <linux/kallsyms.h>
10136-#include <linux/ptrace.h>
10137-#include <linux/utsname.h>
10138-#include <linux/kprobes.h>
10139-#include <linux/kexec.h>
10140-#include <linux/unwind.h>
10141-#include <linux/uaccess.h>
10142-#include <linux/nmi.h>
10143 #include <linux/bug.h>
10144+#include <linux/nmi.h>
10145+#include <linux/mm.h>
10146
10147 #ifdef CONFIG_EISA
10148 #include <linux/ioport.h>
10149@@ -43,21 +45,18 @@
10150 #include <linux/edac.h>
10151 #endif
10152
10153+#include <asm/arch_hooks.h>
10154+#include <asm/stacktrace.h>
10155 #include <asm/processor.h>
10156-#include <asm/system.h>
10157-#include <asm/io.h>
10158-#include <asm/atomic.h>
10159 #include <asm/debugreg.h>
10160+#include <asm/atomic.h>
10161+#include <asm/system.h>
10162+#include <asm/unwind.h>
10163 #include <asm/desc.h>
10164 #include <asm/i387.h>
10165 #include <asm/nmi.h>
10166-#include <asm/unwind.h>
10167 #include <asm/smp.h>
10168-#include <asm/arch_hooks.h>
10169-#include <linux/kdebug.h>
10170-#include <asm/stacktrace.h>
10171-
10172-#include <linux/module.h>
10173+#include <asm/io.h>
10174
10175 #include "mach_traps.h"
10176
10177@@ -71,7 +70,7 @@ EXPORT_SYMBOL_GPL(used_vectors);
10178 asmlinkage int system_call(void);
10179
10180 /* Do we ignore FPU interrupts ? */
10181-char ignore_fpu_irq = 0;
10182+char ignore_fpu_irq;
10183
10184 #ifndef CONFIG_X86_NO_IDT
10185 /*
10186@@ -113,12 +112,13 @@ static unsigned int code_bytes = 64;
10187 void printk_address(unsigned long address, int reliable)
10188 {
10189 #ifdef CONFIG_KALLSYMS
10190- unsigned long offset = 0, symsize;
10191+ char namebuf[KSYM_NAME_LEN];
10192+ unsigned long offset = 0;
10193+ unsigned long symsize;
10194 const char *symname;
10195- char *modname;
10196- char *delim = ":";
10197- char namebuf[128];
10198 char reliab[4] = "";
10199+ char *delim = ":";
10200+ char *modname;
10201
10202 symname = kallsyms_lookup(address, &symsize, &offset,
10203 &modname, namebuf);
10204@@ -146,13 +146,14 @@ static inline int valid_stack_ptr(struct
10205
10206 /* The form of the top of the frame on the stack */
10207 struct stack_frame {
10208- struct stack_frame *next_frame;
10209- unsigned long return_address;
10210+ struct stack_frame *next_frame;
10211+ unsigned long return_address;
10212 };
10213
10214-static inline unsigned long print_context_stack(struct thread_info *tinfo,
10215- unsigned long *stack, unsigned long bp,
10216- const struct stacktrace_ops *ops, void *data)
10217+static inline unsigned long
10218+print_context_stack(struct thread_info *tinfo,
10219+ unsigned long *stack, unsigned long bp,
10220+ const struct stacktrace_ops *ops, void *data)
10221 {
10222 struct stack_frame *frame = (struct stack_frame *)bp;
10223
10224@@ -174,7 +175,7 @@ static inline unsigned long print_contex
10225 return bp;
10226 }
10227
10228-#define MSG(msg) ops->warning(data, msg)
10229+#define MSG(msg) ops->warning(data, msg)
10230
10231 void dump_trace(struct task_struct *task, struct pt_regs *regs,
10232 unsigned long *stack, unsigned long bp,
10233@@ -185,6 +186,7 @@ void dump_trace(struct task_struct *task
10234
10235 if (!stack) {
10236 unsigned long dummy;
10237+
10238 stack = &dummy;
10239 if (task != current)
10240 stack = (unsigned long *)task->thread.sp;
10241@@ -194,7 +196,7 @@ void dump_trace(struct task_struct *task
10242 if (!bp) {
10243 if (task == current) {
10244 /* Grab bp right from our regs */
10245- asm ("movl %%ebp, %0" : "=r" (bp) : );
10246+ asm("movl %%ebp, %0" : "=r" (bp) :);
10247 } else {
10248 /* bp is the last reg pushed by switch_to */
10249 bp = *(unsigned long *) task->thread.sp;
10250@@ -204,15 +206,18 @@ void dump_trace(struct task_struct *task
10251
10252 while (1) {
10253 struct thread_info *context;
10254+
10255 context = (struct thread_info *)
10256 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
10257 bp = print_context_stack(context, stack, bp, ops, data);
10258- /* Should be after the line below, but somewhere
10259- in early boot context comes out corrupted and we
10260- can't reference it -AK */
10261+ /*
10262+ * Should be after the line below, but somewhere
10263+ * in early boot context comes out corrupted and we
10264+ * can't reference it:
10265+ */
10266 if (ops->stack(data, "IRQ") < 0)
10267 break;
10268- stack = (unsigned long*)context->previous_esp;
10269+ stack = (unsigned long *)context->previous_esp;
10270 if (!stack)
10271 break;
10272 touch_nmi_watchdog();
10273@@ -251,15 +256,15 @@ static void print_trace_address(void *da
10274 }
10275
10276 static const struct stacktrace_ops print_trace_ops = {
10277- .warning = print_trace_warning,
10278- .warning_symbol = print_trace_warning_symbol,
10279- .stack = print_trace_stack,
10280- .address = print_trace_address,
10281+ .warning = print_trace_warning,
10282+ .warning_symbol = print_trace_warning_symbol,
10283+ .stack = print_trace_stack,
10284+ .address = print_trace_address,
10285 };
10286
10287 static void
10288 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
10289- unsigned long *stack, unsigned long bp, char *log_lvl)
10290+ unsigned long *stack, unsigned long bp, char *log_lvl)
10291 {
10292 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
10293 printk("%s =======================\n", log_lvl);
10294@@ -271,21 +276,22 @@ void show_trace(struct task_struct *task
10295 show_trace_log_lvl(task, regs, stack, bp, "");
10296 }
10297
10298-static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
10299- unsigned long *sp, unsigned long bp, char *log_lvl)
10300+static void
10301+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
10302+ unsigned long *sp, unsigned long bp, char *log_lvl)
10303 {
10304 unsigned long *stack;
10305 int i;
10306
10307 if (sp == NULL) {
10308 if (task)
10309- sp = (unsigned long*)task->thread.sp;
10310+ sp = (unsigned long *)task->thread.sp;
10311 else
10312 sp = (unsigned long *)&sp;
10313 }
10314
10315 stack = sp;
10316- for(i = 0; i < kstack_depth_to_print; i++) {
10317+ for (i = 0; i < kstack_depth_to_print; i++) {
10318 if (kstack_end(stack))
10319 break;
10320 if (i && ((i % 8) == 0))
10321@@ -293,6 +299,7 @@ static void show_stack_log_lvl(struct ta
10322 printk("%08lx ", *stack++);
10323 }
10324 printk("\n%sCall Trace:\n", log_lvl);
10325+
10326 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
10327 }
10328
10329@@ -307,8 +314,8 @@ void show_stack(struct task_struct *task
10330 */
10331 void dump_stack(void)
10332 {
10333- unsigned long stack;
10334 unsigned long bp = 0;
10335+ unsigned long stack;
10336
10337 #ifdef CONFIG_FRAME_POINTER
10338 if (!bp)
10339@@ -320,6 +327,7 @@ void dump_stack(void)
10340 init_utsname()->release,
10341 (int)strcspn(init_utsname()->version, " "),
10342 init_utsname()->version);
10343+
10344 show_trace(current, NULL, &stack, bp);
10345 }
10346
10347@@ -331,6 +339,7 @@ void show_registers(struct pt_regs *regs
10348
10349 print_modules();
10350 __show_registers(regs, 0);
10351+
10352 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
10353 TASK_COMM_LEN, current->comm, task_pid_nr(current),
10354 current_thread_info(), current, task_thread_info(current));
10355@@ -339,10 +348,10 @@ void show_registers(struct pt_regs *regs
10356 * time of the fault..
10357 */
10358 if (!user_mode_vm(regs)) {
10359- u8 *ip;
10360 unsigned int code_prologue = code_bytes * 43 / 64;
10361 unsigned int code_len = code_bytes;
10362 unsigned char c;
10363+ u8 *ip;
10364
10365 printk("\n" KERN_EMERG "Stack: ");
10366 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
10367@@ -369,7 +378,7 @@ void show_registers(struct pt_regs *regs
10368 }
10369 }
10370 printk("\n");
10371-}
10372+}
10373
10374 int is_valid_bugaddr(unsigned long ip)
10375 {
10376@@ -385,10 +394,10 @@ int is_valid_bugaddr(unsigned long ip)
10377
10378 static int die_counter;
10379
10380-int __kprobes __die(const char * str, struct pt_regs * regs, long err)
10381+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
10382 {
10383- unsigned long sp;
10384 unsigned short ss;
10385+ unsigned long sp;
10386
10387 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
10388 #ifdef CONFIG_PREEMPT
10389@@ -403,8 +412,8 @@ int __kprobes __die(const char * str, st
10390 printk("\n");
10391
10392 if (notify_die(DIE_OOPS, str, regs, err,
10393- current->thread.trap_no, SIGSEGV) !=
10394- NOTIFY_STOP) {
10395+ current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
10396+
10397 show_registers(regs);
10398 /* Executive summary in case the oops scrolled away */
10399 sp = (unsigned long) (&regs->sp);
10400@@ -416,17 +425,18 @@ int __kprobes __die(const char * str, st
10401 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
10402 print_symbol("%s", regs->ip);
10403 printk(" SS:ESP %04x:%08lx\n", ss, sp);
10404+
10405 return 0;
10406- } else {
10407- return 1;
10408 }
10409+
10410+ return 1;
10411 }
10412
10413 /*
10414- * This is gone through when something in the kernel has done something bad and
10415- * is about to be terminated.
10416+ * This is gone through when something in the kernel has done something bad
10417+ * and is about to be terminated:
10418 */
10419-void die(const char * str, struct pt_regs * regs, long err)
10420+void die(const char *str, struct pt_regs *regs, long err)
10421 {
10422 static struct {
10423 raw_spinlock_t lock;
10424@@ -448,8 +458,9 @@ void die(const char * str, struct pt_reg
10425 die.lock_owner = smp_processor_id();
10426 die.lock_owner_depth = 0;
10427 bust_spinlocks(1);
10428- } else
10429+ } else {
10430 raw_local_irq_save(flags);
10431+ }
10432
10433 if (++die.lock_owner_depth < 3) {
10434 report_bug(regs->ip, regs);
10435@@ -482,19 +493,20 @@ void die(const char * str, struct pt_reg
10436 do_exit(SIGSEGV);
10437 }
10438
10439-static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
10440+static inline void
10441+die_if_kernel(const char *str, struct pt_regs *regs, long err)
10442 {
10443 if (!user_mode_vm(regs))
10444 die(str, regs, err);
10445 }
10446
10447-static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
10448- struct pt_regs * regs, long error_code,
10449- siginfo_t *info)
10450+static void __kprobes
10451+do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
10452+ long error_code, siginfo_t *info)
10453 {
10454 struct task_struct *tsk = current;
10455
10456- if (regs->flags & VM_MASK) {
10457+ if (regs->flags & X86_VM_MASK) {
10458 if (vm86)
10459 goto vm86_trap;
10460 goto trap_signal;
10461@@ -503,109 +515,112 @@ static void __kprobes do_trap(int trapnr
10462 if (!user_mode(regs))
10463 goto kernel_trap;
10464
10465- trap_signal: {
10466- /*
10467- * We want error_code and trap_no set for userspace faults and
10468- * kernelspace faults which result in die(), but not
10469- * kernelspace faults which are fixed up. die() gives the
10470- * process no chance to handle the signal and notice the
10471- * kernel fault information, so that won't result in polluting
10472- * the information about previously queued, but not yet
10473- * delivered, faults. See also do_general_protection below.
10474- */
10475- tsk->thread.error_code = error_code;
10476- tsk->thread.trap_no = trapnr;
10477+trap_signal:
10478+ /*
10479+ * We want error_code and trap_no set for userspace faults and
10480+ * kernelspace faults which result in die(), but not
10481+ * kernelspace faults which are fixed up. die() gives the
10482+ * process no chance to handle the signal and notice the
10483+ * kernel fault information, so that won't result in polluting
10484+ * the information about previously queued, but not yet
10485+ * delivered, faults. See also do_general_protection below.
10486+ */
10487+ tsk->thread.error_code = error_code;
10488+ tsk->thread.trap_no = trapnr;
10489
10490- if (info)
10491- force_sig_info(signr, info, tsk);
10492- else
10493- force_sig(signr, tsk);
10494- return;
10495- }
10496+ if (info)
10497+ force_sig_info(signr, info, tsk);
10498+ else
10499+ force_sig(signr, tsk);
10500+ return;
10501
10502- kernel_trap: {
10503- if (!fixup_exception(regs)) {
10504- tsk->thread.error_code = error_code;
10505- tsk->thread.trap_no = trapnr;
10506- die(str, regs, error_code);
10507- }
10508- return;
10509+kernel_trap:
10510+ if (!fixup_exception(regs)) {
10511+ tsk->thread.error_code = error_code;
10512+ tsk->thread.trap_no = trapnr;
10513+ die(str, regs, error_code);
10514 }
10515+ return;
10516
10517- vm86_trap: {
10518- int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
10519- if (ret) goto trap_signal;
10520- return;
10521- }
10522+vm86_trap:
10523+ if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
10524+ error_code, trapnr))
10525+ goto trap_signal;
10526+ return;
10527 }
10528
10529-#define DO_ERROR(trapnr, signr, str, name) \
10530-void do_##name(struct pt_regs * regs, long error_code) \
10531-{ \
10532- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10533- == NOTIFY_STOP) \
10534- return; \
10535- do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
10536-}
10537-
10538-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
10539-void do_##name(struct pt_regs * regs, long error_code) \
10540-{ \
10541- siginfo_t info; \
10542- if (irq) \
10543- local_irq_enable(); \
10544- info.si_signo = signr; \
10545- info.si_errno = 0; \
10546- info.si_code = sicode; \
10547- info.si_addr = (void __user *)siaddr; \
10548- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10549- == NOTIFY_STOP) \
10550- return; \
10551- do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
10552-}
10553-
10554-#define DO_VM86_ERROR(trapnr, signr, str, name) \
10555-void do_##name(struct pt_regs * regs, long error_code) \
10556-{ \
10557- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10558- == NOTIFY_STOP) \
10559- return; \
10560- do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
10561-}
10562-
10563-#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
10564-void do_##name(struct pt_regs * regs, long error_code) \
10565-{ \
10566- siginfo_t info; \
10567- info.si_signo = signr; \
10568- info.si_errno = 0; \
10569- info.si_code = sicode; \
10570- info.si_addr = (void __user *)siaddr; \
10571- trace_hardirqs_fixup(); \
10572- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10573- == NOTIFY_STOP) \
10574- return; \
10575- do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
10576+#define DO_ERROR(trapnr, signr, str, name) \
10577+void do_##name(struct pt_regs *regs, long error_code) \
10578+{ \
10579+ trace_hardirqs_fixup(); \
10580+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10581+ == NOTIFY_STOP) \
10582+ return; \
10583+ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
10584+}
10585+
10586+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
10587+void do_##name(struct pt_regs *regs, long error_code) \
10588+{ \
10589+ siginfo_t info; \
10590+ if (irq) \
10591+ local_irq_enable(); \
10592+ info.si_signo = signr; \
10593+ info.si_errno = 0; \
10594+ info.si_code = sicode; \
10595+ info.si_addr = (void __user *)siaddr; \
10596+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10597+ == NOTIFY_STOP) \
10598+ return; \
10599+ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
10600+}
10601+
10602+#define DO_VM86_ERROR(trapnr, signr, str, name) \
10603+void do_##name(struct pt_regs *regs, long error_code) \
10604+{ \
10605+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10606+ == NOTIFY_STOP) \
10607+ return; \
10608+ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
10609+}
10610+
10611+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
10612+void do_##name(struct pt_regs *regs, long error_code) \
10613+{ \
10614+ siginfo_t info; \
10615+ info.si_signo = signr; \
10616+ info.si_errno = 0; \
10617+ info.si_code = sicode; \
10618+ info.si_addr = (void __user *)siaddr; \
10619+ trace_hardirqs_fixup(); \
10620+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10621+ == NOTIFY_STOP) \
10622+ return; \
10623+ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
10624 }
10625
10626-DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
10627+DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
10628 #ifndef CONFIG_KPROBES
10629-DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
10630+DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
10631 #endif
10632-DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
10633-DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
10634-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
10635-DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
10636+DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
10637+DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
10638+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
10639+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
10640 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
10641 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
10642 DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
10643 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
10644-DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
10645+DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
10646
10647 void __kprobes do_general_protection(struct pt_regs * regs,
10648 long error_code)
10649 {
10650- if (regs->flags & VM_MASK)
10651+ struct thread_struct *thread;
10652+
10653+ thread = &current->thread;
10654+
10655+ if (regs->flags & X86_VM_MASK)
10656 goto gp_in_vm86;
10657
10658 if (!user_mode(regs))
10659@@ -613,6 +628,7 @@ void __kprobes do_general_protection(str
10660
10661 current->thread.error_code = error_code;
10662 current->thread.trap_no = 13;
10663+
10664 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
10665 printk_ratelimit()) {
10666 printk(KERN_INFO
10667@@ -642,22 +658,25 @@ gp_in_kernel:
10668 }
10669 }
10670
10671-static __kprobes void
10672-mem_parity_error(unsigned char reason, struct pt_regs * regs)
10673+static notrace __kprobes void
10674+mem_parity_error(unsigned char reason, struct pt_regs *regs)
10675 {
10676- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
10677- "CPU %d.\n", reason, smp_processor_id());
10678- printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
10679+ printk(KERN_EMERG
10680+ "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
10681+ reason, smp_processor_id());
10682+
10683+ printk(KERN_EMERG
10684+ "You have some hardware problem, likely on the PCI bus.\n");
10685
10686 #if defined(CONFIG_EDAC)
10687- if(edac_handler_set()) {
10688+ if (edac_handler_set()) {
10689 edac_atomic_assert_error();
10690 return;
10691 }
10692 #endif
10693
10694 if (panic_on_unrecovered_nmi)
10695- panic("NMI: Not continuing");
10696+ panic("NMI: Not continuing");
10697
10698 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
10699
10700@@ -665,8 +684,8 @@ mem_parity_error(unsigned char reason, s
10701 clear_mem_error(reason);
10702 }
10703
10704-static __kprobes void
10705-io_check_error(unsigned char reason, struct pt_regs * regs)
10706+static notrace __kprobes void
10707+io_check_error(unsigned char reason, struct pt_regs *regs)
10708 {
10709 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
10710 show_registers(regs);
10711@@ -675,38 +694,43 @@ io_check_error(unsigned char reason, str
10712 clear_io_check_error(reason);
10713 }
10714
10715-static __kprobes void
10716-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
10717+static notrace __kprobes void
10718+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
10719 {
10720+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
10721+ return;
10722 #ifdef CONFIG_MCA
10723- /* Might actually be able to figure out what the guilty party
10724- * is. */
10725- if( MCA_bus ) {
10726+ /*
10727+ * Might actually be able to figure out what the guilty party
10728+ * is:
10729+ */
10730+ if (MCA_bus) {
10731 mca_handle_nmi();
10732 return;
10733 }
10734 #endif
10735- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
10736- "CPU %d.\n", reason, smp_processor_id());
10737+ printk(KERN_EMERG
10738+ "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
10739+ reason, smp_processor_id());
10740+
10741 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
10742 if (panic_on_unrecovered_nmi)
10743- panic("NMI: Not continuing");
10744+ panic("NMI: Not continuing");
10745
10746 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
10747 }
10748
10749 static DEFINE_SPINLOCK(nmi_print_lock);
10750
10751-void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
10752+void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
10753 {
10754- if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
10755- NOTIFY_STOP)
10756+ if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
10757 return;
10758
10759 spin_lock(&nmi_print_lock);
10760 /*
10761 * We are in trouble anyway, lets at least try
10762- * to get a message out.
10763+ * to get a message out:
10764 */
10765 bust_spinlocks(1);
10766 printk(KERN_EMERG "%s", msg);
10767@@ -717,9 +741,10 @@ void __kprobes die_nmi(struct pt_regs *r
10768 spin_unlock(&nmi_print_lock);
10769 bust_spinlocks(0);
10770
10771- /* If we are in kernel we are probably nested up pretty bad
10772- * and might aswell get out now while we still can.
10773- */
10774+ /*
10775+ * If we are in kernel we are probably nested up pretty bad
10776+ * and might aswell get out now while we still can:
10777+ */
10778 if (!user_mode_vm(regs)) {
10779 current->thread.trap_no = 2;
10780 crash_kexec(regs);
10781@@ -728,14 +753,14 @@ void __kprobes die_nmi(struct pt_regs *r
10782 do_exit(SIGSEGV);
10783 }
10784
10785-static __kprobes void default_do_nmi(struct pt_regs * regs)
10786+static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
10787 {
10788 unsigned char reason = 0;
10789
10790- /* Only the BSP gets external NMIs from the system. */
10791+ /* Only the BSP gets external NMIs from the system: */
10792 if (!smp_processor_id())
10793 reason = get_nmi_reason();
10794-
10795+
10796 if (!(reason & 0xc0)) {
10797 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
10798 == NOTIFY_STOP)
10799@@ -748,8 +773,10 @@ static __kprobes void default_do_nmi(str
10800 if (nmi_watchdog_tick(regs, reason))
10801 return;
10802 if (!do_nmi_callback(regs, smp_processor_id()))
10803-#endif
10804 unknown_nmi_error(reason, regs);
10805+#else
10806+ unknown_nmi_error(reason, regs);
10807+#endif
10808
10809 return;
10810 }
10811@@ -761,14 +788,14 @@ static __kprobes void default_do_nmi(str
10812 io_check_error(reason, regs);
10813 /*
10814 * Reassert NMI in case it became active meanwhile
10815- * as it's edge-triggered.
10816+ * as it's edge-triggered:
10817 */
10818 reassert_nmi();
10819 }
10820
10821 static int ignore_nmis;
10822
10823-__kprobes void do_nmi(struct pt_regs * regs, long error_code)
10824+notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
10825 {
10826 int cpu;
10827
10828@@ -804,9 +831,12 @@ void __kprobes do_int3(struct pt_regs *r
10829 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
10830 == NOTIFY_STOP)
10831 return;
10832- /* This is an interrupt gate, because kprobes wants interrupts
10833- disabled. Normal trap handlers don't. */
10834+ /*
10835+ * This is an interrupt gate, because kprobes wants interrupts
10836+ * disabled. Normal trap handlers don't.
10837+ */
10838 restore_interrupts(regs);
10839+
10840 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
10841 }
10842 #endif
10843@@ -821,7 +851,7 @@ void __kprobes do_int3(struct pt_regs *r
10844 * from user space. Such code must not hold kernel locks (since it
10845 * can equally take a page fault), therefore it is safe to call
10846 * force_sig_info even though that claims and releases locks.
10847- *
10848+ *
10849 * Code in ./signal.c ensures that the debug control register
10850 * is restored before we deliver any signal, and therefore that
10851 * user code runs with the correct debug control register even though
10852@@ -833,10 +863,10 @@ void __kprobes do_int3(struct pt_regs *r
10853 * find every occurrence of the TF bit that could be saved away even
10854 * by user code)
10855 */
10856-void __kprobes do_debug(struct pt_regs * regs, long error_code)
10857+void __kprobes do_debug(struct pt_regs *regs, long error_code)
10858 {
10859- unsigned int condition;
10860 struct task_struct *tsk = current;
10861+ unsigned int condition;
10862
10863 trace_hardirqs_fixup();
10864
10865@@ -861,7 +891,7 @@ void __kprobes do_debug(struct pt_regs *
10866 goto clear_dr7;
10867 }
10868
10869- if (regs->flags & VM_MASK)
10870+ if (regs->flags & X86_VM_MASK)
10871 goto debug_vm86;
10872
10873 /* Save debug status register where ptrace can see it */
10874@@ -884,7 +914,8 @@ void __kprobes do_debug(struct pt_regs *
10875 /* Ok, finally something we can handle */
10876 send_sigtrap(tsk, regs, error_code);
10877
10878- /* Disable additional traps. They'll be re-enabled when
10879+ /*
10880+ * Disable additional traps. They'll be re-enabled when
10881 * the signal is delivered.
10882 */
10883 clear_dr7:
10884@@ -897,7 +928,7 @@ debug_vm86:
10885
10886 clear_TF_reenable:
10887 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
10888- regs->flags &= ~TF_MASK;
10889+ regs->flags &= ~X86_EFLAGS_TF;
10890 return;
10891 }
10892
10893@@ -908,9 +939,10 @@ clear_TF_reenable:
10894 */
10895 void math_error(void __user *ip)
10896 {
10897- struct task_struct * task;
10898+ struct task_struct *task;
10899+ unsigned short cwd;
10900+ unsigned short swd;
10901 siginfo_t info;
10902- unsigned short cwd, swd;
10903
10904 /*
10905 * Save the info for the exception handler and clear the error.
10906@@ -936,36 +968,36 @@ void math_error(void __user *ip)
10907 cwd = get_fpu_cwd(task);
10908 swd = get_fpu_swd(task);
10909 switch (swd & ~cwd & 0x3f) {
10910- case 0x000: /* No unmasked exception */
10911- return;
10912- default: /* Multiple exceptions */
10913- break;
10914- case 0x001: /* Invalid Op */
10915- /*
10916- * swd & 0x240 == 0x040: Stack Underflow
10917- * swd & 0x240 == 0x240: Stack Overflow
10918- * User must clear the SF bit (0x40) if set
10919- */
10920- info.si_code = FPE_FLTINV;
10921- break;
10922- case 0x002: /* Denormalize */
10923- case 0x010: /* Underflow */
10924- info.si_code = FPE_FLTUND;
10925- break;
10926- case 0x004: /* Zero Divide */
10927- info.si_code = FPE_FLTDIV;
10928- break;
10929- case 0x008: /* Overflow */
10930- info.si_code = FPE_FLTOVF;
10931- break;
10932- case 0x020: /* Precision */
10933- info.si_code = FPE_FLTRES;
10934- break;
10935+ case 0x000: /* No unmasked exception */
10936+ return;
10937+ default: /* Multiple exceptions */
10938+ break;
10939+ case 0x001: /* Invalid Op */
10940+ /*
10941+ * swd & 0x240 == 0x040: Stack Underflow
10942+ * swd & 0x240 == 0x240: Stack Overflow
10943+ * User must clear the SF bit (0x40) if set
10944+ */
10945+ info.si_code = FPE_FLTINV;
10946+ break;
10947+ case 0x002: /* Denormalize */
10948+ case 0x010: /* Underflow */
10949+ info.si_code = FPE_FLTUND;
10950+ break;
10951+ case 0x004: /* Zero Divide */
10952+ info.si_code = FPE_FLTDIV;
10953+ break;
10954+ case 0x008: /* Overflow */
10955+ info.si_code = FPE_FLTOVF;
10956+ break;
10957+ case 0x020: /* Precision */
10958+ info.si_code = FPE_FLTRES;
10959+ break;
10960 }
10961 force_sig_info(SIGFPE, &info, task);
10962 }
10963
10964-void do_coprocessor_error(struct pt_regs * regs, long error_code)
10965+void do_coprocessor_error(struct pt_regs *regs, long error_code)
10966 {
10967 ignore_fpu_irq = 1;
10968 math_error((void __user *)regs->ip);
10969@@ -973,9 +1005,9 @@ void do_coprocessor_error(struct pt_regs
10970
10971 static void simd_math_error(void __user *ip)
10972 {
10973- struct task_struct * task;
10974- siginfo_t info;
10975+ struct task_struct *task;
10976 unsigned short mxcsr;
10977+ siginfo_t info;
10978
10979 /*
10980 * Save the info for the exception handler and clear the error.
10981@@ -996,84 +1028,82 @@ static void simd_math_error(void __user
10982 */
10983 mxcsr = get_fpu_mxcsr(task);
10984 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
10985- case 0x000:
10986- default:
10987- break;
10988- case 0x001: /* Invalid Op */
10989- info.si_code = FPE_FLTINV;
10990- break;
10991- case 0x002: /* Denormalize */
10992- case 0x010: /* Underflow */
10993- info.si_code = FPE_FLTUND;
10994- break;
10995- case 0x004: /* Zero Divide */
10996- info.si_code = FPE_FLTDIV;
10997- break;
10998- case 0x008: /* Overflow */
10999- info.si_code = FPE_FLTOVF;
11000- break;
11001- case 0x020: /* Precision */
11002- info.si_code = FPE_FLTRES;
11003- break;
11004+ case 0x000:
11005+ default:
11006+ break;
11007+ case 0x001: /* Invalid Op */
11008+ info.si_code = FPE_FLTINV;
11009+ break;
11010+ case 0x002: /* Denormalize */
11011+ case 0x010: /* Underflow */
11012+ info.si_code = FPE_FLTUND;
11013+ break;
11014+ case 0x004: /* Zero Divide */
11015+ info.si_code = FPE_FLTDIV;
11016+ break;
11017+ case 0x008: /* Overflow */
11018+ info.si_code = FPE_FLTOVF;
11019+ break;
11020+ case 0x020: /* Precision */
11021+ info.si_code = FPE_FLTRES;
11022+ break;
11023 }
11024 force_sig_info(SIGFPE, &info, task);
11025 }
11026
11027-void do_simd_coprocessor_error(struct pt_regs * regs,
11028- long error_code)
11029+void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
11030 {
11031 if (cpu_has_xmm) {
11032 /* Handle SIMD FPU exceptions on PIII+ processors. */
11033 ignore_fpu_irq = 1;
11034 simd_math_error((void __user *)regs->ip);
11035- } else {
11036- /*
11037- * Handle strange cache flush from user space exception
11038- * in all other cases. This is undocumented behaviour.
11039- */
11040- if (regs->flags & VM_MASK) {
11041- handle_vm86_fault((struct kernel_vm86_regs *)regs,
11042- error_code);
11043- return;
11044- }
11045- current->thread.trap_no = 19;
11046- current->thread.error_code = error_code;
11047- die_if_kernel("cache flush denied", regs, error_code);
11048- force_sig(SIGSEGV, current);
11049+ return;
11050+ }
11051+ /*
11052+ * Handle strange cache flush from user space exception
11053+ * in all other cases. This is undocumented behaviour.
11054+ */
11055+ if (regs->flags & X86_VM_MASK) {
11056+ handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
11057+ return;
11058 }
11059+ current->thread.trap_no = 19;
11060+ current->thread.error_code = error_code;
11061+ die_if_kernel("cache flush denied", regs, error_code);
11062+ force_sig(SIGSEGV, current);
11063 }
11064
11065 #ifndef CONFIG_XEN
11066-void do_spurious_interrupt_bug(struct pt_regs * regs,
11067- long error_code)
11068+void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
11069 {
11070 #if 0
11071 /* No need to warn about this any longer. */
11072- printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
11073+ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
11074 #endif
11075 }
11076
11077-unsigned long patch_espfix_desc(unsigned long uesp,
11078- unsigned long kesp)
11079+unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
11080 {
11081 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
11082 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
11083 unsigned long new_kesp = kesp - base;
11084 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
11085 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
11086+
11087 /* Set up base for espfix segment */
11088- desc &= 0x00f0ff0000000000ULL;
11089- desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
11090+ desc &= 0x00f0ff0000000000ULL;
11091+ desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
11092 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
11093 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
11094 (lim_pages & 0xffff);
11095 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
11096+
11097 return new_kesp;
11098 }
11099 #endif
11100
11101 /*
11102- * 'math_state_restore()' saves the current math information in the
11103+ * 'math_state_restore()' saves the current math information in the
11104 * old math state array, and gets the new ones from the current task
11105 *
11106 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
11107@@ -1087,9 +1117,22 @@ asmlinkage void math_state_restore(void)
11108 struct thread_info *thread = current_thread_info();
11109 struct task_struct *tsk = thread->task;
11110
11111+ if (!tsk_used_math(tsk)) {
11112+ local_irq_enable();
11113+ /*
11114+ * does a slab alloc which can sleep
11115+ */
11116+ if (init_fpu(tsk)) {
11117+ /*
11118+ * ran out of memory!
11119+ */
11120+ do_group_exit(SIGKILL);
11121+ return;
11122+ }
11123+ local_irq_disable();
11124+ }
11125+
11126 /* NB. 'clts' is done for us by Xen during virtual trap. */
11127- if (!tsk_used_math(tsk))
11128- init_fpu(tsk);
11129 restore_fpu(tsk);
11130 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
11131 tsk->fpu_counter++;
11132@@ -1100,15 +1143,15 @@ EXPORT_SYMBOL_GPL(math_state_restore);
11133
11134 asmlinkage void math_emulate(long arg)
11135 {
11136- printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
11137- printk(KERN_EMERG "killing %s.\n",current->comm);
11138- force_sig(SIGFPE,current);
11139+ printk(KERN_EMERG
11140+ "math-emulation not enabled and no coprocessor found.\n");
11141+ printk(KERN_EMERG "killing %s.\n", current->comm);
11142+ force_sig(SIGFPE, current);
11143 schedule();
11144 }
11145
11146 #endif /* CONFIG_MATH_EMULATION */
11147
11148-
11149 /*
11150 * NB. All these are "trap gates" (i.e. events_mask isn't set) except
11151 * for those that specify <dpl>|4 in the second field.
11152@@ -1146,25 +1189,21 @@ void __init trap_init(void)
11153 if (ret)
11154 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
11155
11156- /*
11157- * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
11158- * Generate a build-time error if the alignment is wrong.
11159- */
11160- BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
11161 if (cpu_has_fxsr) {
11162 printk(KERN_INFO "Enabling fast FPU save and restore... ");
11163 set_in_cr4(X86_CR4_OSFXSR);
11164 printk("done.\n");
11165 }
11166 if (cpu_has_xmm) {
11167- printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
11168- "support... ");
11169+ printk(KERN_INFO
11170+ "Enabling unmasked SIMD FPU exception support... ");
11171 set_in_cr4(X86_CR4_OSXMMEXCPT);
11172 printk("done.\n");
11173 }
11174
11175+ init_thread_xstate();
11176 /*
11177- * Should be a barrier for any external CPU state.
11178+ * Should be a barrier for any external CPU state:
11179 */
11180 cpu_init();
11181 }
11182@@ -1183,6 +1222,7 @@ void __cpuinit smp_trap_init(trap_info_t
11183 static int __init kstack_setup(char *s)
11184 {
11185 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
11186+
11187 return 1;
11188 }
11189 __setup("kstack=", kstack_setup);
11190--- sle11-2009-05-14.orig/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:33:40.000000000 +0100
11191+++ sle11-2009-05-14/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11192@@ -33,6 +33,8 @@
11193 #include <linux/kdebug.h>
11194 #include <linux/utsname.h>
11195
11196+#include <mach_traps.h>
11197+
11198 #if defined(CONFIG_EDAC)
11199 #include <linux/edac.h>
11200 #endif
11201@@ -601,10 +603,16 @@ void die(const char * str, struct pt_reg
11202 }
11203
11204 #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
11205-void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
11206+notrace __kprobes void
11207+die_nmi(char *str, struct pt_regs *regs, int do_panic)
11208 {
11209- unsigned long flags = oops_begin();
11210+ unsigned long flags;
11211+
11212+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
11213+ NOTIFY_STOP)
11214+ return;
11215
11216+ flags = oops_begin();
11217 /*
11218 * We are in trouble anyway, lets at least try
11219 * to get a message out.
11220@@ -769,7 +777,7 @@ asmlinkage void __kprobes do_general_pro
11221 die("general protection fault", regs, error_code);
11222 }
11223
11224-static __kprobes void
11225+static notrace __kprobes void
11226 mem_parity_error(unsigned char reason, struct pt_regs * regs)
11227 {
11228 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
11229@@ -792,7 +800,7 @@ mem_parity_error(unsigned char reason, s
11230 clear_mem_error(reason);
11231 }
11232
11233-static __kprobes void
11234+static notrace __kprobes void
11235 io_check_error(unsigned char reason, struct pt_regs * regs)
11236 {
11237 printk("NMI: IOCK error (debug interrupt?)\n");
11238@@ -802,9 +810,11 @@ io_check_error(unsigned char reason, str
11239 clear_io_check_error(reason);
11240 }
11241
11242-static __kprobes void
11243+static notrace __kprobes void
11244 unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
11245 {
11246+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
11247+ return;
11248 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
11249 reason);
11250 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
11251@@ -817,7 +827,7 @@ unknown_nmi_error(unsigned char reason,
11252
11253 /* Runs on IST stack. This code must keep interrupts off all the time.
11254 Nested NMIs are prevented by the CPU. */
11255-asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
11256+asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
11257 {
11258 unsigned char reason = 0;
11259 int cpu;
11260@@ -1117,11 +1127,25 @@ asmlinkage void __attribute__((weak)) mc
11261 asmlinkage void math_state_restore(void)
11262 {
11263 struct task_struct *me = current;
11264+
11265+ if (!used_math()) {
11266+ local_irq_enable();
11267+ /*
11268+ * does a slab alloc which can sleep
11269+ */
11270+ if (init_fpu(me)) {
11271+ /*
11272+ * ran out of memory!
11273+ */
11274+ do_group_exit(SIGKILL);
11275+ return;
11276+ }
11277+ local_irq_disable();
11278+ }
11279+
11280 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
11281
11282- if (!used_math())
11283- init_fpu(me);
11284- restore_fpu_checking(&me->thread.i387.fxsave);
11285+ restore_fpu_checking(&me->thread.xstate->fxsave);
11286 task_thread_info(me)->status |= TS_USEDFPU;
11287 me->fpu_counter++;
11288 }
11289@@ -1168,6 +1192,10 @@ void __init trap_init(void)
11290 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
11291
11292 /*
11293+ * initialize the per thread extended state:
11294+ */
11295+ init_thread_xstate();
11296+ /*
11297 * Should be a barrier for any external CPU state.
11298 */
11299 cpu_init();
11300--- sle11-2009-05-14.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:33:40.000000000 +0100
11301+++ sle11-2009-05-14/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11302@@ -216,7 +216,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
11303 return 0;
11304 }
11305
11306-long __vsyscall(3) venosys_1(void)
11307+static long __vsyscall(3) venosys_1(void)
11308 {
11309 return -ENOSYS;
11310 }
11311--- sle11-2009-05-14.orig/arch/x86/mm/fault-xen.c 2009-03-16 16:33:40.000000000 +0100
11312+++ sle11-2009-05-14/arch/x86/mm/fault-xen.c 2009-03-16 16:38:05.000000000 +0100
11313@@ -510,6 +510,11 @@ static int vmalloc_fault(unsigned long a
11314 unsigned long pgd_paddr;
11315 pmd_t *pmd_k;
11316 pte_t *pte_k;
11317+
11318+ /* Make sure we are in vmalloc area */
11319+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
11320+ return -1;
11321+
11322 /*
11323 * Synchronize this task's top level page-table
11324 * with the 'reference' page table.
11325@@ -670,7 +675,7 @@ void __kprobes do_page_fault(struct pt_r
11326 #ifdef CONFIG_X86_32
11327 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11328 fault has been handled. */
11329- if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
11330+ if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
11331 local_irq_enable();
11332
11333 /*
11334@@ -1017,9 +1022,5 @@ void vmalloc_sync_all(void)
11335 if (address == start)
11336 start = address + PGDIR_SIZE;
11337 }
11338- /* Check that there is no need to do the same for the modules area. */
11339- BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
11340- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
11341- (__START_KERNEL & PGDIR_MASK)));
11342 #endif
11343 }
11344--- sle11-2009-05-14.orig/arch/x86/mm/highmem_32-xen.c 2009-03-16 16:33:40.000000000 +0100
11345+++ sle11-2009-05-14/arch/x86/mm/highmem_32-xen.c 2009-03-16 16:38:05.000000000 +0100
11346@@ -200,6 +200,8 @@ EXPORT_SYMBOL(kmap);
11347 EXPORT_SYMBOL(kunmap);
11348 EXPORT_SYMBOL(kmap_atomic);
11349 EXPORT_SYMBOL(kunmap_atomic);
11350+#ifdef CONFIG_HIGHPTE
11351 EXPORT_SYMBOL(kmap_atomic_to_page);
11352+#endif
11353 EXPORT_SYMBOL(clear_highpage);
11354 EXPORT_SYMBOL(copy_highpage);
11355--- sle11-2009-05-14.orig/arch/x86/mm/init_32-xen.c 2009-03-16 16:33:40.000000000 +0100
11356+++ sle11-2009-05-14/arch/x86/mm/init_32-xen.c 2009-03-16 16:38:05.000000000 +0100
11357@@ -1,5 +1,4 @@
11358 /*
11359- * linux/arch/i386/mm/init.c
11360 *
11361 * Copyright (C) 1995 Linus Torvalds
11362 *
11363@@ -22,6 +21,7 @@
11364 #include <linux/init.h>
11365 #include <linux/highmem.h>
11366 #include <linux/pagemap.h>
11367+#include <linux/pci.h>
11368 #include <linux/pfn.h>
11369 #include <linux/poison.h>
11370 #include <linux/bootmem.h>
11371@@ -54,6 +54,8 @@
11372
11373 unsigned int __VMALLOC_RESERVE = 128 << 20;
11374
11375+unsigned long max_pfn_mapped;
11376+
11377 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
11378 unsigned long highstart_pfn, highend_pfn;
11379
11380@@ -73,7 +75,7 @@ static pmd_t * __init one_md_table_init(
11381 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
11382 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
11383
11384- paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
11385+ paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
11386 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
11387 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
11388 pud = pud_offset(pgd, 0);
11389@@ -107,7 +109,7 @@ static pte_t * __init one_page_table_ini
11390 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
11391 }
11392
11393- paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
11394+ paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
11395 make_lowmem_page_readonly(page_table,
11396 XENFEAT_writable_page_tables);
11397 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
11398@@ -209,8 +211,13 @@ static void __init kernel_physical_mappi
11399 /*
11400 * Map with big pages if possible, otherwise
11401 * create normal page tables:
11402+ *
11403+ * Don't use a large page for the first 2/4MB of memory
11404+ * because there are often fixed size MTRRs in there
11405+ * and overlapping MTRRs into large pages can cause
11406+ * slowdowns.
11407 */
11408- if (cpu_has_pse) {
11409+ if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
11410 unsigned int addr2;
11411 pgprot_t prot = PAGE_KERNEL_LARGE;
11412
11413@@ -224,6 +231,7 @@ static void __init kernel_physical_mappi
11414 set_pmd(pmd, pfn_pmd(pfn, prot));
11415
11416 pfn += PTRS_PER_PTE;
11417+ max_pfn_mapped = pfn;
11418 continue;
11419 }
11420 pte = one_page_table_init(pmd);
11421@@ -241,6 +249,7 @@ static void __init kernel_physical_mappi
11422
11423 set_pte(pte, pfn_pte(pfn, prot));
11424 }
11425+ max_pfn_mapped = pfn;
11426 pte_ofs = 0;
11427 }
11428 pmd_idx = 0;
11429@@ -262,6 +271,25 @@ static inline int page_kills_ppro(unsign
11430
11431 #endif
11432
11433+/*
11434+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
11435+ * is valid. The argument is a physical page number.
11436+ *
11437+ *
11438+ * On x86, access has to be given to the first megabyte of ram because that area
11439+ * contains bios code and data regions used by X and dosemu and similar apps.
11440+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
11441+ * mmio resources as well as potential bios/acpi data regions.
11442+ */
11443+int devmem_is_allowed(unsigned long pagenr)
11444+{
11445+ if (pagenr <= 256)
11446+ return 1;
11447+ if (mfn_to_local_pfn(pagenr) >= max_pfn)
11448+ return 1;
11449+ return 0;
11450+}
11451+
11452 #ifdef CONFIG_HIGHMEM
11453 pte_t *kmap_pte;
11454 pgprot_t kmap_prot;
11455@@ -303,48 +331,18 @@ static void __init permanent_kmaps_init(
11456 pkmap_page_table = pte;
11457 }
11458
11459-static void __meminit free_new_highpage(struct page *page, int pfn)
11460-{
11461- init_page_count(page);
11462- if (pfn < xen_start_info->nr_pages)
11463- __free_page(page);
11464- totalhigh_pages++;
11465-}
11466-
11467 void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
11468 {
11469 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
11470 ClearPageReserved(page);
11471- free_new_highpage(page, pfn);
11472+ init_page_count(page);
11473+ if (pfn < xen_start_info->nr_pages)
11474+ __free_page(page);
11475+ totalhigh_pages++;
11476 } else
11477 SetPageReserved(page);
11478 }
11479
11480-static int __meminit
11481-add_one_highpage_hotplug(struct page *page, unsigned long pfn)
11482-{
11483- free_new_highpage(page, pfn);
11484- totalram_pages++;
11485-#ifdef CONFIG_FLATMEM
11486- max_mapnr = max(pfn, max_mapnr);
11487-#endif
11488- num_physpages++;
11489-
11490- return 0;
11491-}
11492-
11493-/*
11494- * Not currently handling the NUMA case.
11495- * Assuming single node and all memory that
11496- * has been added dynamically that would be
11497- * onlined here is in HIGHMEM.
11498- */
11499-void __meminit online_page(struct page *page)
11500-{
11501- ClearPageReserved(page);
11502- add_one_highpage_hotplug(page, page_to_pfn(page));
11503-}
11504-
11505 #ifndef CONFIG_NUMA
11506 static void __init set_highmem_pages_init(int bad_ppro)
11507 {
11508@@ -459,15 +457,13 @@ void zap_low_mappings(void)
11509 {
11510 int i;
11511
11512- save_pg_dir();
11513-
11514 /*
11515 * Zap initial low-memory mappings.
11516 *
11517 * Note that "pgd_clear()" doesn't do it for
11518 * us, because pgd_clear() is a no-op on i386.
11519 */
11520- for (i = 0; i < USER_PTRS_PER_PGD; i++) {
11521+ for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
11522 #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
11523 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
11524 #else
11525@@ -572,9 +568,9 @@ void __init paging_init(void)
11526
11527 /*
11528 * Test if the WP bit works in supervisor mode. It isn't supported on 386's
11529- * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
11530- * used to involve black magic jumps to work around some nasty CPU bugs,
11531- * but fortunately the switch to using exceptions got rid of all that.
11532+ * and also on some strange 486's. All 586+'s are OK. This used to involve
11533+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
11534+ * switch to using exceptions got rid of all that.
11535 */
11536 static void __init test_wp_bit(void)
11537 {
11538@@ -605,9 +601,7 @@ void __init mem_init(void)
11539 int tmp, bad_ppro;
11540 unsigned long pfn;
11541
11542-#if defined(CONFIG_SWIOTLB)
11543- swiotlb_init();
11544-#endif
11545+ pci_iommu_alloc();
11546
11547 #ifdef CONFIG_FLATMEM
11548 BUG_ON(!mem_map);
11549@@ -710,16 +704,8 @@ void __init mem_init(void)
11550 test_wp_bit();
11551
11552 cpa_init();
11553-
11554- /*
11555- * Subtle. SMP is doing it's boot stuff late (because it has to
11556- * fork idle threads) - but it also needs low mappings for the
11557- * protected-mode entry to work. We zap these entries only after
11558- * the WP-bit has been tested.
11559- */
11560-#ifndef CONFIG_SMP
11561+ save_pg_dir();
11562 zap_low_mappings();
11563-#endif
11564
11565 SetPagePinned(virt_to_page(init_mm.pgd));
11566 }
11567@@ -769,25 +755,17 @@ void mark_rodata_ro(void)
11568 unsigned long start = PFN_ALIGN(_text);
11569 unsigned long size = PFN_ALIGN(_etext) - start;
11570
11571-#ifndef CONFIG_KPROBES
11572-#ifdef CONFIG_HOTPLUG_CPU
11573- /* It must still be possible to apply SMP alternatives. */
11574- if (num_possible_cpus() <= 1)
11575-#endif
11576- {
11577- set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
11578- printk(KERN_INFO "Write protecting the kernel text: %luk\n",
11579- size >> 10);
11580+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
11581+ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
11582+ size >> 10);
11583
11584 #ifdef CONFIG_CPA_DEBUG
11585- printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
11586- start, start+size);
11587- set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
11588+ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
11589+ start, start+size);
11590+ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
11591
11592- printk(KERN_INFO "Testing CPA: write protecting again\n");
11593- set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
11594-#endif
11595- }
11596+ printk(KERN_INFO "Testing CPA: write protecting again\n");
11597+ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
11598 #endif
11599 start += size;
11600 size = (unsigned long)__end_rodata - start;
11601--- sle11-2009-05-14.orig/arch/x86/mm/init_64-xen.c 2009-03-16 16:33:40.000000000 +0100
11602+++ sle11-2009-05-14/arch/x86/mm/init_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11603@@ -52,9 +52,6 @@
11604
11605 #include <xen/features.h>
11606
11607-const struct dma_mapping_ops *dma_ops;
11608-EXPORT_SYMBOL(dma_ops);
11609-
11610 #if CONFIG_XEN_COMPAT <= 0x030002
11611 unsigned int __kernel_page_user;
11612 EXPORT_SYMBOL(__kernel_page_user);
11613@@ -68,6 +65,28 @@ extern unsigned long start_pfn;
11614 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
11615 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
11616
11617+#ifndef CONFIG_XEN
11618+int direct_gbpages __meminitdata
11619+#ifdef CONFIG_DIRECT_GBPAGES
11620+ = 1
11621+#endif
11622+;
11623+
11624+static int __init parse_direct_gbpages_off(char *arg)
11625+{
11626+ direct_gbpages = 0;
11627+ return 0;
11628+}
11629+early_param("nogbpages", parse_direct_gbpages_off);
11630+
11631+static int __init parse_direct_gbpages_on(char *arg)
11632+{
11633+ direct_gbpages = 1;
11634+ return 0;
11635+}
11636+early_param("gbpages", parse_direct_gbpages_on);
11637+#endif
11638+
11639 /*
11640 * Use this until direct mapping is established, i.e. before __va() is
11641 * available in init_memory_mapping().
11642@@ -135,9 +154,6 @@ void show_mem(void)
11643
11644 printk(KERN_INFO "Mem-info:\n");
11645 show_free_areas();
11646- printk(KERN_INFO "Free swap: %6ldkB\n",
11647- nr_swap_pages << (PAGE_SHIFT-10));
11648-
11649 for_each_online_pgdat(pgdat) {
11650 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
11651 /*
11652@@ -328,7 +344,7 @@ void __init cleanup_highmap(void)
11653 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
11654
11655 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
11656- if (!pmd_present(*pmd))
11657+ if (pmd_none(*pmd))
11658 continue;
11659 if (vaddr < (unsigned long) _text || vaddr > end)
11660 set_pmd(pmd, __pmd(0));
11661@@ -337,8 +353,7 @@ void __init cleanup_highmap(void)
11662 #endif
11663
11664 /* NOTE: this is meant to be run only at boot */
11665-void __init
11666-__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
11667+void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
11668 {
11669 unsigned long address = __fix_to_virt(idx);
11670
11671@@ -463,7 +478,7 @@ __meminit void early_iounmap(void *addr,
11672 }
11673 #endif
11674
11675-static void __meminit
11676+static unsigned long __meminit
11677 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
11678 {
11679 int i = pmd_index(address);
11680@@ -503,21 +518,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
11681 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
11682 }
11683 }
11684+ return address;
11685 }
11686
11687-static void __meminit
11688+static unsigned long __meminit
11689 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
11690 {
11691 pmd_t *pmd = pmd_offset(pud, 0);
11692+ unsigned long last_map_addr;
11693+
11694 spin_lock(&init_mm.page_table_lock);
11695- phys_pmd_init(pmd, address, end);
11696+ last_map_addr = phys_pmd_init(pmd, address, end);
11697 spin_unlock(&init_mm.page_table_lock);
11698 __flush_tlb_all();
11699+ return last_map_addr;
11700 }
11701
11702-static void __meminit
11703+static unsigned long __meminit
11704 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
11705 {
11706+ unsigned long last_map_addr = end;
11707 int i = pud_index(addr);
11708
11709 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
11710@@ -529,7 +549,15 @@ phys_pud_init(pud_t *pud_page, unsigned
11711 break;
11712
11713 if (__pud_val(*pud)) {
11714- phys_pmd_update(pud, addr, end);
11715+ if (!pud_large(*pud))
11716+ last_map_addr = phys_pmd_update(pud, addr, end);
11717+ continue;
11718+ }
11719+
11720+ if (direct_gbpages) {
11721+ set_pte((pte_t *)pud,
11722+ pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
11723+ last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
11724 continue;
11725 }
11726
11727@@ -537,12 +565,14 @@ phys_pud_init(pud_t *pud_page, unsigned
11728
11729 spin_lock(&init_mm.page_table_lock);
11730 *pud = __pud(pmd_phys | _KERNPG_TABLE);
11731- phys_pmd_init(pmd, addr, end);
11732+ last_map_addr = phys_pmd_init(pmd, addr, end);
11733 spin_unlock(&init_mm.page_table_lock);
11734
11735 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
11736 }
11737 __flush_tlb_all();
11738+
11739+ return last_map_addr >> PAGE_SHIFT;
11740 }
11741
11742 void __init xen_init_pt(void)
11743@@ -754,16 +784,138 @@ static void __init xen_finish_init_mappi
11744 table_end = start_pfn;
11745 }
11746
11747+static void __init init_gbpages(void)
11748+{
11749+#ifndef CONFIG_XEN
11750+ if (direct_gbpages && cpu_has_gbpages)
11751+ printk(KERN_INFO "Using GB pages for direct mapping\n");
11752+ else
11753+ direct_gbpages = 0;
11754+#endif
11755+}
11756+
11757+#ifdef CONFIG_MEMTEST_BOOTPARAM
11758+
11759+static void __init memtest(unsigned long start_phys, unsigned long size,
11760+ unsigned pattern)
11761+{
11762+ unsigned long i;
11763+ unsigned long *start;
11764+ unsigned long start_bad;
11765+ unsigned long last_bad;
11766+ unsigned long val;
11767+ unsigned long start_phys_aligned;
11768+ unsigned long count;
11769+ unsigned long incr;
11770+
11771+ switch (pattern) {
11772+ case 0:
11773+ val = 0UL;
11774+ break;
11775+ case 1:
11776+ val = -1UL;
11777+ break;
11778+ case 2:
11779+ val = 0x5555555555555555UL;
11780+ break;
11781+ case 3:
11782+ val = 0xaaaaaaaaaaaaaaaaUL;
11783+ break;
11784+ default:
11785+ return;
11786+ }
11787+
11788+ incr = sizeof(unsigned long);
11789+ start_phys_aligned = ALIGN(start_phys, incr);
11790+ count = (size - (start_phys_aligned - start_phys))/incr;
11791+ start = __va(start_phys_aligned);
11792+ start_bad = 0;
11793+ last_bad = 0;
11794+
11795+ for (i = 0; i < count; i++)
11796+ start[i] = val;
11797+ for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
11798+ if (*start != val) {
11799+ if (start_phys_aligned == last_bad + incr) {
11800+ last_bad += incr;
11801+ } else {
11802+ if (start_bad) {
11803+ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
11804+ val, start_bad, last_bad + incr);
11805+ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
11806+ }
11807+ start_bad = last_bad = start_phys_aligned;
11808+ }
11809+ }
11810+ }
11811+ if (start_bad) {
11812+ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
11813+ val, start_bad, last_bad + incr);
11814+ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
11815+ }
11816+
11817+}
11818+
11819+static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
11820+
11821+static int __init parse_memtest(char *arg)
11822+{
11823+ if (arg)
11824+ memtest_pattern = simple_strtoul(arg, NULL, 0);
11825+ return 0;
11826+}
11827+
11828+early_param("memtest", parse_memtest);
11829+
11830+static void __init early_memtest(unsigned long start, unsigned long end)
11831+{
11832+ u64 t_start, t_size;
11833+ unsigned pattern;
11834+
11835+ if (!memtest_pattern)
11836+ return;
11837+
11838+ printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
11839+ for (pattern = 0; pattern < memtest_pattern; pattern++) {
11840+ t_start = start;
11841+ t_size = 0;
11842+ while (t_start < end) {
11843+ t_start = find_e820_area_size(t_start, &t_size, 1);
11844+
11845+ /* done ? */
11846+ if (t_start >= end)
11847+ break;
11848+ if (t_start + t_size > end)
11849+ t_size = end - t_start;
11850+
11851+ printk(KERN_CONT "\n %016llx - %016llx pattern %d",
11852+ (unsigned long long)t_start,
11853+ (unsigned long long)t_start + t_size, pattern);
11854+
11855+ memtest(t_start, t_size, pattern);
11856+
11857+ t_start += t_size;
11858+ }
11859+ }
11860+ printk(KERN_CONT "\n");
11861+}
11862+#else
11863+static void __init early_memtest(unsigned long start, unsigned long end)
11864+{
11865+}
11866+#endif
11867+
11868 /*
11869 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
11870 * This runs before bootmem is initialized and gets pages directly from
11871 * the physical memory. To access them they are temporarily mapped.
11872 */
11873-void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
11874+unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
11875 {
11876- unsigned long next;
11877+ unsigned long next, last_map_addr = end;
11878+ unsigned long start_phys = start, end_phys = end;
11879
11880- pr_debug("init_memory_mapping\n");
11881+ printk(KERN_INFO "init_memory_mapping\n");
11882
11883 /*
11884 * Find space for the kernel direct mapping tables.
11885@@ -772,8 +924,10 @@ void __init_refok init_memory_mapping(un
11886 * memory mapped. Unfortunately this is done currently before the
11887 * nodes are discovered.
11888 */
11889- if (!after_bootmem)
11890+ if (!after_bootmem) {
11891+ init_gbpages();
11892 find_early_table_space(end);
11893+ }
11894
11895 start = (unsigned long)__va(start);
11896 end = (unsigned long)__va(end);
11897@@ -790,7 +944,7 @@ void __init_refok init_memory_mapping(un
11898 next = start + PGDIR_SIZE;
11899 if (next > end)
11900 next = end;
11901- phys_pud_init(pud, __pa(start), __pa(next));
11902+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
11903 if (!after_bootmem) {
11904 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
11905 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
11906@@ -807,6 +961,11 @@ void __init_refok init_memory_mapping(un
11907 if (!after_bootmem)
11908 reserve_early(table_start << PAGE_SHIFT,
11909 table_end << PAGE_SHIFT, "PGTABLE");
11910+
11911+ if (!after_bootmem)
11912+ early_memtest(start_phys, end_phys);
11913+
11914+ return last_map_addr;
11915 }
11916
11917 #ifndef CONFIG_NUMA
11918@@ -830,15 +989,6 @@ void __init paging_init(void)
11919 /*
11920 * Memory hotplug specific functions
11921 */
11922-void online_page(struct page *page)
11923-{
11924- ClearPageReserved(page);
11925- init_page_count(page);
11926- __free_page(page);
11927- totalram_pages++;
11928- num_physpages++;
11929-}
11930-
11931 #ifdef CONFIG_MEMORY_HOTPLUG
11932 /*
11933 * Memory is added always to NORMAL zone. This means you will never get
11934@@ -848,11 +998,13 @@ int arch_add_memory(int nid, u64 start,
11935 {
11936 struct pglist_data *pgdat = NODE_DATA(nid);
11937 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
11938- unsigned long start_pfn = start >> PAGE_SHIFT;
11939+ unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
11940 unsigned long nr_pages = size >> PAGE_SHIFT;
11941 int ret;
11942
11943- init_memory_mapping(start, start + size-1);
11944+ last_mapped_pfn = init_memory_mapping(start, start + size-1);
11945+ if (last_mapped_pfn > max_pfn_mapped)
11946+ max_pfn_mapped = last_mapped_pfn;
11947
11948 ret = __add_pages(zone, start_pfn, nr_pages);
11949 WARN_ON(1);
11950@@ -871,6 +1023,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
11951
11952 #endif /* CONFIG_MEMORY_HOTPLUG */
11953
11954+/*
11955+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
11956+ * is valid. The argument is a physical page number.
11957+ *
11958+ *
11959+ * On x86, access has to be given to the first megabyte of ram because that area
11960+ * contains bios code and data regions used by X and dosemu and similar apps.
11961+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
11962+ * mmio resources as well as potential bios/acpi data regions.
11963+ */
11964+int devmem_is_allowed(unsigned long pagenr)
11965+{
11966+ if (pagenr <= 256)
11967+ return 1;
11968+ if (mfn_to_local_pfn(pagenr) >= max_pfn)
11969+ return 1;
11970+ return 0;
11971+}
11972+
11973+
11974 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
11975 kcore_modules, kcore_vsyscall;
11976
11977@@ -979,24 +1151,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
11978
11979 void mark_rodata_ro(void)
11980 {
11981- unsigned long start = (unsigned long)_stext, end;
11982-
11983-#ifdef CONFIG_HOTPLUG_CPU
11984- /* It must still be possible to apply SMP alternatives. */
11985- if (num_possible_cpus() > 1)
11986- start = (unsigned long)_etext;
11987-#endif
11988-
11989-#ifdef CONFIG_KPROBES
11990- start = (unsigned long)__start_rodata;
11991-#endif
11992-
11993- end = (unsigned long)__end_rodata;
11994- start = (start + PAGE_SIZE - 1) & PAGE_MASK;
11995- end &= PAGE_MASK;
11996- if (end <= start)
11997- return;
11998-
11999+ unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
12000
12001 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
12002 (end - start) >> 10);
12003@@ -1019,6 +1174,7 @@ void mark_rodata_ro(void)
12004 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
12005 #endif
12006 }
12007+
12008 #endif
12009
12010 #ifdef CONFIG_BLK_DEV_INITRD
12011@@ -1031,7 +1187,7 @@ void free_initrd_mem(unsigned long start
12012 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
12013 {
12014 #ifdef CONFIG_NUMA
12015- int nid = phys_to_nid(phys);
12016+ int nid, next_nid;
12017 #endif
12018 unsigned long pfn = phys >> PAGE_SHIFT;
12019
12020@@ -1040,7 +1196,7 @@ void __init reserve_bootmem_generic(unsi
12021 * This can happen with kdump kernels when accessing
12022 * firmware tables:
12023 */
12024- if (pfn < end_pfn_map)
12025+ if (pfn < max_pfn_mapped)
12026 return;
12027
12028 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
12029@@ -1050,10 +1206,16 @@ void __init reserve_bootmem_generic(unsi
12030
12031 /* Should check here against the e820 map to avoid double free */
12032 #ifdef CONFIG_NUMA
12033- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
12034+ nid = phys_to_nid(phys);
12035+ next_nid = phys_to_nid(phys + len - 1);
12036+ if (nid == next_nid)
12037+ reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
12038+ else
12039+ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
12040 #else
12041 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
12042 #endif
12043+
12044 #ifndef CONFIG_XEN
12045 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
12046 dma_reserve += len / PAGE_SIZE;
12047@@ -1149,6 +1311,10 @@ const char *arch_vma_name(struct vm_area
12048 /*
12049 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
12050 */
12051+static long __meminitdata addr_start, addr_end;
12052+static void __meminitdata *p_start, *p_end;
12053+static int __meminitdata node_start;
12054+
12055 int __meminit
12056 vmemmap_populate(struct page *start_page, unsigned long size, int node)
12057 {
12058@@ -1183,12 +1349,32 @@ vmemmap_populate(struct page *start_page
12059 PAGE_KERNEL_LARGE);
12060 set_pmd(pmd, __pmd_ma(__pte_val(entry)));
12061
12062- printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
12063- addr, addr + PMD_SIZE - 1, p, node);
12064+ /* check to see if we have contiguous blocks */
12065+ if (p_end != p || node_start != node) {
12066+ if (p_start)
12067+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
12068+ addr_start, addr_end-1, p_start, p_end-1, node_start);
12069+ addr_start = addr;
12070+ node_start = node;
12071+ p_start = p;
12072+ }
12073+ addr_end = addr + PMD_SIZE;
12074+ p_end = p + PMD_SIZE;
12075 } else {
12076 vmemmap_verify((pte_t *)pmd, node, addr, next);
12077 }
12078 }
12079 return 0;
12080 }
12081+
12082+void __meminit vmemmap_populate_print_last(void)
12083+{
12084+ if (p_start) {
12085+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
12086+ addr_start, addr_end-1, p_start, p_end-1, node_start);
12087+ p_start = NULL;
12088+ p_end = NULL;
12089+ node_start = 0;
12090+ }
12091+}
12092 #endif
12093--- sle11-2009-05-14.orig/arch/x86/mm/ioremap-xen.c 2009-03-16 16:33:40.000000000 +0100
12094+++ sle11-2009-05-14/arch/x86/mm/ioremap-xen.c 2009-03-16 16:38:05.000000000 +0100
12095@@ -20,14 +20,11 @@
12096 #include <asm/pgtable.h>
12097 #include <asm/tlbflush.h>
12098 #include <asm/pgalloc.h>
12099+#include <asm/pat.h>
12100
12101-enum ioremap_mode {
12102- IOR_MODE_UNCACHED,
12103- IOR_MODE_CACHED,
12104-};
12105-
12106-#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12107+#ifdef CONFIG_X86_64
12108
12109+#ifndef CONFIG_XEN
12110 unsigned long __phys_addr(unsigned long x)
12111 {
12112 if (x >= __START_KERNEL_map)
12113@@ -35,6 +32,19 @@ unsigned long __phys_addr(unsigned long
12114 return x - PAGE_OFFSET;
12115 }
12116 EXPORT_SYMBOL(__phys_addr);
12117+#endif
12118+
12119+static inline int phys_addr_valid(unsigned long addr)
12120+{
12121+ return addr < (1UL << boot_cpu_data.x86_phys_bits);
12122+}
12123+
12124+#else
12125+
12126+static inline int phys_addr_valid(unsigned long addr)
12127+{
12128+ return 1;
12129+}
12130
12131 #endif
12132
12133@@ -92,7 +102,8 @@ static int __direct_remap_pfn_range(stru
12134 * Fill in the machine address: PTE ptr is done later by
12135 * apply_to_page_range().
12136 */
12137- v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
12138+ pgprot_val(prot) |= _PAGE_IO;
12139+ v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot)));
12140
12141 mfn++;
12142 address += PAGE_SIZE;
12143@@ -189,10 +200,9 @@ int touch_pte_range(struct mm_struct *mm
12144
12145 EXPORT_SYMBOL(touch_pte_range);
12146
12147-#ifdef CONFIG_X86_32
12148 int page_is_ram(unsigned long pagenr)
12149 {
12150- unsigned long addr, end;
12151+ resource_size_t addr, end;
12152 int i;
12153
12154 #ifndef CONFIG_XEN
12155@@ -228,31 +238,51 @@ int page_is_ram(unsigned long pagenr)
12156 }
12157 return 0;
12158 }
12159-#endif
12160
12161 /*
12162 * Fix up the linear direct mapping of the kernel to avoid cache attribute
12163 * conflicts.
12164 */
12165 static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
12166- enum ioremap_mode mode)
12167+ unsigned long prot_val)
12168 {
12169 unsigned long nrpages = size >> PAGE_SHIFT;
12170 int err;
12171
12172- switch (mode) {
12173- case IOR_MODE_UNCACHED:
12174+ switch (prot_val) {
12175+ case _PAGE_CACHE_UC:
12176 default:
12177- err = set_memory_uc(vaddr, nrpages);
12178+ err = _set_memory_uc(vaddr, nrpages);
12179+ break;
12180+ case _PAGE_CACHE_WC:
12181+ err = _set_memory_wc(vaddr, nrpages);
12182 break;
12183- case IOR_MODE_CACHED:
12184- err = set_memory_wb(vaddr, nrpages);
12185+ case _PAGE_CACHE_WB:
12186+ err = _set_memory_wb(vaddr, nrpages);
12187 break;
12188 }
12189
12190 return err;
12191 }
12192
12193+int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
12194+ unsigned long prot_val)
12195+{
12196+ unsigned long sz;
12197+ int rc;
12198+
12199+ for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
12200+ unsigned long pfn = mfn_to_local_pfn(mfn);
12201+
12202+ if (pfn >= max_pfn_mapped)
12203+ continue;
12204+ rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
12205+ PAGE_SIZE, prot_val);
12206+ }
12207+
12208+ return rc;
12209+}
12210+
12211 /*
12212 * Remap an arbitrary physical address space into the kernel virtual
12213 * address space. Needed when the kernel wants to access high addresses
12214@@ -262,12 +292,15 @@ static int ioremap_change_attr(unsigned
12215 * have to convert them into an offset in a page-aligned mapping, but the
12216 * caller shouldn't need to know that small detail.
12217 */
12218-static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
12219- enum ioremap_mode mode)
12220+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
12221+ unsigned long size, unsigned long prot_val, void *caller)
12222 {
12223- unsigned long mfn, offset, last_addr, vaddr;
12224+ unsigned long mfn, offset, vaddr;
12225+ resource_size_t last_addr;
12226 struct vm_struct *area;
12227+ unsigned long new_prot_val;
12228 pgprot_t prot;
12229+ int retval;
12230 domid_t domid = DOMID_IO;
12231
12232 /* Don't allow wraparound or zero size */
12233@@ -275,6 +308,13 @@ static void __iomem *__ioremap(resource_
12234 if (!size || last_addr < phys_addr)
12235 return NULL;
12236
12237+ if (!phys_addr_valid(phys_addr)) {
12238+ printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
12239+ (unsigned long long)phys_addr);
12240+ WARN_ON_ONCE(1);
12241+ return NULL;
12242+ }
12243+
12244 /*
12245 * Don't remap the low PCI/ISA area, it's always mapped..
12246 */
12247@@ -287,55 +327,86 @@ static void __iomem *__ioremap(resource_
12248 for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
12249 unsigned long pfn = mfn_to_local_pfn(mfn);
12250
12251- if (pfn >= max_pfn)
12252- continue;
12253+ if (pfn_valid(pfn)) {
12254+ if (!PageReserved(pfn_to_page(pfn)))
12255+ return NULL;
12256+ domid = DOMID_SELF;
12257+ }
12258+ }
12259+ WARN_ON_ONCE(domid == DOMID_SELF);
12260
12261- domid = DOMID_SELF;
12262+ /*
12263+ * Mappings have to be page-aligned
12264+ */
12265+ offset = phys_addr & ~PAGE_MASK;
12266+ phys_addr &= PAGE_MASK;
12267+ size = PAGE_ALIGN(last_addr+1) - phys_addr;
12268
12269- if (pfn >= max_pfn_mapped) /* bogus */
12270- continue;
12271+ retval = reserve_memtype(phys_addr, phys_addr + size,
12272+ prot_val, &new_prot_val);
12273+ if (retval) {
12274+ pr_debug("Warning: reserve_memtype returned %d\n", retval);
12275+ return NULL;
12276+ }
12277
12278- if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
12279+ if (prot_val != new_prot_val) {
12280+ /*
12281+ * Do not fallback to certain memory types with certain
12282+ * requested type:
12283+ * - request is uc-, return cannot be write-back
12284+ * - request is uc-, return cannot be write-combine
12285+ * - request is write-combine, return cannot be write-back
12286+ */
12287+ if ((prot_val == _PAGE_CACHE_UC_MINUS &&
12288+ (new_prot_val == _PAGE_CACHE_WB ||
12289+ new_prot_val == _PAGE_CACHE_WC)) ||
12290+ (prot_val == _PAGE_CACHE_WC &&
12291+ new_prot_val == _PAGE_CACHE_WB)) {
12292+ pr_debug(
12293+ "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
12294+ (unsigned long long)phys_addr,
12295+ (unsigned long long)(phys_addr + size),
12296+ prot_val, new_prot_val);
12297+ free_memtype(phys_addr, phys_addr + size);
12298 return NULL;
12299+ }
12300+ prot_val = new_prot_val;
12301 }
12302
12303- switch (mode) {
12304- case IOR_MODE_UNCACHED:
12305+ switch (prot_val) {
12306+ case _PAGE_CACHE_UC:
12307 default:
12308- /*
12309- * FIXME: we will use UC MINUS for now, as video fb drivers
12310- * depend on it. Upcoming ioremap_wc() will fix this behavior.
12311- */
12312+ prot = PAGE_KERNEL_NOCACHE;
12313+ break;
12314+ case _PAGE_CACHE_UC_MINUS:
12315 prot = PAGE_KERNEL_UC_MINUS;
12316 break;
12317- case IOR_MODE_CACHED:
12318+ case _PAGE_CACHE_WC:
12319+ prot = PAGE_KERNEL_WC;
12320+ break;
12321+ case _PAGE_CACHE_WB:
12322 prot = PAGE_KERNEL;
12323 break;
12324 }
12325
12326 /*
12327- * Mappings have to be page-aligned
12328- */
12329- offset = phys_addr & ~PAGE_MASK;
12330- phys_addr &= PAGE_MASK;
12331- size = PAGE_ALIGN(last_addr+1) - phys_addr;
12332-
12333- /*
12334 * Ok, go for it..
12335 */
12336- area = get_vm_area(size, VM_IOREMAP | (mode << 20));
12337+ area = get_vm_area_caller(size, VM_IOREMAP, caller);
12338 if (!area)
12339 return NULL;
12340 area->phys_addr = phys_addr;
12341 vaddr = (unsigned long) area->addr;
12342 if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
12343 size, prot, domid)) {
12344+ free_memtype(phys_addr, phys_addr + size);
12345 free_vm_area(area);
12346 return NULL;
12347 }
12348
12349- if (ioremap_change_attr(vaddr, size, mode) < 0) {
12350- iounmap((void __iomem *) vaddr);
12351+ if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
12352+ free_memtype(phys_addr, phys_addr + size);
12353+ vunmap(area->addr);
12354 return NULL;
12355 }
12356
12357@@ -365,16 +436,72 @@ static void __iomem *__ioremap(resource_
12358 */
12359 void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
12360 {
12361- return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
12362+ /*
12363+ * Ideally, this should be:
12364+ * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
12365+ *
12366+ * Till we fix all X drivers to use ioremap_wc(), we will use
12367+ * UC MINUS.
12368+ */
12369+ unsigned long val = _PAGE_CACHE_UC_MINUS;
12370+
12371+ return __ioremap_caller(phys_addr, size, val,
12372+ __builtin_return_address(0));
12373 }
12374 EXPORT_SYMBOL(ioremap_nocache);
12375
12376+/**
12377+ * ioremap_wc - map memory into CPU space write combined
12378+ * @offset: bus address of the memory
12379+ * @size: size of the resource to map
12380+ *
12381+ * This version of ioremap ensures that the memory is marked write combining.
12382+ * Write combining allows faster writes to some hardware devices.
12383+ *
12384+ * Must be freed with iounmap.
12385+ */
12386+void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
12387+{
12388+ if (pat_wc_enabled)
12389+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
12390+ __builtin_return_address(0));
12391+ else
12392+ return ioremap_nocache(phys_addr, size);
12393+}
12394+EXPORT_SYMBOL(ioremap_wc);
12395+
12396 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
12397 {
12398- return __ioremap(phys_addr, size, IOR_MODE_CACHED);
12399+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
12400+ __builtin_return_address(0));
12401 }
12402 EXPORT_SYMBOL(ioremap_cache);
12403
12404+#ifndef CONFIG_XEN
12405+static void __iomem *ioremap_default(resource_size_t phys_addr,
12406+ unsigned long size)
12407+{
12408+ unsigned long flags;
12409+ void *ret;
12410+ int err;
12411+
12412+ /*
12413+ * - WB for WB-able memory and no other conflicting mappings
12414+ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
12415+ * - Inherit from confliting mappings otherwise
12416+ */
12417+ err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
12418+ if (err < 0)
12419+ return NULL;
12420+
12421+ ret = (void *) __ioremap_caller(phys_addr, size, flags,
12422+ __builtin_return_address(0));
12423+
12424+ free_memtype(phys_addr, phys_addr + size);
12425+ return (void __iomem *)ret;
12426+}
12427+#endif
12428+
12429 /**
12430 * iounmap - Free a IO remapping
12431 * @addr: virtual address from ioremap_*
12432@@ -417,15 +544,7 @@ void iounmap(volatile void __iomem *addr
12433 return;
12434 }
12435
12436- if ((p->flags >> 20) != IOR_MODE_CACHED) {
12437- unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
12438- unsigned long mfn = p->phys_addr;
12439- unsigned long va = (unsigned long)addr;
12440-
12441- for (; n > 0; n--, mfn++, va += PAGE_SIZE)
12442- if (mfn_to_local_pfn(mfn) < max_pfn)
12443- set_memory_wb(va, 1);
12444- }
12445+ free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
12446
12447 /* Finally remove it */
12448 o = remove_vm_area((void *)addr);
12449@@ -434,6 +553,37 @@ void iounmap(volatile void __iomem *addr
12450 }
12451 EXPORT_SYMBOL(iounmap);
12452
12453+#ifndef CONFIG_XEN
12454+/*
12455+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
12456+ * access
12457+ */
12458+void *xlate_dev_mem_ptr(unsigned long phys)
12459+{
12460+ void *addr;
12461+ unsigned long start = phys & PAGE_MASK;
12462+
12463+ /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
12464+ if (page_is_ram(start >> PAGE_SHIFT))
12465+ return __va(phys);
12466+
12467+ addr = (void *)ioremap_default(start, PAGE_SIZE);
12468+ if (addr)
12469+ addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
12470+
12471+ return addr;
12472+}
12473+
12474+void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
12475+{
12476+ if (page_is_ram(phys >> PAGE_SHIFT))
12477+ return;
12478+
12479+ iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
12480+ return;
12481+}
12482+#endif
12483+
12484 int __initdata early_ioremap_debug;
12485
12486 static int __init early_ioremap_debug_setup(char *str)
12487@@ -445,8 +595,8 @@ static int __init early_ioremap_debug_se
12488 early_param("early_ioremap_debug", early_ioremap_debug_setup);
12489
12490 static __initdata int after_paging_init;
12491-static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
12492- __attribute__((aligned(PAGE_SIZE)));
12493+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
12494+ __section(.bss.page_aligned);
12495
12496 #ifdef CONFIG_X86_32
12497 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
12498@@ -461,8 +611,8 @@ static inline pmd_t * __init early_iorem
12499 }
12500 #else
12501 #define early_ioremap_pmd early_get_pmd
12502+#undef make_lowmem_page_readonly
12503 #define make_lowmem_page_readonly early_make_page_readonly
12504-#define make_lowmem_page_writable make_page_writable
12505 #endif
12506
12507 static inline pte_t * __init early_ioremap_pte(unsigned long addr)
12508@@ -512,7 +662,7 @@ void __init early_ioremap_clear(void)
12509 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
12510 pmd_clear(pmd);
12511 make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
12512- /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
12513+ /* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */
12514 __flush_tlb_all();
12515 }
12516
12517@@ -654,10 +804,11 @@ void __init early_iounmap(void *addr, un
12518 unsigned long offset;
12519 unsigned int nrpages;
12520 enum fixed_addresses idx;
12521- unsigned int nesting;
12522+ int nesting;
12523
12524 nesting = --early_ioremap_nested;
12525- WARN_ON(nesting < 0);
12526+ if (WARN_ON(nesting < 0))
12527+ return;
12528
12529 if (early_ioremap_debug) {
12530 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
12531--- sle11-2009-05-14.orig/arch/x86/mm/pageattr-xen.c 2009-03-16 16:37:14.000000000 +0100
12532+++ sle11-2009-05-14/arch/x86/mm/pageattr-xen.c 2009-03-16 16:38:05.000000000 +0100
12533@@ -9,6 +9,8 @@
12534 #include <linux/slab.h>
12535 #include <linux/mm.h>
12536 #include <linux/interrupt.h>
12537+#include <linux/seq_file.h>
12538+#include <linux/debugfs.h>
12539
12540 #include <asm/e820.h>
12541 #include <asm/processor.h>
12542@@ -17,370 +19,7 @@
12543 #include <asm/uaccess.h>
12544 #include <asm/pgalloc.h>
12545 #include <asm/proto.h>
12546-#include <asm/mmu_context.h>
12547-
12548-#ifndef CONFIG_X86_64
12549-#define TASK_SIZE64 TASK_SIZE
12550-#endif
12551-
12552-static void _pin_lock(struct mm_struct *mm, int lock) {
12553- if (lock)
12554- spin_lock(&mm->page_table_lock);
12555-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
12556- /* While mm->page_table_lock protects us against insertions and
12557- * removals of higher level page table pages, it doesn't protect
12558- * against updates of pte-s. Such updates, however, require the
12559- * pte pages to be in consistent state (unpinned+writable or
12560- * pinned+readonly). The pinning and attribute changes, however
12561- * cannot be done atomically, which is why such updates must be
12562- * prevented from happening concurrently.
12563- * Note that no pte lock can ever elsewhere be acquired nesting
12564- * with an already acquired one in the same mm, or with the mm's
12565- * page_table_lock already acquired, as that would break in the
12566- * non-split case (where all these are actually resolving to the
12567- * one page_table_lock). Thus acquiring all of them here is not
12568- * going to result in dead locks, and the order of acquires
12569- * doesn't matter.
12570- */
12571- {
12572- pgd_t *pgd = mm->pgd;
12573- unsigned g;
12574-
12575- for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
12576- pud_t *pud;
12577- unsigned u;
12578-
12579- if (pgd_none(*pgd))
12580- continue;
12581- pud = pud_offset(pgd, 0);
12582- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
12583- pmd_t *pmd;
12584- unsigned m;
12585-
12586- if (pud_none(*pud))
12587- continue;
12588- pmd = pmd_offset(pud, 0);
12589- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
12590- spinlock_t *ptl;
12591-
12592- if (pmd_none(*pmd))
12593- continue;
12594- ptl = pte_lockptr(0, pmd);
12595- if (lock)
12596- spin_lock(ptl);
12597- else
12598- spin_unlock(ptl);
12599- }
12600- }
12601- }
12602- }
12603-#endif
12604- if (!lock)
12605- spin_unlock(&mm->page_table_lock);
12606-}
12607-#define pin_lock(mm) _pin_lock(mm, 1)
12608-#define pin_unlock(mm) _pin_lock(mm, 0)
12609-
12610-#define PIN_BATCH sizeof(void *)
12611-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
12612-
12613-static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
12614- unsigned int cpu, unsigned int seq)
12615-{
12616- unsigned long pfn = page_to_pfn(page);
12617-
12618- if (PageHighMem(page)) {
12619- if (pgprot_val(flags) & _PAGE_RW)
12620- ClearPagePinned(page);
12621- else
12622- SetPagePinned(page);
12623- } else {
12624- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
12625- (unsigned long)__va(pfn << PAGE_SHIFT),
12626- pfn_pte(pfn, flags), 0);
12627- if (unlikely(++seq == PIN_BATCH)) {
12628- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
12629- PIN_BATCH, NULL)))
12630- BUG();
12631- seq = 0;
12632- }
12633- }
12634-
12635- return seq;
12636-}
12637-
12638-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
12639-{
12640- pgd_t *pgd = pgd_base;
12641- pud_t *pud;
12642- pmd_t *pmd;
12643- int g,u,m;
12644- unsigned int cpu, seq;
12645- multicall_entry_t *mcl;
12646-
12647- if (xen_feature(XENFEAT_auto_translated_physmap))
12648- return;
12649-
12650- cpu = get_cpu();
12651-
12652- /*
12653- * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
12654- * may not be the 'current' task's pagetables (e.g., current may be
12655- * 32-bit, but the pagetables may be for a 64-bit task).
12656- * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
12657- * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
12658- */
12659- for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
12660- if (pgd_none(*pgd))
12661- continue;
12662- pud = pud_offset(pgd, 0);
12663- if (PTRS_PER_PUD > 1) /* not folded */
12664- seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
12665- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
12666- if (pud_none(*pud))
12667- continue;
12668- pmd = pmd_offset(pud, 0);
12669- if (PTRS_PER_PMD > 1) /* not folded */
12670- seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
12671- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
12672- if (pmd_none(*pmd))
12673- continue;
12674- seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
12675- }
12676- }
12677- }
12678-
12679- mcl = per_cpu(pb_mcl, cpu);
12680-#ifdef CONFIG_X86_64
12681- if (unlikely(seq > PIN_BATCH - 2)) {
12682- if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
12683- BUG();
12684- seq = 0;
12685- }
12686- MULTI_update_va_mapping(mcl + seq,
12687- (unsigned long)__user_pgd(pgd_base),
12688- pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
12689- 0);
12690- MULTI_update_va_mapping(mcl + seq + 1,
12691- (unsigned long)pgd_base,
12692- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
12693- UVMF_TLB_FLUSH);
12694- if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
12695- BUG();
12696-#else
12697- if (likely(seq != 0)) {
12698- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
12699- (unsigned long)pgd_base,
12700- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
12701- UVMF_TLB_FLUSH);
12702- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
12703- seq + 1, NULL)))
12704- BUG();
12705- } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
12706- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
12707- UVMF_TLB_FLUSH))
12708- BUG();
12709-#endif
12710-
12711- put_cpu();
12712-}
12713-
12714-static void __pgd_pin(pgd_t *pgd)
12715-{
12716- pgd_walk(pgd, PAGE_KERNEL_RO);
12717- kmap_flush_unused();
12718- xen_pgd_pin(__pa(pgd)); /* kernel */
12719-#ifdef CONFIG_X86_64
12720- xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
12721-#endif
12722- SetPagePinned(virt_to_page(pgd));
12723-}
12724-
12725-static void __pgd_unpin(pgd_t *pgd)
12726-{
12727- xen_pgd_unpin(__pa(pgd));
12728-#ifdef CONFIG_X86_64
12729- xen_pgd_unpin(__pa(__user_pgd(pgd)));
12730-#endif
12731- pgd_walk(pgd, PAGE_KERNEL);
12732- ClearPagePinned(virt_to_page(pgd));
12733-}
12734-
12735-void pgd_test_and_unpin(pgd_t *pgd)
12736-{
12737- if (PagePinned(virt_to_page(pgd)))
12738- __pgd_unpin(pgd);
12739-}
12740-
12741-void mm_pin(struct mm_struct *mm)
12742-{
12743- if (xen_feature(XENFEAT_writable_page_tables))
12744- return;
12745-
12746- pin_lock(mm);
12747- __pgd_pin(mm->pgd);
12748- pin_unlock(mm);
12749-}
12750-
12751-void mm_unpin(struct mm_struct *mm)
12752-{
12753- if (xen_feature(XENFEAT_writable_page_tables))
12754- return;
12755-
12756- pin_lock(mm);
12757- __pgd_unpin(mm->pgd);
12758- pin_unlock(mm);
12759-}
12760-
12761-void mm_pin_all(void)
12762-{
12763- struct page *page;
12764- unsigned long flags;
12765-
12766- if (xen_feature(XENFEAT_writable_page_tables))
12767- return;
12768-
12769- /*
12770- * Allow uninterrupted access to the pgd_list. Also protects
12771- * __pgd_pin() by disabling preemption.
12772- * All other CPUs must be at a safe point (e.g., in stop_machine
12773- * or offlined entirely).
12774- */
12775- spin_lock_irqsave(&pgd_lock, flags);
12776- list_for_each_entry(page, &pgd_list, lru) {
12777- if (!PagePinned(page))
12778- __pgd_pin((pgd_t *)page_address(page));
12779- }
12780- spin_unlock_irqrestore(&pgd_lock, flags);
12781-}
12782-
12783-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
12784-{
12785- if (!PagePinned(virt_to_page(mm->pgd)))
12786- mm_pin(mm);
12787-}
12788-
12789-void arch_exit_mmap(struct mm_struct *mm)
12790-{
12791- struct task_struct *tsk = current;
12792-
12793- task_lock(tsk);
12794-
12795- /*
12796- * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
12797- * *much* faster this way, as no tlb flushes means bigger wrpt batches.
12798- */
12799- if (tsk->active_mm == mm) {
12800- tsk->active_mm = &init_mm;
12801- atomic_inc(&init_mm.mm_count);
12802-
12803- switch_mm(mm, &init_mm, tsk);
12804-
12805- atomic_dec(&mm->mm_count);
12806- BUG_ON(atomic_read(&mm->mm_count) == 0);
12807- }
12808-
12809- task_unlock(tsk);
12810-
12811- if (PagePinned(virt_to_page(mm->pgd))
12812- && atomic_read(&mm->mm_count) == 1
12813- && !mm->context.has_foreign_mappings)
12814- mm_unpin(mm);
12815-}
12816-
12817-static void _pte_free(struct page *page, unsigned int order)
12818-{
12819- BUG_ON(order);
12820- __pte_free(page);
12821-}
12822-
12823-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
12824-{
12825- struct page *pte;
12826-
12827-#ifdef CONFIG_HIGHPTE
12828- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
12829-#else
12830- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
12831-#endif
12832- if (pte) {
12833- pgtable_page_ctor(pte);
12834- SetPageForeign(pte, _pte_free);
12835- init_page_count(pte);
12836- }
12837- return pte;
12838-}
12839-
12840-void __pte_free(pgtable_t pte)
12841-{
12842- if (!PageHighMem(pte)) {
12843- unsigned long va = (unsigned long)page_address(pte);
12844- unsigned int level;
12845- pte_t *ptep = lookup_address(va, &level);
12846-
12847- BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
12848- if (!pte_write(*ptep)
12849- && HYPERVISOR_update_va_mapping(va,
12850- mk_pte(pte, PAGE_KERNEL),
12851- 0))
12852- BUG();
12853- } else
12854-#ifdef CONFIG_HIGHPTE
12855- ClearPagePinned(pte);
12856-#else
12857- BUG();
12858-#endif
12859-
12860- ClearPageForeign(pte);
12861- init_page_count(pte);
12862- pgtable_page_dtor(pte);
12863- __free_page(pte);
12864-}
12865-
12866-#if PAGETABLE_LEVELS >= 3
12867-static void _pmd_free(struct page *page, unsigned int order)
12868-{
12869- BUG_ON(order);
12870- __pmd_free(page);
12871-}
12872-
12873-pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
12874-{
12875- struct page *pmd;
12876-
12877- pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
12878- if (!pmd)
12879- return NULL;
12880- SetPageForeign(pmd, _pmd_free);
12881- init_page_count(pmd);
12882- return page_address(pmd);
12883-}
12884-
12885-void __pmd_free(pgtable_t pmd)
12886-{
12887- unsigned long va = (unsigned long)page_address(pmd);
12888- unsigned int level;
12889- pte_t *ptep = lookup_address(va, &level);
12890-
12891- BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
12892- if (!pte_write(*ptep)
12893- && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
12894- BUG();
12895-
12896- ClearPageForeign(pmd);
12897- init_page_count(pmd);
12898- __free_page(pmd);
12899-}
12900-#endif
12901-
12902-/* blktap and gntdev need this, as otherwise they would implicitly (and
12903- * needlessly, as they never use it) reference init_mm. */
12904-pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
12905- unsigned long addr, pte_t *ptep, int full)
12906-{
12907- return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
12908-}
12909-EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
12910+#include <asm/pat.h>
12911
12912 /*
12913 * The current flushing context - we pass it instead of 5 arguments:
12914@@ -392,6 +31,7 @@ struct cpa_data {
12915 int numpages;
12916 int flushtlb;
12917 unsigned long pfn;
12918+ unsigned force_split : 1;
12919 };
12920
12921 #ifdef CONFIG_X86_64
12922@@ -637,6 +277,9 @@ try_preserve_large_page(pte_t *kpte, uns
12923 int i, do_split = 1;
12924 unsigned int level;
12925
12926+ if (cpa->force_split)
12927+ return 1;
12928+
12929 spin_lock_irqsave(&pgd_lock, flags);
12930 /*
12931 * Check for races, another CPU might have split this page
12932@@ -856,9 +499,7 @@ static int split_large_page(pte_t *kpte,
12933 goto out_unlock;
12934
12935 pbase = (pte_t *)page_address(base);
12936-#ifdef CONFIG_X86_32
12937- paravirt_alloc_pt(&init_mm, page_to_pfn(base));
12938-#endif
12939+ paravirt_alloc_pte(&init_mm, page_to_pfn(base));
12940 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
12941
12942 #ifdef CONFIG_X86_64
12943@@ -919,7 +560,7 @@ static int __change_page_attr(struct cpa
12944 repeat:
12945 kpte = lookup_address(address, &level);
12946 if (!kpte)
12947- return primary ? -EINVAL : 0;
12948+ return 0;
12949
12950 old_pte = *kpte;
12951 if (!__pte_val(old_pte)) {
12952@@ -1078,7 +719,8 @@ static inline int cache_attr(pgprot_t at
12953 }
12954
12955 static int change_page_attr_set_clr(unsigned long addr, int numpages,
12956- pgprot_t mask_set, pgprot_t mask_clr)
12957+ pgprot_t mask_set, pgprot_t mask_clr,
12958+ int force_split)
12959 {
12960 struct cpa_data cpa;
12961 int ret, cache, checkalias;
12962@@ -1089,7 +731,7 @@ static int change_page_attr_set_clr(unsi
12963 */
12964 mask_set = canon_pgprot(mask_set);
12965 mask_clr = canon_pgprot(mask_clr);
12966- if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
12967+ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
12968 return 0;
12969
12970 /* Ensure we are PAGE_SIZE aligned */
12971@@ -1106,6 +748,7 @@ static int change_page_attr_set_clr(unsi
12972 cpa.mask_set = mask_set;
12973 cpa.mask_clr = mask_clr;
12974 cpa.flushtlb = 0;
12975+ cpa.force_split = force_split;
12976
12977 /* No alias checking for _NX bit modifications */
12978 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
12979@@ -1144,26 +787,67 @@ out:
12980 static inline int change_page_attr_set(unsigned long addr, int numpages,
12981 pgprot_t mask)
12982 {
12983- return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
12984+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
12985 }
12986
12987 static inline int change_page_attr_clear(unsigned long addr, int numpages,
12988 pgprot_t mask)
12989 {
12990- return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
12991+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
12992 }
12993
12994-int set_memory_uc(unsigned long addr, int numpages)
12995+int _set_memory_uc(unsigned long addr, int numpages)
12996 {
12997+ /*
12998+ * for now UC MINUS. see comments in ioremap_nocache()
12999+ */
13000 return change_page_attr_set(addr, numpages,
13001- __pgprot(_PAGE_PCD));
13002+ __pgprot(_PAGE_CACHE_UC_MINUS));
13003+}
13004+
13005+int set_memory_uc(unsigned long addr, int numpages)
13006+{
13007+ /*
13008+ * for now UC MINUS. see comments in ioremap_nocache()
13009+ */
13010+ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
13011+ _PAGE_CACHE_UC_MINUS, NULL))
13012+ return -EINVAL;
13013+
13014+ return _set_memory_uc(addr, numpages);
13015 }
13016 EXPORT_SYMBOL(set_memory_uc);
13017
13018-int set_memory_wb(unsigned long addr, int numpages)
13019+int _set_memory_wc(unsigned long addr, int numpages)
13020+{
13021+ return change_page_attr_set(addr, numpages,
13022+ __pgprot(_PAGE_CACHE_WC));
13023+}
13024+
13025+int set_memory_wc(unsigned long addr, int numpages)
13026+{
13027+ if (!pat_wc_enabled)
13028+ return set_memory_uc(addr, numpages);
13029+
13030+ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
13031+ _PAGE_CACHE_WC, NULL))
13032+ return -EINVAL;
13033+
13034+ return _set_memory_wc(addr, numpages);
13035+}
13036+EXPORT_SYMBOL(set_memory_wc);
13037+
13038+int _set_memory_wb(unsigned long addr, int numpages)
13039 {
13040 return change_page_attr_clear(addr, numpages,
13041- __pgprot(_PAGE_PCD | _PAGE_PWT));
13042+ __pgprot(_PAGE_CACHE_MASK));
13043+}
13044+
13045+int set_memory_wb(unsigned long addr, int numpages)
13046+{
13047+ free_memtype(addr, addr + numpages * PAGE_SIZE);
13048+
13049+ return _set_memory_wb(addr, numpages);
13050 }
13051 EXPORT_SYMBOL(set_memory_wb);
13052
13053@@ -1194,6 +878,12 @@ int set_memory_np(unsigned long addr, in
13054 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
13055 }
13056
13057+int set_memory_4k(unsigned long addr, int numpages)
13058+{
13059+ return change_page_attr_set_clr(addr, numpages, __pgprot(0),
13060+ __pgprot(0), 1);
13061+}
13062+
13063 int set_pages_uc(struct page *page, int numpages)
13064 {
13065 unsigned long addr = (unsigned long)page_address(page);
13066@@ -1303,6 +993,45 @@ void kernel_map_pages(struct page *page,
13067 cpa_fill_pool(NULL);
13068 }
13069
13070+#ifdef CONFIG_DEBUG_FS
13071+static int dpa_show(struct seq_file *m, void *v)
13072+{
13073+ seq_puts(m, "DEBUG_PAGEALLOC\n");
13074+ seq_printf(m, "pool_size : %lu\n", pool_size);
13075+ seq_printf(m, "pool_pages : %lu\n", pool_pages);
13076+ seq_printf(m, "pool_low : %lu\n", pool_low);
13077+ seq_printf(m, "pool_used : %lu\n", pool_used);
13078+ seq_printf(m, "pool_failed : %lu\n", pool_failed);
13079+
13080+ return 0;
13081+}
13082+
13083+static int dpa_open(struct inode *inode, struct file *filp)
13084+{
13085+ return single_open(filp, dpa_show, NULL);
13086+}
13087+
13088+static const struct file_operations dpa_fops = {
13089+ .open = dpa_open,
13090+ .read = seq_read,
13091+ .llseek = seq_lseek,
13092+ .release = single_release,
13093+};
13094+
13095+static int __init debug_pagealloc_proc_init(void)
13096+{
13097+ struct dentry *de;
13098+
13099+ de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
13100+ &dpa_fops);
13101+ if (!de)
13102+ return -ENOMEM;
13103+
13104+ return 0;
13105+}
13106+__initcall(debug_pagealloc_proc_init);
13107+#endif
13108+
13109 #ifdef CONFIG_HIBERNATION
13110
13111 bool kernel_page_present(struct page *page)
13112--- /dev/null 1970-01-01 00:00:00.000000000 +0000
13113+++ sle11-2009-05-14/arch/x86/mm/pat-xen.c 2009-03-16 16:38:05.000000000 +0100
13114@@ -0,0 +1,602 @@
13115+/*
13116+ * Handle caching attributes in page tables (PAT)
13117+ *
13118+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
13119+ * Suresh B Siddha <suresh.b.siddha@intel.com>
13120+ *
13121+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
13122+ */
13123+
13124+#include <linux/mm.h>
13125+#include <linux/kernel.h>
13126+#include <linux/gfp.h>
13127+#include <linux/fs.h>
13128+#include <linux/bootmem.h>
13129+
13130+#include <asm/msr.h>
13131+#include <asm/tlbflush.h>
13132+#include <asm/processor.h>
13133+#include <asm/page.h>
13134+#include <asm/pgtable.h>
13135+#include <asm/pat.h>
13136+#include <asm/e820.h>
13137+#include <asm/cacheflush.h>
13138+#include <asm/fcntl.h>
13139+#include <asm/mtrr.h>
13140+#include <asm/io.h>
13141+
13142+#ifdef CONFIG_X86_PAT
13143+int __read_mostly pat_wc_enabled = 1;
13144+
13145+void __cpuinit pat_disable(char *reason)
13146+{
13147+ pat_wc_enabled = 0;
13148+ printk(KERN_INFO "%s\n", reason);
13149+}
13150+
13151+static int __init nopat(char *str)
13152+{
13153+ pat_disable("PAT support disabled.");
13154+ return 0;
13155+}
13156+early_param("nopat", nopat);
13157+#endif
13158+
13159+static u64 __read_mostly boot_pat_state;
13160+
13161+enum {
13162+ PAT_UC = 0, /* uncached */
13163+ PAT_WC = 1, /* Write combining */
13164+ PAT_WT = 4, /* Write Through */
13165+ PAT_WP = 5, /* Write Protected */
13166+ PAT_WB = 6, /* Write Back (default) */
13167+ PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
13168+};
13169+
13170+#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
13171+
13172+void pat_init(void)
13173+{
13174+ u64 pat;
13175+
13176+ if (!pat_wc_enabled)
13177+ return;
13178+
13179+ /* Paranoia check. */
13180+ if (!cpu_has_pat) {
13181+ printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
13182+ /*
13183+ * Panic if this happens on the secondary CPU, and we
13184+ * switched to PAT on the boot CPU. We have no way to
13185+ * undo PAT.
13186+ */
13187+ BUG_ON(boot_pat_state);
13188+ }
13189+
13190+#ifndef CONFIG_XEN
13191+ /* Set PWT to Write-Combining. All other bits stay the same */
13192+ /*
13193+ * PTE encoding used in Linux:
13194+ * PAT
13195+ * |PCD
13196+ * ||PWT
13197+ * |||
13198+ * 000 WB _PAGE_CACHE_WB
13199+ * 001 WC _PAGE_CACHE_WC
13200+ * 010 UC- _PAGE_CACHE_UC_MINUS
13201+ * 011 UC _PAGE_CACHE_UC
13202+ * PAT bit unused
13203+ */
13204+ pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
13205+ PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
13206+
13207+ /* Boot CPU check */
13208+ if (!boot_pat_state)
13209+ rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
13210+
13211+ wrmsrl(MSR_IA32_CR_PAT, pat);
13212+#else
13213+ /*
13214+ * PAT settings are part of the hypervisor interface, and their
13215+ * assignment cannot be changed.
13216+ */
13217+ rdmsrl(MSR_IA32_CR_PAT, pat);
13218+ if (!boot_pat_state)
13219+ boot_pat_state = pat;
13220+#endif
13221+ printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
13222+ smp_processor_id(), boot_pat_state, pat);
13223+}
13224+
13225+#undef PAT
13226+
13227+static char *cattr_name(unsigned long flags)
13228+{
13229+ switch (flags & _PAGE_CACHE_MASK) {
13230+ case _PAGE_CACHE_UC: return "uncached";
13231+ case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
13232+ case _PAGE_CACHE_WB: return "write-back";
13233+ case _PAGE_CACHE_WC: return "write-combining";
13234+ case _PAGE_CACHE_WP: return "write-protected";
13235+ case _PAGE_CACHE_WT: return "write-through";
13236+ default: return "broken";
13237+ }
13238+}
13239+
13240+/*
13241+ * The global memtype list keeps track of memory type for specific
13242+ * physical memory areas. Conflicting memory types in different
13243+ * mappings can cause CPU cache corruption. To avoid this we keep track.
13244+ *
13245+ * The list is sorted based on starting address and can contain multiple
13246+ * entries for each address (this allows reference counting for overlapping
13247+ * areas). All the aliases have the same cache attributes of course.
13248+ * Zero attributes are represented as holes.
13249+ *
13250+ * Currently the data structure is a list because the number of mappings
13251+ * are expected to be relatively small. If this should be a problem
13252+ * it could be changed to a rbtree or similar.
13253+ *
13254+ * memtype_lock protects the whole list.
13255+ */
13256+
13257+struct memtype {
13258+ u64 start;
13259+ u64 end;
13260+ unsigned long type;
13261+ struct list_head nd;
13262+};
13263+
13264+static LIST_HEAD(memtype_list);
13265+static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
13266+
13267+/*
13268+ * Does intersection of PAT memory type and MTRR memory type and returns
13269+ * the resulting memory type as PAT understands it.
13270+ * (Type in pat and mtrr will not have same value)
13271+ * The intersection is based on "Effective Memory Type" tables in IA-32
13272+ * SDM vol 3a
13273+ */
13274+static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
13275+ unsigned long *ret_prot)
13276+{
13277+ unsigned long pat_type;
13278+ u8 mtrr_type;
13279+
13280+ pat_type = prot & _PAGE_CACHE_MASK;
13281+ prot &= (~_PAGE_CACHE_MASK);
13282+
13283+ /*
13284+ * We return the PAT request directly for types where PAT takes
13285+ * precedence with respect to MTRR and for UC_MINUS.
13286+ * Consistency checks with other PAT requests is done later
13287+ * while going through memtype list.
13288+ */
13289+ if (pat_type == _PAGE_CACHE_WC) {
13290+ *ret_prot = prot | _PAGE_CACHE_WC;
13291+ return 0;
13292+ } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
13293+ *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
13294+ return 0;
13295+ } else if (pat_type == _PAGE_CACHE_UC) {
13296+ *ret_prot = prot | _PAGE_CACHE_UC;
13297+ return 0;
13298+ }
13299+
13300+ /*
13301+ * Look for MTRR hint to get the effective type in case where PAT
13302+ * request is for WB.
13303+ */
13304+ mtrr_type = mtrr_type_lookup(start, end);
13305+
13306+ if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
13307+ *ret_prot = prot | _PAGE_CACHE_UC;
13308+ } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
13309+ *ret_prot = prot | _PAGE_CACHE_WC;
13310+ } else {
13311+ *ret_prot = prot | _PAGE_CACHE_WB;
13312+ }
13313+
13314+ return 0;
13315+}
13316+
13317+/*
13318+ * req_type typically has one of the:
13319+ * - _PAGE_CACHE_WB
13320+ * - _PAGE_CACHE_WC
13321+ * - _PAGE_CACHE_UC_MINUS
13322+ * - _PAGE_CACHE_UC
13323+ *
13324+ * req_type will have a special case value '-1', when requester want to inherit
13325+ * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
13326+ *
13327+ * If ret_type is NULL, function will return an error if it cannot reserve the
13328+ * region with req_type. If ret_type is non-null, function will return
13329+ * available type in ret_type in case of no error. In case of any error
13330+ * it will return a negative return value.
13331+ */
13332+int reserve_memtype(u64 start, u64 end, unsigned long req_type,
13333+ unsigned long *ret_type)
13334+{
13335+ struct memtype *new_entry = NULL;
13336+ struct memtype *parse;
13337+ unsigned long actual_type;
13338+ int err = 0;
13339+
13340+ /* Only track when pat_wc_enabled */
13341+ if (!pat_wc_enabled) {
13342+ /* This is identical to page table setting without PAT */
13343+ if (ret_type) {
13344+ if (req_type == -1) {
13345+ *ret_type = _PAGE_CACHE_WB;
13346+ } else {
13347+ *ret_type = req_type;
13348+ }
13349+ }
13350+ return 0;
13351+ }
13352+
13353+ /* Low ISA region is always mapped WB in page table. No need to track */
13354+ if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
13355+ if (ret_type)
13356+ *ret_type = _PAGE_CACHE_WB;
13357+
13358+ return 0;
13359+ }
13360+
13361+ if (req_type == -1) {
13362+ /*
13363+ * Call mtrr_lookup to get the type hint. This is an
13364+ * optimization for /dev/mem mmap'ers into WB memory (BIOS
13365+ * tools and ACPI tools). Use WB request for WB memory and use
13366+ * UC_MINUS otherwise.
13367+ */
13368+ u8 mtrr_type = mtrr_type_lookup(start, end);
13369+
13370+ if (mtrr_type == MTRR_TYPE_WRBACK) {
13371+ req_type = _PAGE_CACHE_WB;
13372+ actual_type = _PAGE_CACHE_WB;
13373+ } else {
13374+ req_type = _PAGE_CACHE_UC_MINUS;
13375+ actual_type = _PAGE_CACHE_UC_MINUS;
13376+ }
13377+ } else {
13378+ req_type &= _PAGE_CACHE_MASK;
13379+ err = pat_x_mtrr_type(start, end, req_type, &actual_type);
13380+ }
13381+
13382+ if (err) {
13383+ if (ret_type)
13384+ *ret_type = actual_type;
13385+
13386+ return -EINVAL;
13387+ }
13388+
13389+ new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
13390+ if (!new_entry)
13391+ return -ENOMEM;
13392+
13393+ new_entry->start = start;
13394+ new_entry->end = end;
13395+ new_entry->type = actual_type;
13396+
13397+ if (ret_type)
13398+ *ret_type = actual_type;
13399+
13400+ spin_lock(&memtype_lock);
13401+
13402+ /* Search for existing mapping that overlaps the current range */
13403+ list_for_each_entry(parse, &memtype_list, nd) {
13404+ struct memtype *saved_ptr;
13405+
13406+ if (parse->start >= end) {
13407+ pr_debug("New Entry\n");
13408+ list_add(&new_entry->nd, parse->nd.prev);
13409+ new_entry = NULL;
13410+ break;
13411+ }
13412+
13413+ if (start <= parse->start && end >= parse->start) {
13414+ if (actual_type != parse->type && ret_type) {
13415+ actual_type = parse->type;
13416+ *ret_type = actual_type;
13417+ new_entry->type = actual_type;
13418+ }
13419+
13420+ if (actual_type != parse->type) {
13421+ printk(
13422+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13423+ current->comm, current->pid,
13424+ start, end,
13425+ cattr_name(actual_type),
13426+ cattr_name(parse->type));
13427+ err = -EBUSY;
13428+ break;
13429+ }
13430+
13431+ saved_ptr = parse;
13432+ /*
13433+ * Check to see whether the request overlaps more
13434+ * than one entry in the list
13435+ */
13436+ list_for_each_entry_continue(parse, &memtype_list, nd) {
13437+ if (end <= parse->start) {
13438+ break;
13439+ }
13440+
13441+ if (actual_type != parse->type) {
13442+ printk(
13443+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13444+ current->comm, current->pid,
13445+ start, end,
13446+ cattr_name(actual_type),
13447+ cattr_name(parse->type));
13448+ err = -EBUSY;
13449+ break;
13450+ }
13451+ }
13452+
13453+ if (err) {
13454+ break;
13455+ }
13456+
13457+ pr_debug("Overlap at 0x%Lx-0x%Lx\n",
13458+ saved_ptr->start, saved_ptr->end);
13459+ /* No conflict. Go ahead and add this new entry */
13460+ list_add(&new_entry->nd, saved_ptr->nd.prev);
13461+ new_entry = NULL;
13462+ break;
13463+ }
13464+
13465+ if (start < parse->end) {
13466+ if (actual_type != parse->type && ret_type) {
13467+ actual_type = parse->type;
13468+ *ret_type = actual_type;
13469+ new_entry->type = actual_type;
13470+ }
13471+
13472+ if (actual_type != parse->type) {
13473+ printk(
13474+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13475+ current->comm, current->pid,
13476+ start, end,
13477+ cattr_name(actual_type),
13478+ cattr_name(parse->type));
13479+ err = -EBUSY;
13480+ break;
13481+ }
13482+
13483+ saved_ptr = parse;
13484+ /*
13485+ * Check to see whether the request overlaps more
13486+ * than one entry in the list
13487+ */
13488+ list_for_each_entry_continue(parse, &memtype_list, nd) {
13489+ if (end <= parse->start) {
13490+ break;
13491+ }
13492+
13493+ if (actual_type != parse->type) {
13494+ printk(
13495+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13496+ current->comm, current->pid,
13497+ start, end,
13498+ cattr_name(actual_type),
13499+ cattr_name(parse->type));
13500+ err = -EBUSY;
13501+ break;
13502+ }
13503+ }
13504+
13505+ if (err) {
13506+ break;
13507+ }
13508+
13509+ pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
13510+ saved_ptr->start, saved_ptr->end);
13511+ /* No conflict. Go ahead and add this new entry */
13512+ list_add(&new_entry->nd, &saved_ptr->nd);
13513+ new_entry = NULL;
13514+ break;
13515+ }
13516+ }
13517+
13518+ if (err) {
13519+ printk(KERN_INFO
13520+ "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
13521+ start, end, cattr_name(new_entry->type),
13522+ cattr_name(req_type));
13523+ kfree(new_entry);
13524+ spin_unlock(&memtype_lock);
13525+ return err;
13526+ }
13527+
13528+ if (new_entry) {
13529+ /* No conflict. Not yet added to the list. Add to the tail */
13530+ list_add_tail(&new_entry->nd, &memtype_list);
13531+ pr_debug("New Entry\n");
13532+ }
13533+
13534+ if (ret_type) {
13535+ pr_debug(
13536+ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
13537+ start, end, cattr_name(actual_type),
13538+ cattr_name(req_type), cattr_name(*ret_type));
13539+ } else {
13540+ pr_debug(
13541+ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
13542+ start, end, cattr_name(actual_type),
13543+ cattr_name(req_type));
13544+ }
13545+
13546+ spin_unlock(&memtype_lock);
13547+ return err;
13548+}
13549+
13550+int free_memtype(u64 start, u64 end)
13551+{
13552+ struct memtype *ml;
13553+ int err = -EINVAL;
13554+
13555+ /* Only track when pat_wc_enabled */
13556+ if (!pat_wc_enabled) {
13557+ return 0;
13558+ }
13559+
13560+ /* Low ISA region is always mapped WB. No need to track */
13561+ if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
13562+ return 0;
13563+ }
13564+
13565+ spin_lock(&memtype_lock);
13566+ list_for_each_entry(ml, &memtype_list, nd) {
13567+ if (ml->start == start && ml->end == end) {
13568+ list_del(&ml->nd);
13569+ kfree(ml);
13570+ err = 0;
13571+ break;
13572+ }
13573+ }
13574+ spin_unlock(&memtype_lock);
13575+
13576+ if (err) {
13577+ printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
13578+ current->comm, current->pid, start, end);
13579+ }
13580+
13581+ pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
13582+ return err;
13583+}
13584+
13585+
13586+/*
13587+ * /dev/mem mmap interface. The memtype used for mapping varies:
13588+ * - Use UC for mappings with O_SYNC flag
13589+ * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
13590+ * inherit the memtype from existing mapping.
13591+ * - Else use UC_MINUS memtype (for backward compatibility with existing
13592+ * X drivers.
13593+ */
13594+pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
13595+ unsigned long size, pgprot_t vma_prot)
13596+{
13597+ return vma_prot;
13598+}
13599+
13600+#ifdef CONFIG_NONPROMISC_DEVMEM
13601+/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
13602+static inline int range_is_allowed(unsigned long mfn, unsigned long size)
13603+{
13604+ return 1;
13605+}
13606+#else
13607+static inline int range_is_allowed(unsigned long mfn, unsigned long size)
13608+{
13609+ u64 from = ((u64)mfn) << PAGE_SHIFT;
13610+ u64 to = from + size;
13611+ u64 cursor = from;
13612+
13613+ while (cursor < to) {
13614+ if (!devmem_is_allowed(mfn)) {
13615+ printk(KERN_INFO
13616+ "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
13617+ current->comm, from, to);
13618+ return 0;
13619+ }
13620+ cursor += PAGE_SIZE;
13621+ mfn++;
13622+ }
13623+ return 1;
13624+}
13625+#endif /* CONFIG_NONPROMISC_DEVMEM */
13626+
13627+int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
13628+ unsigned long size, pgprot_t *vma_prot)
13629+{
13630+ u64 addr = (u64)mfn << PAGE_SHIFT;
13631+ unsigned long flags = _PAGE_CACHE_UC_MINUS;
13632+ int retval;
13633+
13634+ if (!range_is_allowed(mfn, size))
13635+ return 0;
13636+
13637+ if (file->f_flags & O_SYNC) {
13638+ flags = _PAGE_CACHE_UC;
13639+ }
13640+
13641+#ifndef CONFIG_X86_32
13642+#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */
13643+ /*
13644+ * On the PPro and successors, the MTRRs are used to set
13645+ * memory types for physical addresses outside main memory,
13646+ * so blindly setting UC or PWT on those pages is wrong.
13647+ * For Pentiums and earlier, the surround logic should disable
13648+ * caching for the high addresses through the KEN pin, but
13649+ * we maintain the tradition of paranoia in this code.
13650+ */
13651+ if (!pat_wc_enabled &&
13652+ ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
13653+ test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
13654+ test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
13655+ test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
13656+ (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
13657+ flags = _PAGE_CACHE_UC;
13658+ }
13659+#endif
13660+#endif
13661+
13662+ /*
13663+ * With O_SYNC, we can only take UC mapping. Fail if we cannot.
13664+ * Without O_SYNC, we want to get
13665+ * - WB for WB-able memory and no other conflicting mappings
13666+ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
13667+ * - Inherit from confliting mappings otherwise
13668+ */
13669+ if (flags != _PAGE_CACHE_UC_MINUS) {
13670+ retval = reserve_memtype(addr, addr + size, flags, NULL);
13671+ } else {
13672+ retval = reserve_memtype(addr, addr + size, -1, &flags);
13673+ }
13674+
13675+ if (retval < 0)
13676+ return 0;
13677+
13678+ if (ioremap_check_change_attr(mfn, size, flags) < 0) {
13679+ free_memtype(addr, addr + size);
13680+ printk(KERN_INFO
13681+ "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
13682+ current->comm, current->pid,
13683+ cattr_name(flags),
13684+ addr, addr + size);
13685+ return 0;
13686+ }
13687+
13688+ *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
13689+ flags);
13690+ return 1;
13691+}
13692+
13693+void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
13694+{
13695+ u64 addr = (u64)mfn << PAGE_SHIFT;
13696+ unsigned long flags;
13697+ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
13698+
13699+ reserve_memtype(addr, addr + size, want_flags, &flags);
13700+ if (flags != want_flags) {
13701+ printk(KERN_INFO
13702+ "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
13703+ current->comm, current->pid,
13704+ cattr_name(want_flags),
13705+ addr, (unsigned long long)(addr + size),
13706+ cattr_name(flags));
13707+ }
13708+}
13709+
13710+void unmap_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
13711+{
13712+ u64 addr = (u64)mfn << PAGE_SHIFT;
13713+
13714+ free_memtype(addr, addr + size);
13715+}
13716+
13717--- /dev/null 1970-01-01 00:00:00.000000000 +0000
13718+++ sle11-2009-05-14/arch/x86/mm/pgtable-xen.c 2009-03-16 16:38:05.000000000 +0100
13719@@ -0,0 +1,709 @@
13720+#include <linux/mm.h>
13721+#include <linux/module.h>
13722+#include <xen/features.h>
13723+#include <asm/pgalloc.h>
13724+#include <asm/pgtable.h>
13725+#include <asm/tlb.h>
13726+#include <asm/hypervisor.h>
13727+#include <asm/mmu_context.h>
13728+
13729+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
13730+{
13731+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
13732+ if (pte)
13733+ make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
13734+ return pte;
13735+}
13736+
13737+static void _pte_free(struct page *page, unsigned int order)
13738+{
13739+ BUG_ON(order);
13740+ __pte_free(page);
13741+}
13742+
13743+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
13744+{
13745+ struct page *pte;
13746+
13747+#ifdef CONFIG_HIGHPTE
13748+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
13749+#else
13750+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
13751+#endif
13752+ if (pte) {
13753+ pgtable_page_ctor(pte);
13754+ SetPageForeign(pte, _pte_free);
13755+ init_page_count(pte);
13756+ }
13757+ return pte;
13758+}
13759+
13760+void __pte_free(pgtable_t pte)
13761+{
13762+ if (!PageHighMem(pte)) {
13763+ unsigned long va = (unsigned long)page_address(pte);
13764+ unsigned int level;
13765+ pte_t *ptep = lookup_address(va, &level);
13766+
13767+ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
13768+ if (!pte_write(*ptep)
13769+ && HYPERVISOR_update_va_mapping(va,
13770+ mk_pte(pte, PAGE_KERNEL),
13771+ 0))
13772+ BUG();
13773+ } else
13774+#ifdef CONFIG_HIGHPTE
13775+ ClearPagePinned(pte);
13776+#else
13777+ BUG();
13778+#endif
13779+
13780+ ClearPageForeign(pte);
13781+ init_page_count(pte);
13782+ pgtable_page_dtor(pte);
13783+ __free_page(pte);
13784+}
13785+
13786+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
13787+{
13788+ pgtable_page_dtor(pte);
13789+ paravirt_release_pte(page_to_pfn(pte));
13790+ tlb_remove_page(tlb, pte);
13791+}
13792+
13793+#if PAGETABLE_LEVELS > 2
13794+static void _pmd_free(struct page *page, unsigned int order)
13795+{
13796+ BUG_ON(order);
13797+ __pmd_free(page);
13798+}
13799+
13800+pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
13801+{
13802+ struct page *pmd;
13803+
13804+ pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
13805+ if (!pmd)
13806+ return NULL;
13807+ SetPageForeign(pmd, _pmd_free);
13808+ init_page_count(pmd);
13809+ return page_address(pmd);
13810+}
13811+
13812+void __pmd_free(pgtable_t pmd)
13813+{
13814+ unsigned long va = (unsigned long)page_address(pmd);
13815+ unsigned int level;
13816+ pte_t *ptep = lookup_address(va, &level);
13817+
13818+ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
13819+ if (!pte_write(*ptep)
13820+ && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
13821+ BUG();
13822+
13823+ ClearPageForeign(pmd);
13824+ init_page_count(pmd);
13825+ __free_page(pmd);
13826+}
13827+
13828+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
13829+{
13830+ paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
13831+ tlb_remove_page(tlb, virt_to_page(pmd));
13832+}
13833+
13834+#if PAGETABLE_LEVELS > 3
13835+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
13836+{
13837+ paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
13838+ tlb_remove_page(tlb, virt_to_page(pud));
13839+}
13840+#endif /* PAGETABLE_LEVELS > 3 */
13841+#endif /* PAGETABLE_LEVELS > 2 */
13842+
13843+#ifndef CONFIG_X86_64
13844+#define TASK_SIZE64 TASK_SIZE
13845+#endif
13846+
13847+static void _pin_lock(struct mm_struct *mm, int lock) {
13848+ if (lock)
13849+ spin_lock(&mm->page_table_lock);
13850+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
13851+ /* While mm->page_table_lock protects us against insertions and
13852+ * removals of higher level page table pages, it doesn't protect
13853+ * against updates of pte-s. Such updates, however, require the
13854+ * pte pages to be in consistent state (unpinned+writable or
13855+ * pinned+readonly). The pinning and attribute changes, however
13856+ * cannot be done atomically, which is why such updates must be
13857+ * prevented from happening concurrently.
13858+ * Note that no pte lock can ever elsewhere be acquired nesting
13859+ * with an already acquired one in the same mm, or with the mm's
13860+ * page_table_lock already acquired, as that would break in the
13861+ * non-split case (where all these are actually resolving to the
13862+ * one page_table_lock). Thus acquiring all of them here is not
13863+ * going to result in dead locks, and the order of acquires
13864+ * doesn't matter.
13865+ */
13866+ {
13867+ pgd_t *pgd = mm->pgd;
13868+ unsigned g;
13869+
13870+ for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
13871+ pud_t *pud;
13872+ unsigned u;
13873+
13874+ if (pgd_none(*pgd))
13875+ continue;
13876+ pud = pud_offset(pgd, 0);
13877+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
13878+ pmd_t *pmd;
13879+ unsigned m;
13880+
13881+ if (pud_none(*pud))
13882+ continue;
13883+ pmd = pmd_offset(pud, 0);
13884+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
13885+ spinlock_t *ptl;
13886+
13887+ if (pmd_none(*pmd))
13888+ continue;
13889+ ptl = pte_lockptr(0, pmd);
13890+ if (lock)
13891+ spin_lock(ptl);
13892+ else
13893+ spin_unlock(ptl);
13894+ }
13895+ }
13896+ }
13897+ }
13898+#endif
13899+ if (!lock)
13900+ spin_unlock(&mm->page_table_lock);
13901+}
13902+#define pin_lock(mm) _pin_lock(mm, 1)
13903+#define pin_unlock(mm) _pin_lock(mm, 0)
13904+
13905+#define PIN_BATCH sizeof(void *)
13906+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
13907+
13908+static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
13909+ unsigned int cpu, unsigned int seq)
13910+{
13911+ unsigned long pfn = page_to_pfn(page);
13912+
13913+ if (PageHighMem(page)) {
13914+ if (pgprot_val(flags) & _PAGE_RW)
13915+ ClearPagePinned(page);
13916+ else
13917+ SetPagePinned(page);
13918+ } else {
13919+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
13920+ (unsigned long)__va(pfn << PAGE_SHIFT),
13921+ pfn_pte(pfn, flags), 0);
13922+ if (unlikely(++seq == PIN_BATCH)) {
13923+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
13924+ PIN_BATCH, NULL)))
13925+ BUG();
13926+ seq = 0;
13927+ }
13928+ }
13929+
13930+ return seq;
13931+}
13932+
13933+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
13934+{
13935+ pgd_t *pgd = pgd_base;
13936+ pud_t *pud;
13937+ pmd_t *pmd;
13938+ int g,u,m;
13939+ unsigned int cpu, seq;
13940+ multicall_entry_t *mcl;
13941+
13942+ if (xen_feature(XENFEAT_auto_translated_physmap))
13943+ return;
13944+
13945+ cpu = get_cpu();
13946+
13947+ /*
13948+ * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
13949+ * may not be the 'current' task's pagetables (e.g., current may be
13950+ * 32-bit, but the pagetables may be for a 64-bit task).
13951+ * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
13952+ * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
13953+ */
13954+ for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
13955+ if (pgd_none(*pgd))
13956+ continue;
13957+ pud = pud_offset(pgd, 0);
13958+ if (PTRS_PER_PUD > 1) /* not folded */
13959+ seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
13960+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
13961+ if (pud_none(*pud))
13962+ continue;
13963+ pmd = pmd_offset(pud, 0);
13964+ if (PTRS_PER_PMD > 1) /* not folded */
13965+ seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
13966+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
13967+ if (pmd_none(*pmd))
13968+ continue;
13969+ seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
13970+ }
13971+ }
13972+ }
13973+
13974+ mcl = per_cpu(pb_mcl, cpu);
13975+#ifdef CONFIG_X86_64
13976+ if (unlikely(seq > PIN_BATCH - 2)) {
13977+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
13978+ BUG();
13979+ seq = 0;
13980+ }
13981+ MULTI_update_va_mapping(mcl + seq,
13982+ (unsigned long)__user_pgd(pgd_base),
13983+ pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
13984+ 0);
13985+ MULTI_update_va_mapping(mcl + seq + 1,
13986+ (unsigned long)pgd_base,
13987+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
13988+ UVMF_TLB_FLUSH);
13989+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
13990+ BUG();
13991+#else
13992+ if (likely(seq != 0)) {
13993+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
13994+ (unsigned long)pgd_base,
13995+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
13996+ UVMF_TLB_FLUSH);
13997+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
13998+ seq + 1, NULL)))
13999+ BUG();
14000+ } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
14001+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
14002+ UVMF_TLB_FLUSH))
14003+ BUG();
14004+#endif
14005+
14006+ put_cpu();
14007+}
14008+
14009+static void __pgd_pin(pgd_t *pgd)
14010+{
14011+ pgd_walk(pgd, PAGE_KERNEL_RO);
14012+ kmap_flush_unused();
14013+ xen_pgd_pin(__pa(pgd)); /* kernel */
14014+#ifdef CONFIG_X86_64
14015+ xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
14016+#endif
14017+ SetPagePinned(virt_to_page(pgd));
14018+}
14019+
14020+static void __pgd_unpin(pgd_t *pgd)
14021+{
14022+ xen_pgd_unpin(__pa(pgd));
14023+#ifdef CONFIG_X86_64
14024+ xen_pgd_unpin(__pa(__user_pgd(pgd)));
14025+#endif
14026+ pgd_walk(pgd, PAGE_KERNEL);
14027+ ClearPagePinned(virt_to_page(pgd));
14028+}
14029+
14030+static void pgd_test_and_unpin(pgd_t *pgd)
14031+{
14032+ if (PagePinned(virt_to_page(pgd)))
14033+ __pgd_unpin(pgd);
14034+}
14035+
14036+void mm_pin(struct mm_struct *mm)
14037+{
14038+ if (xen_feature(XENFEAT_writable_page_tables))
14039+ return;
14040+
14041+ pin_lock(mm);
14042+ __pgd_pin(mm->pgd);
14043+ pin_unlock(mm);
14044+}
14045+
14046+void mm_unpin(struct mm_struct *mm)
14047+{
14048+ if (xen_feature(XENFEAT_writable_page_tables))
14049+ return;
14050+
14051+ pin_lock(mm);
14052+ __pgd_unpin(mm->pgd);
14053+ pin_unlock(mm);
14054+}
14055+
14056+void mm_pin_all(void)
14057+{
14058+ struct page *page;
14059+ unsigned long flags;
14060+
14061+ if (xen_feature(XENFEAT_writable_page_tables))
14062+ return;
14063+
14064+ /*
14065+ * Allow uninterrupted access to the pgd_list. Also protects
14066+ * __pgd_pin() by disabling preemption.
14067+ * All other CPUs must be at a safe point (e.g., in stop_machine
14068+ * or offlined entirely).
14069+ */
14070+ spin_lock_irqsave(&pgd_lock, flags);
14071+ list_for_each_entry(page, &pgd_list, lru) {
14072+ if (!PagePinned(page))
14073+ __pgd_pin((pgd_t *)page_address(page));
14074+ }
14075+ spin_unlock_irqrestore(&pgd_lock, flags);
14076+}
14077+
14078+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
14079+{
14080+ if (!PagePinned(virt_to_page(mm->pgd)))
14081+ mm_pin(mm);
14082+}
14083+
14084+void arch_exit_mmap(struct mm_struct *mm)
14085+{
14086+ struct task_struct *tsk = current;
14087+
14088+ task_lock(tsk);
14089+
14090+ /*
14091+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
14092+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
14093+ */
14094+ if (tsk->active_mm == mm) {
14095+ tsk->active_mm = &init_mm;
14096+ atomic_inc(&init_mm.mm_count);
14097+
14098+ switch_mm(mm, &init_mm, tsk);
14099+
14100+ atomic_dec(&mm->mm_count);
14101+ BUG_ON(atomic_read(&mm->mm_count) == 0);
14102+ }
14103+
14104+ task_unlock(tsk);
14105+
14106+ if (PagePinned(virt_to_page(mm->pgd))
14107+ && atomic_read(&mm->mm_count) == 1
14108+ && !mm->context.has_foreign_mappings)
14109+ mm_unpin(mm);
14110+}
14111+
14112+static inline void pgd_list_add(pgd_t *pgd)
14113+{
14114+ struct page *page = virt_to_page(pgd);
14115+
14116+ list_add(&page->lru, &pgd_list);
14117+}
14118+
14119+static inline void pgd_list_del(pgd_t *pgd)
14120+{
14121+ struct page *page = virt_to_page(pgd);
14122+
14123+ list_del(&page->lru);
14124+}
14125+
14126+#define UNSHARED_PTRS_PER_PGD \
14127+ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
14128+
14129+static void pgd_ctor(void *p)
14130+{
14131+ pgd_t *pgd = p;
14132+ unsigned long flags;
14133+
14134+ pgd_test_and_unpin(pgd);
14135+
14136+ /* Clear usermode parts of PGD */
14137+ memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
14138+
14139+ spin_lock_irqsave(&pgd_lock, flags);
14140+
14141+ /* If the pgd points to a shared pagetable level (either the
14142+ ptes in non-PAE, or shared PMD in PAE), then just copy the
14143+ references from swapper_pg_dir. */
14144+ if (PAGETABLE_LEVELS == 2 ||
14145+ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
14146+ PAGETABLE_LEVELS == 4) {
14147+ clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
14148+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
14149+ KERNEL_PGD_PTRS);
14150+ paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
14151+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
14152+ KERNEL_PGD_BOUNDARY,
14153+ KERNEL_PGD_PTRS);
14154+ }
14155+
14156+#ifdef CONFIG_X86_64
14157+ /* set level3_user_pgt for vsyscall area */
14158+ __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
14159+ __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
14160+#endif
14161+
14162+#ifndef CONFIG_X86_PAE
14163+ /* list required to sync kernel mapping updates */
14164+ if (!SHARED_KERNEL_PMD)
14165+ pgd_list_add(pgd);
14166+#endif
14167+
14168+ spin_unlock_irqrestore(&pgd_lock, flags);
14169+}
14170+
14171+static void pgd_dtor(void *pgd)
14172+{
14173+ unsigned long flags; /* can be called from interrupt context */
14174+
14175+ if (!SHARED_KERNEL_PMD) {
14176+ spin_lock_irqsave(&pgd_lock, flags);
14177+ pgd_list_del(pgd);
14178+ spin_unlock_irqrestore(&pgd_lock, flags);
14179+ }
14180+
14181+ pgd_test_and_unpin(pgd);
14182+}
14183+
14184+/*
14185+ * List of all pgd's needed for non-PAE so it can invalidate entries
14186+ * in both cached and uncached pgd's; not needed for PAE since the
14187+ * kernel pmd is shared. If PAE were not to share the pmd a similar
14188+ * tactic would be needed. This is essentially codepath-based locking
14189+ * against pageattr.c; it is the unique case in which a valid change
14190+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
14191+ * vmalloc faults work because attached pagetables are never freed.
14192+ * -- wli
14193+ */
14194+
14195+#ifdef CONFIG_X86_PAE
14196+/*
14197+ * Mop up any pmd pages which may still be attached to the pgd.
14198+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
14199+ * preallocate which never got a corresponding vma will need to be
14200+ * freed manually.
14201+ */
14202+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
14203+{
14204+ int i;
14205+
14206+ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
14207+ pgd_t pgd = pgdp[i];
14208+
14209+ if (__pgd_val(pgd) != 0) {
14210+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
14211+
14212+ pgdp[i] = xen_make_pgd(0);
14213+
14214+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
14215+ pmd_free(mm, pmd);
14216+ }
14217+ }
14218+
14219+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
14220+ xen_destroy_contiguous_region((unsigned long)pgdp, 0);
14221+}
14222+
14223+/*
14224+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
14225+ * updating the top-level pagetable entries to guarantee the
14226+ * processor notices the update. Since this is expensive, and
14227+ * all 4 top-level entries are used almost immediately in a
14228+ * new process's life, we just pre-populate them here.
14229+ *
14230+ * Also, if we're in a paravirt environment where the kernel pmd is
14231+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
14232+ * and initialize the kernel pmds here.
14233+ */
14234+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14235+{
14236+ pud_t *pud;
14237+ pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
14238+ unsigned long addr, flags;
14239+ int i;
14240+
14241+ /*
14242+ * We can race save/restore (if we sleep during a GFP_KERNEL memory
14243+ * allocation). We therefore store virtual addresses of pmds as they
14244+ * do not change across save/restore, and poke the machine addresses
14245+ * into the pgdir under the pgd_lock.
14246+ */
14247+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
14248+ pmds[i] = pmd_alloc_one(mm, addr);
14249+ if (!pmds[i])
14250+ goto out_oom;
14251+ }
14252+
14253+ spin_lock_irqsave(&pgd_lock, flags);
14254+
14255+ /* Protect against save/restore: move below 4GB under pgd_lock. */
14256+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
14257+ && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
14258+ spin_unlock_irqrestore(&pgd_lock, flags);
14259+out_oom:
14260+ while (i--)
14261+ pmd_free(mm, pmds[i]);
14262+ return 0;
14263+ }
14264+
14265+ /* Copy kernel pmd contents and write-protect the new pmds. */
14266+ pud = pud_offset(pgd, 0);
14267+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
14268+ i++, pud++, addr += PUD_SIZE) {
14269+ if (i >= KERNEL_PGD_BOUNDARY) {
14270+ memcpy(pmds[i],
14271+ (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
14272+ sizeof(pmd_t) * PTRS_PER_PMD);
14273+ make_lowmem_page_readonly(
14274+ pmds[i], XENFEAT_writable_page_tables);
14275+ }
14276+
14277+ /* It is safe to poke machine addresses of pmds under the pgd_lock. */
14278+ pud_populate(mm, pud, pmds[i]);
14279+ }
14280+
14281+ /* List required to sync kernel mapping updates and
14282+ * to pin/unpin on save/restore. */
14283+ pgd_list_add(pgd);
14284+
14285+ spin_unlock_irqrestore(&pgd_lock, flags);
14286+
14287+ return 1;
14288+}
14289+
14290+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
14291+{
14292+ struct page *page = virt_to_page(pmd);
14293+ unsigned long pfn = page_to_pfn(page);
14294+
14295+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
14296+
14297+ /* Note: almost everything apart from _PAGE_PRESENT is
14298+ reserved at the pmd (PDPT) level. */
14299+ if (PagePinned(virt_to_page(mm->pgd))) {
14300+ BUG_ON(PageHighMem(page));
14301+ BUG_ON(HYPERVISOR_update_va_mapping(
14302+ (unsigned long)__va(pfn << PAGE_SHIFT),
14303+ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
14304+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
14305+ } else
14306+ *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
14307+
14308+ /*
14309+ * According to Intel App note "TLBs, Paging-Structure Caches,
14310+ * and Their Invalidation", April 2007, document 317080-001,
14311+ * section 8.1: in PAE mode we explicitly have to flush the
14312+ * TLB via cr3 if the top-level pgd is changed...
14313+ */
14314+ if (mm == current->active_mm)
14315+ xen_tlb_flush();
14316+}
14317+#else /* !CONFIG_X86_PAE */
14318+/* No need to prepopulate any pagetable entries in non-PAE modes. */
14319+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14320+{
14321+ return 1;
14322+}
14323+
14324+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
14325+{
14326+}
14327+#endif /* CONFIG_X86_PAE */
14328+
14329+#ifdef CONFIG_X86_64
14330+/* We allocate two contiguous pages for kernel and user. */
14331+#define PGD_ORDER 1
14332+#else
14333+#define PGD_ORDER 0
14334+#endif
14335+
14336+pgd_t *pgd_alloc(struct mm_struct *mm)
14337+{
14338+ pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
14339+
14340+ /* so that alloc_pd can use it */
14341+ mm->pgd = pgd;
14342+ if (pgd)
14343+ pgd_ctor(pgd);
14344+
14345+ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
14346+ free_pages((unsigned long)pgd, PGD_ORDER);
14347+ pgd = NULL;
14348+ }
14349+
14350+ return pgd;
14351+}
14352+
14353+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
14354+{
14355+ /*
14356+ * After this the pgd should not be pinned for the duration of this
14357+ * function's execution. We should never sleep and thus never race:
14358+ * 1. User pmds will not become write-protected under our feet due
14359+ * to a concurrent mm_pin_all().
14360+ * 2. The machine addresses in PGD entries will not become invalid
14361+ * due to a concurrent save/restore.
14362+ */
14363+ pgd_dtor(pgd);
14364+
14365+ pgd_mop_up_pmds(mm, pgd);
14366+ free_pages((unsigned long)pgd, PGD_ORDER);
14367+}
14368+
14369+/* blktap and gntdev need this, as otherwise they would implicitly (and
14370+ * needlessly, as they never use it) reference init_mm. */
14371+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
14372+ unsigned long addr, pte_t *ptep, int full)
14373+{
14374+ return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
14375+}
14376+EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
14377+
14378+int ptep_set_access_flags(struct vm_area_struct *vma,
14379+ unsigned long address, pte_t *ptep,
14380+ pte_t entry, int dirty)
14381+{
14382+ int changed = !pte_same(*ptep, entry);
14383+
14384+ if (changed && dirty) {
14385+ if (likely(vma->vm_mm == current->mm)) {
14386+ if (HYPERVISOR_update_va_mapping(address,
14387+ entry,
14388+ (unsigned long)vma->vm_mm->cpu_vm_mask.bits|
14389+ UVMF_INVLPG|UVMF_MULTI))
14390+ BUG();
14391+ } else {
14392+ xen_l1_entry_update(ptep, entry);
14393+ flush_tlb_page(vma, address);
14394+ }
14395+ }
14396+
14397+ return changed;
14398+}
14399+
14400+int ptep_test_and_clear_young(struct vm_area_struct *vma,
14401+ unsigned long addr, pte_t *ptep)
14402+{
14403+ int ret = 0;
14404+
14405+ if (pte_young(*ptep))
14406+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
14407+ &ptep->pte);
14408+
14409+ if (ret)
14410+ pte_update(vma->vm_mm, addr, ptep);
14411+
14412+ return ret;
14413+}
14414+
14415+int ptep_clear_flush_young(struct vm_area_struct *vma,
14416+ unsigned long address, pte_t *ptep)
14417+{
14418+ pte_t pte = *ptep;
14419+ int young = pte_young(pte);
14420+
14421+ pte = pte_mkold(pte);
14422+ if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
14423+ ptep_set_access_flags(vma, address, ptep, pte, young);
14424+ else if (young)
14425+ ptep->pte_low = pte.pte_low;
14426+
14427+ return young;
14428+}
14429--- sle11-2009-05-14.orig/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:33:40.000000000 +0100
14430+++ sle11-2009-05-14/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:38:05.000000000 +0100
14431@@ -1,7 +1,3 @@
14432-/*
14433- * linux/arch/i386/mm/pgtable.c
14434- */
14435-
14436 #include <linux/sched.h>
14437 #include <linux/kernel.h>
14438 #include <linux/errno.h>
14439@@ -41,7 +37,6 @@ void show_mem(void)
14440
14441 printk(KERN_INFO "Mem-info:\n");
14442 show_free_areas();
14443- printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
14444 for_each_online_pgdat(pgdat) {
14445 pgdat_resize_lock(pgdat, &flags);
14446 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14447@@ -157,243 +152,6 @@ void __init reserve_top_address(unsigned
14448 __VMALLOC_RESERVE += reserve;
14449 }
14450
14451-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
14452-{
14453- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
14454- if (pte)
14455- make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
14456- return pte;
14457-}
14458-
14459-/*
14460- * List of all pgd's needed for non-PAE so it can invalidate entries
14461- * in both cached and uncached pgd's; not needed for PAE since the
14462- * kernel pmd is shared. If PAE were not to share the pmd a similar
14463- * tactic would be needed. This is essentially codepath-based locking
14464- * against pageattr.c; it is the unique case in which a valid change
14465- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
14466- * vmalloc faults work because attached pagetables are never freed.
14467- * -- wli
14468- */
14469-static inline void pgd_list_add(pgd_t *pgd)
14470-{
14471- struct page *page = virt_to_page(pgd);
14472-
14473- list_add(&page->lru, &pgd_list);
14474-}
14475-
14476-static inline void pgd_list_del(pgd_t *pgd)
14477-{
14478- struct page *page = virt_to_page(pgd);
14479-
14480- list_del(&page->lru);
14481-}
14482-
14483-#define UNSHARED_PTRS_PER_PGD \
14484- (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
14485-
14486-static void pgd_ctor(void *p)
14487-{
14488- pgd_t *pgd = p;
14489- unsigned long flags;
14490-
14491- pgd_test_and_unpin(pgd);
14492-
14493- /* Clear usermode parts of PGD */
14494- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
14495-
14496- spin_lock_irqsave(&pgd_lock, flags);
14497-
14498- /* If the pgd points to a shared pagetable level (either the
14499- ptes in non-PAE, or shared PMD in PAE), then just copy the
14500- references from swapper_pg_dir. */
14501- if (PAGETABLE_LEVELS == 2 ||
14502- (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
14503- clone_pgd_range(pgd + USER_PTRS_PER_PGD,
14504- swapper_pg_dir + USER_PTRS_PER_PGD,
14505- KERNEL_PGD_PTRS);
14506- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
14507- __pa(swapper_pg_dir) >> PAGE_SHIFT,
14508- USER_PTRS_PER_PGD,
14509- KERNEL_PGD_PTRS);
14510- }
14511-
14512- /* list required to sync kernel mapping updates */
14513- if (PAGETABLE_LEVELS == 2)
14514- pgd_list_add(pgd);
14515-
14516- spin_unlock_irqrestore(&pgd_lock, flags);
14517-}
14518-
14519-static void pgd_dtor(void *pgd)
14520-{
14521- unsigned long flags; /* can be called from interrupt context */
14522-
14523- if (!SHARED_KERNEL_PMD) {
14524- spin_lock_irqsave(&pgd_lock, flags);
14525- pgd_list_del(pgd);
14526- spin_unlock_irqrestore(&pgd_lock, flags);
14527- }
14528-
14529- pgd_test_and_unpin(pgd);
14530-}
14531-
14532-#ifdef CONFIG_X86_PAE
14533-/*
14534- * Mop up any pmd pages which may still be attached to the pgd.
14535- * Normally they will be freed by munmap/exit_mmap, but any pmd we
14536- * preallocate which never got a corresponding vma will need to be
14537- * freed manually.
14538- */
14539-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
14540-{
14541- int i;
14542-
14543- for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
14544- pgd_t pgd = pgdp[i];
14545-
14546- if (__pgd_val(pgd) != 0) {
14547- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
14548-
14549- pgdp[i] = xen_make_pgd(0);
14550-
14551- paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
14552- pmd_free(mm, pmd);
14553- }
14554- }
14555-}
14556-
14557-/*
14558- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
14559- * updating the top-level pagetable entries to guarantee the
14560- * processor notices the update. Since this is expensive, and
14561- * all 4 top-level entries are used almost immediately in a
14562- * new process's life, we just pre-populate them here.
14563- *
14564- * Also, if we're in a paravirt environment where the kernel pmd is
14565- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
14566- * and initialize the kernel pmds here.
14567- */
14568-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14569-{
14570- pud_t *pud;
14571- pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
14572- unsigned long addr, flags;
14573- int i;
14574-
14575- /*
14576- * We can race save/restore (if we sleep during a GFP_KERNEL memory
14577- * allocation). We therefore store virtual addresses of pmds as they
14578- * do not change across save/restore, and poke the machine addresses
14579- * into the pgdir under the pgd_lock.
14580- */
14581- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
14582- pmds[i] = pmd_alloc_one(mm, addr);
14583- if (!pmds[i])
14584- goto out_oom;
14585- }
14586-
14587- spin_lock_irqsave(&pgd_lock, flags);
14588-
14589- /* Protect against save/restore: move below 4GB under pgd_lock. */
14590- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
14591- && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
14592- spin_unlock_irqrestore(&pgd_lock, flags);
14593-out_oom:
14594- while (i--)
14595- pmd_free(mm, pmds[i]);
14596- return 0;
14597- }
14598-
14599- /* Copy kernel pmd contents and write-protect the new pmds. */
14600- pud = pud_offset(pgd, 0);
14601- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
14602- i++, pud++, addr += PUD_SIZE) {
14603- if (i >= USER_PTRS_PER_PGD) {
14604- memcpy(pmds[i],
14605- (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
14606- sizeof(pmd_t) * PTRS_PER_PMD);
14607- make_lowmem_page_readonly(
14608- pmds[i], XENFEAT_writable_page_tables);
14609- }
14610-
14611- /* It is safe to poke machine addresses of pmds under the pgd_lock. */
14612- pud_populate(mm, pud, pmds[i]);
14613- }
14614-
14615- /* List required to sync kernel mapping updates and
14616- * to pin/unpin on save/restore. */
14617- pgd_list_add(pgd);
14618-
14619- spin_unlock_irqrestore(&pgd_lock, flags);
14620-
14621- return 1;
14622-}
14623-#else /* !CONFIG_X86_PAE */
14624-/* No need to prepopulate any pagetable entries in non-PAE modes. */
14625-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14626-{
14627- return 1;
14628-}
14629-
14630-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
14631-{
14632-}
14633-#endif /* CONFIG_X86_PAE */
14634-
14635-pgd_t *pgd_alloc(struct mm_struct *mm)
14636-{
14637- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
14638-
14639- /* so that alloc_pd can use it */
14640- mm->pgd = pgd;
14641- if (pgd)
14642- pgd_ctor(pgd);
14643-
14644- if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
14645- free_page((unsigned long)pgd);
14646- pgd = NULL;
14647- }
14648-
14649- return pgd;
14650-}
14651-
14652-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
14653-{
14654- /*
14655- * After this the pgd should not be pinned for the duration of this
14656- * function's execution. We should never sleep and thus never race:
14657- * 1. User pmds will not become write-protected under our feet due
14658- * to a concurrent mm_pin_all().
14659- * 2. The machine addresses in PGD entries will not become invalid
14660- * due to a concurrent save/restore.
14661- */
14662- pgd_dtor(pgd);
14663-
14664- if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
14665- xen_destroy_contiguous_region((unsigned long)pgd, 0);
14666-
14667- pgd_mop_up_pmds(mm, pgd);
14668- free_page((unsigned long)pgd);
14669-}
14670-
14671-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
14672-{
14673- pgtable_page_dtor(pte);
14674- paravirt_release_pt(page_to_pfn(pte));
14675- tlb_remove_page(tlb, pte);
14676-}
14677-
14678-#ifdef CONFIG_X86_PAE
14679-
14680-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
14681-{
14682- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
14683- tlb_remove_page(tlb, virt_to_page(pmd));
14684-}
14685-
14686-#endif
14687-
14688 void make_lowmem_page_readonly(void *va, unsigned int feature)
14689 {
14690 pte_t *pte;
14691--- sle11-2009-05-14.orig/arch/x86/pci/i386.c 2009-05-14 10:56:29.000000000 +0200
14692+++ sle11-2009-05-14/arch/x86/pci/i386.c 2009-05-14 11:20:29.000000000 +0200
14693@@ -331,10 +331,14 @@ int pci_mmap_page_range(struct pci_dev *
14694 flags);
14695 }
14696
14697+#ifndef CONFIG_XEN
14698 if (((vma->vm_pgoff < max_low_pfn_mapped) ||
14699 (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
14700 vma->vm_pgoff < max_pfn_mapped)) &&
14701 ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
14702+#else
14703+ if (ioremap_check_change_attr(vma->vm_pgoff, len, flags)) {
14704+#endif
14705 free_memtype(addr, addr + len);
14706 return -EINVAL;
14707 }
14708--- sle11-2009-05-14.orig/arch/x86/pci/irq-xen.c 2009-03-16 16:33:40.000000000 +0100
14709+++ sle11-2009-05-14/arch/x86/pci/irq-xen.c 2009-03-16 16:38:05.000000000 +0100
14710@@ -140,9 +140,11 @@ static void __init pirq_peer_trick(void)
14711 busmap[e->bus] = 1;
14712 }
14713 for(i = 1; i < 256; i++) {
14714+ int node;
14715 if (!busmap[i] || pci_find_bus(0, i))
14716 continue;
14717- if (pci_scan_bus_with_sysdata(i))
14718+ node = get_mp_bus_to_node(i);
14719+ if (pci_scan_bus_on_node(i, &pci_root_ops, node))
14720 printk(KERN_INFO "PCI: Discovered primary peer "
14721 "bus %02x [IRQ]\n", i);
14722 }
14723@@ -204,7 +206,7 @@ static int pirq_ali_get(struct pci_dev *
14724 {
14725 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
14726
14727- WARN_ON_ONCE(pirq >= 16);
14728+ WARN_ON_ONCE(pirq > 16);
14729 return irqmap[read_config_nybble(router, 0x48, pirq-1)];
14730 }
14731
14732@@ -213,7 +215,7 @@ static int pirq_ali_set(struct pci_dev *
14733 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
14734 unsigned int val = irqmap[irq];
14735
14736- WARN_ON_ONCE(pirq >= 16);
14737+ WARN_ON_ONCE(pirq > 16);
14738 if (val) {
14739 write_config_nybble(router, 0x48, pirq-1, val);
14740 return 1;
14741@@ -264,7 +266,7 @@ static int pirq_via586_get(struct pci_de
14742 {
14743 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
14744
14745- WARN_ON_ONCE(pirq >= 5);
14746+ WARN_ON_ONCE(pirq > 5);
14747 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
14748 }
14749
14750@@ -272,7 +274,7 @@ static int pirq_via586_set(struct pci_de
14751 {
14752 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
14753
14754- WARN_ON_ONCE(pirq >= 5);
14755+ WARN_ON_ONCE(pirq > 5);
14756 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
14757 return 1;
14758 }
14759@@ -286,7 +288,7 @@ static int pirq_ite_get(struct pci_dev *
14760 {
14761 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
14762
14763- WARN_ON_ONCE(pirq >= 4);
14764+ WARN_ON_ONCE(pirq > 4);
14765 return read_config_nybble(router,0x43, pirqmap[pirq-1]);
14766 }
14767
14768@@ -294,7 +296,7 @@ static int pirq_ite_set(struct pci_dev *
14769 {
14770 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
14771
14772- WARN_ON_ONCE(pirq >= 4);
14773+ WARN_ON_ONCE(pirq > 4);
14774 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
14775 return 1;
14776 }
14777@@ -623,6 +625,13 @@ static __init int via_router_probe(struc
14778 */
14779 device = PCI_DEVICE_ID_VIA_8235;
14780 break;
14781+ case PCI_DEVICE_ID_VIA_8237:
14782+ /**
14783+ * Asus a7v600 bios wrongly reports 8237
14784+ * as 586-compatible
14785+ */
14786+ device = PCI_DEVICE_ID_VIA_8237;
14787+ break;
14788 }
14789 }
14790
14791--- sle11-2009-05-14.orig/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:33:40.000000000 +0100
14792+++ sle11-2009-05-14/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:38:05.000000000 +0100
14793@@ -164,7 +164,7 @@ static __init void relocate_vdso(Elf32_E
14794 Elf32_Shdr *shdr;
14795 int i;
14796
14797- BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
14798+ BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
14799 !elf_check_arch_ia32(ehdr) ||
14800 ehdr->e_type != ET_DYN);
14801
14802@@ -233,8 +233,12 @@ void syscall32_cpu_init(void)
14803 BUG();
14804 #endif
14805
14806- if (use_sysenter < 0)
14807- use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
14808+ if (use_sysenter < 0) {
14809+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
14810+ use_sysenter = 1;
14811+ if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
14812+ use_sysenter = 1;
14813+ }
14814 }
14815
14816 #define compat_uses_vma 1
14817@@ -337,8 +341,6 @@ int __init sysenter_setup(void)
14818
14819 #ifdef CONFIG_X86_32
14820 gate_vma_init();
14821-
14822- printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
14823 #endif
14824
14825 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
14826@@ -383,6 +385,9 @@ int arch_setup_additional_pages(struct l
14827 int ret = 0;
14828 bool compat;
14829
14830+ if (vdso_enabled == VDSO_DISABLED)
14831+ return 0;
14832+
14833 down_write(&mm->mmap_sem);
14834
14835 /* Test compat mode once here, in case someone
14836--- sle11-2009-05-14.orig/drivers/acpi/processor_core.c 2009-02-16 15:58:14.000000000 +0100
14837+++ sle11-2009-05-14/drivers/acpi/processor_core.c 2009-03-16 16:38:05.000000000 +0100
14838@@ -657,7 +657,7 @@ static int acpi_processor_get_info(struc
14839 * of /proc/cpuinfo
14840 */
14841 status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
14842- if (ACPI_SUCCESS(status))
14843+ if (ACPI_SUCCESS(status) && pr->id != -1)
14844 arch_fix_phys_package_id(pr->id, object.integer.value);
14845
14846 return 0;
14847--- sle11-2009-05-14.orig/drivers/input/xen-kbdfront.c 2009-05-14 10:56:29.000000000 +0200
14848+++ sle11-2009-05-14/drivers/input/xen-kbdfront.c 2009-03-16 16:38:05.000000000 +0100
14849@@ -325,7 +325,6 @@ static struct xenbus_device_id xenkbd_id
14850
14851 static struct xenbus_driver xenkbd = {
14852 .name = "vkbd",
14853- .owner = THIS_MODULE,
14854 .ids = xenkbd_ids,
14855 .probe = xenkbd_probe,
14856 .remove = xenkbd_remove,
14857--- sle11-2009-05-14.orig/drivers/oprofile/cpu_buffer.c 2009-03-12 16:15:32.000000000 +0100
14858+++ sle11-2009-05-14/drivers/oprofile/cpu_buffer.c 2009-03-16 16:38:05.000000000 +0100
14859@@ -341,7 +341,7 @@ void oprofile_add_mode(int cpu_mode)
14860
14861 int oprofile_add_domain_switch(int32_t domain_id)
14862 {
14863- struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
14864+ struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
14865
14866 /* should have space for switching into and out of domain
14867 (2 slots each) plus one sample and one cpu mode switch */
14868--- sle11-2009-05-14.orig/drivers/pci/msi-xen.c 2009-03-16 16:33:40.000000000 +0100
14869+++ sle11-2009-05-14/drivers/pci/msi-xen.c 2009-03-16 16:38:05.000000000 +0100
14870@@ -583,7 +583,7 @@ int pci_enable_msi(struct pci_dev* dev)
14871 EXPORT_SYMBOL(pci_enable_msi);
14872
14873 extern void pci_frontend_disable_msi(struct pci_dev* dev);
14874-void pci_disable_msi(struct pci_dev* dev)
14875+void pci_msi_shutdown(struct pci_dev* dev)
14876 {
14877 int pirq;
14878
14879@@ -612,6 +612,10 @@ void pci_disable_msi(struct pci_dev* dev
14880 pci_intx_for_msi(dev, 1);
14881 dev->msi_enabled = 0;
14882 }
14883+void pci_disable_msi(struct pci_dev* dev)
14884+{
14885+ pci_msi_shutdown(dev);
14886+}
14887 EXPORT_SYMBOL(pci_disable_msi);
14888
14889 /**
14890@@ -714,7 +718,7 @@ int pci_enable_msix(struct pci_dev* dev,
14891 EXPORT_SYMBOL(pci_enable_msix);
14892
14893 extern void pci_frontend_disable_msix(struct pci_dev* dev);
14894-void pci_disable_msix(struct pci_dev* dev)
14895+void pci_msix_shutdown(struct pci_dev* dev)
14896 {
14897 if (!pci_msi_enable)
14898 return;
14899@@ -751,6 +755,10 @@ void pci_disable_msix(struct pci_dev* de
14900 pci_intx_for_msi(dev, 1);
14901 dev->msix_enabled = 0;
14902 }
14903+void pci_disable_msix(struct pci_dev* dev)
14904+{
14905+ pci_msix_shutdown(dev);
14906+}
14907 EXPORT_SYMBOL(pci_disable_msix);
14908
14909 /**
14910--- sle11-2009-05-14.orig/drivers/video/Kconfig 2009-02-16 15:58:02.000000000 +0100
14911+++ sle11-2009-05-14/drivers/video/Kconfig 2009-03-16 16:38:05.000000000 +0100
14912@@ -2029,7 +2029,7 @@ config FB_VIRTUAL
14913
14914 config XEN_FBDEV_FRONTEND
14915 tristate "Xen virtual frame buffer support"
14916- depends on FB && XEN
14917+ depends on FB && PARAVIRT_XEN
14918 select FB_SYS_FILLRECT
14919 select FB_SYS_COPYAREA
14920 select FB_SYS_IMAGEBLIT
14921--- sle11-2009-05-14.orig/drivers/video/xen-fbfront.c 2009-05-14 10:56:29.000000000 +0200
14922+++ sle11-2009-05-14/drivers/video/xen-fbfront.c 2009-03-16 16:38:05.000000000 +0100
14923@@ -670,7 +670,6 @@ static struct xenbus_device_id xenfb_ids
14924
14925 static struct xenbus_driver xenfb = {
14926 .name = "vfb",
14927- .owner = THIS_MODULE,
14928 .ids = xenfb_ids,
14929 .probe = xenfb_probe,
14930 .remove = xenfb_remove,
14931--- sle11-2009-05-14.orig/drivers/xen/Kconfig 2009-03-04 11:28:34.000000000 +0100
14932+++ sle11-2009-05-14/drivers/xen/Kconfig 2009-03-16 16:38:05.000000000 +0100
14933@@ -2,8 +2,6 @@
14934 # This Kconfig describe xen options
14935 #
14936
14937-mainmenu "Xen Configuration"
14938-
14939 config XEN
14940 bool
14941
14942--- sle11-2009-05-14.orig/drivers/xen/Makefile 2009-02-16 16:17:21.000000000 +0100
14943+++ sle11-2009-05-14/drivers/xen/Makefile 2009-03-16 16:38:05.000000000 +0100
14944@@ -1,5 +1,8 @@
14945-obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o
14946+obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
14947+xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
14948+xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
14949
14950+xen-balloon-$(CONFIG_XEN) := balloon/
14951 obj-$(CONFIG_XEN) += core/
14952 obj-$(CONFIG_XEN) += console/
14953 obj-$(CONFIG_XEN) += evtchn/
14954@@ -7,7 +10,8 @@ obj-y += xenbus/
14955 obj-$(CONFIG_XEN) += char/
14956
14957 obj-$(CONFIG_XEN) += util.o
14958-obj-$(CONFIG_XEN_BALLOON) += balloon/
14959+obj-$(CONFIG_XEN_XENCOMM) += $(xen-xencomm-y)
14960+obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y)
14961 obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
14962 obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
14963 obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
14964--- sle11-2009-05-14.orig/drivers/xen/blkfront/blkfront.c 2009-03-24 10:12:53.000000000 +0100
14965+++ sle11-2009-05-14/drivers/xen/blkfront/blkfront.c 2009-05-19 10:38:53.000000000 +0200
14966@@ -285,7 +285,11 @@ static void backend_changed(struct xenbu
14967 break;
14968
14969 case XenbusStateClosing:
14970- bd = bdget(info->dev);
14971+ if (!info->gd) {
14972+ xenbus_frontend_closed(dev);
14973+ break;
14974+ }
14975+ bd = bdget_disk(info->gd, 0);
14976 if (bd == NULL)
14977 xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
14978
14979--- sle11-2009-05-14.orig/drivers/xen/blkfront/block.h 2009-03-24 10:11:58.000000000 +0100
14980+++ sle11-2009-05-14/drivers/xen/blkfront/block.h 2009-03-16 16:38:05.000000000 +0100
14981@@ -96,7 +96,6 @@ struct blk_shadow {
14982 struct blkfront_info
14983 {
14984 struct xenbus_device *xbdev;
14985- dev_t dev;
14986 struct gendisk *gd;
14987 int vdevice;
14988 blkif_vdev_t handle;
14989--- sle11-2009-05-14.orig/drivers/xen/blkfront/vbd.c 2009-02-16 16:17:21.000000000 +0100
14990+++ sle11-2009-05-14/drivers/xen/blkfront/vbd.c 2009-03-16 16:38:05.000000000 +0100
14991@@ -246,17 +246,32 @@ xlvbd_init_blk_queue(struct gendisk *gd,
14992 return 0;
14993 }
14994
14995-static int
14996-xlvbd_alloc_gendisk(int major, int minor, blkif_sector_t capacity, int vdevice,
14997- u16 vdisk_info, u16 sector_size,
14998- struct blkfront_info *info)
14999+int
15000+xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
15001+ u16 sector_size, struct blkfront_info *info)
15002 {
15003+ int major, minor;
15004 struct gendisk *gd;
15005 struct xlbd_major_info *mi;
15006 int nr_minors = 1;
15007 int err = -ENODEV;
15008 unsigned int offset;
15009
15010+ if ((vdevice>>EXT_SHIFT) > 1) {
15011+ /* this is above the extended range; something is wrong */
15012+ printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
15013+ return -ENODEV;
15014+ }
15015+
15016+ if (!VDEV_IS_EXTENDED(vdevice)) {
15017+ major = BLKIF_MAJOR(vdevice);
15018+ minor = BLKIF_MINOR(vdevice);
15019+ }
15020+ else {
15021+ major = 202;
15022+ minor = BLKIF_MINOR_EXT(vdevice);
15023+ }
15024+
15025 BUG_ON(info->gd != NULL);
15026 BUG_ON(info->mi != NULL);
15027 BUG_ON(info->rq != NULL);
15028@@ -337,41 +352,6 @@ xlvbd_alloc_gendisk(int major, int minor
15029 return err;
15030 }
15031
15032-int
15033-xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
15034- u16 sector_size, struct blkfront_info *info)
15035-{
15036- struct block_device *bd;
15037- int err = 0;
15038- int major, minor;
15039-
15040- if ((vdevice>>EXT_SHIFT) > 1) {
15041- /* this is above the extended range; something is wrong */
15042- printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
15043- return -ENODEV;
15044- }
15045-
15046- if (!VDEV_IS_EXTENDED(vdevice)) {
15047- major = BLKIF_MAJOR(vdevice);
15048- minor = BLKIF_MINOR(vdevice);
15049- }
15050- else {
15051- major = 202;
15052- minor = BLKIF_MINOR_EXT(vdevice);
15053- }
15054-
15055- info->dev = MKDEV(major, minor);
15056- bd = bdget(info->dev);
15057- if (bd == NULL)
15058- return -ENODEV;
15059-
15060- err = xlvbd_alloc_gendisk(major, minor, capacity, vdevice, vdisk_info,
15061- sector_size, info);
15062-
15063- bdput(bd);
15064- return err;
15065-}
15066-
15067 void
15068 xlvbd_del(struct blkfront_info *info)
15069 {
15070--- sle11-2009-05-14.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:38:54.000000000 +0200
15071+++ sle11-2009-05-14/drivers/xen/blktap/blktap.c 2009-04-20 11:40:14.000000000 +0200
15072@@ -111,6 +111,7 @@ typedef struct tap_blkif {
15073 unsigned long mode; /*current switching mode */
15074 int minor; /*Minor number for tapdisk device */
15075 pid_t pid; /*tapdisk process id */
15076+ struct pid_namespace *pid_ns; /*... and its corresponding namespace */
15077 enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
15078 shutdown */
15079 unsigned long *idx_map; /*Record the user ring id to kern
15080@@ -299,16 +300,14 @@ struct tap_vma_priv {
15081 struct page *map[];
15082 };
15083
15084-static struct page *blktap_nopage(struct vm_area_struct *vma,
15085- unsigned long address,
15086- int *type)
15087+static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
15088 {
15089 /*
15090 * if the page has not been mapped in by the driver then return
15091- * NOPAGE_SIGBUS to the domain.
15092+ * VM_FAULT_SIGBUS to the domain.
15093 */
15094
15095- return NOPAGE_SIGBUS;
15096+ return VM_FAULT_SIGBUS;
15097 }
15098
15099 static pte_t blktap_clear_pte(struct vm_area_struct *vma,
15100@@ -404,7 +403,7 @@ static void blktap_vma_close(struct vm_a
15101 }
15102
15103 struct vm_operations_struct blktap_vm_ops = {
15104- nopage: blktap_nopage,
15105+ fault: blktap_fault,
15106 zap_pte: blktap_clear_pte,
15107 close: blktap_vma_close,
15108 };
15109@@ -498,9 +497,8 @@ found:
15110 tapfds[minor] = info;
15111
15112 if ((class = get_xen_class()) != NULL)
15113- class_device_create(class, NULL,
15114- MKDEV(blktap_major, minor), NULL,
15115- "blktap%d", minor);
15116+ device_create(class, NULL, MKDEV(blktap_major, minor),
15117+ "blktap%d", minor);
15118 }
15119
15120 out:
15121@@ -542,7 +540,7 @@ void signal_tapdisk(int idx)
15122 return;
15123
15124 if (info->pid > 0) {
15125- ptask = find_task_by_pid(info->pid);
15126+ ptask = find_task_by_pid_ns(info->pid, info->pid_ns);
15127 if (ptask)
15128 info->status = CLEANSHUTDOWN;
15129 }
15130@@ -770,8 +768,9 @@ static int blktap_ioctl(struct inode *in
15131 {
15132 if (info) {
15133 info->pid = (pid_t)arg;
15134- DPRINTK("blktap: pid received %d\n",
15135- info->pid);
15136+ info->pid_ns = current->nsproxy->pid_ns;
15137+ DPRINTK("blktap: pid received %p:%d\n",
15138+ info->pid_ns, info->pid);
15139 }
15140 return 0;
15141 }
15142@@ -1684,9 +1683,7 @@ static int __init blkif_init(void)
15143 * We only create the device when a request of a new device is
15144 * made.
15145 */
15146- class_device_create(class, NULL,
15147- MKDEV(blktap_major, 0), NULL,
15148- "blktap0");
15149+ device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
15150 } else {
15151 /* this is bad, but not fatal */
15152 WPRINTK("blktap: sysfs xen_class not created\n");
15153--- sle11-2009-05-14.orig/drivers/xen/char/mem.c 2008-12-15 11:27:22.000000000 +0100
15154+++ sle11-2009-05-14/drivers/xen/char/mem.c 2009-03-16 16:38:05.000000000 +0100
15155@@ -33,6 +33,27 @@ static inline int uncached_access(struct
15156 return 0;
15157 }
15158
15159+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
15160+{
15161+#ifdef CONFIG_NONPROMISC_DEVMEM
15162+ u64 from = ((u64)pfn) << PAGE_SHIFT;
15163+ u64 to = from + size;
15164+ u64 cursor = from;
15165+
15166+ while (cursor < to) {
15167+ if (!devmem_is_allowed(pfn)) {
15168+ printk(KERN_INFO
15169+ "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
15170+ current->comm, from, to);
15171+ return 0;
15172+ }
15173+ cursor += PAGE_SIZE;
15174+ pfn++;
15175+ }
15176+#endif
15177+ return 1;
15178+}
15179+
15180 /*
15181 * This funcion reads the *physical* memory. The f_pos points directly to the
15182 * memory location.
15183@@ -55,6 +76,9 @@ static ssize_t read_mem(struct file * fi
15184
15185 sz = min_t(unsigned long, sz, count);
15186
15187+ if (!range_is_allowed(p >> PAGE_SHIFT, count))
15188+ return -EPERM;
15189+
15190 v = ioremap(p, sz);
15191 if (IS_ERR(v) || v == NULL) {
15192 /*
15193@@ -103,6 +127,9 @@ static ssize_t write_mem(struct file * f
15194
15195 sz = min_t(unsigned long, sz, count);
15196
15197+ if (!range_is_allowed(p >> PAGE_SHIFT, sz))
15198+ return -EPERM;
15199+
15200 v = ioremap(p, sz);
15201 if (v == NULL)
15202 break;
15203@@ -131,6 +158,23 @@ static ssize_t write_mem(struct file * f
15204 }
15205
15206 #ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
15207+static void mmap_mem_open(struct vm_area_struct *vma)
15208+{
15209+ map_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
15210+ vma->vm_page_prot);
15211+}
15212+
15213+static void mmap_mem_close(struct vm_area_struct *vma)
15214+{
15215+ unmap_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
15216+ vma->vm_page_prot);
15217+}
15218+
15219+static struct vm_operations_struct mmap_mem_ops = {
15220+ .open = mmap_mem_open,
15221+ .close = mmap_mem_close
15222+};
15223+
15224 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
15225 {
15226 size_t size = vma->vm_end - vma->vm_start;
15227@@ -138,6 +182,15 @@ static int xen_mmap_mem(struct file * fi
15228 if (uncached_access(file))
15229 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
15230
15231+ if (!range_is_allowed(vma->vm_pgoff, size))
15232+ return -EPERM;
15233+
15234+ if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
15235+ &vma->vm_page_prot))
15236+ return -EINVAL;
15237+
15238+ vma->vm_ops = &mmap_mem_ops;
15239+
15240 /* We want to return the real error code, not EAGAIN. */
15241 return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
15242 size, vma->vm_page_prot, DOMID_IO);
15243--- sle11-2009-05-14.orig/drivers/xen/console/console.c 2008-12-15 11:26:44.000000000 +0100
15244+++ sle11-2009-05-14/drivers/xen/console/console.c 2009-03-16 16:38:05.000000000 +0100
15245@@ -552,16 +552,18 @@ static int xencons_write(
15246 return i;
15247 }
15248
15249-static void xencons_put_char(struct tty_struct *tty, u_char ch)
15250+static int xencons_put_char(struct tty_struct *tty, u_char ch)
15251 {
15252 unsigned long flags;
15253+ int ret;
15254
15255 if (DUMMY_TTY(tty))
15256- return;
15257+ return 0;
15258
15259 spin_lock_irqsave(&xencons_lock, flags);
15260- (void)__xencons_put_char(ch);
15261+ ret = __xencons_put_char(ch);
15262 spin_unlock_irqrestore(&xencons_lock, flags);
15263+ return ret;
15264 }
15265
15266 static void xencons_flush_chars(struct tty_struct *tty)
15267@@ -583,7 +585,7 @@ static void xencons_wait_until_sent(stru
15268 if (DUMMY_TTY(tty))
15269 return;
15270
15271- while (DRV(tty->driver)->chars_in_buffer(tty)) {
15272+ while (tty_chars_in_buffer(tty)) {
15273 set_current_state(TASK_INTERRUPTIBLE);
15274 schedule_timeout(1);
15275 if (signal_pending(current))
15276@@ -632,8 +634,7 @@ static void xencons_close(struct tty_str
15277
15278 tty->closing = 1;
15279 tty_wait_until_sent(tty, 0);
15280- if (DRV(tty->driver)->flush_buffer != NULL)
15281- DRV(tty->driver)->flush_buffer(tty);
15282+ tty_driver_flush_buffer(tty);
15283 if (tty->ldisc.flush_buffer != NULL)
15284 tty->ldisc.flush_buffer(tty);
15285 tty->closing = 0;
15286--- sle11-2009-05-14.orig/drivers/xen/core/machine_kexec.c 2009-02-17 11:46:41.000000000 +0100
15287+++ sle11-2009-05-14/drivers/xen/core/machine_kexec.c 2009-03-16 16:38:05.000000000 +0100
15288@@ -5,6 +5,7 @@
15289
15290 #include <linux/kexec.h>
15291 #include <xen/interface/kexec.h>
15292+#include <linux/reboot.h>
15293 #include <linux/mm.h>
15294 #include <linux/bootmem.h>
15295
15296@@ -90,6 +91,9 @@ void __init xen_machine_kexec_setup_reso
15297 xen_hypervisor_res.start = range.start;
15298 xen_hypervisor_res.end = range.start + range.size - 1;
15299 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
15300+#ifdef CONFIG_X86_64
15301+ insert_resource(&iomem_resource, &xen_hypervisor_res);
15302+#endif
15303
15304 /* fill in crashk_res if range is reserved by hypervisor */
15305
15306@@ -102,6 +106,9 @@ void __init xen_machine_kexec_setup_reso
15307 if (range.size) {
15308 crashk_res.start = range.start;
15309 crashk_res.end = range.start + range.size - 1;
15310+#ifdef CONFIG_X86_64
15311+ insert_resource(&iomem_resource, &crashk_res);
15312+#endif
15313 }
15314
15315 /* get physical address of vmcoreinfo */
15316@@ -153,11 +160,13 @@ void __init xen_machine_kexec_setup_reso
15317 return;
15318 }
15319
15320+#ifndef CONFIG_X86_64
15321 void __init xen_machine_kexec_register_resources(struct resource *res)
15322 {
15323 request_resource(res, &xen_hypervisor_res);
15324 machine_kexec_register_resources(res);
15325 }
15326+#endif
15327
15328 static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
15329 {
15330@@ -228,6 +237,11 @@ void machine_shutdown(void)
15331 /* do nothing */
15332 }
15333
15334+void machine_crash_shutdown(struct pt_regs *regs)
15335+{
15336+ /* The kernel is broken so disable interrupts */
15337+ local_irq_disable();
15338+}
15339
15340 /*
15341 * Local variables:
15342--- sle11-2009-05-14.orig/drivers/xen/core/smpboot.c 2009-03-16 16:33:40.000000000 +0100
15343+++ sle11-2009-05-14/drivers/xen/core/smpboot.c 2009-03-16 16:38:05.000000000 +0100
15344@@ -53,17 +53,16 @@ static DEFINE_PER_CPU(int, callfunc_irq)
15345 static char resched_name[NR_CPUS][15];
15346 static char callfunc_name[NR_CPUS][15];
15347
15348-u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
15349+#ifdef CONFIG_X86_LOCAL_APIC
15350+#define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
15351+#else
15352+#define set_cpu_to_apicid(cpu, apicid)
15353+#endif
15354
15355 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
15356 DEFINE_PER_CPU(cpumask_t, cpu_core_map);
15357 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
15358
15359-#if defined(__i386__)
15360-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
15361-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15362-#endif
15363-
15364 void __init prefill_possible_map(void)
15365 {
15366 int i, rc;
15367@@ -154,7 +153,7 @@ static int __cpuinit xen_smp_intr_init(u
15368 }
15369
15370 #ifdef CONFIG_HOTPLUG_CPU
15371-static void xen_smp_intr_exit(unsigned int cpu)
15372+static void __cpuexit xen_smp_intr_exit(unsigned int cpu)
15373 {
15374 if (cpu != 0)
15375 local_teardown_timer(cpu);
15376@@ -263,8 +262,7 @@ void __init smp_prepare_cpus(unsigned in
15377 boot_cpu_data.apicid = apicid;
15378 cpu_data(0) = boot_cpu_data;
15379
15380- cpu_2_logical_apicid[0] = apicid;
15381- per_cpu(x86_cpu_to_apicid, 0) = apicid;
15382+ set_cpu_to_apicid(0, apicid);
15383
15384 current_thread_info()->cpu = 0;
15385
15386@@ -319,8 +317,7 @@ void __init smp_prepare_cpus(unsigned in
15387 cpu_data(cpu).cpu_index = cpu;
15388 cpu_data(cpu).apicid = apicid;
15389
15390- cpu_2_logical_apicid[cpu] = apicid;
15391- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
15392+ set_cpu_to_apicid(cpu, apicid);
15393
15394 #ifdef __x86_64__
15395 cpu_pda(cpu)->pcurrent = idle;
15396@@ -375,7 +372,7 @@ static int __init initialize_cpu_present
15397 }
15398 core_initcall(initialize_cpu_present_map);
15399
15400-int __cpu_disable(void)
15401+int __cpuexit __cpu_disable(void)
15402 {
15403 cpumask_t map = cpu_online_map;
15404 unsigned int cpu = smp_processor_id();
15405@@ -392,7 +389,7 @@ int __cpu_disable(void)
15406 return 0;
15407 }
15408
15409-void __cpu_die(unsigned int cpu)
15410+void __cpuexit __cpu_die(unsigned int cpu)
15411 {
15412 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
15413 current->state = TASK_UNINTERRUPTIBLE;
15414--- sle11-2009-05-14.orig/drivers/xen/core/xen_proc.c 2009-05-14 10:56:29.000000000 +0200
15415+++ sle11-2009-05-14/drivers/xen/core/xen_proc.c 2009-03-16 16:38:05.000000000 +0100
15416@@ -8,7 +8,7 @@ static struct proc_dir_entry *xen_base;
15417 struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
15418 {
15419 if ( xen_base == NULL )
15420- if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
15421+ if ( (xen_base = proc_mkdir("xen", NULL)) == NULL )
15422 panic("Couldn't create /proc/xen");
15423 return create_proc_entry(name, mode, xen_base);
15424 }
15425--- sle11-2009-05-14.orig/drivers/xen/fbfront/xenfb.c 2009-03-04 11:25:55.000000000 +0100
15426+++ sle11-2009-05-14/drivers/xen/fbfront/xenfb.c 2009-03-16 16:38:05.000000000 +0100
15427@@ -93,7 +93,7 @@ struct xenfb_info
15428 * only mappings. The former creates unfaulted pages. Preserves
15429 * invariant. The latter removes pages. Preserves invariant.
15430 *
15431- * 3. Holding both locks: xenfb_vm_nopage(). Extends the dirty
15432+ * 3. Holding both locks: xenfb_vm_fault(). Extends the dirty
15433 * rectangle and updates mappings consistently. Preserves
15434 * invariant.
15435 *
15436@@ -112,13 +112,13 @@ struct xenfb_info
15437 *
15438 * But FIXME: the invariant is too weak. It misses that the fault
15439 * record in mappings must be consistent with the mapping of pages in
15440- * the associated address space! do_no_page() updates the PTE after
15441- * xenfb_vm_nopage() returns, i.e. outside the critical region. This
15442+ * the associated address space! __do_fault() updates the PTE after
15443+ * xenfb_vm_fault() returns, i.e. outside the critical region. This
15444 * allows the following race:
15445 *
15446 * X writes to some address in the Xen frame buffer
15447- * Fault - call do_no_page()
15448- * call xenfb_vm_nopage()
15449+ * Fault - call __do_fault()
15450+ * call xenfb_vm_fault()
15451 * grab mm_lock
15452 * map->faults++;
15453 * release mm_lock
15454@@ -387,18 +387,17 @@ static void xenfb_vm_close(struct vm_are
15455 mutex_unlock(&info->mm_lock);
15456 }
15457
15458-static struct page *xenfb_vm_nopage(struct vm_area_struct *vma,
15459- unsigned long vaddr, int *type)
15460+static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
15461 {
15462 struct xenfb_mapping *map = vma->vm_private_data;
15463 struct xenfb_info *info = map->info;
15464- int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT;
15465+ int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
15466 unsigned long flags;
15467 struct page *page;
15468 int y1, y2;
15469
15470 if (pgnr >= info->nr_pages)
15471- return NOPAGE_SIGBUS;
15472+ return VM_FAULT_SIGBUS;
15473
15474 mutex_lock(&info->mm_lock);
15475 spin_lock_irqsave(&info->dirty_lock, flags);
15476@@ -414,16 +413,15 @@ static struct page *xenfb_vm_nopage(stru
15477 spin_unlock_irqrestore(&info->dirty_lock, flags);
15478 mutex_unlock(&info->mm_lock);
15479
15480- if (type)
15481- *type = VM_FAULT_MINOR;
15482+ vmf->page = page;
15483
15484- return page;
15485+ return VM_FAULT_MINOR;
15486 }
15487
15488 static struct vm_operations_struct xenfb_vm_ops = {
15489 .open = xenfb_vm_open,
15490 .close = xenfb_vm_close,
15491- .nopage = xenfb_vm_nopage,
15492+ .fault = xenfb_vm_fault,
15493 };
15494
15495 static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
15496--- sle11-2009-05-14.orig/drivers/xen/gntdev/gntdev.c 2009-03-16 16:33:40.000000000 +0100
15497+++ sle11-2009-05-14/drivers/xen/gntdev/gntdev.c 2009-03-16 16:38:05.000000000 +0100
15498@@ -392,7 +392,7 @@ nomem_out:
15499 static int __init gntdev_init(void)
15500 {
15501 struct class *class;
15502- struct class_device *device;
15503+ struct device *device;
15504
15505 if (!is_running_on_xen()) {
15506 printk(KERN_ERR "You must be running Xen to use gntdev\n");
15507@@ -417,8 +417,8 @@ static int __init gntdev_init(void)
15508 return 0;
15509 }
15510
15511- device = class_device_create(class, NULL, MKDEV(gntdev_major, 0),
15512- NULL, GNTDEV_NAME);
15513+ device = device_create(class, NULL, MKDEV(gntdev_major, 0),
15514+ GNTDEV_NAME);
15515 if (IS_ERR(device)) {
15516 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
15517 printk(KERN_ERR "gntdev created with major number = %d\n",
15518@@ -435,7 +435,7 @@ static void __exit gntdev_exit(void)
15519 {
15520 struct class *class;
15521 if ((class = get_xen_class()) != NULL)
15522- class_device_destroy(class, MKDEV(gntdev_major, 0));
15523+ device_destroy(class, MKDEV(gntdev_major, 0));
15524 unregister_chrdev(gntdev_major, GNTDEV_NAME);
15525 }
15526
15527--- sle11-2009-05-14.orig/drivers/xen/netfront/netfront.c 2009-03-30 16:39:44.000000000 +0200
15528+++ sle11-2009-05-14/drivers/xen/netfront/netfront.c 2009-03-30 16:40:17.000000000 +0200
15529@@ -1464,8 +1464,7 @@ err:
15530 }
15531 }
15532
15533- while ((skb = __skb_dequeue(&errq)))
15534- kfree_skb(skb);
15535+ __skb_queue_purge(&errq);
15536
15537 while ((skb = __skb_dequeue(&rxq)) != NULL) {
15538 struct page *page = NETFRONT_SKB_CB(skb)->page;
15539@@ -1630,8 +1629,7 @@ static void netif_release_rx_bufs_flip(s
15540 }
15541 }
15542
15543- while ((skb = __skb_dequeue(&free_list)) != NULL)
15544- dev_kfree_skb(skb);
15545+ __skb_queue_purge(&free_list);
15546
15547 spin_unlock_bh(&np->rx_lock);
15548 }
15549--- sle11-2009-05-14.orig/drivers/xen/privcmd/privcmd.c 2009-03-04 11:28:34.000000000 +0100
15550+++ sle11-2009-05-14/drivers/xen/privcmd/privcmd.c 2009-03-16 16:38:05.000000000 +0100
15551@@ -261,15 +261,13 @@ static long privcmd_ioctl(struct file *f
15552 }
15553
15554 #ifndef HAVE_ARCH_PRIVCMD_MMAP
15555-static struct page *privcmd_nopage(struct vm_area_struct *vma,
15556- unsigned long address,
15557- int *type)
15558+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
15559 {
15560- return NOPAGE_SIGBUS;
15561+ return VM_FAULT_SIGBUS;
15562 }
15563
15564 static struct vm_operations_struct privcmd_vm_ops = {
15565- .nopage = privcmd_nopage
15566+ .fault = privcmd_fault
15567 };
15568
15569 static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
15570--- sle11-2009-05-14.orig/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:12:22.000000000 +0100
15571+++ sle11-2009-05-14/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:13:17.000000000 +0100
15572@@ -442,7 +442,7 @@ int xenbus_map_ring_valloc(struct xenbus
15573
15574 *vaddr = NULL;
15575
15576- area = alloc_vm_area(PAGE_SIZE);
15577+ area = xen_alloc_vm_area(PAGE_SIZE);
15578 if (!area)
15579 return -ENOMEM;
15580
15581@@ -452,7 +452,7 @@ int xenbus_map_ring_valloc(struct xenbus
15582 BUG();
15583
15584 if (op.status != GNTST_okay) {
15585- free_vm_area(area);
15586+ xen_free_vm_area(area);
15587 xenbus_dev_fatal(dev, op.status,
15588 "mapping in shared page %d from domain %d",
15589 gnt_ref, dev->otherend_id);
15590@@ -551,7 +551,7 @@ int xenbus_unmap_ring_vfree(struct xenbu
15591 BUG();
15592
15593 if (op.status == GNTST_okay)
15594- free_vm_area(area);
15595+ xen_free_vm_area(area);
15596 else
15597 xenbus_dev_error(dev, op.status,
15598 "unmapping page at handle %d error %d",
15599--- sle11-2009-05-14.orig/drivers/xen/xenbus/xenbus_probe.c 2009-02-16 16:18:36.000000000 +0100
15600+++ sle11-2009-05-14/drivers/xen/xenbus/xenbus_probe.c 2009-03-16 16:38:05.000000000 +0100
15601@@ -173,7 +173,7 @@ static int read_backend_details(struct x
15602 return read_otherend_details(xendev, "backend-id", "backend");
15603 }
15604
15605-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) && (defined(CONFIG_XEN) || defined(MODULE))
15606+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
15607 static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env)
15608 {
15609 struct xenbus_device *xdev;
15610@@ -185,8 +185,10 @@ static int xenbus_uevent_frontend(struct
15611 return -ENODEV;
15612
15613 /* stuff we want to pass to /sbin/hotplug */
15614+#if defined(CONFIG_XEN) || defined(MODULE)
15615 add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype);
15616 add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename);
15617+#endif
15618 add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype);
15619
15620 return 0;
15621@@ -207,10 +209,8 @@ static struct xen_bus_type xenbus_fronte
15622 .probe = xenbus_dev_probe,
15623 .remove = xenbus_dev_remove,
15624 .shutdown = xenbus_dev_shutdown,
15625-#if defined(CONFIG_XEN) || defined(MODULE)
15626 .uevent = xenbus_uevent_frontend,
15627 #endif
15628-#endif
15629 },
15630 #if defined(CONFIG_XEN) || defined(MODULE)
15631 .dev = {
15632@@ -519,6 +519,15 @@ static ssize_t xendev_show_devtype(struc
15633 }
15634 DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
15635
15636+static ssize_t xendev_show_modalias(struct device *dev,
15637+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
15638+ struct device_attribute *attr,
15639+#endif
15640+ char *buf)
15641+{
15642+ return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
15643+}
15644+DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
15645
15646 int xenbus_probe_node(struct xen_bus_type *bus,
15647 const char *type,
15648@@ -579,10 +588,16 @@ int xenbus_probe_node(struct xen_bus_typ
15649
15650 err = device_create_file(&xendev->dev, &dev_attr_devtype);
15651 if (err)
15652- goto fail_remove_file;
15653+ goto fail_remove_nodename;
15654+
15655+ err = device_create_file(&xendev->dev, &dev_attr_modalias);
15656+ if (err)
15657+ goto fail_remove_devtype;
15658
15659 return 0;
15660-fail_remove_file:
15661+fail_remove_devtype:
15662+ device_remove_file(&xendev->dev, &dev_attr_devtype);
15663+fail_remove_nodename:
15664 device_remove_file(&xendev->dev, &dev_attr_nodename);
15665 fail_unregister:
15666 device_unregister(&xendev->dev);
15667--- sle11-2009-05-14.orig/fs/aio.c 2009-03-24 10:11:37.000000000 +0100
15668+++ sle11-2009-05-14/fs/aio.c 2009-03-24 10:13:25.000000000 +0100
15669@@ -1271,6 +1271,7 @@ static void io_destroy(struct kioctx *io
15670 #ifdef CONFIG_EPOLL
15671 /* forget the poll file, but it's up to the user to close it */
15672 if (ioctx->file) {
15673+ fput(ioctx->file);
15674 ioctx->file->private_data = 0;
15675 ioctx->file = 0;
15676 }
15677@@ -1295,6 +1296,7 @@ static int aio_queue_fd_close(struct ino
15678 spin_lock_irq(&ioctx->ctx_lock);
15679 ioctx->file = 0;
15680 spin_unlock_irq(&ioctx->ctx_lock);
15681+ fput(file);
15682 }
15683 return 0;
15684 }
15685@@ -1330,16 +1332,17 @@ static const struct file_operations aioq
15686
15687 static int make_aio_fd(struct kioctx *ioctx)
15688 {
15689- int error, fd;
15690- struct inode *inode;
15691+ int fd;
15692 struct file *file;
15693
15694- error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
15695- &aioq_fops, ioctx);
15696- if (error)
15697- return error;
15698+ fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
15699+ if (fd < 0)
15700+ return fd;
15701
15702 /* associate the file with the IO context */
15703+ file = fget(fd);
15704+ if (!file)
15705+ return -EBADF;
15706 file->private_data = ioctx;
15707 ioctx->file = file;
15708 init_waitqueue_head(&ioctx->poll_wait);
15709--- sle11-2009-05-14.orig/include/asm-x86/dma-mapping.h 2009-05-14 10:56:29.000000000 +0200
15710+++ sle11-2009-05-14/include/asm-x86/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
15711@@ -223,8 +223,13 @@ static inline dma_addr_t dma_map_page(st
15712 struct dma_mapping_ops *ops = get_dma_ops(dev);
15713
15714 BUG_ON(!valid_dma_direction(direction));
15715+#ifndef CONFIG_XEN
15716 return ops->map_single(dev, page_to_phys(page) + offset,
15717 size, direction);
15718+#else
15719+ return ops->map_single(dev, page_to_pseudophys(page) + offset,
15720+ size, direction);
15721+#endif
15722 }
15723
15724 static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
15725--- sle11-2009-05-14.orig/include/asm-x86/genapic_64.h 2009-05-14 10:56:29.000000000 +0200
15726+++ sle11-2009-05-14/include/asm-x86/genapic_64.h 2009-03-16 16:38:05.000000000 +0100
15727@@ -46,6 +46,7 @@ extern struct genapic apic_x2apic_phys;
15728 extern int acpi_madt_oem_check(char *, char *);
15729
15730 extern void apic_send_IPI_self(int vector);
15731+#ifndef CONFIG_XEN
15732 enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
15733 extern enum uv_system_type get_uv_system_type(void);
15734 extern int is_uv_system(void);
15735@@ -55,6 +56,10 @@ DECLARE_PER_CPU(int, x2apic_extra_bits);
15736 extern void uv_cpu_init(void);
15737 extern void uv_system_init(void);
15738 extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
15739+#else
15740+#define is_uv_system() 0
15741+#define uv_cpu_init() ((void)0)
15742+#endif
15743
15744 extern void setup_apic_routing(void);
15745
15746--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:33:40.000000000 +0100
15747+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:38:05.000000000 +0100
15748@@ -64,8 +64,8 @@ static inline struct desc_struct *get_cp
15749 }
15750
15751 static inline void pack_gate(gate_desc *gate, unsigned char type,
15752- unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
15753-
15754+ unsigned long base, unsigned dpl, unsigned flags,
15755+ unsigned short seg)
15756 {
15757 gate->a = (seg << 16) | (base & 0xffff);
15758 gate->b = (base & 0xffff0000) |
15759@@ -84,22 +84,23 @@ static inline int desc_empty(const void
15760 #define load_TR_desc() native_load_tr_desc()
15761 #define load_gdt(dtr) native_load_gdt(dtr)
15762 #define load_idt(dtr) native_load_idt(dtr)
15763-#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
15764-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
15765+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
15766+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
15767
15768 #define store_gdt(dtr) native_store_gdt(dtr)
15769 #define store_idt(dtr) native_store_idt(dtr)
15770 #define store_tr(tr) (tr = native_store_tr())
15771-#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
15772+#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
15773
15774 #define load_TLS(t, cpu) native_load_tls(t, cpu)
15775 #define set_ldt native_set_ldt
15776
15777-#define write_ldt_entry(dt, entry, desc) \
15778- native_write_ldt_entry(dt, entry, desc)
15779-#define write_gdt_entry(dt, entry, desc, type) \
15780- native_write_gdt_entry(dt, entry, desc, type)
15781-#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
15782+#define write_ldt_entry(dt, entry, desc) \
15783+ native_write_ldt_entry(dt, entry, desc)
15784+#define write_gdt_entry(dt, entry, desc, type) \
15785+ native_write_gdt_entry(dt, entry, desc, type)
15786+#define write_idt_entry(dt, entry, g) \
15787+ native_write_idt_entry(dt, entry, g)
15788
15789 static inline void native_write_idt_entry(gate_desc *idt, int entry,
15790 const gate_desc *gate)
15791@@ -138,8 +139,8 @@ static inline void pack_descriptor(struc
15792 {
15793 desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
15794 desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
15795- (limit & 0x000f0000) | ((type & 0xff) << 8) |
15796- ((flags & 0xf) << 20);
15797+ (limit & 0x000f0000) | ((type & 0xff) << 8) |
15798+ ((flags & 0xf) << 20);
15799 desc->p = 1;
15800 }
15801
15802@@ -160,7 +161,6 @@ static inline void set_tssldt_descriptor
15803 desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
15804 desc->base3 = PTR_HIGH(addr);
15805 #else
15806-
15807 pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
15808 #endif
15809 }
15810@@ -178,7 +178,8 @@ static inline void __set_tss_desc(unsign
15811 * last valid byte
15812 */
15813 set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
15814- IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
15815+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
15816+ sizeof(unsigned long) - 1);
15817 write_gdt_entry(d, entry, &tss, DESC_TSS);
15818 }
15819
15820@@ -187,16 +188,16 @@ static inline void __set_tss_desc(unsign
15821 static inline void native_set_ldt(const void *addr, unsigned int entries)
15822 {
15823 if (likely(entries == 0))
15824- __asm__ __volatile__("lldt %w0"::"q" (0));
15825+ asm volatile("lldt %w0"::"q" (0));
15826 else {
15827 unsigned cpu = smp_processor_id();
15828 ldt_desc ldt;
15829
15830- set_tssldt_descriptor(&ldt, (unsigned long)addr,
15831- DESC_LDT, entries * sizeof(ldt) - 1);
15832+ set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
15833+ entries * LDT_ENTRY_SIZE - 1);
15834 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
15835 &ldt, DESC_LDT);
15836- __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
15837+ asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
15838 }
15839 }
15840
15841@@ -261,15 +262,15 @@ static inline void xen_load_tls(struct t
15842 }
15843 #endif
15844
15845-#define _LDT_empty(info) (\
15846- (info)->base_addr == 0 && \
15847- (info)->limit == 0 && \
15848- (info)->contents == 0 && \
15849- (info)->read_exec_only == 1 && \
15850- (info)->seg_32bit == 0 && \
15851- (info)->limit_in_pages == 0 && \
15852- (info)->seg_not_present == 1 && \
15853- (info)->useable == 0)
15854+#define _LDT_empty(info) \
15855+ ((info)->base_addr == 0 && \
15856+ (info)->limit == 0 && \
15857+ (info)->contents == 0 && \
15858+ (info)->read_exec_only == 1 && \
15859+ (info)->seg_32bit == 0 && \
15860+ (info)->limit_in_pages == 0 && \
15861+ (info)->seg_not_present == 1 && \
15862+ (info)->useable == 0)
15863
15864 #ifdef CONFIG_X86_64
15865 #define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
15866@@ -309,7 +310,7 @@ static inline unsigned long get_desc_lim
15867
15868 #ifndef CONFIG_X86_NO_IDT
15869 static inline void _set_gate(int gate, unsigned type, void *addr,
15870- unsigned dpl, unsigned ist, unsigned seg)
15871+ unsigned dpl, unsigned ist, unsigned seg)
15872 {
15873 gate_desc s;
15874 pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
15875@@ -393,10 +394,10 @@ static inline void set_system_gate_ist(i
15876 * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
15877 */
15878 #define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
15879- movb idx*8+4(gdt), lo_b; \
15880- movb idx*8+7(gdt), hi_b; \
15881- shll $16, base; \
15882- movw idx*8+2(gdt), lo_w;
15883+ movb idx * 8 + 4(gdt), lo_b; \
15884+ movb idx * 8 + 7(gdt), hi_b; \
15885+ shll $16, base; \
15886+ movw idx * 8 + 2(gdt), lo_w;
15887
15888
15889 #endif /* __ASSEMBLY__ */
15890--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/dma-mapping.h 2009-02-16 16:18:36.000000000 +0100
15891+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
15892@@ -1,5 +1,17 @@
15893-#ifdef CONFIG_X86_32
15894-# include "dma-mapping_32.h"
15895-#else
15896-# include "dma-mapping_64.h"
15897-#endif
15898+#ifndef _ASM_DMA_MAPPING_H_
15899+
15900+#include "../../dma-mapping.h"
15901+
15902+static inline int
15903+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
15904+{
15905+ dma_addr_t mask = 0xffffffff;
15906+ /* If the device has a mask, use it, otherwise default to 32 bits */
15907+ if (hwdev && hwdev->dma_mask)
15908+ mask = *hwdev->dma_mask;
15909+ return (addr & ~mask) != 0;
15910+}
15911+
15912+extern int range_straddles_page_boundary(paddr_t p, size_t size);
15913+
15914+#endif /* _ASM_DMA_MAPPING_H_ */
15915--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2009-03-16 16:33:40.000000000 +0100
15916+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
15917@@ -1,141 +0,0 @@
15918-#ifndef _ASM_I386_DMA_MAPPING_H
15919-#define _ASM_I386_DMA_MAPPING_H
15920-
15921-/*
15922- * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
15923- * documentation.
15924- */
15925-
15926-#include <linux/mm.h>
15927-#include <linux/scatterlist.h>
15928-#include <asm/cache.h>
15929-#include <asm/io.h>
15930-#include <asm/swiotlb.h>
15931-
15932-static inline int
15933-address_needs_mapping(struct device *hwdev, dma_addr_t addr)
15934-{
15935- dma_addr_t mask = 0xffffffff;
15936- /* If the device has a mask, use it, otherwise default to 32 bits */
15937- if (hwdev && hwdev->dma_mask)
15938- mask = *hwdev->dma_mask;
15939- return (addr & ~mask) != 0;
15940-}
15941-
15942-extern int range_straddles_page_boundary(paddr_t p, size_t size);
15943-
15944-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
15945-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
15946-
15947-void *dma_alloc_coherent(struct device *dev, size_t size,
15948- dma_addr_t *dma_handle, gfp_t flag);
15949-
15950-void dma_free_coherent(struct device *dev, size_t size,
15951- void *vaddr, dma_addr_t dma_handle);
15952-
15953-extern dma_addr_t
15954-dma_map_single(struct device *dev, void *ptr, size_t size,
15955- enum dma_data_direction direction);
15956-
15957-extern void
15958-dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
15959- enum dma_data_direction direction);
15960-
15961-extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
15962- int nents, enum dma_data_direction direction);
15963-extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
15964- int nents, enum dma_data_direction direction);
15965-
15966-#ifdef CONFIG_HIGHMEM
15967-extern dma_addr_t
15968-dma_map_page(struct device *dev, struct page *page, unsigned long offset,
15969- size_t size, enum dma_data_direction direction);
15970-
15971-extern void
15972-dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
15973- enum dma_data_direction direction);
15974-#else
15975-#define dma_map_page(dev, page, offset, size, dir) \
15976- dma_map_single(dev, page_address(page) + (offset), (size), (dir))
15977-#define dma_unmap_page dma_unmap_single
15978-#endif
15979-
15980-extern void
15981-dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
15982- enum dma_data_direction direction);
15983-
15984-extern void
15985-dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
15986- enum dma_data_direction direction);
15987-
15988-static inline void
15989-dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
15990- unsigned long offset, size_t size,
15991- enum dma_data_direction direction)
15992-{
15993- dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
15994-}
15995-
15996-static inline void
15997-dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
15998- unsigned long offset, size_t size,
15999- enum dma_data_direction direction)
16000-{
16001- dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
16002-}
16003-
16004-extern void
16005-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
16006- enum dma_data_direction direction);
16007-
16008-extern void
16009-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
16010- enum dma_data_direction direction);
16011-
16012-extern int
16013-dma_mapping_error(dma_addr_t dma_addr);
16014-
16015-extern int
16016-dma_supported(struct device *dev, u64 mask);
16017-
16018-static inline int
16019-dma_set_mask(struct device *dev, u64 mask)
16020-{
16021- if(!dev->dma_mask || !dma_supported(dev, mask))
16022- return -EIO;
16023-
16024- *dev->dma_mask = mask;
16025-
16026- return 0;
16027-}
16028-
16029-static inline int
16030-dma_get_cache_alignment(void)
16031-{
16032- /* no easy way to get cache size on all x86, so return the
16033- * maximum possible, to be safe */
16034- return (1 << INTERNODE_CACHE_SHIFT);
16035-}
16036-
16037-#define dma_is_consistent(d, h) (1)
16038-
16039-static inline void
16040-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
16041- enum dma_data_direction direction)
16042-{
16043- flush_write_buffers();
16044-}
16045-
16046-#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
16047-extern int
16048-dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
16049- dma_addr_t device_addr, size_t size, int flags);
16050-
16051-extern void
16052-dma_release_declared_memory(struct device *dev);
16053-
16054-extern void *
16055-dma_mark_declared_memory_occupied(struct device *dev,
16056- dma_addr_t device_addr, size_t size);
16057-
16058-#endif
16059--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/dma-mapping_64.h 2009-02-16 16:18:36.000000000 +0100
16060+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
16061@@ -1,205 +0,0 @@
16062-#ifndef _X8664_DMA_MAPPING_H
16063-#define _X8664_DMA_MAPPING_H 1
16064-
16065-/*
16066- * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
16067- * documentation.
16068- */
16069-
16070-#include <linux/scatterlist.h>
16071-#include <asm/io.h>
16072-
16073-struct dma_mapping_ops {
16074- int (*mapping_error)(dma_addr_t dma_addr);
16075- void* (*alloc_coherent)(struct device *dev, size_t size,
16076- dma_addr_t *dma_handle, gfp_t gfp);
16077- void (*free_coherent)(struct device *dev, size_t size,
16078- void *vaddr, dma_addr_t dma_handle);
16079- dma_addr_t (*map_single)(struct device *hwdev, void *ptr,
16080- size_t size, int direction);
16081- /* like map_single, but doesn't check the device mask */
16082- dma_addr_t (*map_simple)(struct device *hwdev, char *ptr,
16083- size_t size, int direction);
16084- void (*unmap_single)(struct device *dev, dma_addr_t addr,
16085- size_t size, int direction);
16086- void (*sync_single_for_cpu)(struct device *hwdev,
16087- dma_addr_t dma_handle, size_t size,
16088- int direction);
16089- void (*sync_single_for_device)(struct device *hwdev,
16090- dma_addr_t dma_handle, size_t size,
16091- int direction);
16092- void (*sync_single_range_for_cpu)(struct device *hwdev,
16093- dma_addr_t dma_handle, unsigned long offset,
16094- size_t size, int direction);
16095- void (*sync_single_range_for_device)(struct device *hwdev,
16096- dma_addr_t dma_handle, unsigned long offset,
16097- size_t size, int direction);
16098- void (*sync_sg_for_cpu)(struct device *hwdev,
16099- struct scatterlist *sg, int nelems,
16100- int direction);
16101- void (*sync_sg_for_device)(struct device *hwdev,
16102- struct scatterlist *sg, int nelems,
16103- int direction);
16104- int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
16105- int nents, int direction);
16106- void (*unmap_sg)(struct device *hwdev,
16107- struct scatterlist *sg, int nents,
16108- int direction);
16109- int (*dma_supported)(struct device *hwdev, u64 mask);
16110- int is_phys;
16111-};
16112-
16113-extern dma_addr_t bad_dma_address;
16114-extern const struct dma_mapping_ops* dma_ops;
16115-extern int iommu_merge;
16116-
16117-#if 0
16118-static inline int dma_mapping_error(dma_addr_t dma_addr)
16119-{
16120- if (dma_ops->mapping_error)
16121- return dma_ops->mapping_error(dma_addr);
16122-
16123- return (dma_addr == bad_dma_address);
16124-}
16125-
16126-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
16127-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
16128-
16129-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
16130-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
16131-
16132-extern void *dma_alloc_coherent(struct device *dev, size_t size,
16133- dma_addr_t *dma_handle, gfp_t gfp);
16134-extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
16135- dma_addr_t dma_handle);
16136-
16137-static inline dma_addr_t
16138-dma_map_single(struct device *hwdev, void *ptr, size_t size,
16139- int direction)
16140-{
16141- BUG_ON(!valid_dma_direction(direction));
16142- return dma_ops->map_single(hwdev, ptr, size, direction);
16143-}
16144-
16145-static inline void
16146-dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
16147- int direction)
16148-{
16149- BUG_ON(!valid_dma_direction(direction));
16150- dma_ops->unmap_single(dev, addr, size, direction);
16151-}
16152-
16153-#define dma_map_page(dev,page,offset,size,dir) \
16154- dma_map_single((dev), page_address(page)+(offset), (size), (dir))
16155-
16156-#define dma_unmap_page dma_unmap_single
16157-
16158-static inline void
16159-dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
16160- size_t size, int direction)
16161-{
16162- BUG_ON(!valid_dma_direction(direction));
16163- if (dma_ops->sync_single_for_cpu)
16164- dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
16165- direction);
16166- flush_write_buffers();
16167-}
16168-
16169-static inline void
16170-dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
16171- size_t size, int direction)
16172-{
16173- BUG_ON(!valid_dma_direction(direction));
16174- if (dma_ops->sync_single_for_device)
16175- dma_ops->sync_single_for_device(hwdev, dma_handle, size,
16176- direction);
16177- flush_write_buffers();
16178-}
16179-
16180-static inline void
16181-dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
16182- unsigned long offset, size_t size, int direction)
16183-{
16184- BUG_ON(!valid_dma_direction(direction));
16185- if (dma_ops->sync_single_range_for_cpu) {
16186- dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
16187- }
16188-
16189- flush_write_buffers();
16190-}
16191-
16192-static inline void
16193-dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
16194- unsigned long offset, size_t size, int direction)
16195-{
16196- BUG_ON(!valid_dma_direction(direction));
16197- if (dma_ops->sync_single_range_for_device)
16198- dma_ops->sync_single_range_for_device(hwdev, dma_handle,
16199- offset, size, direction);
16200-
16201- flush_write_buffers();
16202-}
16203-
16204-static inline void
16205-dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
16206- int nelems, int direction)
16207-{
16208- BUG_ON(!valid_dma_direction(direction));
16209- if (dma_ops->sync_sg_for_cpu)
16210- dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
16211- flush_write_buffers();
16212-}
16213-
16214-static inline void
16215-dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
16216- int nelems, int direction)
16217-{
16218- BUG_ON(!valid_dma_direction(direction));
16219- if (dma_ops->sync_sg_for_device) {
16220- dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
16221- }
16222-
16223- flush_write_buffers();
16224-}
16225-
16226-static inline int
16227-dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
16228-{
16229- BUG_ON(!valid_dma_direction(direction));
16230- return dma_ops->map_sg(hwdev, sg, nents, direction);
16231-}
16232-
16233-static inline void
16234-dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
16235- int direction)
16236-{
16237- BUG_ON(!valid_dma_direction(direction));
16238- dma_ops->unmap_sg(hwdev, sg, nents, direction);
16239-}
16240-
16241-extern int dma_supported(struct device *hwdev, u64 mask);
16242-
16243-/* same for gart, swiotlb, and nommu */
16244-static inline int dma_get_cache_alignment(void)
16245-{
16246- return boot_cpu_data.x86_clflush_size;
16247-}
16248-
16249-#define dma_is_consistent(d, h) 1
16250-
16251-extern int dma_set_mask(struct device *dev, u64 mask);
16252-
16253-static inline void
16254-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
16255- enum dma_data_direction dir)
16256-{
16257- flush_write_buffers();
16258-}
16259-
16260-extern struct device fallback_dev;
16261-extern int panic_on_overflow;
16262-#endif
16263-
16264-#endif /* _X8664_DMA_MAPPING_H */
16265-
16266-#include "dma-mapping_32.h"
16267--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/fixmap.h 2009-02-16 16:18:36.000000000 +0100
16268+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/fixmap.h 2009-03-16 16:38:05.000000000 +0100
16269@@ -1,5 +1,13 @@
16270+#ifndef _ASM_FIXMAP_H
16271+#define _ASM_FIXMAP_H
16272+
16273 #ifdef CONFIG_X86_32
16274 # include "fixmap_32.h"
16275 #else
16276 # include "fixmap_64.h"
16277 #endif
16278+
16279+#define clear_fixmap(idx) \
16280+ __set_fixmap(idx, 0, __pgprot(0))
16281+
16282+#endif
16283--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:33:40.000000000 +0100
16284+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:38:05.000000000 +0100
16285@@ -10,8 +10,8 @@
16286 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
16287 */
16288
16289-#ifndef _ASM_FIXMAP_H
16290-#define _ASM_FIXMAP_H
16291+#ifndef _ASM_FIXMAP_32_H
16292+#define _ASM_FIXMAP_32_H
16293
16294 /* used by vmalloc.c, vsyscall.lds.S.
16295 *
16296@@ -102,8 +102,7 @@ enum fixed_addresses {
16297 */
16298 #define NR_FIX_BTMAPS 64
16299 #define FIX_BTMAPS_NESTING 4
16300- FIX_BTMAP_END =
16301- __end_of_permanent_fixed_addresses + 512 -
16302+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
16303 (__end_of_permanent_fixed_addresses & 511),
16304 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
16305 FIX_WP_TEST,
16306@@ -114,19 +113,16 @@ enum fixed_addresses {
16307 };
16308
16309 extern void __set_fixmap(enum fixed_addresses idx,
16310- maddr_t phys, pgprot_t flags);
16311+ maddr_t phys, pgprot_t flags);
16312 extern void reserve_top_address(unsigned long reserve);
16313
16314-#define set_fixmap(idx, phys) \
16315- __set_fixmap(idx, phys, PAGE_KERNEL)
16316+#define set_fixmap(idx, phys) \
16317+ __set_fixmap(idx, phys, PAGE_KERNEL)
16318 /*
16319 * Some hardware wants to get fixmapped without caching.
16320 */
16321-#define set_fixmap_nocache(idx, phys) \
16322- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16323-
16324-#define clear_fixmap(idx) \
16325- __set_fixmap(idx, 0, __pgprot(0))
16326+#define set_fixmap_nocache(idx, phys) \
16327+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16328
16329 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
16330
16331@@ -159,7 +155,7 @@ static __always_inline unsigned long fix
16332 if (idx >= __end_of_fixed_addresses)
16333 __this_fixmap_does_not_exist();
16334
16335- return __fix_to_virt(idx);
16336+ return __fix_to_virt(idx);
16337 }
16338
16339 static inline unsigned long virt_to_fix(const unsigned long vaddr)
16340--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:33:40.000000000 +0100
16341+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:38:05.000000000 +0100
16342@@ -8,8 +8,8 @@
16343 * Copyright (C) 1998 Ingo Molnar
16344 */
16345
16346-#ifndef _ASM_FIXMAP_H
16347-#define _ASM_FIXMAP_H
16348+#ifndef _ASM_FIXMAP_64_H
16349+#define _ASM_FIXMAP_64_H
16350
16351 #include <linux/kernel.h>
16352 #include <asm/apicdef.h>
16353@@ -35,7 +35,8 @@
16354
16355 enum fixed_addresses {
16356 VSYSCALL_LAST_PAGE,
16357- VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
16358+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
16359+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
16360 VSYSCALL_HPET,
16361 FIX_DBGP_BASE,
16362 FIX_EARLYCON_MEM_BASE,
16363@@ -45,11 +46,12 @@ enum fixed_addresses {
16364 #endif
16365 #ifndef CONFIG_XEN
16366 FIX_IO_APIC_BASE_0,
16367- FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
16368+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
16369 #endif
16370 #ifdef CONFIG_EFI
16371 FIX_EFI_IO_MAP_LAST_PAGE,
16372- FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
16373+ FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
16374+ + MAX_EFI_IO_PAGES - 1,
16375 #endif
16376 #ifdef CONFIG_ACPI
16377 FIX_ACPI_BEGIN,
16378@@ -79,19 +81,16 @@ enum fixed_addresses {
16379 __end_of_fixed_addresses
16380 };
16381
16382-extern void __set_fixmap (enum fixed_addresses idx,
16383- unsigned long phys, pgprot_t flags);
16384+extern void __set_fixmap(enum fixed_addresses idx,
16385+ unsigned long phys, pgprot_t flags);
16386
16387-#define set_fixmap(idx, phys) \
16388- __set_fixmap(idx, phys, PAGE_KERNEL)
16389+#define set_fixmap(idx, phys) \
16390+ __set_fixmap(idx, phys, PAGE_KERNEL)
16391 /*
16392 * Some hardware wants to get fixmapped without caching.
16393 */
16394-#define set_fixmap_nocache(idx, phys) \
16395- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16396-
16397-#define clear_fixmap(idx) \
16398- __set_fixmap(idx, 0, __pgprot(0))
16399+#define set_fixmap_nocache(idx, phys) \
16400+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16401
16402 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
16403 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
16404--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:33:40.000000000 +0100
16405+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:38:05.000000000 +0100
16406@@ -8,7 +8,7 @@
16407 * Gerhard.Wichert@pdb.siemens.de
16408 *
16409 *
16410- * Redesigned the x86 32-bit VM architecture to deal with
16411+ * Redesigned the x86 32-bit VM architecture to deal with
16412 * up to 16 Terabyte physical memory. With current x86 CPUs
16413 * we now support up to 64 Gigabytes physical RAM.
16414 *
16415--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/io.h 2009-02-16 16:18:36.000000000 +0100
16416+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/io.h 2009-03-16 16:38:05.000000000 +0100
16417@@ -1,5 +1,22 @@
16418+#ifndef _ASM_X86_IO_H
16419+#define _ASM_X86_IO_H
16420+
16421+#define ARCH_HAS_IOREMAP_WC
16422+
16423 #ifdef CONFIG_X86_32
16424 # include "io_32.h"
16425 #else
16426 # include "io_64.h"
16427 #endif
16428+
16429+extern void *xlate_dev_mem_ptr(unsigned long phys);
16430+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
16431+
16432+extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
16433+extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
16434+
16435+extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
16436+ unsigned long prot_val);
16437+extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
16438+
16439+#endif /* _ASM_X86_IO_H */
16440--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:33:40.000000000 +0100
16441+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:38:05.000000000 +0100
16442@@ -137,11 +137,11 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT
16443 #endif /* __ASSEMBLY__ */
16444
16445 #ifndef __ASSEMBLY__
16446-#define raw_local_save_flags(flags) \
16447- do { (flags) = __raw_local_save_flags(); } while (0)
16448+#define raw_local_save_flags(flags) \
16449+ do { (flags) = __raw_local_save_flags(); } while (0)
16450
16451-#define raw_local_irq_save(flags) \
16452- do { (flags) = __raw_local_irq_save(); } while (0)
16453+#define raw_local_irq_save(flags) \
16454+ do { (flags) = __raw_local_irq_save(); } while (0)
16455
16456 static inline int raw_irqs_disabled_flags(unsigned long flags)
16457 {
16458--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:33:40.000000000 +0100
16459+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:38:05.000000000 +0100
16460@@ -94,7 +94,7 @@ static inline void switch_mm(struct mm_s
16461 BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
16462
16463 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
16464- /* We were in lazy tlb mode and leave_mm disabled
16465+ /* We were in lazy tlb mode and leave_mm disabled
16466 * tlb flush IPI delivery. We must reload %cr3.
16467 */
16468 load_cr3(next->pgd);
16469@@ -107,10 +107,10 @@ static inline void switch_mm(struct mm_s
16470 #define deactivate_mm(tsk, mm) \
16471 asm("movl %0,%%gs": :"r" (0));
16472
16473-#define activate_mm(prev, next) \
16474- do { \
16475- xen_activate_mm(prev, next); \
16476- switch_mm((prev),(next),NULL); \
16477- } while(0)
16478+#define activate_mm(prev, next) \
16479+do { \
16480+ xen_activate_mm(prev, next); \
16481+ switch_mm((prev), (next), NULL); \
16482+} while (0)
16483
16484 #endif
16485--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:33:40.000000000 +0100
16486+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:38:05.000000000 +0100
16487@@ -21,7 +21,7 @@ void destroy_context(struct mm_struct *m
16488 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
16489 {
16490 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
16491- if (read_pda(mmu_state) == TLBSTATE_OK)
16492+ if (read_pda(mmu_state) == TLBSTATE_OK)
16493 write_pda(mmu_state, TLBSTATE_LAZY);
16494 #endif
16495 }
16496@@ -62,7 +62,7 @@ extern void mm_pin(struct mm_struct *mm)
16497 extern void mm_unpin(struct mm_struct *mm);
16498 void mm_pin_all(void);
16499
16500-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
16501+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
16502 struct task_struct *tsk)
16503 {
16504 unsigned cpu = smp_processor_id();
16505@@ -106,7 +106,7 @@ static inline void switch_mm(struct mm_s
16506 if (read_pda(active_mm) != next)
16507 BUG();
16508 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
16509- /* We were in lazy tlb mode and leave_mm disabled
16510+ /* We were in lazy tlb mode and leave_mm disabled
16511 * tlb flush IPI delivery. We must reload CR3
16512 * to make sure to use no freed page tables.
16513 */
16514@@ -118,10 +118,11 @@ static inline void switch_mm(struct mm_s
16515 #endif
16516 }
16517
16518-#define deactivate_mm(tsk,mm) do { \
16519- load_gs_index(0); \
16520- asm volatile("movl %0,%%fs"::"r"(0)); \
16521-} while(0)
16522+#define deactivate_mm(tsk, mm) \
16523+do { \
16524+ load_gs_index(0); \
16525+ asm volatile("movl %0,%%fs"::"r"(0)); \
16526+} while (0)
16527
16528 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
16529 {
16530--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:33:40.000000000 +0100
16531+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:38:05.000000000 +0100
16532@@ -20,8 +20,16 @@
16533 #define _PAGE_BIT_IO 9
16534 #define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
16535
16536-#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
16537-#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK)
16538+#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
16539+#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
16540+
16541+/* Cast PAGE_MASK to a signed type so that it is sign-extended if
16542+ virtual addresses are 32-bits but physical addresses are larger
16543+ (ie, 32-bit PAE). */
16544+#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
16545+
16546+/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
16547+#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
16548
16549 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
16550 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
16551@@ -34,19 +42,14 @@
16552 /* to align the pointer to the (next) page boundary */
16553 #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
16554
16555-#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
16556-#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
16557-
16558 #ifndef __ASSEMBLY__
16559 #include <linux/types.h>
16560 #endif
16561
16562 #ifdef CONFIG_X86_64
16563 #include <asm/page_64.h>
16564-#define max_pfn_mapped end_pfn_map
16565 #else
16566 #include <asm/page_32.h>
16567-#define max_pfn_mapped max_low_pfn
16568 #endif /* CONFIG_X86_64 */
16569
16570 #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
16571@@ -59,6 +62,9 @@
16572 #ifndef __ASSEMBLY__
16573
16574 extern int page_is_ram(unsigned long pagenr);
16575+extern int devmem_is_allowed(unsigned long pagenr);
16576+
16577+extern unsigned long max_pfn_mapped;
16578
16579 struct page;
16580
16581--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:33:40.000000000 +0100
16582+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:38:05.000000000 +0100
16583@@ -5,7 +5,7 @@
16584
16585 #define THREAD_ORDER 1
16586 #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
16587-#define CURRENT_MASK (~(THREAD_SIZE-1))
16588+#define CURRENT_MASK (~(THREAD_SIZE - 1))
16589
16590 #define EXCEPTION_STACK_ORDER 0
16591 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
16592@@ -53,10 +53,10 @@
16593 #define __VIRTUAL_MASK_SHIFT 48
16594
16595 /*
16596- * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
16597+ * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
16598 * arch/x86/kernel/head_64.S), and it is mapped here:
16599 */
16600-#define KERNEL_IMAGE_SIZE (128*1024*1024)
16601+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
16602 #define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
16603
16604 #ifndef __ASSEMBLY__
16605@@ -64,7 +64,6 @@ void clear_page(void *page);
16606 void copy_page(void *to, void *from);
16607
16608 extern unsigned long end_pfn;
16609-extern unsigned long end_pfn_map;
16610
16611 static inline unsigned long __phys_addr(unsigned long x)
16612 {
16613@@ -89,6 +88,9 @@ typedef union { pteval_t pte; unsigned i
16614
16615 #define vmemmap ((struct page *)VMEMMAP_START)
16616
16617+extern unsigned long init_memory_mapping(unsigned long start,
16618+ unsigned long end);
16619+
16620 #endif /* !__ASSEMBLY__ */
16621
16622 #ifdef CONFIG_FLATMEM
16623--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:33:40.000000000 +0100
16624+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:38:05.000000000 +0100
16625@@ -8,14 +8,13 @@
16626 #include <asm/scatterlist.h>
16627 #include <asm/io.h>
16628
16629-
16630 #ifdef __KERNEL__
16631
16632 struct pci_sysdata {
16633 int domain; /* PCI domain */
16634 int node; /* NUMA node */
16635 #ifdef CONFIG_X86_64
16636- void* iommu; /* IOMMU private data */
16637+ void *iommu; /* IOMMU private data */
16638 #endif
16639 #ifdef CONFIG_XEN_PCIDEV_FRONTEND
16640 struct pcifront_device *pdev;
16641@@ -23,6 +22,8 @@ struct pci_sysdata {
16642 };
16643
16644 /* scan a bus after allocating a pci_sysdata for it */
16645+extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
16646+ int node);
16647 extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
16648
16649 static inline int pci_domain_nr(struct pci_bus *bus)
16650@@ -36,6 +37,7 @@ static inline int pci_proc_domain(struct
16651 return pci_domain_nr(bus);
16652 }
16653
16654+extern void pci_iommu_alloc(void);
16655
16656 /* Can be used to override the logic in pci_scan_bus for skipping
16657 already-configured bus numbers - to be used for buggy BIOSes
16658@@ -57,7 +59,7 @@ extern unsigned long pci_mem_start;
16659 #define PCIBIOS_MIN_CARDBUS_IO 0x4000
16660
16661 void pcibios_config_init(void);
16662-struct pci_bus * pcibios_scan_root(int bus);
16663+struct pci_bus *pcibios_scan_root(int bus);
16664
16665 void pcibios_set_master(struct pci_dev *dev);
16666 void pcibios_penalize_isa_irq(int irq, int active);
16667@@ -67,7 +69,8 @@ int pcibios_set_irq_routing(struct pci_d
16668
16669 #define HAVE_PCI_MMAP
16670 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
16671- enum pci_mmap_state mmap_state, int write_combine);
16672+ enum pci_mmap_state mmap_state,
16673+ int write_combine);
16674
16675
16676 #ifdef CONFIG_PCI
16677--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2009-02-16 16:18:36.000000000 +0100
16678+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgalloc.h 2009-03-16 16:38:05.000000000 +0100
16679@@ -1,5 +1,149 @@
16680-#ifdef CONFIG_X86_32
16681-# include "pgalloc_32.h"
16682-#else
16683-# include "pgalloc_64.h"
16684+#ifndef _ASM_X86_PGALLOC_H
16685+#define _ASM_X86_PGALLOC_H
16686+
16687+#include <linux/threads.h>
16688+#include <linux/mm.h> /* for struct page */
16689+#include <linux/pagemap.h>
16690+
16691+#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
16692+
16693+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
16694+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
16695+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
16696+ unsigned long start, unsigned long count) {}
16697+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
16698+static inline void paravirt_release_pte(unsigned long pfn) {}
16699+static inline void paravirt_release_pmd(unsigned long pfn) {}
16700+static inline void paravirt_release_pud(unsigned long pfn) {}
16701+
16702+#ifdef CONFIG_X86_64
16703+void early_make_page_readonly(void *va, unsigned int feature);
16704+pmd_t *early_get_pmd(unsigned long va);
16705+#define make_lowmem_page_readonly make_page_readonly
16706+#define make_lowmem_page_writable make_page_writable
16707 #endif
16708+
16709+/*
16710+ * Allocate and free page tables.
16711+ */
16712+extern pgd_t *pgd_alloc(struct mm_struct *);
16713+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
16714+
16715+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
16716+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
16717+
16718+/* Should really implement gc for free page table pages. This could be
16719+ done with a reference count in struct page. */
16720+
16721+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
16722+{
16723+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
16724+ make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
16725+ free_page((unsigned long)pte);
16726+}
16727+
16728+extern void __pte_free(pgtable_t);
16729+static inline void pte_free(struct mm_struct *mm, struct page *pte)
16730+{
16731+ __pte_free(pte);
16732+}
16733+
16734+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
16735+
16736+static inline void pmd_populate_kernel(struct mm_struct *mm,
16737+ pmd_t *pmd, pte_t *pte)
16738+{
16739+ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
16740+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
16741+}
16742+
16743+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
16744+ struct page *pte)
16745+{
16746+ unsigned long pfn = page_to_pfn(pte);
16747+
16748+ paravirt_alloc_pte(mm, pfn);
16749+ if (PagePinned(virt_to_page(mm->pgd))) {
16750+ if (!PageHighMem(pte))
16751+ BUG_ON(HYPERVISOR_update_va_mapping(
16752+ (unsigned long)__va(pfn << PAGE_SHIFT),
16753+ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
16754+#ifndef CONFIG_X86_64
16755+ else if (!TestSetPagePinned(pte))
16756+ kmap_flush_unused();
16757+#endif
16758+ set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
16759+ } else
16760+ *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
16761+}
16762+
16763+#define pmd_pgtable(pmd) pmd_page(pmd)
16764+
16765+#if PAGETABLE_LEVELS > 2
16766+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
16767+extern void __pmd_free(pgtable_t);
16768+
16769+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
16770+{
16771+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
16772+ __pmd_free(virt_to_page(pmd));
16773+}
16774+
16775+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
16776+
16777+#ifdef CONFIG_X86_PAE
16778+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
16779+#else /* !CONFIG_X86_PAE */
16780+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
16781+{
16782+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
16783+ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
16784+ BUG_ON(HYPERVISOR_update_va_mapping(
16785+ (unsigned long)pmd,
16786+ pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
16787+ PAGE_KERNEL_RO), 0));
16788+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
16789+ } else
16790+ *pud = __pud(_PAGE_TABLE | __pa(pmd));
16791+}
16792+#endif /* CONFIG_X86_PAE */
16793+
16794+#if PAGETABLE_LEVELS > 3
16795+#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
16796+
16797+/*
16798+ * We need to use the batch mode here, but pgd_pupulate() won't be
16799+ * be called frequently.
16800+ */
16801+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
16802+{
16803+ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
16804+ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
16805+ BUG_ON(HYPERVISOR_update_va_mapping(
16806+ (unsigned long)pud,
16807+ pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
16808+ PAGE_KERNEL_RO), 0));
16809+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
16810+ set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
16811+ } else {
16812+ *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
16813+ *__user_pgd(pgd) = *(pgd);
16814+ }
16815+}
16816+
16817+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
16818+{
16819+ return (pud_t *)pmd_alloc_one(mm, addr);
16820+}
16821+
16822+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
16823+{
16824+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
16825+ __pmd_free(virt_to_page(pud));
16826+}
16827+
16828+extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
16829+#endif /* PAGETABLE_LEVELS > 3 */
16830+#endif /* PAGETABLE_LEVELS > 2 */
16831+
16832+#endif /* _ASM_X86_PGALLOC_H */
16833--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgalloc_32.h 2009-03-16 16:33:40.000000000 +0100
16834+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
16835@@ -1,111 +0,0 @@
16836-#ifndef _I386_PGALLOC_H
16837-#define _I386_PGALLOC_H
16838-
16839-#include <linux/threads.h>
16840-#include <linux/mm.h> /* for struct page */
16841-#include <linux/pagemap.h>
16842-#include <asm/tlb.h>
16843-#include <asm-generic/tlb.h>
16844-#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
16845-
16846-#define paravirt_alloc_pt(mm, pfn) do { } while (0)
16847-#define paravirt_alloc_pd(mm, pfn) do { } while (0)
16848-#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
16849-#define paravirt_release_pt(pfn) do { } while (0)
16850-#define paravirt_release_pd(pfn) do { } while (0)
16851-
16852-static inline void pmd_populate_kernel(struct mm_struct *mm,
16853- pmd_t *pmd, pte_t *pte)
16854-{
16855- paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
16856- set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
16857-}
16858-
16859-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
16860-{
16861- unsigned long pfn = page_to_pfn(pte);
16862-
16863- paravirt_alloc_pt(mm, pfn);
16864- if (PagePinned(virt_to_page(mm->pgd))) {
16865- if (!PageHighMem(pte))
16866- BUG_ON(HYPERVISOR_update_va_mapping(
16867- (unsigned long)__va(pfn << PAGE_SHIFT),
16868- pfn_pte(pfn, PAGE_KERNEL_RO), 0));
16869- else if (!test_and_set_bit(PG_pinned, &pte->flags))
16870- kmap_flush_unused();
16871- set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
16872- } else
16873- *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
16874-}
16875-#define pmd_pgtable(pmd) pmd_page(pmd)
16876-
16877-/*
16878- * Allocate and free page tables.
16879- */
16880-extern void pgd_test_and_unpin(pgd_t *);
16881-extern pgd_t *pgd_alloc(struct mm_struct *);
16882-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
16883-
16884-extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
16885-extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
16886-
16887-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
16888-{
16889- make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
16890- free_page((unsigned long)pte);
16891-}
16892-
16893-extern void __pte_free(pgtable_t);
16894-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
16895-{
16896- __pte_free(pte);
16897-}
16898-
16899-
16900-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
16901-
16902-#ifdef CONFIG_X86_PAE
16903-/*
16904- * In the PAE case we free the pmds as part of the pgd.
16905- */
16906-extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
16907-
16908-extern void __pmd_free(pgtable_t);
16909-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
16910-{
16911- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
16912- __pmd_free(virt_to_page(pmd));
16913-}
16914-
16915-extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
16916-
16917-static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
16918-{
16919- struct page *page = virt_to_page(pmd);
16920- unsigned long pfn = page_to_pfn(page);
16921-
16922- paravirt_alloc_pd(mm, pfn);
16923-
16924- /* Note: almost everything apart from _PAGE_PRESENT is
16925- reserved at the pmd (PDPT) level. */
16926- if (PagePinned(virt_to_page(mm->pgd))) {
16927- BUG_ON(PageHighMem(page));
16928- BUG_ON(HYPERVISOR_update_va_mapping(
16929- (unsigned long)__va(pfn << PAGE_SHIFT),
16930- pfn_pte(pfn, PAGE_KERNEL_RO), 0));
16931- set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
16932- } else
16933- *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
16934-
16935- /*
16936- * According to Intel App note "TLBs, Paging-Structure Caches,
16937- * and Their Invalidation", April 2007, document 317080-001,
16938- * section 8.1: in PAE mode we explicitly have to flush the
16939- * TLB via cr3 if the top-level pgd is changed...
16940- */
16941- if (mm == current->active_mm)
16942- xen_tlb_flush();
16943-}
16944-#endif /* CONFIG_X86_PAE */
16945-
16946-#endif /* _I386_PGALLOC_H */
16947--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgalloc_64.h 2009-03-16 16:33:40.000000000 +0100
16948+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
16949@@ -1,179 +0,0 @@
16950-#ifndef _X86_64_PGALLOC_H
16951-#define _X86_64_PGALLOC_H
16952-
16953-#include <asm/pda.h>
16954-#include <linux/threads.h>
16955-#include <linux/mm.h>
16956-#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
16957-
16958-pmd_t *early_get_pmd(unsigned long va);
16959-void early_make_page_readonly(void *va, unsigned int feature);
16960-
16961-#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
16962-
16963-#define pmd_populate_kernel(mm, pmd, pte) \
16964- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
16965-
16966-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
16967-{
16968- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
16969- BUG_ON(HYPERVISOR_update_va_mapping(
16970- (unsigned long)pmd,
16971- pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
16972- PAGE_KERNEL_RO), 0));
16973- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
16974- } else {
16975- *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
16976- }
16977-}
16978-
16979-/*
16980- * We need to use the batch mode here, but pgd_pupulate() won't be
16981- * be called frequently.
16982- */
16983-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
16984-{
16985- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
16986- BUG_ON(HYPERVISOR_update_va_mapping(
16987- (unsigned long)pud,
16988- pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
16989- PAGE_KERNEL_RO), 0));
16990- set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
16991- set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
16992- } else {
16993- *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
16994- *(__user_pgd(pgd)) = *(pgd);
16995- }
16996-}
16997-
16998-#define pmd_pgtable(pmd) pmd_page(pmd)
16999-
17000-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
17001-{
17002- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
17003- BUG_ON(HYPERVISOR_update_va_mapping(
17004- (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
17005- pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
17006- set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
17007- } else {
17008- *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
17009- }
17010-}
17011-
17012-extern void __pmd_free(pgtable_t);
17013-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
17014-{
17015- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
17016- __pmd_free(virt_to_page(pmd));
17017-}
17018-
17019-extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
17020-
17021-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
17022-{
17023- return (pud_t *)pmd_alloc_one(mm, addr);
17024-}
17025-
17026-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
17027-{
17028- BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
17029- __pmd_free(virt_to_page(pud));
17030-}
17031-
17032-static inline void pgd_list_add(pgd_t *pgd)
17033-{
17034- struct page *page = virt_to_page(pgd);
17035- unsigned long flags;
17036-
17037- spin_lock_irqsave(&pgd_lock, flags);
17038- list_add(&page->lru, &pgd_list);
17039- spin_unlock_irqrestore(&pgd_lock, flags);
17040-}
17041-
17042-static inline void pgd_list_del(pgd_t *pgd)
17043-{
17044- struct page *page = virt_to_page(pgd);
17045- unsigned long flags;
17046-
17047- spin_lock_irqsave(&pgd_lock, flags);
17048- list_del(&page->lru);
17049- spin_unlock_irqrestore(&pgd_lock, flags);
17050-}
17051-
17052-extern void pgd_test_and_unpin(pgd_t *);
17053-
17054-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
17055-{
17056- /*
17057- * We allocate two contiguous pages for kernel and user.
17058- */
17059- unsigned boundary;
17060- pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
17061- if (!pgd)
17062- return NULL;
17063- pgd_list_add(pgd);
17064- pgd_test_and_unpin(pgd);
17065- /*
17066- * Copy kernel pointers in from init.
17067- * Could keep a freelist or slab cache of those because the kernel
17068- * part never changes.
17069- */
17070- boundary = pgd_index(__PAGE_OFFSET);
17071- memset(pgd, 0, boundary * sizeof(pgd_t));
17072- memcpy(pgd + boundary,
17073- init_level4_pgt + boundary,
17074- (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
17075-
17076- memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
17077- /*
17078- * Set level3_user_pgt for vsyscall area
17079- */
17080- __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
17081- __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
17082- return pgd;
17083-}
17084-
17085-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
17086-{
17087- pgd_test_and_unpin(pgd);
17088- pgd_list_del(pgd);
17089- free_pages((unsigned long)pgd, 1);
17090-}
17091-
17092-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
17093-{
17094- pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
17095- if (pte)
17096- make_page_readonly(pte, XENFEAT_writable_page_tables);
17097-
17098- return pte;
17099-}
17100-
17101-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
17102-
17103-/* Should really implement gc for free page table pages. This could be
17104- done with a reference count in struct page. */
17105-
17106-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
17107-{
17108- BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
17109- make_page_writable(pte, XENFEAT_writable_page_tables);
17110- free_page((unsigned long)pte);
17111-}
17112-
17113-extern void __pte_free(pgtable_t);
17114-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
17115-{
17116- __pte_free(pte);
17117-}
17118-
17119-#define __pte_free_tlb(tlb,pte) \
17120-do { \
17121- pgtable_page_dtor((pte)); \
17122- tlb_remove_page((tlb), (pte)); \
17123-} while (0)
17124-
17125-#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
17126-#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
17127-
17128-#endif /* _X86_64_PGALLOC_H */
17129--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:33:40.000000000 +0100
17130+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:38:05.000000000 +0100
17131@@ -1,17 +1,15 @@
17132 #ifndef _ASM_X86_PGTABLE_H
17133 #define _ASM_X86_PGTABLE_H
17134
17135-#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
17136 #define FIRST_USER_ADDRESS 0
17137
17138-#define _PAGE_BIT_PRESENT 0
17139-#define _PAGE_BIT_RW 1
17140-#define _PAGE_BIT_USER 2
17141-#define _PAGE_BIT_PWT 3
17142-#define _PAGE_BIT_PCD 4
17143-#define _PAGE_BIT_ACCESSED 5
17144-#define _PAGE_BIT_DIRTY 6
17145-#define _PAGE_BIT_FILE 6
17146+#define _PAGE_BIT_PRESENT 0 /* is present */
17147+#define _PAGE_BIT_RW 1 /* writeable */
17148+#define _PAGE_BIT_USER 2 /* userspace addressable */
17149+#define _PAGE_BIT_PWT 3 /* page write through */
17150+#define _PAGE_BIT_PCD 4 /* page cache disabled */
17151+#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
17152+#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
17153 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
17154 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
17155 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
17156@@ -22,6 +20,14 @@
17157 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
17158 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
17159
17160+/* If _PAGE_BIT_PRESENT is clear, we use these: */
17161+
17162+/* set: nonlinear file mapping, saved PTE; unset:swap */
17163+#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
17164+
17165+/* if the user mapped it with PROT_NONE; pte_present gives true */
17166+#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
17167+
17168 /*
17169 * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
17170 * sign-extended value on 32-bit with all 1's in the upper word,
17171@@ -48,10 +54,8 @@
17172 #define _PAGE_NX 0
17173 #endif
17174
17175-/* If _PAGE_PRESENT is clear, we use these: */
17176-#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
17177-#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
17178- pte_present gives true */
17179+#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
17180+#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
17181
17182 #ifndef __ASSEMBLY__
17183 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
17184@@ -61,20 +65,42 @@ extern unsigned int __kernel_page_user;
17185 #endif
17186 #endif
17187
17188-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
17189-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
17190+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
17191+ _PAGE_ACCESSED | _PAGE_DIRTY)
17192+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
17193+ _PAGE_DIRTY | __kernel_page_user)
17194+
17195+/* Set of bits not changed in pte_modify */
17196+#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
17197+ _PAGE_ACCESSED | _PAGE_DIRTY)
17198
17199-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
17200+/*
17201+ * PAT settings are part of the hypervisor interface, which sets the
17202+ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
17203+ */
17204+#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
17205+#define _PAGE_CACHE_WB (0)
17206+#define _PAGE_CACHE_WT (_PAGE_PWT)
17207+#define _PAGE_CACHE_WC (_PAGE_PAT)
17208+#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT)
17209+#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
17210+#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
17211
17212 #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
17213-#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
17214+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
17215+ _PAGE_ACCESSED | _PAGE_NX)
17216
17217-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
17218-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
17219-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
17220+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
17221+ _PAGE_USER | _PAGE_ACCESSED)
17222+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
17223+ _PAGE_ACCESSED | _PAGE_NX)
17224+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
17225+ _PAGE_ACCESSED)
17226 #define PAGE_COPY PAGE_COPY_NOEXEC
17227-#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
17228-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
17229+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
17230+ _PAGE_ACCESSED | _PAGE_NX)
17231+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
17232+ _PAGE_ACCESSED)
17233
17234 #ifdef CONFIG_X86_32
17235 #define _PAGE_KERNEL_EXEC \
17236@@ -93,6 +119,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
17237 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
17238 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
17239 #define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
17240+#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
17241 #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
17242 #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
17243 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
17244@@ -109,6 +136,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
17245 #define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
17246 #define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
17247 #define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
17248+#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
17249 #define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
17250 #define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
17251 #define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
17252@@ -142,7 +170,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
17253 * ZERO_PAGE is a global shared page that is always zero: used
17254 * for zero-mapped memory areas etc..
17255 */
17256-extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
17257+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
17258 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
17259
17260 extern spinlock_t pgd_lock;
17261@@ -152,30 +180,111 @@ extern struct list_head pgd_list;
17262 * The following only work if pte_present() is true.
17263 * Undefined behaviour if not..
17264 */
17265-static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
17266-static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
17267-static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
17268-static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
17269-static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
17270-static inline int pte_global(pte_t pte) { return 0; }
17271-static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
17272-
17273-static inline int pmd_large(pmd_t pte) {
17274- return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
17275- (_PAGE_PSE|_PAGE_PRESENT);
17276-}
17277-
17278-static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
17279-static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
17280-static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
17281-static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
17282-static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
17283-static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
17284-static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
17285-static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
17286-static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
17287-static inline pte_t pte_mkglobal(pte_t pte) { return pte; }
17288-static inline pte_t pte_clrglobal(pte_t pte) { return pte; }
17289+static inline int pte_dirty(pte_t pte)
17290+{
17291+ return __pte_val(pte) & _PAGE_DIRTY;
17292+}
17293+
17294+static inline int pte_young(pte_t pte)
17295+{
17296+ return __pte_val(pte) & _PAGE_ACCESSED;
17297+}
17298+
17299+static inline int pte_write(pte_t pte)
17300+{
17301+ return __pte_val(pte) & _PAGE_RW;
17302+}
17303+
17304+static inline int pte_file(pte_t pte)
17305+{
17306+ return __pte_val(pte) & _PAGE_FILE;
17307+}
17308+
17309+static inline int pte_huge(pte_t pte)
17310+{
17311+ return __pte_val(pte) & _PAGE_PSE;
17312+}
17313+
17314+static inline int pte_global(pte_t pte)
17315+{
17316+ return 0;
17317+}
17318+
17319+static inline int pte_exec(pte_t pte)
17320+{
17321+ return !(__pte_val(pte) & _PAGE_NX);
17322+}
17323+
17324+static inline int pte_special(pte_t pte)
17325+{
17326+ return 0;
17327+}
17328+
17329+static inline int pmd_large(pmd_t pte)
17330+{
17331+ return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
17332+ (_PAGE_PSE | _PAGE_PRESENT);
17333+}
17334+
17335+static inline pte_t pte_mkclean(pte_t pte)
17336+{
17337+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
17338+}
17339+
17340+static inline pte_t pte_mkold(pte_t pte)
17341+{
17342+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
17343+}
17344+
17345+static inline pte_t pte_wrprotect(pte_t pte)
17346+{
17347+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
17348+}
17349+
17350+static inline pte_t pte_mkexec(pte_t pte)
17351+{
17352+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
17353+}
17354+
17355+static inline pte_t pte_mkdirty(pte_t pte)
17356+{
17357+ return __pte_ma(__pte_val(pte) | _PAGE_DIRTY);
17358+}
17359+
17360+static inline pte_t pte_mkyoung(pte_t pte)
17361+{
17362+ return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED);
17363+}
17364+
17365+static inline pte_t pte_mkwrite(pte_t pte)
17366+{
17367+ return __pte_ma(__pte_val(pte) | _PAGE_RW);
17368+}
17369+
17370+static inline pte_t pte_mkhuge(pte_t pte)
17371+{
17372+ return __pte_ma(__pte_val(pte) | _PAGE_PSE);
17373+}
17374+
17375+static inline pte_t pte_clrhuge(pte_t pte)
17376+{
17377+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
17378+}
17379+
17380+static inline pte_t pte_mkglobal(pte_t pte)
17381+{
17382+ return pte;
17383+}
17384+
17385+static inline pte_t pte_clrglobal(pte_t pte)
17386+{
17387+ return pte;
17388+}
17389+
17390+static inline pte_t pte_mkspecial(pte_t pte)
17391+{
17392+ return pte;
17393+}
17394
17395 extern pteval_t __supported_pte_mask;
17396
17397@@ -202,15 +311,33 @@ static inline pte_t pte_modify(pte_t pte
17398 pteval_t val = pte_val(pte);
17399
17400 val &= _PAGE_CHG_MASK;
17401- val |= pgprot_val(newprot) & __supported_pte_mask;
17402+ val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
17403
17404 return __pte(val);
17405 }
17406
17407-#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
17408+/* mprotect needs to preserve PAT bits when updating vm_page_prot */
17409+#define pgprot_modify pgprot_modify
17410+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
17411+{
17412+ pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
17413+ pgprotval_t addbits = pgprot_val(newprot);
17414+ return __pgprot(preservebits | addbits);
17415+}
17416+
17417+#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
17418
17419 #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
17420
17421+#ifndef __ASSEMBLY__
17422+#define __HAVE_PHYS_MEM_ACCESS_PROT
17423+struct file;
17424+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
17425+ unsigned long size, pgprot_t vma_prot);
17426+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
17427+ unsigned long size, pgprot_t *vma_prot);
17428+#endif
17429+
17430 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
17431 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
17432
17433@@ -246,6 +373,9 @@ static inline pte_t pte_modify(pte_t pte
17434 # include "pgtable_64.h"
17435 #endif
17436
17437+#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
17438+#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
17439+
17440 #ifndef __ASSEMBLY__
17441
17442 enum {
17443@@ -312,46 +442,17 @@ static inline void xen_pte_clear(struct
17444 * bit at the same time.
17445 */
17446 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
17447-#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
17448-({ \
17449- int __changed = !pte_same(*(ptep), entry); \
17450- if (__changed && (dirty)) { \
17451- if ( likely((vma)->vm_mm == current->mm) ) { \
17452- BUG_ON(HYPERVISOR_update_va_mapping(address, \
17453- entry, \
17454- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
17455- UVMF_INVLPG|UVMF_MULTI)); \
17456- } else { \
17457- xen_l1_entry_update(ptep, entry); \
17458- flush_tlb_page(vma, address); \
17459- } \
17460- } \
17461- __changed; \
17462-})
17463+extern int ptep_set_access_flags(struct vm_area_struct *vma,
17464+ unsigned long address, pte_t *ptep,
17465+ pte_t entry, int dirty);
17466
17467 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
17468-#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
17469- int __ret = 0; \
17470- if (pte_young(*(ptep))) \
17471- __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
17472- &(ptep)->pte); \
17473- if (__ret) \
17474- pte_update((vma)->vm_mm, addr, ptep); \
17475- __ret; \
17476-})
17477+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
17478+ unsigned long addr, pte_t *ptep);
17479
17480 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
17481-#define ptep_clear_flush_young(vma, address, ptep) \
17482-({ \
17483- pte_t __pte = *(ptep); \
17484- int __young = pte_young(__pte); \
17485- __pte = pte_mkold(__pte); \
17486- if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
17487- (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
17488- else if (__young) \
17489- (ptep)->pte_low = __pte.pte_low; \
17490- __young; \
17491-})
17492+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
17493+ unsigned long address, pte_t *ptep);
17494
17495 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
17496 #define ptep_clear_flush(vma, addr, ptep) \
17497@@ -370,7 +471,8 @@ static inline void xen_pte_clear(struct
17498 })
17499
17500 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
17501-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
17502+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
17503+ pte_t *ptep)
17504 {
17505 pte_t pte = *ptep;
17506 if (!pte_none(pte)
17507@@ -398,13 +500,29 @@ static inline pte_t ptep_get_and_clear(s
17508 pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
17509
17510 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
17511-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
17512+static inline void ptep_set_wrprotect(struct mm_struct *mm,
17513+ unsigned long addr, pte_t *ptep)
17514 {
17515 pte_t pte = *ptep;
17516 if (pte_write(pte))
17517 set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
17518 }
17519
17520+/*
17521+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
17522+ *
17523+ * dst - pointer to pgd range anwhere on a pgd page
17524+ * src - ""
17525+ * count - the number of pgds to copy.
17526+ *
17527+ * dst and src can be on the same page, but the range must not overlap,
17528+ * and must not cross a page boundary.
17529+ */
17530+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
17531+{
17532+ memcpy(dst, src, count * sizeof(pgd_t));
17533+}
17534+
17535 #define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
17536 xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
17537
17538--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:33:40.000000000 +0100
17539+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:38:05.000000000 +0100
17540@@ -8,25 +8,28 @@
17541 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
17542 */
17543
17544-#define pte_ERROR(e) \
17545- printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \
17546- &(e), __pte_val(e), pte_pfn(e))
17547-#define pmd_ERROR(e) \
17548- printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
17549- &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17550-#define pgd_ERROR(e) \
17551- printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
17552- &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17553-
17554+#define pte_ERROR(e) \
17555+ printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \
17556+ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
17557+#define pmd_ERROR(e) \
17558+ printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
17559+ __FILE__, __LINE__, &(e), __pmd_val(e), \
17560+ (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17561+#define pgd_ERROR(e) \
17562+ printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
17563+ __FILE__, __LINE__, &(e), __pgd_val(e), \
17564+ (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17565
17566 static inline int pud_none(pud_t pud)
17567 {
17568 return __pud_val(pud) == 0;
17569+
17570 }
17571 static inline int pud_bad(pud_t pud)
17572 {
17573 return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
17574 }
17575+
17576 static inline int pud_present(pud_t pud)
17577 {
17578 return __pud_val(pud) & _PAGE_PRESENT;
17579@@ -48,12 +51,14 @@ static inline void xen_set_pte(pte_t *pt
17580
17581 static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
17582 {
17583- set_64bit((unsigned long long *)(ptep),__pte_val(pte));
17584+ set_64bit((unsigned long long *)(ptep), __pte_val(pte));
17585 }
17586+
17587 static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
17588 {
17589 xen_l2_entry_update(pmdp, pmd);
17590 }
17591+
17592 static inline void xen_set_pud(pud_t *pudp, pud_t pud)
17593 {
17594 xen_l3_entry_update(pudp, pud);
17595@@ -92,20 +97,19 @@ static inline void pud_clear(pud_t *pudp
17596 * current pgd to avoid unnecessary TLB flushes.
17597 */
17598 pgd = read_cr3();
17599- if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
17600+ if (__pa(pudp) >= pgd && __pa(pudp) <
17601+ (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
17602 xen_tlb_flush();
17603 }
17604
17605-#define pud_page(pud) \
17606-((struct page *) __va(pud_val(pud) & PAGE_MASK))
17607+#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
17608
17609-#define pud_page_vaddr(pud) \
17610-((unsigned long) __va(pud_val(pud) & PAGE_MASK))
17611+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
17612
17613
17614 /* Find an entry in the second-level page table.. */
17615-#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
17616- pmd_index(address))
17617+#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \
17618+ pmd_index(address))
17619
17620 #ifdef CONFIG_SMP
17621 static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
17622@@ -150,7 +154,8 @@ static inline int pte_none(pte_t pte)
17623 * put the 32 bits of offset into the high part.
17624 */
17625 #define pte_to_pgoff(pte) ((pte).pte_high)
17626-#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
17627+#define pgoff_to_pte(off) \
17628+ ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
17629 #define PTE_FILE_MAX_BITS 32
17630
17631 /* Encode and de-code a swap entry */
17632--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:33:40.000000000 +0100
17633+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:38:05.000000000 +0100
17634@@ -38,16 +38,13 @@ void paging_init(void);
17635 #ifdef CONFIG_X86_PAE
17636 # include <asm/pgtable-3level-defs.h>
17637 # define PMD_SIZE (1UL << PMD_SHIFT)
17638-# define PMD_MASK (~(PMD_SIZE-1))
17639+# define PMD_MASK (~(PMD_SIZE - 1))
17640 #else
17641 # include <asm/pgtable-2level-defs.h>
17642 #endif
17643
17644 #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
17645-#define PGDIR_MASK (~(PGDIR_SIZE-1))
17646-
17647-#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
17648-#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
17649+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
17650
17651 /* Just any arbitrary offset to the start of the vmalloc VM area: the
17652 * current 8MB value just means that there will be a 8MB "hole" after the
17653@@ -56,21 +53,22 @@ void paging_init(void);
17654 * The vmalloc() routines leaves a hole of 4kB between each vmalloced
17655 * area for the same reason. ;)
17656 */
17657-#define VMALLOC_OFFSET (8*1024*1024)
17658-#define VMALLOC_START (((unsigned long) high_memory + \
17659- 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
17660+#define VMALLOC_OFFSET (8 * 1024 * 1024)
17661+#define VMALLOC_START (((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) \
17662+ & ~(VMALLOC_OFFSET - 1))
17663 #ifdef CONFIG_X86_PAE
17664 #define LAST_PKMAP 512
17665 #else
17666 #define LAST_PKMAP 1024
17667 #endif
17668
17669-#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
17670+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
17671+ & PMD_MASK)
17672
17673 #ifdef CONFIG_HIGHMEM
17674-# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
17675+# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
17676 #else
17677-# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
17678+# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
17679 #endif
17680
17681 /*
17682@@ -91,10 +89,10 @@ extern unsigned long pg0[];
17683 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
17684 can temporarily clear it. */
17685 #define pmd_present(x) (__pmd_val(x))
17686-#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
17687+#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
17688 #else
17689 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
17690-#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
17691+#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
17692 #endif
17693
17694
17695@@ -107,32 +105,18 @@ extern unsigned long pg0[];
17696 #endif
17697
17698 /*
17699- * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
17700- *
17701- * dst - pointer to pgd range anwhere on a pgd page
17702- * src - ""
17703- * count - the number of pgds to copy.
17704- *
17705- * dst and src can be on the same page, but the range must not overlap,
17706- * and must not cross a page boundary.
17707+ * Macro to mark a page protection value as "uncacheable".
17708+ * On processors which do not support it, this is a no-op.
17709 */
17710-static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
17711-{
17712- memcpy(dst, src, count * sizeof(pgd_t));
17713-}
17714-
17715-/*
17716- * Macro to mark a page protection value as "uncacheable". On processors which do not support
17717- * it, this is a no-op.
17718- */
17719-#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
17720- ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
17721+#define pgprot_noncached(prot) \
17722+ ((boot_cpu_data.x86 > 3) \
17723+ ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
17724+ : (prot))
17725
17726 /*
17727 * Conversion functions: convert a page and protection to a page entry,
17728 * and a page entry and page directory to the page they refer to.
17729 */
17730-
17731 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
17732
17733 /*
17734@@ -141,20 +125,20 @@ static inline void clone_pgd_range(pgd_t
17735 * this macro returns the index of the entry in the pgd page which would
17736 * control the given virtual address
17737 */
17738-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
17739-#define pgd_index_k(addr) pgd_index(addr)
17740+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
17741+#define pgd_index_k(addr) pgd_index((addr))
17742
17743 /*
17744 * pgd_offset() returns a (pgd_t *)
17745 * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
17746 */
17747-#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
17748+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
17749
17750 /*
17751 * a shortcut which implies the use of the kernel's pgd, instead
17752 * of a process's
17753 */
17754-#define pgd_offset_k(address) pgd_offset(&init_mm, address)
17755+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
17756
17757 static inline int pud_large(pud_t pud) { return 0; }
17758
17759@@ -164,8 +148,8 @@ static inline int pud_large(pud_t pud) {
17760 * this macro returns the index of the entry in the pmd page which would
17761 * control the given virtual address
17762 */
17763-#define pmd_index(address) \
17764- (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
17765+#define pmd_index(address) \
17766+ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
17767
17768 /*
17769 * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
17770@@ -173,33 +157,36 @@ static inline int pud_large(pud_t pud) {
17771 * this macro returns the index of the entry in the pte page which would
17772 * control the given virtual address
17773 */
17774-#define pte_index(address) \
17775- (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
17776-#define pte_offset_kernel(dir, address) \
17777- ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
17778+#define pte_index(address) \
17779+ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
17780+#define pte_offset_kernel(dir, address) \
17781+ ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
17782
17783-#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
17784+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
17785
17786-#define pmd_page_vaddr(pmd) \
17787- ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
17788+#define pmd_page_vaddr(pmd) \
17789+ ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
17790
17791 #if defined(CONFIG_HIGHPTE)
17792-#define pte_offset_map(dir, address) \
17793- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
17794-#define pte_offset_map_nested(dir, address) \
17795- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
17796-#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
17797-#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
17798-#else
17799-#define pte_offset_map(dir, address) \
17800- ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
17801-#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
17802+#define pte_offset_map(dir, address) \
17803+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
17804+ pte_index((address)))
17805+#define pte_offset_map_nested(dir, address) \
17806+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
17807+ pte_index((address)))
17808+#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
17809+#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
17810+#else
17811+#define pte_offset_map(dir, address) \
17812+ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
17813+#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
17814 #define pte_unmap(pte) do { } while (0)
17815 #define pte_unmap_nested(pte) do { } while (0)
17816 #endif
17817
17818 /* Clear a kernel PTE and flush it from the TLB */
17819-#define kpte_clear_flush(ptep, vaddr) do { \
17820+#define kpte_clear_flush(ptep, vaddr) \
17821+do { \
17822 if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
17823 BUG(); \
17824 } while (0)
17825@@ -208,7 +195,7 @@ static inline int pud_large(pud_t pud) {
17826 * The i386 doesn't have any external MMU info: the kernel page
17827 * tables contain all the necessary information.
17828 */
17829-#define update_mmu_cache(vma,address,pte) do { } while (0)
17830+#define update_mmu_cache(vma, address, pte) do { } while (0)
17831
17832 void make_lowmem_page_readonly(void *va, unsigned int feature);
17833 void make_lowmem_page_writable(void *va, unsigned int feature);
17834@@ -225,7 +212,7 @@ void make_lowmem_page_writable(void *va,
17835 #define kern_addr_valid(kaddr) (0)
17836 #endif
17837
17838-#define io_remap_pfn_range(vma,from,pfn,size,prot) \
17839-direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
17840+#define io_remap_pfn_range(vma, from, pfn, size, prot) \
17841+ direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO)
17842
17843 #endif /* _I386_PGTABLE_H */
17844--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:33:40.000000000 +0100
17845+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:38:05.000000000 +0100
17846@@ -31,7 +31,7 @@ extern void paging_init(void);
17847
17848 #endif /* !__ASSEMBLY__ */
17849
17850-#define SHARED_KERNEL_PMD 1
17851+#define SHARED_KERNEL_PMD 0
17852
17853 /*
17854 * PGDIR_SHIFT determines what a top-level page table entry can map
17855@@ -59,18 +59,20 @@ extern void paging_init(void);
17856
17857 #ifndef __ASSEMBLY__
17858
17859-#define pte_ERROR(e) \
17860- printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
17861- &(e), __pte_val(e), pte_pfn(e))
17862-#define pmd_ERROR(e) \
17863- printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
17864- &(e), __pmd_val(e), pmd_pfn(e))
17865-#define pud_ERROR(e) \
17866- printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
17867- &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
17868-#define pgd_ERROR(e) \
17869- printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
17870- &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
17871+#define pte_ERROR(e) \
17872+ printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \
17873+ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
17874+#define pmd_ERROR(e) \
17875+ printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \
17876+ __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
17877+#define pud_ERROR(e) \
17878+ printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", \
17879+ __FILE__, __LINE__, &(e), __pud_val(e), \
17880+ (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
17881+#define pgd_ERROR(e) \
17882+ printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", \
17883+ __FILE__, __LINE__, &(e), __pgd_val(e), \
17884+ (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
17885
17886 #define pgd_none(x) (!__pgd_val(x))
17887 #define pud_none(x) (!__pud_val(x))
17888@@ -125,7 +127,7 @@ static inline void xen_set_pgd(pgd_t *pg
17889 xen_l4_entry_update(pgdp, pgd);
17890 }
17891
17892-static inline void xen_pgd_clear(pgd_t * pgd)
17893+static inline void xen_pgd_clear(pgd_t *pgd)
17894 {
17895 xen_set_pgd(pgd, xen_make_pgd(0));
17896 xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
17897@@ -135,43 +137,43 @@ static inline void xen_pgd_clear(pgd_t *
17898
17899 #endif /* !__ASSEMBLY__ */
17900
17901-#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
17902-#define PMD_MASK (~(PMD_SIZE-1))
17903-#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT)
17904-#define PUD_MASK (~(PUD_SIZE-1))
17905-#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
17906-#define PGDIR_MASK (~(PGDIR_SIZE-1))
17907+#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
17908+#define PMD_MASK (~(PMD_SIZE - 1))
17909+#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
17910+#define PUD_MASK (~(PUD_SIZE - 1))
17911+#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
17912+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
17913
17914
17915-#define MAXMEM _AC(0x3fffffffffff, UL)
17916+#define MAXMEM _AC(0x00003fffffffffff, UL)
17917 #define VMALLOC_START _AC(0xffffc20000000000, UL)
17918 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
17919 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
17920-#define MODULES_VADDR _AC(0xffffffff88000000, UL)
17921+#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
17922 #define MODULES_END _AC(0xfffffffffff00000, UL)
17923 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
17924
17925 #ifndef __ASSEMBLY__
17926
17927-static inline unsigned long pgd_bad(pgd_t pgd)
17928+static inline int pgd_bad(pgd_t pgd)
17929 {
17930- return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
17931+ return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
17932 }
17933
17934-static inline unsigned long pud_bad(pud_t pud)
17935+static inline int pud_bad(pud_t pud)
17936 {
17937- return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
17938+ return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
17939 }
17940
17941-static inline unsigned long pmd_bad(pmd_t pmd)
17942+static inline int pmd_bad(pmd_t pmd)
17943 {
17944- return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
17945+ return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
17946 }
17947
17948 #define pte_none(x) (!(x).pte)
17949 #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
17950
17951-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
17952+#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
17953
17954 #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
17955 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
17956@@ -181,13 +183,13 @@ static inline unsigned long pmd_bad(pmd_
17957 mfn_to_local_pfn(__pte_mfn(_pte)) : \
17958 __pte_mfn(_pte))
17959
17960-#define pte_page(x) pfn_to_page(pte_pfn(x))
17961+#define pte_page(x) pfn_to_page(pte_pfn((x)))
17962
17963 /*
17964 * Macro to mark a page protection value as "uncacheable".
17965 */
17966-#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
17967-
17968+#define pgprot_noncached(prot) \
17969+ (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
17970
17971 /*
17972 * Conversion functions: convert a page and protection to a page entry,
17973@@ -197,36 +199,39 @@ static inline unsigned long pmd_bad(pmd_
17974 /*
17975 * Level 4 access.
17976 */
17977-#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
17978-#define pgd_page(pgd) (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
17979-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
17980-#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
17981-#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
17982+#define pgd_page_vaddr(pgd) \
17983+ ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
17984+#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
17985+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
17986+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
17987+#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
17988 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
17989 static inline int pgd_large(pgd_t pgd) { return 0; }
17990 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
17991
17992 /* PUD - Level3 access */
17993 /* to find an entry in a page-table-directory. */
17994-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
17995-#define pud_page(pud) (pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
17996-#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
17997-#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
17998+#define pud_page_vaddr(pud) \
17999+ ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
18000+#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
18001+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
18002+#define pud_offset(pgd, address) \
18003+ ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
18004 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
18005
18006 static inline int pud_large(pud_t pte)
18007 {
18008- return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
18009- (_PAGE_PSE|_PAGE_PRESENT);
18010+ return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
18011+ (_PAGE_PSE | _PAGE_PRESENT);
18012 }
18013
18014 /* PMD - Level 2 access */
18015-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
18016-#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
18017+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
18018+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
18019
18020-#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
18021-#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
18022- pmd_index(address))
18023+#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
18024+#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
18025+ pmd_index(address))
18026 #define pmd_none(x) (!__pmd_val(x))
18027 #if CONFIG_XEN_COMPAT <= 0x030002
18028 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
18029@@ -235,43 +240,56 @@ static inline int pud_large(pud_t pte)
18030 #else
18031 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
18032 #endif
18033-#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
18034-#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
18035+#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
18036+#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
18037
18038 #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
18039-#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
18040+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
18041+ _PAGE_FILE })
18042 #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
18043
18044 /* PTE - Level 1 access. */
18045
18046 /* page, protection -> pte */
18047-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
18048-
18049-#define pte_index(address) \
18050- (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
18051+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
18052+
18053+#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
18054 #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
18055- pte_index(address))
18056+ pte_index((address)))
18057
18058 /* x86-64 always has all page tables mapped. */
18059-#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
18060-#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
18061+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
18062+#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
18063 #define pte_unmap(pte) /* NOP */
18064-#define pte_unmap_nested(pte) /* NOP */
18065+#define pte_unmap_nested(pte) /* NOP */
18066+
18067+#define update_mmu_cache(vma, address, pte) do { } while (0)
18068
18069-#define update_mmu_cache(vma,address,pte) do { } while (0)
18070+#define direct_gbpages 0
18071
18072 /* Encode and de-code a swap entry */
18073-#define __swp_type(x) (((x).val >> 1) & 0x3f)
18074-#define __swp_offset(x) ((x).val >> 8)
18075-#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
18076+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
18077+#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
18078+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
18079+#else
18080+#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
18081+#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
18082+#endif
18083+
18084+#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
18085+ & ((1U << SWP_TYPE_BITS) - 1))
18086+#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
18087+#define __swp_entry(type, offset) ((swp_entry_t) { \
18088+ ((type) << (_PAGE_BIT_PRESENT + 1)) \
18089+ | ((offset) << SWP_OFFSET_SHIFT) })
18090 #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
18091 #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
18092
18093-extern int kern_addr_valid(unsigned long addr);
18094+extern int kern_addr_valid(unsigned long addr);
18095 extern void cleanup_highmap(void);
18096
18097-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
18098- direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
18099+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
18100+ direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
18101
18102 #define HAVE_ARCH_UNMAPPED_AREA
18103 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
18104@@ -284,8 +302,10 @@ extern void cleanup_highmap(void);
18105
18106 /* fs/proc/kcore.c */
18107 #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
18108-#define kc_offset_to_vaddr(o) \
18109- (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
18110+#define kc_offset_to_vaddr(o) \
18111+ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
18112+ ? ((o) | ~__VIRTUAL_MASK) \
18113+ : (o))
18114
18115 #define __HAVE_ARCH_PTE_SAME
18116 #endif /* !__ASSEMBLY__ */
18117--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:33:40.000000000 +0100
18118+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:38:05.000000000 +0100
18119@@ -3,10 +3,6 @@
18120
18121 #include <asm/processor-flags.h>
18122
18123-/* migration helpers, for KVM - will be removed in 2.6.25: */
18124-#include <asm/vm86.h>
18125-#define Xgt_desc_struct desc_ptr
18126-
18127 /* Forward declaration, a strange C thing */
18128 struct task_struct;
18129 struct mm_struct;
18130@@ -24,6 +20,7 @@ struct mm_struct;
18131 #include <asm/msr.h>
18132 #include <asm/desc_defs.h>
18133 #include <asm/nops.h>
18134+
18135 #include <linux/personality.h>
18136 #include <linux/cpumask.h>
18137 #include <linux/cache.h>
18138@@ -38,16 +35,18 @@ struct mm_struct;
18139 static inline void *current_text_addr(void)
18140 {
18141 void *pc;
18142- asm volatile("mov $1f,%0\n1:":"=r" (pc));
18143+
18144+ asm volatile("mov $1f, %0; 1:":"=r" (pc));
18145+
18146 return pc;
18147 }
18148
18149 #ifdef CONFIG_X86_VSMP
18150-#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
18151-#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
18152+# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
18153+# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
18154 #else
18155-#define ARCH_MIN_TASKALIGN 16
18156-#define ARCH_MIN_MMSTRUCT_ALIGN 0
18157+# define ARCH_MIN_TASKALIGN 16
18158+# define ARCH_MIN_MMSTRUCT_ALIGN 0
18159 #endif
18160
18161 /*
18162@@ -57,68 +56,80 @@ static inline void *current_text_addr(vo
18163 */
18164
18165 struct cpuinfo_x86 {
18166- __u8 x86; /* CPU family */
18167- __u8 x86_vendor; /* CPU vendor */
18168- __u8 x86_model;
18169- __u8 x86_mask;
18170+ __u8 x86; /* CPU family */
18171+ __u8 x86_vendor; /* CPU vendor */
18172+ __u8 x86_model;
18173+ __u8 x86_mask;
18174 #ifdef CONFIG_X86_32
18175- char wp_works_ok; /* It doesn't on 386's */
18176- char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
18177- char hard_math;
18178- char rfu;
18179- char fdiv_bug;
18180- char f00f_bug;
18181- char coma_bug;
18182- char pad0;
18183+ char wp_works_ok; /* It doesn't on 386's */
18184+
18185+ /* Problems on some 486Dx4's and old 386's: */
18186+ char hlt_works_ok;
18187+ char hard_math;
18188+ char rfu;
18189+ char fdiv_bug;
18190+ char f00f_bug;
18191+ char coma_bug;
18192+ char pad0;
18193 #else
18194- /* number of 4K pages in DTLB/ITLB combined(in pages)*/
18195- int x86_tlbsize;
18196- __u8 x86_virt_bits, x86_phys_bits;
18197- /* cpuid returned core id bits */
18198- __u8 x86_coreid_bits;
18199- /* Max extended CPUID function supported */
18200- __u32 extended_cpuid_level;
18201-#endif
18202- int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
18203- __u32 x86_capability[NCAPINTS];
18204- char x86_vendor_id[16];
18205- char x86_model_id[64];
18206- int x86_cache_size; /* in KB - valid for CPUS which support this
18207- call */
18208- int x86_cache_alignment; /* In bytes */
18209- int x86_power;
18210- unsigned long loops_per_jiffy;
18211+ /* Number of 4K pages in DTLB/ITLB combined(in pages): */
18212+ int x86_tlbsize;
18213+ __u8 x86_virt_bits;
18214+ __u8 x86_phys_bits;
18215+ /* CPUID returned core id bits: */
18216+ __u8 x86_coreid_bits;
18217+ /* Max extended CPUID function supported: */
18218+ __u32 extended_cpuid_level;
18219+#endif
18220+ /* Maximum supported CPUID level, -1=no CPUID: */
18221+ int cpuid_level;
18222+ __u32 x86_capability[NCAPINTS];
18223+ char x86_vendor_id[16];
18224+ char x86_model_id[64];
18225+ /* in KB - valid for CPUS which support this call: */
18226+ int x86_cache_size;
18227+ int x86_cache_alignment; /* In bytes */
18228+ int x86_power;
18229+ unsigned long loops_per_jiffy;
18230 #ifdef CONFIG_SMP
18231- cpumask_t llc_shared_map; /* cpus sharing the last level cache */
18232+ /* cpus sharing the last level cache: */
18233+ cpumask_t llc_shared_map;
18234 #endif
18235- u16 x86_max_cores; /* cpuid returned max cores value */
18236- u16 apicid;
18237- u16 x86_clflush_size;
18238+ /* cpuid returned max cores value: */
18239+ u16 x86_max_cores;
18240+ u16 apicid;
18241+ u16 initial_apicid;
18242+ u16 x86_clflush_size;
18243 #ifdef CONFIG_SMP
18244- u16 booted_cores; /* number of cores as seen by OS */
18245- u16 phys_proc_id; /* Physical processor id. */
18246- u16 cpu_core_id; /* Core id */
18247- u16 cpu_index; /* index into per_cpu list */
18248+ /* number of cores as seen by the OS: */
18249+ u16 booted_cores;
18250+ /* Physical processor id: */
18251+ u16 phys_proc_id;
18252+ /* Core id: */
18253+ u16 cpu_core_id;
18254+ /* Index into per_cpu list: */
18255+ u16 cpu_index;
18256 #endif
18257 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
18258
18259-#define X86_VENDOR_INTEL 0
18260-#define X86_VENDOR_CYRIX 1
18261-#define X86_VENDOR_AMD 2
18262-#define X86_VENDOR_UMC 3
18263-#define X86_VENDOR_NEXGEN 4
18264-#define X86_VENDOR_CENTAUR 5
18265-#define X86_VENDOR_TRANSMETA 7
18266-#define X86_VENDOR_NSC 8
18267-#define X86_VENDOR_NUM 9
18268-#define X86_VENDOR_UNKNOWN 0xff
18269+#define X86_VENDOR_INTEL 0
18270+#define X86_VENDOR_CYRIX 1
18271+#define X86_VENDOR_AMD 2
18272+#define X86_VENDOR_UMC 3
18273+#define X86_VENDOR_CENTAUR 5
18274+#define X86_VENDOR_TRANSMETA 7
18275+#define X86_VENDOR_NSC 8
18276+#define X86_VENDOR_NUM 9
18277+
18278+#define X86_VENDOR_UNKNOWN 0xff
18279
18280 /*
18281 * capabilities of CPUs
18282 */
18283-extern struct cpuinfo_x86 boot_cpu_data;
18284-extern struct cpuinfo_x86 new_cpu_data;
18285-extern __u32 cleared_cpu_caps[NCAPINTS];
18286+extern struct cpuinfo_x86 boot_cpu_data;
18287+extern struct cpuinfo_x86 new_cpu_data;
18288+
18289+extern __u32 cleared_cpu_caps[NCAPINTS];
18290
18291 #ifdef CONFIG_SMP
18292 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
18293@@ -129,7 +140,18 @@ DECLARE_PER_CPU(struct cpuinfo_x86, cpu_
18294 #define current_cpu_data boot_cpu_data
18295 #endif
18296
18297-void cpu_detect(struct cpuinfo_x86 *c);
18298+static inline int hlt_works(int cpu)
18299+{
18300+#ifdef CONFIG_X86_32
18301+ return cpu_data(cpu).hlt_works_ok;
18302+#else
18303+ return 1;
18304+#endif
18305+}
18306+
18307+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
18308+
18309+extern void cpu_detect(struct cpuinfo_x86 *c);
18310
18311 extern void identify_cpu(struct cpuinfo_x86 *);
18312 extern void identify_boot_cpu(void);
18313@@ -149,12 +171,12 @@ static inline void xen_cpuid(unsigned in
18314 unsigned int *ecx, unsigned int *edx)
18315 {
18316 /* ecx is often an input as well as an output. */
18317- __asm__(XEN_CPUID
18318- : "=a" (*eax),
18319- "=b" (*ebx),
18320- "=c" (*ecx),
18321- "=d" (*edx)
18322- : "0" (*eax), "2" (*ecx));
18323+ asm(XEN_CPUID
18324+ : "=a" (*eax),
18325+ "=b" (*ebx),
18326+ "=c" (*ecx),
18327+ "=d" (*edx)
18328+ : "0" (*eax), "2" (*ecx));
18329 }
18330
18331 static inline void load_cr3(pgd_t *pgdir)
18332@@ -166,57 +188,70 @@ static inline void load_cr3(pgd_t *pgdir
18333 #ifdef CONFIG_X86_32
18334 /* This is the TSS defined by the hardware. */
18335 struct x86_hw_tss {
18336- unsigned short back_link, __blh;
18337- unsigned long sp0;
18338- unsigned short ss0, __ss0h;
18339- unsigned long sp1;
18340- unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
18341- unsigned long sp2;
18342- unsigned short ss2, __ss2h;
18343- unsigned long __cr3;
18344- unsigned long ip;
18345- unsigned long flags;
18346- unsigned long ax, cx, dx, bx;
18347- unsigned long sp, bp, si, di;
18348- unsigned short es, __esh;
18349- unsigned short cs, __csh;
18350- unsigned short ss, __ssh;
18351- unsigned short ds, __dsh;
18352- unsigned short fs, __fsh;
18353- unsigned short gs, __gsh;
18354- unsigned short ldt, __ldth;
18355- unsigned short trace, io_bitmap_base;
18356+ unsigned short back_link, __blh;
18357+ unsigned long sp0;
18358+ unsigned short ss0, __ss0h;
18359+ unsigned long sp1;
18360+ /* ss1 caches MSR_IA32_SYSENTER_CS: */
18361+ unsigned short ss1, __ss1h;
18362+ unsigned long sp2;
18363+ unsigned short ss2, __ss2h;
18364+ unsigned long __cr3;
18365+ unsigned long ip;
18366+ unsigned long flags;
18367+ unsigned long ax;
18368+ unsigned long cx;
18369+ unsigned long dx;
18370+ unsigned long bx;
18371+ unsigned long sp;
18372+ unsigned long bp;
18373+ unsigned long si;
18374+ unsigned long di;
18375+ unsigned short es, __esh;
18376+ unsigned short cs, __csh;
18377+ unsigned short ss, __ssh;
18378+ unsigned short ds, __dsh;
18379+ unsigned short fs, __fsh;
18380+ unsigned short gs, __gsh;
18381+ unsigned short ldt, __ldth;
18382+ unsigned short trace;
18383+ unsigned short io_bitmap_base;
18384+
18385 } __attribute__((packed));
18386 extern struct tss_struct doublefault_tss;
18387 #else
18388 struct x86_hw_tss {
18389- u32 reserved1;
18390- u64 sp0;
18391- u64 sp1;
18392- u64 sp2;
18393- u64 reserved2;
18394- u64 ist[7];
18395- u32 reserved3;
18396- u32 reserved4;
18397- u16 reserved5;
18398- u16 io_bitmap_base;
18399+ u32 reserved1;
18400+ u64 sp0;
18401+ u64 sp1;
18402+ u64 sp2;
18403+ u64 reserved2;
18404+ u64 ist[7];
18405+ u32 reserved3;
18406+ u32 reserved4;
18407+ u16 reserved5;
18408+ u16 io_bitmap_base;
18409+
18410 } __attribute__((packed)) ____cacheline_aligned;
18411 #endif
18412 #endif /* CONFIG_X86_NO_TSS */
18413
18414 /*
18415- * Size of io_bitmap.
18416+ * IO-bitmap sizes:
18417 */
18418-#define IO_BITMAP_BITS 65536
18419-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
18420-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
18421-#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
18422-#define INVALID_IO_BITMAP_OFFSET 0x8000
18423-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
18424+#define IO_BITMAP_BITS 65536
18425+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
18426+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
18427+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
18428+#define INVALID_IO_BITMAP_OFFSET 0x8000
18429+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
18430
18431 #ifndef CONFIG_X86_NO_TSS
18432 struct tss_struct {
18433- struct x86_hw_tss x86_tss;
18434+ /*
18435+ * The hardware state:
18436+ */
18437+ struct x86_hw_tss x86_tss;
18438
18439 /*
18440 * The extra 1 is there because the CPU will access an
18441@@ -224,136 +259,162 @@ struct tss_struct {
18442 * bitmap. The extra byte must be all 1 bits, and must
18443 * be within the limit.
18444 */
18445- unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
18446+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
18447 /*
18448 * Cache the current maximum and the last task that used the bitmap:
18449 */
18450- unsigned long io_bitmap_max;
18451- struct thread_struct *io_bitmap_owner;
18452+ unsigned long io_bitmap_max;
18453+ struct thread_struct *io_bitmap_owner;
18454+
18455 /*
18456- * pads the TSS to be cacheline-aligned (size is 0x100)
18457+ * Pad the TSS to be cacheline-aligned (size is 0x100):
18458 */
18459- unsigned long __cacheline_filler[35];
18460+ unsigned long __cacheline_filler[35];
18461 /*
18462- * .. and then another 0x100 bytes for emergency kernel stack
18463+ * .. and then another 0x100 bytes for the emergency kernel stack:
18464 */
18465- unsigned long stack[64];
18466+ unsigned long stack[64];
18467+
18468 } __attribute__((packed));
18469
18470 DECLARE_PER_CPU(struct tss_struct, init_tss);
18471
18472-/* Save the original ist values for checking stack pointers during debugging */
18473+/*
18474+ * Save the original ist values for checking stack pointers during debugging
18475+ */
18476 struct orig_ist {
18477- unsigned long ist[7];
18478+ unsigned long ist[7];
18479 };
18480 #endif /* CONFIG_X86_NO_TSS */
18481
18482 #define MXCSR_DEFAULT 0x1f80
18483
18484 struct i387_fsave_struct {
18485- u32 cwd;
18486- u32 swd;
18487- u32 twd;
18488- u32 fip;
18489- u32 fcs;
18490- u32 foo;
18491- u32 fos;
18492- u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
18493- u32 status; /* software status information */
18494+ u32 cwd; /* FPU Control Word */
18495+ u32 swd; /* FPU Status Word */
18496+ u32 twd; /* FPU Tag Word */
18497+ u32 fip; /* FPU IP Offset */
18498+ u32 fcs; /* FPU IP Selector */
18499+ u32 foo; /* FPU Operand Pointer Offset */
18500+ u32 fos; /* FPU Operand Pointer Selector */
18501+
18502+ /* 8*10 bytes for each FP-reg = 80 bytes: */
18503+ u32 st_space[20];
18504+
18505+ /* Software status information [not touched by FSAVE ]: */
18506+ u32 status;
18507 };
18508
18509 struct i387_fxsave_struct {
18510- u16 cwd;
18511- u16 swd;
18512- u16 twd;
18513- u16 fop;
18514+ u16 cwd; /* Control Word */
18515+ u16 swd; /* Status Word */
18516+ u16 twd; /* Tag Word */
18517+ u16 fop; /* Last Instruction Opcode */
18518 union {
18519 struct {
18520- u64 rip;
18521- u64 rdp;
18522+ u64 rip; /* Instruction Pointer */
18523+ u64 rdp; /* Data Pointer */
18524 };
18525 struct {
18526- u32 fip;
18527- u32 fcs;
18528- u32 foo;
18529- u32 fos;
18530+ u32 fip; /* FPU IP Offset */
18531+ u32 fcs; /* FPU IP Selector */
18532+ u32 foo; /* FPU Operand Offset */
18533+ u32 fos; /* FPU Operand Selector */
18534 };
18535 };
18536- u32 mxcsr;
18537- u32 mxcsr_mask;
18538- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
18539- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
18540- u32 padding[24];
18541+ u32 mxcsr; /* MXCSR Register State */
18542+ u32 mxcsr_mask; /* MXCSR Mask */
18543+
18544+ /* 8*16 bytes for each FP-reg = 128 bytes: */
18545+ u32 st_space[32];
18546+
18547+ /* 16*16 bytes for each XMM-reg = 256 bytes: */
18548+ u32 xmm_space[64];
18549+
18550+ u32 padding[24];
18551+
18552 } __attribute__((aligned(16)));
18553
18554 struct i387_soft_struct {
18555- u32 cwd;
18556- u32 swd;
18557- u32 twd;
18558- u32 fip;
18559- u32 fcs;
18560- u32 foo;
18561- u32 fos;
18562- u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
18563- u8 ftop, changed, lookahead, no_update, rm, alimit;
18564- struct info *info;
18565- u32 entry_eip;
18566+ u32 cwd;
18567+ u32 swd;
18568+ u32 twd;
18569+ u32 fip;
18570+ u32 fcs;
18571+ u32 foo;
18572+ u32 fos;
18573+ /* 8*10 bytes for each FP-reg = 80 bytes: */
18574+ u32 st_space[20];
18575+ u8 ftop;
18576+ u8 changed;
18577+ u8 lookahead;
18578+ u8 no_update;
18579+ u8 rm;
18580+ u8 alimit;
18581+ struct info *info;
18582+ u32 entry_eip;
18583 };
18584
18585-union i387_union {
18586+union thread_xstate {
18587 struct i387_fsave_struct fsave;
18588 struct i387_fxsave_struct fxsave;
18589- struct i387_soft_struct soft;
18590+ struct i387_soft_struct soft;
18591 };
18592
18593-#ifdef CONFIG_X86_32
18594-DECLARE_PER_CPU(u8, cpu_llc_id);
18595-#elif !defined(CONFIG_X86_NO_TSS)
18596+#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS)
18597 DECLARE_PER_CPU(struct orig_ist, orig_ist);
18598 #endif
18599
18600 extern void print_cpu_info(struct cpuinfo_x86 *);
18601+extern unsigned int xstate_size;
18602+extern void free_thread_xstate(struct task_struct *);
18603+extern struct kmem_cache *task_xstate_cachep;
18604 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
18605 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
18606 extern unsigned short num_cache_leaves;
18607
18608 struct thread_struct {
18609-/* cached TLS descriptors. */
18610- struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
18611- unsigned long sp0;
18612- unsigned long sp;
18613+ /* Cached TLS descriptors: */
18614+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
18615+ unsigned long sp0;
18616+ unsigned long sp;
18617 #ifdef CONFIG_X86_32
18618- unsigned long sysenter_cs;
18619+ unsigned long sysenter_cs;
18620 #else
18621- unsigned long usersp; /* Copy from PDA */
18622- unsigned short es, ds, fsindex, gsindex;
18623-#endif
18624- unsigned long ip;
18625- unsigned long fs;
18626- unsigned long gs;
18627-/* Hardware debugging registers */
18628- unsigned long debugreg0;
18629- unsigned long debugreg1;
18630- unsigned long debugreg2;
18631- unsigned long debugreg3;
18632- unsigned long debugreg6;
18633- unsigned long debugreg7;
18634-/* fault info */
18635- unsigned long cr2, trap_no, error_code;
18636-/* floating point info */
18637- union i387_union i387 __attribute__((aligned(16)));;
18638+ unsigned long usersp; /* Copy from PDA */
18639+ unsigned short es;
18640+ unsigned short ds;
18641+ unsigned short fsindex;
18642+ unsigned short gsindex;
18643+#endif
18644+ unsigned long ip;
18645+ unsigned long fs;
18646+ unsigned long gs;
18647+ /* Hardware debugging registers: */
18648+ unsigned long debugreg0;
18649+ unsigned long debugreg1;
18650+ unsigned long debugreg2;
18651+ unsigned long debugreg3;
18652+ unsigned long debugreg6;
18653+ unsigned long debugreg7;
18654+ /* Fault info: */
18655+ unsigned long cr2;
18656+ unsigned long trap_no;
18657+ unsigned long error_code;
18658+ /* floating point and extended processor state */
18659+ union thread_xstate *xstate;
18660 #ifdef CONFIG_X86_32
18661-/* virtual 86 mode info */
18662+ /* Virtual 86 mode info */
18663 struct vm86_struct __user *vm86_info;
18664 unsigned long screen_bitmap;
18665 unsigned long v86flags, v86mask, saved_sp0;
18666 unsigned int saved_fs, saved_gs;
18667 #endif
18668-/* IO permissions */
18669- unsigned long *io_bitmap_ptr;
18670- unsigned long iopl;
18671-/* max allowed port in the bitmap, in bytes: */
18672- unsigned io_bitmap_max;
18673+ /* IO permissions: */
18674+ unsigned long *io_bitmap_ptr;
18675+ unsigned long iopl;
18676+ /* Max allowed port in the bitmap, in bytes: */
18677+ unsigned io_bitmap_max;
18678 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
18679 unsigned long debugctlmsr;
18680 /* Debug Store - if not 0 points to a DS Save Area configuration;
18681@@ -384,12 +445,12 @@ static inline void xen_set_iopl_mask(uns
18682 }
18683
18684 #ifndef CONFIG_X86_NO_TSS
18685-static inline void native_load_sp0(struct tss_struct *tss,
18686- struct thread_struct *thread)
18687+static inline void
18688+native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
18689 {
18690 tss->x86_tss.sp0 = thread->sp0;
18691 #ifdef CONFIG_X86_32
18692- /* Only happens when SEP is enabled, no need to test "SEP"arately */
18693+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
18694 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
18695 tss->x86_tss.ss1 = thread->sysenter_cs;
18696 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
18697@@ -403,8 +464,8 @@ static inline void native_load_sp0(struc
18698 } while (0)
18699 #endif
18700
18701-#define __cpuid xen_cpuid
18702-#define paravirt_enabled() 0
18703+#define __cpuid xen_cpuid
18704+#define paravirt_enabled() 0
18705
18706 /*
18707 * These special macros can be used to get or set a debugging register
18708@@ -424,11 +485,12 @@ static inline void native_load_sp0(struc
18709 * enable), so that any CPU's that boot up
18710 * after us can get the correct flags.
18711 */
18712-extern unsigned long mmu_cr4_features;
18713+extern unsigned long mmu_cr4_features;
18714
18715 static inline void set_in_cr4(unsigned long mask)
18716 {
18717 unsigned cr4;
18718+
18719 mmu_cr4_features |= mask;
18720 cr4 = read_cr4();
18721 cr4 |= mask;
18722@@ -438,6 +500,7 @@ static inline void set_in_cr4(unsigned l
18723 static inline void clear_in_cr4(unsigned long mask)
18724 {
18725 unsigned cr4;
18726+
18727 mmu_cr4_features &= ~mask;
18728 cr4 = read_cr4();
18729 cr4 &= ~mask;
18730@@ -445,42 +508,42 @@ static inline void clear_in_cr4(unsigned
18731 }
18732
18733 struct microcode_header {
18734- unsigned int hdrver;
18735- unsigned int rev;
18736- unsigned int date;
18737- unsigned int sig;
18738- unsigned int cksum;
18739- unsigned int ldrver;
18740- unsigned int pf;
18741- unsigned int datasize;
18742- unsigned int totalsize;
18743- unsigned int reserved[3];
18744+ unsigned int hdrver;
18745+ unsigned int rev;
18746+ unsigned int date;
18747+ unsigned int sig;
18748+ unsigned int cksum;
18749+ unsigned int ldrver;
18750+ unsigned int pf;
18751+ unsigned int datasize;
18752+ unsigned int totalsize;
18753+ unsigned int reserved[3];
18754 };
18755
18756 struct microcode {
18757- struct microcode_header hdr;
18758- unsigned int bits[0];
18759+ struct microcode_header hdr;
18760+ unsigned int bits[0];
18761 };
18762
18763-typedef struct microcode microcode_t;
18764-typedef struct microcode_header microcode_header_t;
18765+typedef struct microcode microcode_t;
18766+typedef struct microcode_header microcode_header_t;
18767
18768 /* microcode format is extended from prescott processors */
18769 struct extended_signature {
18770- unsigned int sig;
18771- unsigned int pf;
18772- unsigned int cksum;
18773+ unsigned int sig;
18774+ unsigned int pf;
18775+ unsigned int cksum;
18776 };
18777
18778 struct extended_sigtable {
18779- unsigned int count;
18780- unsigned int cksum;
18781- unsigned int reserved[3];
18782+ unsigned int count;
18783+ unsigned int cksum;
18784+ unsigned int reserved[3];
18785 struct extended_signature sigs[0];
18786 };
18787
18788 typedef struct {
18789- unsigned long seg;
18790+ unsigned long seg;
18791 } mm_segment_t;
18792
18793
18794@@ -492,7 +555,7 @@ extern int kernel_thread(int (*fn)(void
18795 /* Free all resources held by a thread. */
18796 extern void release_thread(struct task_struct *);
18797
18798-/* Prepare to copy thread state - unlazy all lazy status */
18799+/* Prepare to copy thread state - unlazy all lazy state */
18800 extern void prepare_to_copy(struct task_struct *tsk);
18801
18802 unsigned long get_wchan(struct task_struct *p);
18803@@ -529,118 +592,138 @@ static inline unsigned int cpuid_eax(uns
18804 unsigned int eax, ebx, ecx, edx;
18805
18806 cpuid(op, &eax, &ebx, &ecx, &edx);
18807+
18808 return eax;
18809 }
18810+
18811 static inline unsigned int cpuid_ebx(unsigned int op)
18812 {
18813 unsigned int eax, ebx, ecx, edx;
18814
18815 cpuid(op, &eax, &ebx, &ecx, &edx);
18816+
18817 return ebx;
18818 }
18819+
18820 static inline unsigned int cpuid_ecx(unsigned int op)
18821 {
18822 unsigned int eax, ebx, ecx, edx;
18823
18824 cpuid(op, &eax, &ebx, &ecx, &edx);
18825+
18826 return ecx;
18827 }
18828+
18829 static inline unsigned int cpuid_edx(unsigned int op)
18830 {
18831 unsigned int eax, ebx, ecx, edx;
18832
18833 cpuid(op, &eax, &ebx, &ecx, &edx);
18834+
18835 return edx;
18836 }
18837
18838 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
18839 static inline void rep_nop(void)
18840 {
18841- __asm__ __volatile__("rep;nop": : :"memory");
18842+ asm volatile("rep; nop" ::: "memory");
18843 }
18844
18845-/* Stop speculative execution */
18846+static inline void cpu_relax(void)
18847+{
18848+ rep_nop();
18849+}
18850+
18851+/* Stop speculative execution: */
18852 static inline void sync_core(void)
18853 {
18854 int tmp;
18855+
18856 asm volatile("cpuid" : "=a" (tmp) : "0" (1)
18857- : "ebx", "ecx", "edx", "memory");
18858+ : "ebx", "ecx", "edx", "memory");
18859 }
18860
18861-#define cpu_relax() rep_nop()
18862-
18863 static inline void __monitor(const void *eax, unsigned long ecx,
18864- unsigned long edx)
18865+ unsigned long edx)
18866 {
18867- /* "monitor %eax,%ecx,%edx;" */
18868- asm volatile(
18869- ".byte 0x0f,0x01,0xc8;"
18870- : :"a" (eax), "c" (ecx), "d"(edx));
18871+ /* "monitor %eax, %ecx, %edx;" */
18872+ asm volatile(".byte 0x0f, 0x01, 0xc8;"
18873+ :: "a" (eax), "c" (ecx), "d"(edx));
18874 }
18875
18876 static inline void __mwait(unsigned long eax, unsigned long ecx)
18877 {
18878- /* "mwait %eax,%ecx;" */
18879- asm volatile(
18880- ".byte 0x0f,0x01,0xc9;"
18881- : :"a" (eax), "c" (ecx));
18882+ /* "mwait %eax, %ecx;" */
18883+ asm volatile(".byte 0x0f, 0x01, 0xc9;"
18884+ :: "a" (eax), "c" (ecx));
18885 }
18886
18887 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
18888 {
18889- /* "mwait %eax,%ecx;" */
18890- asm volatile(
18891- "sti; .byte 0x0f,0x01,0xc9;"
18892- : :"a" (eax), "c" (ecx));
18893+ trace_hardirqs_on();
18894+ /* "mwait %eax, %ecx;" */
18895+ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
18896+ :: "a" (eax), "c" (ecx));
18897 }
18898
18899 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
18900
18901-extern int force_mwait;
18902+extern int force_mwait;
18903
18904 extern void select_idle_routine(const struct cpuinfo_x86 *c);
18905
18906-extern unsigned long boot_option_idle_override;
18907+extern unsigned long boot_option_idle_override;
18908
18909 extern void enable_sep_cpu(void);
18910 extern int sysenter_setup(void);
18911
18912 /* Defined in head.S */
18913-extern struct desc_ptr early_gdt_descr;
18914+extern struct desc_ptr early_gdt_descr;
18915
18916 extern void cpu_set_gdt(int);
18917 extern void switch_to_new_gdt(void);
18918 extern void cpu_init(void);
18919 extern void init_gdt(int cpu);
18920
18921-/* from system description table in BIOS. Mostly for MCA use, but
18922- * others may find it useful. */
18923-extern unsigned int machine_id;
18924-extern unsigned int machine_submodel_id;
18925-extern unsigned int BIOS_revision;
18926+static inline void update_debugctlmsr(unsigned long debugctlmsr)
18927+{
18928+#ifndef CONFIG_X86_DEBUGCTLMSR
18929+ if (boot_cpu_data.x86 < 6)
18930+ return;
18931+#endif
18932+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
18933+}
18934
18935-/* Boot loader type from the setup header */
18936-extern int bootloader_type;
18937+/*
18938+ * from system description table in BIOS. Mostly for MCA use, but
18939+ * others may find it useful:
18940+ */
18941+extern unsigned int machine_id;
18942+extern unsigned int machine_submodel_id;
18943+extern unsigned int BIOS_revision;
18944+
18945+/* Boot loader type from the setup header: */
18946+extern int bootloader_type;
18947
18948-extern char ignore_fpu_irq;
18949-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
18950+extern char ignore_fpu_irq;
18951
18952 #define HAVE_ARCH_PICK_MMAP_LAYOUT 1
18953 #define ARCH_HAS_PREFETCHW
18954 #define ARCH_HAS_SPINLOCK_PREFETCH
18955
18956 #ifdef CONFIG_X86_32
18957-#define BASE_PREFETCH ASM_NOP4
18958-#define ARCH_HAS_PREFETCH
18959+# define BASE_PREFETCH ASM_NOP4
18960+# define ARCH_HAS_PREFETCH
18961 #else
18962-#define BASE_PREFETCH "prefetcht0 (%1)"
18963+# define BASE_PREFETCH "prefetcht0 (%1)"
18964 #endif
18965
18966-/* Prefetch instructions for Pentium III and AMD Athlon */
18967-/* It's not worth to care about 3dnow! prefetches for the K6
18968- because they are microcoded there and very slow.
18969- However we don't do prefetches for pre XP Athlons currently
18970- That should be fixed. */
18971+/*
18972+ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
18973+ *
18974+ * It's not worth to care about 3dnow prefetches for the K6
18975+ * because they are microcoded there and very slow.
18976+ */
18977 static inline void prefetch(const void *x)
18978 {
18979 alternative_input(BASE_PREFETCH,
18980@@ -649,8 +732,11 @@ static inline void prefetch(const void *
18981 "r" (x));
18982 }
18983
18984-/* 3dnow! prefetch to get an exclusive cache line. Useful for
18985- spinlocks to avoid one state transition in the cache coherency protocol. */
18986+/*
18987+ * 3dnow prefetch to get an exclusive cache line.
18988+ * Useful for spinlocks to avoid one state transition in the
18989+ * cache coherency protocol:
18990+ */
18991 static inline void prefetchw(const void *x)
18992 {
18993 alternative_input(BASE_PREFETCH,
18994@@ -659,21 +745,25 @@ static inline void prefetchw(const void
18995 "r" (x));
18996 }
18997
18998-#define spin_lock_prefetch(x) prefetchw(x)
18999+static inline void spin_lock_prefetch(const void *x)
19000+{
19001+ prefetchw(x);
19002+}
19003+
19004 #ifdef CONFIG_X86_32
19005 /*
19006 * User space process size: 3GB (default).
19007 */
19008-#define TASK_SIZE (PAGE_OFFSET)
19009-#define STACK_TOP TASK_SIZE
19010-#define STACK_TOP_MAX STACK_TOP
19011-
19012-#define INIT_THREAD { \
19013- .sp0 = sizeof(init_stack) + (long)&init_stack, \
19014- .vm86_info = NULL, \
19015- .sysenter_cs = __KERNEL_CS, \
19016- .io_bitmap_ptr = NULL, \
19017- .fs = __KERNEL_PERCPU, \
19018+#define TASK_SIZE PAGE_OFFSET
19019+#define STACK_TOP TASK_SIZE
19020+#define STACK_TOP_MAX STACK_TOP
19021+
19022+#define INIT_THREAD { \
19023+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
19024+ .vm86_info = NULL, \
19025+ .sysenter_cs = __KERNEL_CS, \
19026+ .io_bitmap_ptr = NULL, \
19027+ .fs = __KERNEL_PERCPU, \
19028 }
19029
19030 /*
19031@@ -682,28 +772,15 @@ static inline void prefetchw(const void
19032 * permission bitmap. The extra byte must be all 1 bits, and must
19033 * be within the limit.
19034 */
19035-#define INIT_TSS { \
19036- .x86_tss = { \
19037+#define INIT_TSS { \
19038+ .x86_tss = { \
19039 .sp0 = sizeof(init_stack) + (long)&init_stack, \
19040- .ss0 = __KERNEL_DS, \
19041- .ss1 = __KERNEL_CS, \
19042- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
19043- }, \
19044- .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
19045-}
19046-
19047-#define start_thread(regs, new_eip, new_esp) do { \
19048- __asm__("movl %0,%%gs": :"r" (0)); \
19049- regs->fs = 0; \
19050- set_fs(USER_DS); \
19051- regs->ds = __USER_DS; \
19052- regs->es = __USER_DS; \
19053- regs->ss = __USER_DS; \
19054- regs->cs = __USER_CS; \
19055- regs->ip = new_eip; \
19056- regs->sp = new_esp; \
19057-} while (0)
19058-
19059+ .ss0 = __KERNEL_DS, \
19060+ .ss1 = __KERNEL_CS, \
19061+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
19062+ }, \
19063+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
19064+}
19065
19066 extern unsigned long thread_saved_pc(struct task_struct *tsk);
19067
19068@@ -731,24 +808,24 @@ extern unsigned long thread_saved_pc(str
19069 __regs__ - 1; \
19070 })
19071
19072-#define KSTK_ESP(task) (task_pt_regs(task)->sp)
19073+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
19074
19075 #else
19076 /*
19077 * User space process size. 47bits minus one guard page.
19078 */
19079-#define TASK_SIZE64 (0x800000000000UL - 4096)
19080+#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
19081
19082 /* This decides where the kernel will search for a free chunk of vm
19083 * space during mmap's.
19084 */
19085-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
19086- 0xc0000000 : 0xFFFFe000)
19087+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
19088+ 0xc0000000 : 0xFFFFe000)
19089
19090-#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
19091- IA32_PAGE_OFFSET : TASK_SIZE64)
19092-#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
19093- IA32_PAGE_OFFSET : TASK_SIZE64)
19094+#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
19095+ IA32_PAGE_OFFSET : TASK_SIZE64)
19096+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
19097+ IA32_PAGE_OFFSET : TASK_SIZE64)
19098
19099 #define STACK_TOP TASK_SIZE
19100 #define STACK_TOP_MAX TASK_SIZE64
19101@@ -761,33 +838,32 @@ extern unsigned long thread_saved_pc(str
19102 .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
19103 }
19104
19105-#define start_thread(regs, new_rip, new_rsp) do { \
19106- asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
19107- load_gs_index(0); \
19108- (regs)->ip = (new_rip); \
19109- (regs)->sp = (new_rsp); \
19110- write_pda(oldrsp, (new_rsp)); \
19111- (regs)->cs = __USER_CS; \
19112- (regs)->ss = __USER_DS; \
19113- (regs)->flags = 0x200; \
19114- set_fs(USER_DS); \
19115-} while (0)
19116-
19117 /*
19118 * Return saved PC of a blocked thread.
19119 * What is this good for? it will be always the scheduler or ret_from_fork.
19120 */
19121-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
19122+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
19123
19124-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
19125-#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
19126+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
19127+#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
19128 #endif /* CONFIG_X86_64 */
19129
19130-/* This decides where the kernel will search for a free chunk of vm
19131+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
19132+ unsigned long new_sp);
19133+
19134+/*
19135+ * This decides where the kernel will search for a free chunk of vm
19136 * space during mmap's.
19137 */
19138 #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
19139
19140-#define KSTK_EIP(task) (task_pt_regs(task)->ip)
19141+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
19142+
19143+/* Get/set a process' ability to use the timestamp counter instruction */
19144+#define GET_TSC_CTL(adr) get_tsc_mode((adr))
19145+#define SET_TSC_CTL(val) set_tsc_mode((val))
19146+
19147+extern int get_tsc_mode(unsigned long adr);
19148+extern int set_tsc_mode(unsigned int val);
19149
19150 #endif
19151--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:33:40.000000000 +0100
19152+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:38:05.000000000 +0100
19153@@ -191,13 +191,14 @@
19154 #define SEGMENT_TI_MASK 0x4
19155
19156 #define IDT_ENTRIES 256
19157+#define NUM_EXCEPTION_VECTORS 32
19158 #define GDT_SIZE (GDT_ENTRIES * 8)
19159 #define GDT_ENTRY_TLS_ENTRIES 3
19160 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
19161
19162 #ifdef __KERNEL__
19163 #ifndef __ASSEMBLY__
19164-extern const char early_idt_handlers[IDT_ENTRIES][10];
19165+extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
19166 #endif
19167 #endif
19168
19169--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/smp.h 2009-02-16 16:18:36.000000000 +0100
19170+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/smp.h 2009-03-16 16:38:05.000000000 +0100
19171@@ -1,5 +1,227 @@
19172-#ifdef CONFIG_X86_32
19173-# include "smp_32.h"
19174+#ifndef _ASM_X86_SMP_H_
19175+#define _ASM_X86_SMP_H_
19176+#ifndef __ASSEMBLY__
19177+#include <linux/cpumask.h>
19178+#include <linux/init.h>
19179+#include <asm/percpu.h>
19180+
19181+/*
19182+ * We need the APIC definitions automatically as part of 'smp.h'
19183+ */
19184+#ifdef CONFIG_X86_LOCAL_APIC
19185+# include <asm/mpspec.h>
19186+# include <asm/apic.h>
19187+# ifdef CONFIG_X86_IO_APIC
19188+# include <asm/io_apic.h>
19189+# endif
19190+#endif
19191+#include <asm/pda.h>
19192+#include <asm/thread_info.h>
19193+
19194+#define cpu_callout_map cpu_possible_map
19195+extern cpumask_t cpu_initialized;
19196+#define cpu_callin_map cpu_possible_map
19197+
19198+extern void (*mtrr_hook)(void);
19199+extern void zap_low_mappings(void);
19200+
19201+extern int smp_num_siblings;
19202+extern unsigned int num_processors;
19203+extern cpumask_t cpu_initialized;
19204+
19205+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
19206+extern u16 x86_cpu_to_apicid_init[];
19207+extern u16 x86_bios_cpu_apicid_init[];
19208+extern void *x86_cpu_to_apicid_early_ptr;
19209+extern void *x86_bios_cpu_apicid_early_ptr;
19210 #else
19211-# include "smp_64.h"
19212+#define x86_cpu_to_apicid_early_ptr NULL
19213+#define x86_bios_cpu_apicid_early_ptr NULL
19214+#endif
19215+
19216+DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
19217+DECLARE_PER_CPU(cpumask_t, cpu_core_map);
19218+DECLARE_PER_CPU(u16, cpu_llc_id);
19219+DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
19220+DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
19221+
19222+#ifdef CONFIG_SMP
19223+
19224+#ifndef CONFIG_XEN
19225+
19226+/* Static state in head.S used to set up a CPU */
19227+extern struct {
19228+ void *sp;
19229+ unsigned short ss;
19230+} stack_start;
19231+
19232+struct smp_ops {
19233+ void (*smp_prepare_boot_cpu)(void);
19234+ void (*smp_prepare_cpus)(unsigned max_cpus);
19235+ int (*cpu_up)(unsigned cpu);
19236+ void (*smp_cpus_done)(unsigned max_cpus);
19237+
19238+ void (*smp_send_stop)(void);
19239+ void (*smp_send_reschedule)(int cpu);
19240+ int (*smp_call_function_mask)(cpumask_t mask,
19241+ void (*func)(void *info), void *info,
19242+ int wait);
19243+};
19244+
19245+/* Globals due to paravirt */
19246+extern void set_cpu_sibling_map(int cpu);
19247+
19248+#ifndef CONFIG_PARAVIRT
19249+#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
19250+#endif
19251+extern struct smp_ops smp_ops;
19252+
19253+static inline void smp_send_stop(void)
19254+{
19255+ smp_ops.smp_send_stop();
19256+}
19257+
19258+static inline void smp_prepare_boot_cpu(void)
19259+{
19260+ smp_ops.smp_prepare_boot_cpu();
19261+}
19262+
19263+static inline void smp_prepare_cpus(unsigned int max_cpus)
19264+{
19265+ smp_ops.smp_prepare_cpus(max_cpus);
19266+}
19267+
19268+static inline void smp_cpus_done(unsigned int max_cpus)
19269+{
19270+ smp_ops.smp_cpus_done(max_cpus);
19271+}
19272+
19273+static inline int __cpu_up(unsigned int cpu)
19274+{
19275+ return smp_ops.cpu_up(cpu);
19276+}
19277+
19278+static inline void smp_send_reschedule(int cpu)
19279+{
19280+ smp_ops.smp_send_reschedule(cpu);
19281+}
19282+
19283+static inline int smp_call_function_mask(cpumask_t mask,
19284+ void (*func) (void *info), void *info,
19285+ int wait)
19286+{
19287+ return smp_ops.smp_call_function_mask(mask, func, info, wait);
19288+}
19289+
19290+void native_smp_prepare_boot_cpu(void);
19291+void native_smp_prepare_cpus(unsigned int max_cpus);
19292+void native_smp_cpus_done(unsigned int max_cpus);
19293+int native_cpu_up(unsigned int cpunum);
19294+
19295+#else /* CONFIG_XEN */
19296+
19297+void xen_smp_send_stop(void);
19298+void xen_smp_send_reschedule(int cpu);
19299+int xen_smp_call_function_mask(cpumask_t mask,
19300+ void (*func) (void *info), void *info,
19301+ int wait);
19302+
19303+#define smp_send_stop xen_smp_send_stop
19304+#define smp_send_reschedule xen_smp_send_reschedule
19305+#define smp_call_function_mask xen_smp_call_function_mask
19306+
19307+extern void prefill_possible_map(void);
19308+
19309+#endif /* CONFIG_XEN */
19310+
19311+extern int __cpu_disable(void);
19312+extern void __cpu_die(unsigned int cpu);
19313+
19314+extern void prefill_possible_map(void);
19315+
19316+void smp_store_cpu_info(int id);
19317+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
19318+
19319+/* We don't mark CPUs online until __cpu_up(), so we need another measure */
19320+static inline int num_booting_cpus(void)
19321+{
19322+ return cpus_weight(cpu_callout_map);
19323+}
19324+#endif /* CONFIG_SMP */
19325+
19326+extern unsigned disabled_cpus __cpuinitdata;
19327+
19328+#ifdef CONFIG_X86_32_SMP
19329+/*
19330+ * This function is needed by all SMP systems. It must _always_ be valid
19331+ * from the initial startup. We map APIC_BASE very early in page_setup(),
19332+ * so this is correct in the x86 case.
19333+ */
19334+DECLARE_PER_CPU(int, cpu_number);
19335+#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
19336+#define safe_smp_processor_id() smp_processor_id()
19337+
19338+#elif defined(CONFIG_X86_64_SMP)
19339+#define raw_smp_processor_id() read_pda(cpunumber)
19340+
19341+#define stack_smp_processor_id() \
19342+({ \
19343+ struct thread_info *ti; \
19344+ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
19345+ ti->cpu; \
19346+})
19347+#define safe_smp_processor_id() smp_processor_id()
19348+
19349+#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
19350+#define cpu_physical_id(cpu) boot_cpu_physical_apicid
19351+#define safe_smp_processor_id() 0
19352+#define stack_smp_processor_id() 0
19353+#endif
19354+
19355+#ifdef CONFIG_X86_LOCAL_APIC
19356+
19357+static inline int logical_smp_processor_id(void)
19358+{
19359+ /* we don't want to mark this access volatile - bad code generation */
19360+ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
19361+}
19362+
19363+#ifndef CONFIG_X86_64
19364+static inline unsigned int read_apic_id(void)
19365+{
19366+ return *(u32 *)(APIC_BASE + APIC_ID);
19367+}
19368+#else
19369+extern unsigned int read_apic_id(void);
19370+#endif
19371+
19372+
19373+# ifdef APIC_DEFINITION
19374+extern int hard_smp_processor_id(void);
19375+# else
19376+# include <mach_apicdef.h>
19377+static inline int hard_smp_processor_id(void)
19378+{
19379+ /* we don't want to mark this access volatile - bad code generation */
19380+ return GET_APIC_ID(read_apic_id());
19381+}
19382+# endif /* APIC_DEFINITION */
19383+
19384+#else /* CONFIG_X86_LOCAL_APIC */
19385+
19386+# ifndef CONFIG_SMP
19387+# define hard_smp_processor_id() 0
19388+# endif
19389+
19390+#endif /* CONFIG_X86_LOCAL_APIC */
19391+
19392+#ifdef CONFIG_HOTPLUG_CPU
19393+extern void cpu_exit_clear(void);
19394+extern void cpu_uninit(void);
19395+#endif
19396+
19397+extern void smp_alloc_memory(void);
19398+extern void lock_ipi_call_lock(void);
19399+extern void unlock_ipi_call_lock(void);
19400+#endif /* __ASSEMBLY__ */
19401 #endif
19402--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/smp_32.h 2009-03-16 16:33:40.000000000 +0100
19403+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
19404@@ -1,178 +0,0 @@
19405-#ifndef __ASM_SMP_H
19406-#define __ASM_SMP_H
19407-
19408-#ifndef __ASSEMBLY__
19409-#include <linux/cpumask.h>
19410-#include <linux/init.h>
19411-
19412-/*
19413- * We need the APIC definitions automatically as part of 'smp.h'
19414- */
19415-#ifdef CONFIG_X86_LOCAL_APIC
19416-# include <asm/mpspec.h>
19417-# include <asm/apic.h>
19418-# ifdef CONFIG_X86_IO_APIC
19419-# include <asm/io_apic.h>
19420-# endif
19421-#endif
19422-
19423-#define cpu_callout_map cpu_possible_map
19424-#define cpu_callin_map cpu_possible_map
19425-
19426-extern int smp_num_siblings;
19427-extern unsigned int num_processors;
19428-
19429-extern void smp_alloc_memory(void);
19430-extern void lock_ipi_call_lock(void);
19431-extern void unlock_ipi_call_lock(void);
19432-
19433-extern void (*mtrr_hook) (void);
19434-extern void zap_low_mappings (void);
19435-
19436-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
19437-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
19438-DECLARE_PER_CPU(u8, cpu_llc_id);
19439-DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
19440-
19441-#ifdef CONFIG_HOTPLUG_CPU
19442-extern void cpu_exit_clear(void);
19443-extern void cpu_uninit(void);
19444-#endif
19445-
19446-#ifdef CONFIG_SMP
19447-
19448-#ifndef CONFIG_XEN
19449-
19450-/* Globals due to paravirt */
19451-extern void set_cpu_sibling_map(int cpu);
19452-
19453-struct smp_ops
19454-{
19455- void (*smp_prepare_boot_cpu)(void);
19456- void (*smp_prepare_cpus)(unsigned max_cpus);
19457- int (*cpu_up)(unsigned cpu);
19458- void (*smp_cpus_done)(unsigned max_cpus);
19459-
19460- void (*smp_send_stop)(void);
19461- void (*smp_send_reschedule)(int cpu);
19462- int (*smp_call_function_mask)(cpumask_t mask,
19463- void (*func)(void *info), void *info,
19464- int wait);
19465-};
19466-
19467-extern struct smp_ops smp_ops;
19468-
19469-static inline void smp_prepare_boot_cpu(void)
19470-{
19471- smp_ops.smp_prepare_boot_cpu();
19472-}
19473-static inline void smp_prepare_cpus(unsigned int max_cpus)
19474-{
19475- smp_ops.smp_prepare_cpus(max_cpus);
19476-}
19477-static inline int __cpu_up(unsigned int cpu)
19478-{
19479- return smp_ops.cpu_up(cpu);
19480-}
19481-static inline void smp_cpus_done(unsigned int max_cpus)
19482-{
19483- smp_ops.smp_cpus_done(max_cpus);
19484-}
19485-
19486-static inline void smp_send_stop(void)
19487-{
19488- smp_ops.smp_send_stop();
19489-}
19490-static inline void smp_send_reschedule(int cpu)
19491-{
19492- smp_ops.smp_send_reschedule(cpu);
19493-}
19494-static inline int smp_call_function_mask(cpumask_t mask,
19495- void (*func) (void *info), void *info,
19496- int wait)
19497-{
19498- return smp_ops.smp_call_function_mask(mask, func, info, wait);
19499-}
19500-
19501-void native_smp_prepare_boot_cpu(void);
19502-void native_smp_prepare_cpus(unsigned int max_cpus);
19503-int native_cpu_up(unsigned int cpunum);
19504-void native_smp_cpus_done(unsigned int max_cpus);
19505-
19506-#ifndef CONFIG_PARAVIRT
19507-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
19508-#endif
19509-
19510-#else /* CONFIG_XEN */
19511-
19512-void xen_smp_send_stop(void);
19513-void xen_smp_send_reschedule(int cpu);
19514-int xen_smp_call_function_mask(cpumask_t mask,
19515- void (*func) (void *info), void *info,
19516- int wait);
19517-
19518-#define smp_send_stop xen_smp_send_stop
19519-#define smp_send_reschedule xen_smp_send_reschedule
19520-#define smp_call_function_mask xen_smp_call_function_mask
19521-
19522-extern void prefill_possible_map(void);
19523-
19524-#endif /* CONFIG_XEN */
19525-
19526-extern int __cpu_disable(void);
19527-extern void __cpu_die(unsigned int cpu);
19528-
19529-/*
19530- * This function is needed by all SMP systems. It must _always_ be valid
19531- * from the initial startup. We map APIC_BASE very early in page_setup(),
19532- * so this is correct in the x86 case.
19533- */
19534-DECLARE_PER_CPU(int, cpu_number);
19535-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
19536-
19537-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
19538-
19539-#define safe_smp_processor_id() smp_processor_id()
19540-
19541-/* We don't mark CPUs online until __cpu_up(), so we need another measure */
19542-static inline int num_booting_cpus(void)
19543-{
19544- return cpus_weight(cpu_callout_map);
19545-}
19546-
19547-#else /* CONFIG_SMP */
19548-
19549-#define safe_smp_processor_id() 0
19550-#define cpu_physical_id(cpu) boot_cpu_physical_apicid
19551-
19552-#endif /* !CONFIG_SMP */
19553-
19554-#ifdef CONFIG_X86_LOCAL_APIC
19555-
19556-static __inline int logical_smp_processor_id(void)
19557-{
19558- /* we don't want to mark this access volatile - bad code generation */
19559- return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
19560-}
19561-
19562-# ifdef APIC_DEFINITION
19563-extern int hard_smp_processor_id(void);
19564-# else
19565-# include <mach_apicdef.h>
19566-static inline int hard_smp_processor_id(void)
19567-{
19568- /* we don't want to mark this access volatile - bad code generation */
19569- return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
19570-}
19571-# endif /* APIC_DEFINITION */
19572-
19573-#else /* CONFIG_X86_LOCAL_APIC */
19574-
19575-# ifndef CONFIG_SMP
19576-# define hard_smp_processor_id() 0
19577-# endif
19578-
19579-#endif /* CONFIG_X86_LOCAL_APIC */
19580-
19581-#endif /* !ASSEMBLY */
19582-#endif
19583--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/smp_64.h 2009-03-16 16:33:40.000000000 +0100
19584+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
19585@@ -1,103 +0,0 @@
19586-#ifndef __ASM_SMP_H
19587-#define __ASM_SMP_H
19588-
19589-#include <linux/cpumask.h>
19590-#include <linux/init.h>
19591-
19592-#ifdef CONFIG_X86_LOCAL_APIC
19593-/*
19594- * We need the APIC definitions automatically as part of 'smp.h'
19595- */
19596-#include <asm/apic.h>
19597-#ifdef CONFIG_X86_IO_APIC
19598-#include <asm/io_apic.h>
19599-#endif
19600-#include <asm/mpspec.h>
19601-#endif
19602-#include <asm/pda.h>
19603-#include <asm/thread_info.h>
19604-
19605-extern cpumask_t cpu_initialized;
19606-
19607-extern int smp_num_siblings;
19608-extern unsigned int num_processors;
19609-
19610-extern void smp_alloc_memory(void);
19611-extern void lock_ipi_call_lock(void);
19612-extern void unlock_ipi_call_lock(void);
19613-
19614-extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
19615- void *info, int wait);
19616-
19617-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
19618-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
19619-DECLARE_PER_CPU(u16, cpu_llc_id);
19620-DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
19621-DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
19622-
19623-#ifdef CONFIG_X86_LOCAL_APIC
19624-static inline int cpu_present_to_apicid(int mps_cpu)
19625-{
19626- if (cpu_present(mps_cpu))
19627- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
19628- else
19629- return BAD_APICID;
19630-}
19631-#endif
19632-
19633-#ifdef CONFIG_SMP
19634-
19635-#define SMP_TRAMPOLINE_BASE 0x6000
19636-
19637-extern int __cpu_disable(void);
19638-extern void __cpu_die(unsigned int cpu);
19639-extern void prefill_possible_map(void);
19640-extern unsigned __cpuinitdata disabled_cpus;
19641-
19642-#define raw_smp_processor_id() read_pda(cpunumber)
19643-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
19644-
19645-#define stack_smp_processor_id() \
19646- ({ \
19647- struct thread_info *ti; \
19648- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
19649- ti->cpu; \
19650-})
19651-
19652-/*
19653- * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
19654- * scheduling and IPI sending and compresses data structures.
19655- */
19656-static inline int num_booting_cpus(void)
19657-{
19658- return cpus_weight(cpu_possible_map);
19659-}
19660-
19661-extern void smp_send_reschedule(int cpu);
19662-
19663-#else /* CONFIG_SMP */
19664-
19665-extern unsigned int boot_cpu_id;
19666-#define cpu_physical_id(cpu) boot_cpu_id
19667-#define stack_smp_processor_id() 0
19668-
19669-#endif /* !CONFIG_SMP */
19670-
19671-#define safe_smp_processor_id() smp_processor_id()
19672-
19673-#ifdef CONFIG_X86_LOCAL_APIC
19674-static __inline int logical_smp_processor_id(void)
19675-{
19676- /* we don't want to mark this access volatile - bad code generation */
19677- return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
19678-}
19679-
19680-static inline int hard_smp_processor_id(void)
19681-{
19682- /* we don't want to mark this access volatile - bad code generation */
19683- return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
19684-}
19685-#endif
19686-
19687-#endif
19688-
19689--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:33:40.000000000 +0100
19690+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:38:05.000000000 +0100
19691@@ -88,7 +88,7 @@ extern void xen_spin_kick(raw_spinlock_t
19692 : "memory", "cc")
19693
19694
19695-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
19696+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
19697 {
19698 int tmp, new;
19699
19700@@ -107,7 +107,7 @@ static inline int __raw_spin_trylock(raw
19701 return tmp;
19702 }
19703
19704-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
19705+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
19706 {
19707 unsigned int token;
19708 unsigned char kick;
19709@@ -155,7 +155,7 @@ static inline void __raw_spin_unlock(raw
19710 : "memory", "cc"); \
19711 } while (0)
19712
19713-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
19714+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
19715 {
19716 int tmp;
19717 int new;
19718@@ -177,7 +177,7 @@ static inline int __raw_spin_trylock(raw
19719 return tmp;
19720 }
19721
19722-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
19723+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
19724 {
19725 unsigned int token, tmp;
19726 bool kick;
19727@@ -197,19 +197,19 @@ static inline void __raw_spin_unlock(raw
19728
19729 static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
19730 {
19731- int tmp = *(volatile signed int *)(&(lock)->slock);
19732+ int tmp = ACCESS_ONCE(lock->slock);
19733
19734 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
19735 }
19736
19737 static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
19738 {
19739- int tmp = *(volatile signed int *)(&(lock)->slock);
19740+ int tmp = ACCESS_ONCE(lock->slock);
19741
19742 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
19743 }
19744
19745-static inline void __raw_spin_lock(raw_spinlock_t *lock)
19746+static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
19747 {
19748 unsigned int token, count;
19749 bool free;
19750@@ -223,8 +223,8 @@ static inline void __raw_spin_lock(raw_s
19751 } while (unlikely(!count) && !xen_spin_wait(lock, token));
19752 }
19753
19754-static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
19755- unsigned long flags)
19756+static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
19757+ unsigned long flags)
19758 {
19759 unsigned int token, count;
19760 bool free;
19761--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/swiotlb.h 2009-02-16 16:18:36.000000000 +0100
19762+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/swiotlb.h 2009-03-16 16:38:05.000000000 +0100
19763@@ -1,5 +1,8 @@
19764-#ifdef CONFIG_X86_32
19765-# include "swiotlb_32.h"
19766-#else
19767-# include "../../swiotlb.h"
19768-#endif
19769+#ifndef _ASM_SWIOTLB_H
19770+
19771+#include "../../swiotlb.h"
19772+
19773+dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
19774+ int dir);
19775+
19776+#endif /* _ASM_SWIOTLB_H */
19777--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/swiotlb_32.h 2009-05-14 10:56:29.000000000 +0200
19778+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
19779@@ -1,43 +0,0 @@
19780-#ifndef _ASM_SWIOTLB_H
19781-#define _ASM_SWIOTLB_H 1
19782-
19783-/* SWIOTLB interface */
19784-
19785-extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
19786- int dir);
19787-extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
19788- size_t size, int dir);
19789-extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
19790- dma_addr_t dev_addr,
19791- size_t size, int dir);
19792-extern void swiotlb_sync_single_for_device(struct device *hwdev,
19793- dma_addr_t dev_addr,
19794- size_t size, int dir);
19795-extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
19796- struct scatterlist *sg, int nelems,
19797- int dir);
19798-extern void swiotlb_sync_sg_for_device(struct device *hwdev,
19799- struct scatterlist *sg, int nelems,
19800- int dir);
19801-extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
19802- int nents, int direction);
19803-extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
19804- int nents, int direction);
19805-extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
19806-#ifdef CONFIG_HIGHMEM
19807-extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
19808- unsigned long offset, size_t size,
19809- enum dma_data_direction direction);
19810-extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
19811- size_t size, enum dma_data_direction direction);
19812-#endif
19813-extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
19814-extern void swiotlb_init(void);
19815-
19816-#ifdef CONFIG_SWIOTLB
19817-extern int swiotlb;
19818-#else
19819-#define swiotlb 0
19820-#endif
19821-
19822-#endif
19823--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:33:40.000000000 +0100
19824+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:38:05.000000000 +0100
19825@@ -28,22 +28,44 @@ struct task_struct *__switch_to(struct t
19826 * Saving eflags is important. It switches not only IOPL between tasks,
19827 * it also protects other tasks from NT leaking through sysenter etc.
19828 */
19829-#define switch_to(prev, next, last) do { \
19830- unsigned long esi, edi; \
19831- asm volatile("pushfl\n\t" /* Save flags */ \
19832- "pushl %%ebp\n\t" \
19833- "movl %%esp,%0\n\t" /* save ESP */ \
19834- "movl %5,%%esp\n\t" /* restore ESP */ \
19835- "movl $1f,%1\n\t" /* save EIP */ \
19836- "pushl %6\n\t" /* restore EIP */ \
19837- "jmp __switch_to\n" \
19838+#define switch_to(prev, next, last) \
19839+do { \
19840+ /* \
19841+ * Context-switching clobbers all registers, so we clobber \
19842+ * them explicitly, via unused output variables. \
19843+ * (EAX and EBP is not listed because EBP is saved/restored \
19844+ * explicitly for wchan access and EAX is the return value of \
19845+ * __switch_to()) \
19846+ */ \
19847+ unsigned long ebx, ecx, edx, esi, edi; \
19848+ \
19849+ asm volatile("pushfl\n\t" /* save flags */ \
19850+ "pushl %%ebp\n\t" /* save EBP */ \
19851+ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
19852+ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
19853+ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
19854+ "pushl %[next_ip]\n\t" /* restore EIP */ \
19855+ "jmp __switch_to\n" /* regparm call */ \
19856 "1:\t" \
19857- "popl %%ebp\n\t" \
19858- "popfl" \
19859- :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \
19860- "=a" (last), "=S" (esi), "=D" (edi) \
19861- :"m" (next->thread.sp), "m" (next->thread.ip), \
19862- "2" (prev), "d" (next)); \
19863+ "popl %%ebp\n\t" /* restore EBP */ \
19864+ "popfl\n" /* restore flags */ \
19865+ \
19866+ /* output parameters */ \
19867+ : [prev_sp] "=m" (prev->thread.sp), \
19868+ [prev_ip] "=m" (prev->thread.ip), \
19869+ "=a" (last), \
19870+ \
19871+ /* clobbered output registers: */ \
19872+ "=b" (ebx), "=c" (ecx), "=d" (edx), \
19873+ "=S" (esi), "=D" (edi) \
19874+ \
19875+ /* input parameters: */ \
19876+ : [next_sp] "m" (next->thread.sp), \
19877+ [next_ip] "m" (next->thread.ip), \
19878+ \
19879+ /* regparm parameters for __switch_to(): */ \
19880+ [prev] "a" (prev), \
19881+ [next] "d" (next)); \
19882 } while (0)
19883
19884 /*
19885@@ -123,30 +145,29 @@ extern void load_gs_index(unsigned);
19886 */
19887 #define loadsegment(seg, value) \
19888 asm volatile("\n" \
19889- "1:\t" \
19890- "movl %k0,%%" #seg "\n" \
19891- "2:\n" \
19892- ".section .fixup,\"ax\"\n" \
19893- "3:\t" \
19894- "movl %k1, %%" #seg "\n\t" \
19895- "jmp 2b\n" \
19896- ".previous\n" \
19897- _ASM_EXTABLE(1b,3b) \
19898- : :"r" (value), "r" (0))
19899+ "1:\t" \
19900+ "movl %k0,%%" #seg "\n" \
19901+ "2:\n" \
19902+ ".section .fixup,\"ax\"\n" \
19903+ "3:\t" \
19904+ "movl %k1, %%" #seg "\n\t" \
19905+ "jmp 2b\n" \
19906+ ".previous\n" \
19907+ _ASM_EXTABLE(1b,3b) \
19908+ : :"r" (value), "r" (0))
19909
19910
19911 /*
19912 * Save a segment register away
19913 */
19914-#define savesegment(seg, value) \
19915+#define savesegment(seg, value) \
19916 asm volatile("mov %%" #seg ",%0":"=rm" (value))
19917
19918 static inline unsigned long get_limit(unsigned long segment)
19919 {
19920 unsigned long __limit;
19921- __asm__("lsll %1,%0"
19922- :"=r" (__limit):"r" (segment));
19923- return __limit+1;
19924+ asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
19925+ return __limit + 1;
19926 }
19927
19928 static inline void xen_clts(void)
19929@@ -171,13 +192,13 @@ static unsigned long __force_order;
19930 static inline unsigned long xen_read_cr0(void)
19931 {
19932 unsigned long val;
19933- asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
19934+ asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
19935 return val;
19936 }
19937
19938 static inline void xen_write_cr0(unsigned long val)
19939 {
19940- asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
19941+ asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
19942 }
19943
19944 #define xen_read_cr2() (current_vcpu_info()->arch.cr2)
19945@@ -186,7 +207,7 @@ static inline void xen_write_cr0(unsigne
19946 static inline unsigned long xen_read_cr3(void)
19947 {
19948 unsigned long val;
19949- asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
19950+ asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
19951 #ifdef CONFIG_X86_32
19952 return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
19953 #else
19954@@ -201,13 +222,13 @@ static inline void xen_write_cr3(unsigne
19955 #else
19956 val = phys_to_machine(val);
19957 #endif
19958- asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
19959+ asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
19960 }
19961
19962 static inline unsigned long xen_read_cr4(void)
19963 {
19964 unsigned long val;
19965- asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
19966+ asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
19967 return val;
19968 }
19969
19970@@ -215,7 +236,7 @@ static inline unsigned long xen_read_cr4
19971
19972 static inline void xen_write_cr4(unsigned long val)
19973 {
19974- asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
19975+ asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
19976 }
19977
19978 #ifdef CONFIG_X86_64
19979@@ -234,6 +255,7 @@ static inline void xen_wbinvd(void)
19980 {
19981 asm volatile("wbinvd": : :"memory");
19982 }
19983+
19984 #define read_cr0() (xen_read_cr0())
19985 #define write_cr0(x) (xen_write_cr0(x))
19986 #define read_cr2() (xen_read_cr2())
19987@@ -260,7 +282,7 @@ static inline void clflush(volatile void
19988 asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
19989 }
19990
19991-#define nop() __asm__ __volatile__ ("nop")
19992+#define nop() asm volatile ("nop")
19993
19994 void disable_hlt(void);
19995 void enable_hlt(void);
19996@@ -280,16 +302,7 @@ void default_idle(void);
19997 */
19998 #ifdef CONFIG_X86_32
19999 /*
20000- * For now, "wmb()" doesn't actually do anything, as all
20001- * Intel CPU's follow what Intel calls a *Processor Order*,
20002- * in which all writes are seen in the program order even
20003- * outside the CPU.
20004- *
20005- * I expect future Intel CPU's to have a weaker ordering,
20006- * but I'd also expect them to finally get their act together
20007- * and add some real memory barriers if so.
20008- *
20009- * Some non intel clones support out of order store. wmb() ceases to be a
20010+ * Some non-Intel clones support out of order store. wmb() ceases to be a
20011 * nop for these.
20012 */
20013 #define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
20014@@ -368,7 +381,7 @@ void default_idle(void);
20015 # define smp_wmb() barrier()
20016 #endif
20017 #define smp_read_barrier_depends() read_barrier_depends()
20018-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
20019+#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
20020 #else
20021 #define smp_mb() barrier()
20022 #define smp_rmb() barrier()
20023--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/tlbflush.h 2009-03-16 16:33:40.000000000 +0100
20024+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/tlbflush.h 2009-03-16 16:38:05.000000000 +0100
20025@@ -86,8 +86,7 @@ static inline void flush_tlb_range(struc
20026 #define TLBSTATE_LAZY 2
20027
20028 #ifdef CONFIG_X86_32
20029-struct tlb_state
20030-{
20031+struct tlb_state {
20032 struct mm_struct *active_mm;
20033 int state;
20034 char __cacheline_padding[L1_CACHE_BYTES-8];
20035--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/vga.h 2009-05-14 10:56:29.000000000 +0200
20036+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/vga.h 2009-03-16 16:38:05.000000000 +0100
20037@@ -12,9 +12,9 @@
20038 * access the videoram directly without any black magic.
20039 */
20040
20041-#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x)
20042+#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x)
20043
20044 #define vga_readb(x) (*(x))
20045-#define vga_writeb(x,y) (*(y) = (x))
20046+#define vga_writeb(x, y) (*(y) = (x))
20047
20048 #endif
20049--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/xor_64.h 2009-05-14 10:56:29.000000000 +0200
20050+++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/xor_64.h 2009-03-16 16:38:05.000000000 +0100
20051@@ -1,20 +1,23 @@
20052 /*
20053- * x86-64 changes / gcc fixes from Andi Kleen.
20054+ * x86-64 changes / gcc fixes from Andi Kleen.
20055 * Copyright 2002 Andi Kleen, SuSE Labs.
20056 *
20057 * This hasn't been optimized for the hammer yet, but there are likely
20058 * no advantages to be gotten from x86-64 here anyways.
20059 */
20060
20061-typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
20062+typedef struct {
20063+ unsigned long a, b;
20064+} __attribute__((aligned(16))) xmm_store_t;
20065
20066-/* Doesn't use gcc to save the XMM registers, because there is no easy way to
20067+/* Doesn't use gcc to save the XMM registers, because there is no easy way to
20068 tell it to do a clts before the register saving. */
20069-#define XMMS_SAVE do { \
20070+#define XMMS_SAVE \
20071+do { \
20072 preempt_disable(); \
20073 if (!(current_thread_info()->status & TS_USEDFPU)) \
20074 clts(); \
20075- __asm__ __volatile__ ( \
20076+ asm volatile( \
20077 "movups %%xmm0,(%1) ;\n\t" \
20078 "movups %%xmm1,0x10(%1) ;\n\t" \
20079 "movups %%xmm2,0x20(%1) ;\n\t" \
20080@@ -22,10 +25,11 @@ typedef struct { unsigned long a,b; } __
20081 : "=&r" (cr0) \
20082 : "r" (xmm_save) \
20083 : "memory"); \
20084-} while(0)
20085+} while (0)
20086
20087-#define XMMS_RESTORE do { \
20088- asm volatile ( \
20089+#define XMMS_RESTORE \
20090+do { \
20091+ asm volatile( \
20092 "sfence ;\n\t" \
20093 "movups (%1),%%xmm0 ;\n\t" \
20094 "movups 0x10(%1),%%xmm1 ;\n\t" \
20095@@ -37,72 +41,72 @@ typedef struct { unsigned long a,b; } __
20096 if (!(current_thread_info()->status & TS_USEDFPU)) \
20097 stts(); \
20098 preempt_enable(); \
20099-} while(0)
20100+} while (0)
20101
20102 #define OFFS(x) "16*("#x")"
20103 #define PF_OFFS(x) "256+16*("#x")"
20104 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
20105-#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
20106-#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
20107+#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
20108+#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
20109 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
20110 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
20111 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
20112 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
20113 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
20114-#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
20115-#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
20116-#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
20117-#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
20118-#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
20119+#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
20120+#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
20121+#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
20122+#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
20123+#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
20124
20125
20126 static void
20127 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
20128 {
20129- unsigned int lines = bytes >> 8;
20130+ unsigned int lines = bytes >> 8;
20131 unsigned long cr0;
20132 xmm_store_t xmm_save[4];
20133
20134 XMMS_SAVE;
20135
20136- asm volatile (
20137+ asm volatile(
20138 #undef BLOCK
20139 #define BLOCK(i) \
20140- LD(i,0) \
20141- LD(i+1,1) \
20142+ LD(i, 0) \
20143+ LD(i + 1, 1) \
20144 PF1(i) \
20145- PF1(i+2) \
20146- LD(i+2,2) \
20147- LD(i+3,3) \
20148- PF0(i+4) \
20149- PF0(i+6) \
20150- XO1(i,0) \
20151- XO1(i+1,1) \
20152- XO1(i+2,2) \
20153- XO1(i+3,3) \
20154- ST(i,0) \
20155- ST(i+1,1) \
20156- ST(i+2,2) \
20157- ST(i+3,3) \
20158+ PF1(i + 2) \
20159+ LD(i + 2, 2) \
20160+ LD(i + 3, 3) \
20161+ PF0(i + 4) \
20162+ PF0(i + 6) \
20163+ XO1(i, 0) \
20164+ XO1(i + 1, 1) \
20165+ XO1(i + 2, 2) \
20166+ XO1(i + 3, 3) \
20167+ ST(i, 0) \
20168+ ST(i + 1, 1) \
20169+ ST(i + 2, 2) \
20170+ ST(i + 3, 3) \
20171
20172
20173 PF0(0)
20174 PF0(2)
20175
20176 " .align 32 ;\n"
20177- " 1: ;\n"
20178+ " 1: ;\n"
20179
20180 BLOCK(0)
20181 BLOCK(4)
20182 BLOCK(8)
20183 BLOCK(12)
20184
20185- " addq %[inc], %[p1] ;\n"
20186- " addq %[inc], %[p2] ;\n"
20187+ " addq %[inc], %[p1] ;\n"
20188+ " addq %[inc], %[p2] ;\n"
20189 " decl %[cnt] ; jnz 1b"
20190 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
20191- : [inc] "r" (256UL)
20192- : "memory");
20193+ : [inc] "r" (256UL)
20194+ : "memory");
20195
20196 XMMS_RESTORE;
20197 }
20198@@ -117,52 +121,52 @@ xor_sse_3(unsigned long bytes, unsigned
20199
20200 XMMS_SAVE;
20201
20202- __asm__ __volatile__ (
20203+ asm volatile(
20204 #undef BLOCK
20205 #define BLOCK(i) \
20206 PF1(i) \
20207- PF1(i+2) \
20208- LD(i,0) \
20209- LD(i+1,1) \
20210- LD(i+2,2) \
20211- LD(i+3,3) \
20212+ PF1(i + 2) \
20213+ LD(i, 0) \
20214+ LD(i + 1, 1) \
20215+ LD(i + 2, 2) \
20216+ LD(i + 3, 3) \
20217 PF2(i) \
20218- PF2(i+2) \
20219- PF0(i+4) \
20220- PF0(i+6) \
20221- XO1(i,0) \
20222- XO1(i+1,1) \
20223- XO1(i+2,2) \
20224- XO1(i+3,3) \
20225- XO2(i,0) \
20226- XO2(i+1,1) \
20227- XO2(i+2,2) \
20228- XO2(i+3,3) \
20229- ST(i,0) \
20230- ST(i+1,1) \
20231- ST(i+2,2) \
20232- ST(i+3,3) \
20233+ PF2(i + 2) \
20234+ PF0(i + 4) \
20235+ PF0(i + 6) \
20236+ XO1(i, 0) \
20237+ XO1(i + 1, 1) \
20238+ XO1(i + 2, 2) \
20239+ XO1(i + 3, 3) \
20240+ XO2(i, 0) \
20241+ XO2(i + 1, 1) \
20242+ XO2(i + 2, 2) \
20243+ XO2(i + 3, 3) \
20244+ ST(i, 0) \
20245+ ST(i + 1, 1) \
20246+ ST(i + 2, 2) \
20247+ ST(i + 3, 3) \
20248
20249
20250 PF0(0)
20251 PF0(2)
20252
20253 " .align 32 ;\n"
20254- " 1: ;\n"
20255+ " 1: ;\n"
20256
20257 BLOCK(0)
20258 BLOCK(4)
20259 BLOCK(8)
20260 BLOCK(12)
20261
20262- " addq %[inc], %[p1] ;\n"
20263- " addq %[inc], %[p2] ;\n"
20264- " addq %[inc], %[p3] ;\n"
20265+ " addq %[inc], %[p1] ;\n"
20266+ " addq %[inc], %[p2] ;\n"
20267+ " addq %[inc], %[p3] ;\n"
20268 " decl %[cnt] ; jnz 1b"
20269 : [cnt] "+r" (lines),
20270 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
20271 : [inc] "r" (256UL)
20272- : "memory");
20273+ : "memory");
20274 XMMS_RESTORE;
20275 }
20276
20277@@ -171,64 +175,64 @@ xor_sse_4(unsigned long bytes, unsigned
20278 unsigned long *p3, unsigned long *p4)
20279 {
20280 unsigned int lines = bytes >> 8;
20281- xmm_store_t xmm_save[4];
20282+ xmm_store_t xmm_save[4];
20283 unsigned long cr0;
20284
20285 XMMS_SAVE;
20286
20287- __asm__ __volatile__ (
20288+ asm volatile(
20289 #undef BLOCK
20290 #define BLOCK(i) \
20291 PF1(i) \
20292- PF1(i+2) \
20293- LD(i,0) \
20294- LD(i+1,1) \
20295- LD(i+2,2) \
20296- LD(i+3,3) \
20297+ PF1(i + 2) \
20298+ LD(i, 0) \
20299+ LD(i + 1, 1) \
20300+ LD(i + 2, 2) \
20301+ LD(i + 3, 3) \
20302 PF2(i) \
20303- PF2(i+2) \
20304- XO1(i,0) \
20305- XO1(i+1,1) \
20306- XO1(i+2,2) \
20307- XO1(i+3,3) \
20308+ PF2(i + 2) \
20309+ XO1(i, 0) \
20310+ XO1(i + 1, 1) \
20311+ XO1(i + 2, 2) \
20312+ XO1(i + 3, 3) \
20313 PF3(i) \
20314- PF3(i+2) \
20315- PF0(i+4) \
20316- PF0(i+6) \
20317- XO2(i,0) \
20318- XO2(i+1,1) \
20319- XO2(i+2,2) \
20320- XO2(i+3,3) \
20321- XO3(i,0) \
20322- XO3(i+1,1) \
20323- XO3(i+2,2) \
20324- XO3(i+3,3) \
20325- ST(i,0) \
20326- ST(i+1,1) \
20327- ST(i+2,2) \
20328- ST(i+3,3) \
20329+ PF3(i + 2) \
20330+ PF0(i + 4) \
20331+ PF0(i + 6) \
20332+ XO2(i, 0) \
20333+ XO2(i + 1, 1) \
20334+ XO2(i + 2, 2) \
20335+ XO2(i + 3, 3) \
20336+ XO3(i, 0) \
20337+ XO3(i + 1, 1) \
20338+ XO3(i + 2, 2) \
20339+ XO3(i + 3, 3) \
20340+ ST(i, 0) \
20341+ ST(i + 1, 1) \
20342+ ST(i + 2, 2) \
20343+ ST(i + 3, 3) \
20344
20345
20346 PF0(0)
20347 PF0(2)
20348
20349 " .align 32 ;\n"
20350- " 1: ;\n"
20351+ " 1: ;\n"
20352
20353 BLOCK(0)
20354 BLOCK(4)
20355 BLOCK(8)
20356 BLOCK(12)
20357
20358- " addq %[inc], %[p1] ;\n"
20359- " addq %[inc], %[p2] ;\n"
20360- " addq %[inc], %[p3] ;\n"
20361- " addq %[inc], %[p4] ;\n"
20362+ " addq %[inc], %[p1] ;\n"
20363+ " addq %[inc], %[p2] ;\n"
20364+ " addq %[inc], %[p3] ;\n"
20365+ " addq %[inc], %[p4] ;\n"
20366 " decl %[cnt] ; jnz 1b"
20367 : [cnt] "+c" (lines),
20368 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
20369 : [inc] "r" (256UL)
20370- : "memory" );
20371+ : "memory" );
20372
20373 XMMS_RESTORE;
20374 }
20375@@ -237,70 +241,70 @@ static void
20376 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
20377 unsigned long *p3, unsigned long *p4, unsigned long *p5)
20378 {
20379- unsigned int lines = bytes >> 8;
20380+ unsigned int lines = bytes >> 8;
20381 xmm_store_t xmm_save[4];
20382 unsigned long cr0;
20383
20384 XMMS_SAVE;
20385
20386- __asm__ __volatile__ (
20387+ asm volatile(
20388 #undef BLOCK
20389 #define BLOCK(i) \
20390 PF1(i) \
20391- PF1(i+2) \
20392- LD(i,0) \
20393- LD(i+1,1) \
20394- LD(i+2,2) \
20395- LD(i+3,3) \
20396+ PF1(i + 2) \
20397+ LD(i, 0) \
20398+ LD(i + 1, 1) \
20399+ LD(i + 2, 2) \
20400+ LD(i + 3, 3) \
20401 PF2(i) \
20402- PF2(i+2) \
20403- XO1(i,0) \
20404- XO1(i+1,1) \
20405- XO1(i+2,2) \
20406- XO1(i+3,3) \
20407+ PF2(i + 2) \
20408+ XO1(i, 0) \
20409+ XO1(i + 1, 1) \
20410+ XO1(i + 2, 2) \
20411+ XO1(i + 3, 3) \
20412 PF3(i) \
20413- PF3(i+2) \
20414- XO2(i,0) \
20415- XO2(i+1,1) \
20416- XO2(i+2,2) \
20417- XO2(i+3,3) \
20418+ PF3(i + 2) \
20419+ XO2(i, 0) \
20420+ XO2(i + 1, 1) \
20421+ XO2(i + 2, 2) \
20422+ XO2(i + 3, 3) \
20423 PF4(i) \
20424- PF4(i+2) \
20425- PF0(i+4) \
20426- PF0(i+6) \
20427- XO3(i,0) \
20428- XO3(i+1,1) \
20429- XO3(i+2,2) \
20430- XO3(i+3,3) \
20431- XO4(i,0) \
20432- XO4(i+1,1) \
20433- XO4(i+2,2) \
20434- XO4(i+3,3) \
20435- ST(i,0) \
20436- ST(i+1,1) \
20437- ST(i+2,2) \
20438- ST(i+3,3) \
20439+ PF4(i + 2) \
20440+ PF0(i + 4) \
20441+ PF0(i + 6) \
20442+ XO3(i, 0) \
20443+ XO3(i + 1, 1) \
20444+ XO3(i + 2, 2) \
20445+ XO3(i + 3, 3) \
20446+ XO4(i, 0) \
20447+ XO4(i + 1, 1) \
20448+ XO4(i + 2, 2) \
20449+ XO4(i + 3, 3) \
20450+ ST(i, 0) \
20451+ ST(i + 1, 1) \
20452+ ST(i + 2, 2) \
20453+ ST(i + 3, 3) \
20454
20455
20456 PF0(0)
20457 PF0(2)
20458
20459 " .align 32 ;\n"
20460- " 1: ;\n"
20461+ " 1: ;\n"
20462
20463 BLOCK(0)
20464 BLOCK(4)
20465 BLOCK(8)
20466 BLOCK(12)
20467
20468- " addq %[inc], %[p1] ;\n"
20469- " addq %[inc], %[p2] ;\n"
20470- " addq %[inc], %[p3] ;\n"
20471- " addq %[inc], %[p4] ;\n"
20472- " addq %[inc], %[p5] ;\n"
20473+ " addq %[inc], %[p1] ;\n"
20474+ " addq %[inc], %[p2] ;\n"
20475+ " addq %[inc], %[p3] ;\n"
20476+ " addq %[inc], %[p4] ;\n"
20477+ " addq %[inc], %[p5] ;\n"
20478 " decl %[cnt] ; jnz 1b"
20479 : [cnt] "+c" (lines),
20480- [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
20481+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
20482 [p5] "+r" (p5)
20483 : [inc] "r" (256UL)
20484 : "memory");
20485@@ -309,18 +313,18 @@ xor_sse_5(unsigned long bytes, unsigned
20486 }
20487
20488 static struct xor_block_template xor_block_sse = {
20489- .name = "generic_sse",
20490- .do_2 = xor_sse_2,
20491- .do_3 = xor_sse_3,
20492- .do_4 = xor_sse_4,
20493- .do_5 = xor_sse_5,
20494+ .name = "generic_sse",
20495+ .do_2 = xor_sse_2,
20496+ .do_3 = xor_sse_3,
20497+ .do_4 = xor_sse_4,
20498+ .do_5 = xor_sse_5,
20499 };
20500
20501 #undef XOR_TRY_TEMPLATES
20502-#define XOR_TRY_TEMPLATES \
20503- do { \
20504- xor_speed(&xor_block_sse); \
20505- } while (0)
20506+#define XOR_TRY_TEMPLATES \
20507+do { \
20508+ xor_speed(&xor_block_sse); \
20509+} while (0)
20510
20511 /* We force the use of the SSE xor block because it can write around L2.
20512 We may also be able to load into the L1 only depending on how the cpu
20513--- sle11-2009-05-14.orig/include/asm-x86/scatterlist.h 2009-05-14 10:56:29.000000000 +0200
20514+++ sle11-2009-05-14/include/asm-x86/scatterlist.h 2009-03-16 16:38:05.000000000 +0100
20515@@ -24,7 +24,7 @@ struct scatterlist {
20516 * returns.
20517 */
20518 #define sg_dma_address(sg) ((sg)->dma_address)
20519-#ifdef CONFIG_X86_32
20520+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
20521 # define sg_dma_len(sg) ((sg)->length)
20522 #else
20523 # define sg_dma_len(sg) ((sg)->dma_length)
20524--- sle11-2009-05-14.orig/include/linux/page-flags.h 2009-03-16 16:33:40.000000000 +0100
20525+++ sle11-2009-05-14/include/linux/page-flags.h 2009-03-16 16:38:05.000000000 +0100
20526@@ -278,18 +278,25 @@ static inline void SetPageUptodate(struc
20527
20528 CLEARPAGEFLAG(Uptodate, uptodate)
20529
20530-#define PageForeign(page) test_bit(PG_foreign, &(page)->flags)
20531-#define SetPageForeign(_page, dtor) do { \
20532- set_bit(PG_foreign, &(_page)->flags); \
20533- BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0); \
20534- (_page)->index = (long)(dtor); \
20535-} while (0)
20536-#define ClearPageForeign(page) do { \
20537- clear_bit(PG_foreign, &(page)->flags); \
20538- (page)->index = 0; \
20539-} while (0)
20540-#define PageForeignDestructor(_page, order) \
20541- ((void (*)(struct page *, unsigned int))(_page)->index)(_page, order)
20542+#ifdef CONFIG_XEN
20543+TESTPAGEFLAG(Foreign, foreign)
20544+static inline void SetPageForeign(struct page *page,
20545+ void (*dtor)(struct page *, unsigned int))
20546+{
20547+ BUG_ON(!dtor);
20548+ set_bit(PG_foreign, &page->flags);
20549+ page->index = (long)dtor;
20550+}
20551+static inline void ClearPageForeign(struct page *page)
20552+{
20553+ clear_bit(PG_foreign, &page->flags);
20554+ page->index = 0;
20555+}
20556+static inline void PageForeignDestructor(struct page *page, unsigned int order)
20557+{
20558+ ((void (*)(struct page *, unsigned int))page->index)(page, order);
20559+}
20560+#endif
20561
20562 extern void cancel_dirty_page(struct page *page, unsigned int account_size);
20563
20564--- sle11-2009-05-14.orig/include/xen/balloon.h 2008-11-25 12:35:56.000000000 +0100
20565+++ sle11-2009-05-14/include/xen/balloon.h 2009-03-16 16:38:05.000000000 +0100
20566@@ -31,9 +31,12 @@
20567 * IN THE SOFTWARE.
20568 */
20569
20570-#ifndef __ASM_BALLOON_H__
20571-#define __ASM_BALLOON_H__
20572+#ifndef __XEN_BALLOON_H__
20573+#define __XEN_BALLOON_H__
20574
20575+#include <linux/spinlock.h>
20576+
20577+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
20578 /*
20579 * Inform the balloon driver that it should allow some slop for device-driver
20580 * memory activities.
20581@@ -53,5 +56,6 @@ void balloon_release_driver_page(struct
20582 extern spinlock_t balloon_lock;
20583 #define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags)
20584 #define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
20585+#endif
20586
20587-#endif /* __ASM_BALLOON_H__ */
20588+#endif /* __XEN_BALLOON_H__ */
20589--- sle11-2009-05-14.orig/include/xen/interface/grant_table.h 2008-11-25 12:22:34.000000000 +0100
20590+++ sle11-2009-05-14/include/xen/interface/grant_table.h 2009-03-16 16:38:05.000000000 +0100
20591@@ -193,6 +193,7 @@ struct gnttab_map_grant_ref {
20592 grant_handle_t handle;
20593 uint64_t dev_bus_addr;
20594 };
20595+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
20596 typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
20597 DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
20598
20599@@ -216,6 +217,7 @@ struct gnttab_unmap_grant_ref {
20600 /* OUT parameters. */
20601 int16_t status; /* GNTST_* */
20602 };
20603+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
20604 typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
20605 DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
20606
20607@@ -237,6 +239,7 @@ struct gnttab_setup_table {
20608 int16_t status; /* GNTST_* */
20609 XEN_GUEST_HANDLE(ulong) frame_list;
20610 };
20611+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_setup_table);
20612 typedef struct gnttab_setup_table gnttab_setup_table_t;
20613 DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
20614
20615@@ -251,6 +254,7 @@ struct gnttab_dump_table {
20616 /* OUT parameters. */
20617 int16_t status; /* GNTST_* */
20618 };
20619+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_dump_table);
20620 typedef struct gnttab_dump_table gnttab_dump_table_t;
20621 DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
20622
20623@@ -271,6 +275,7 @@ struct gnttab_transfer {
20624 /* OUT parameters. */
20625 int16_t status;
20626 };
20627+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_transfer);
20628 typedef struct gnttab_transfer gnttab_transfer_t;
20629 DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
20630
20631@@ -314,6 +319,7 @@ typedef struct gnttab_copy {
20632 /* OUT parameters. */
20633 int16_t status;
20634 } gnttab_copy_t;
20635+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_copy);
20636 DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);
20637
20638 /*
20639@@ -332,6 +338,7 @@ struct gnttab_query_size {
20640 uint32_t max_nr_frames;
20641 int16_t status; /* GNTST_* */
20642 };
20643+DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_query_size);
20644 typedef struct gnttab_query_size gnttab_query_size_t;
20645 DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t);
20646
20647--- sle11-2009-05-14.orig/include/xen/interface/io/fbif.h 2008-11-25 12:35:56.000000000 +0100
20648+++ sle11-2009-05-14/include/xen/interface/io/fbif.h 2009-03-16 16:38:05.000000000 +0100
20649@@ -150,7 +150,12 @@ struct xenfb_page
20650 * framebuffer with a max resolution of 12,800x10,240. Should
20651 * be enough for a while with room leftover for expansion.
20652 */
20653+#ifndef CONFIG_PARAVIRT_XEN
20654 unsigned long pd[256];
20655+#else
20656+ /* Two directory pages should be enough for a while. */
20657+ unsigned long pd[2];
20658+#endif
20659 };
20660
20661 /*
20662--- sle11-2009-05-14.orig/include/xen/interface/memory.h 2009-02-16 16:17:21.000000000 +0100
20663+++ sle11-2009-05-14/include/xen/interface/memory.h 2009-03-16 16:38:05.000000000 +0100
20664@@ -62,7 +62,7 @@ struct xen_memory_reservation {
20665 * OUT: GMFN bases of extents that were allocated
20666 * (NB. This command also updates the mach_to_phys translation table)
20667 */
20668- XEN_GUEST_HANDLE(ulong) extent_start;
20669+ XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
20670
20671 /* Number of extents, and size/alignment of each (2^extent_order pages). */
20672 xen_ulong_t nr_extents;
20673@@ -82,7 +82,6 @@ struct xen_memory_reservation {
20674 domid_t domid;
20675
20676 };
20677-DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
20678 typedef struct xen_memory_reservation xen_memory_reservation_t;
20679 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
20680
20681@@ -168,7 +167,11 @@ struct xen_machphys_mfn_list {
20682 * any large discontiguities in the machine address space, 2MB gaps in
20683 * the machphys table will be represented by an MFN base of zero.
20684 */
20685+#ifndef CONFIG_PARAVIRT_XEN
20686 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
20687+#else
20688+ ulong extent_start;
20689+#endif
20690
20691 /*
20692 * Number of extents written to the above array. This will be smaller
20693@@ -176,7 +179,6 @@ struct xen_machphys_mfn_list {
20694 */
20695 unsigned int nr_extents;
20696 };
20697-DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
20698 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
20699 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
20700
20701@@ -216,7 +218,6 @@ struct xen_add_to_physmap {
20702 /* GPFN where the source mapping page should appear. */
20703 xen_pfn_t gpfn;
20704 };
20705-DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
20706 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
20707 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
20708
20709@@ -249,13 +250,21 @@ struct xen_translate_gpfn_list {
20710 xen_ulong_t nr_gpfns;
20711
20712 /* List of GPFNs to translate. */
20713+#ifndef CONFIG_PARAVIRT_XEN
20714 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
20715+#else
20716+ ulong gpfn_list;
20717+#endif
20718
20719 /*
20720 * Output list to contain MFN translations. May be the same as the input
20721 * list (in which case each input GPFN is overwritten with the output MFN).
20722 */
20723+#ifndef CONFIG_PARAVIRT_XEN
20724 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
20725+#else
20726+ ulong mfn_list;
20727+#endif
20728 };
20729 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
20730 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
20731--- sle11-2009-05-14.orig/include/xen/interface/vcpu.h 2008-11-25 12:35:56.000000000 +0100
20732+++ sle11-2009-05-14/include/xen/interface/vcpu.h 2009-03-16 16:38:05.000000000 +0100
20733@@ -85,6 +85,7 @@ struct vcpu_runstate_info {
20734 */
20735 uint64_t time[4];
20736 };
20737+DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
20738 typedef struct vcpu_runstate_info vcpu_runstate_info_t;
20739 DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);
20740
20741@@ -140,6 +141,7 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_register_ru
20742 struct vcpu_set_periodic_timer {
20743 uint64_t period_ns;
20744 };
20745+DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer);
20746 typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t;
20747 DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t);
20748
20749@@ -153,6 +155,7 @@ struct vcpu_set_singleshot_timer {
20750 uint64_t timeout_abs_ns; /* Absolute system time value in nanoseconds. */
20751 uint32_t flags; /* VCPU_SSHOTTMR_??? */
20752 };
20753+DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
20754 typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t;
20755 DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t);
20756
20757@@ -176,6 +179,7 @@ struct vcpu_register_vcpu_info {
20758 uint32_t offset; /* offset within page */
20759 uint32_t rsvd; /* unused */
20760 };
20761+DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
20762 typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t;
20763 DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t);
20764
20765--- sle11-2009-05-14.orig/lib/swiotlb-xen.c 2009-03-16 16:33:40.000000000 +0100
20766+++ sle11-2009-05-14/lib/swiotlb-xen.c 2009-03-16 16:38:05.000000000 +0100
20767@@ -20,6 +20,7 @@
20768 #include <linux/ctype.h>
20769 #include <linux/init.h>
20770 #include <linux/bootmem.h>
20771+#include <linux/iommu-helper.h>
20772 #include <linux/highmem.h>
20773 #include <asm/io.h>
20774 #include <asm/pci.h>
20775@@ -288,15 +289,6 @@ __sync_single(struct phys_addr buffer, c
20776 }
20777 }
20778
20779-static inline unsigned int is_span_boundary(unsigned int index,
20780- unsigned int nslots,
20781- unsigned long offset_slots,
20782- unsigned long max_slots)
20783-{
20784- unsigned long offset = (offset_slots + index) & (max_slots - 1);
20785- return offset + nslots > max_slots;
20786-}
20787-
20788 /*
20789 * Allocates bounce buffer and returns its kernel virtual address.
20790 */
20791@@ -335,61 +327,53 @@ map_single(struct device *hwdev, struct
20792 * request and allocate a buffer from that IO TLB pool.
20793 */
20794 spin_lock_irqsave(&io_tlb_lock, flags);
20795- {
20796- index = ALIGN(io_tlb_index, stride);
20797- if (index >= iotlb_nslabs)
20798- index = 0;
20799- wrap = index;
20800+ index = ALIGN(io_tlb_index, stride);
20801+ if (index >= iotlb_nslabs)
20802+ index = 0;
20803+ wrap = index;
20804
20805- do {
20806- while (is_span_boundary(index, nslots, offset_slots,
20807- max_slots)) {
20808- index += stride;
20809- if (index >= iotlb_nslabs)
20810- index = 0;
20811- if (index == wrap)
20812- goto not_found;
20813- }
20814+ do {
20815+ while (iommu_is_span_boundary(index, nslots, offset_slots,
20816+ max_slots)) {
20817+ index += stride;
20818+ if (index >= iotlb_nslabs)
20819+ index = 0;
20820+ if (index == wrap)
20821+ goto not_found;
20822+ }
20823+
20824+ /*
20825+ * If we find a slot that indicates we have 'nslots' number of
20826+ * contiguous buffers, we allocate the buffers from that slot
20827+ * and mark the entries as '0' indicating unavailable.
20828+ */
20829+ if (io_tlb_list[index] >= nslots) {
20830+ int count = 0;
20831+
20832+ for (i = index; i < (int) (index + nslots); i++)
20833+ io_tlb_list[i] = 0;
20834+ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
20835+ io_tlb_list[i] = ++count;
20836+ dma_addr = iotlb_virt_start + (index << IO_TLB_SHIFT);
20837
20838 /*
20839- * If we find a slot that indicates we have 'nslots'
20840- * number of contiguous buffers, we allocate the
20841- * buffers from that slot and mark the entries as '0'
20842- * indicating unavailable.
20843+ * Update the indices to avoid searching in the next
20844+ * round.
20845 */
20846- if (io_tlb_list[index] >= nslots) {
20847- int count = 0;
20848-
20849- for (i = index; i < (int)(index + nslots); i++)
20850- io_tlb_list[i] = 0;
20851- for (i = index - 1;
20852- (OFFSET(i, IO_TLB_SEGSIZE) !=
20853- IO_TLB_SEGSIZE -1) && io_tlb_list[i];
20854- i--)
20855- io_tlb_list[i] = ++count;
20856- dma_addr = iotlb_virt_start +
20857- (index << IO_TLB_SHIFT);
20858-
20859- /*
20860- * Update the indices to avoid searching in
20861- * the next round.
20862- */
20863- io_tlb_index =
20864- ((index + nslots) < iotlb_nslabs
20865- ? (index + nslots) : 0);
20866+ io_tlb_index = ((index + nslots) < iotlb_nslabs
20867+ ? (index + nslots) : 0);
20868
20869- goto found;
20870- }
20871- index += stride;
20872- if (index >= iotlb_nslabs)
20873- index = 0;
20874- } while (index != wrap);
20875+ goto found;
20876+ }
20877+ index += stride;
20878+ if (index >= iotlb_nslabs)
20879+ index = 0;
20880+ } while (index != wrap);
20881
20882- not_found:
20883- spin_unlock_irqrestore(&io_tlb_lock, flags);
20884- return NULL;
20885- }
20886- found:
20887+not_found:
20888+ spin_unlock_irqrestore(&io_tlb_lock, flags);
20889+ return NULL;
20890+found:
20891 spin_unlock_irqrestore(&io_tlb_lock, flags);
20892
20893 /*
20894@@ -502,11 +486,13 @@ swiotlb_full(struct device *dev, size_t
20895 * Once the device is given the dma address, the device owns this memory until
20896 * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
20897 */
20898-dma_addr_t
20899-swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
20900-{
20901- dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) +
20902- offset_in_page(ptr);
20903+static dma_addr_t
20904+_swiotlb_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
20905+ int dir, struct dma_attrs *attrs)
20906+{
20907+ struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
20908+ dma_addr_t dev_addr = gnttab_dma_map_page(page) +
20909+ offset_in_page(paddr);
20910 void *map;
20911 struct phys_addr buffer;
20912
20913@@ -517,7 +503,7 @@ swiotlb_map_single(struct device *hwdev,
20914 * we can safely return the device addr and not worry about bounce
20915 * buffering it.
20916 */
20917- if (!range_straddles_page_boundary(__pa(ptr), size) &&
20918+ if (!range_straddles_page_boundary(paddr, size) &&
20919 !address_needs_mapping(hwdev, dev_addr))
20920 return dev_addr;
20921
20922@@ -525,8 +511,8 @@ swiotlb_map_single(struct device *hwdev,
20923 * Oh well, have to allocate and map a bounce buffer.
20924 */
20925 gnttab_dma_unmap_page(dev_addr);
20926- buffer.page = virt_to_page(ptr);
20927- buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
20928+ buffer.page = page;
20929+ buffer.offset = offset_in_page(paddr);
20930 map = map_single(hwdev, buffer, size, dir);
20931 if (!map) {
20932 swiotlb_full(hwdev, size, dir, 1);
20933@@ -537,6 +523,26 @@ swiotlb_map_single(struct device *hwdev,
20934 return dev_addr;
20935 }
20936
20937+dma_addr_t
20938+swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
20939+ int dir, struct dma_attrs *attrs)
20940+{
20941+ return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, attrs);
20942+}
20943+EXPORT_SYMBOL(swiotlb_map_single_attrs);
20944+
20945+dma_addr_t
20946+swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
20947+{
20948+ return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL);
20949+}
20950+
20951+dma_addr_t
20952+swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
20953+{
20954+ return _swiotlb_map_single(hwdev, paddr, size, dir, NULL);
20955+}
20956+
20957 /*
20958 * Unmap a single streaming mode DMA translation. The dma_addr and size must
20959 * match what was provided for in a previous swiotlb_map_single call. All
20960@@ -546,8 +552,8 @@ swiotlb_map_single(struct device *hwdev,
20961 * whatever the device wrote there.
20962 */
20963 void
20964-swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
20965- int dir)
20966+swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
20967+ size_t size, int dir, struct dma_attrs *attrs)
20968 {
20969 BUG_ON(dir == DMA_NONE);
20970 if (in_swiotlb_aperture(dev_addr))
20971@@ -555,7 +561,14 @@ swiotlb_unmap_single(struct device *hwde
20972 else
20973 gnttab_dma_unmap_page(dev_addr);
20974 }
20975+EXPORT_SYMBOL(swiotlb_unmap_single_attrs);
20976
20977+void
20978+swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
20979+ int dir)
20980+{
20981+ return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
20982+}
20983 /*
20984 * Make physical memory consistent for a single streaming mode DMA translation
20985 * after a transfer.
20986@@ -584,6 +597,26 @@ swiotlb_sync_single_for_device(struct de
20987 sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
20988 }
20989
20990+void
20991+swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
20992+ unsigned long offset, size_t size, int dir)
20993+{
20994+ BUG_ON(dir == DMA_NONE);
20995+ if (in_swiotlb_aperture(dev_addr))
20996+ sync_single(hwdev, bus_to_virt(dev_addr + offset), size, dir);
20997+}
20998+
20999+void
21000+swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
21001+ unsigned long offset, size_t size, int dir)
21002+{
21003+ BUG_ON(dir == DMA_NONE);
21004+ if (in_swiotlb_aperture(dev_addr))
21005+ sync_single(hwdev, bus_to_virt(dev_addr + offset), size, dir);
21006+}
21007+
21008+void swiotlb_unmap_sg_attrs(struct device *, struct scatterlist *, int, int,
21009+ struct dma_attrs *);
21010 /*
21011 * Map a set of buffers described by scatterlist in streaming mode for DMA.
21012 * This is the scatter-gather version of the above swiotlb_map_single
21013@@ -601,8 +634,8 @@ swiotlb_sync_single_for_device(struct de
21014 * same here.
21015 */
21016 int
21017-swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21018- int dir)
21019+swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
21020+ int dir, struct dma_attrs *attrs)
21021 {
21022 struct scatterlist *sg;
21023 struct phys_addr buffer;
21024@@ -626,7 +659,8 @@ swiotlb_map_sg(struct device *hwdev, str
21025 /* Don't panic here, we expect map_sg users
21026 to do proper error handling. */
21027 swiotlb_full(hwdev, sg->length, dir, 0);
21028- swiotlb_unmap_sg(hwdev, sgl, i, dir);
21029+ swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
21030+ attrs);
21031 sgl[0].dma_length = 0;
21032 return 0;
21033 }
21034@@ -637,14 +671,22 @@ swiotlb_map_sg(struct device *hwdev, str
21035 }
21036 return nelems;
21037 }
21038+EXPORT_SYMBOL(swiotlb_map_sg_attrs);
21039+
21040+int
21041+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21042+ int dir)
21043+{
21044+ return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
21045+}
21046
21047 /*
21048 * Unmap a set of streaming mode DMA translations. Again, cpu read rules
21049 * concerning calls here are the same as for swiotlb_unmap_single() above.
21050 */
21051 void
21052-swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21053- int dir)
21054+swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
21055+ int nelems, int dir, struct dma_attrs *attrs)
21056 {
21057 struct scatterlist *sg;
21058 int i;
21059@@ -659,6 +701,14 @@ swiotlb_unmap_sg(struct device *hwdev, s
21060 gnttab_dma_unmap_page(sg->dma_address);
21061 }
21062 }
21063+EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
21064+
21065+void
21066+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21067+ int dir)
21068+{
21069+ return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
21070+}
21071
21072 /*
21073 * Make physical memory consistent for a set of streaming mode DMA translations
21074@@ -699,46 +749,6 @@ swiotlb_sync_sg_for_device(struct device
21075 }
21076 }
21077
21078-#ifdef CONFIG_HIGHMEM
21079-
21080-dma_addr_t
21081-swiotlb_map_page(struct device *hwdev, struct page *page,
21082- unsigned long offset, size_t size,
21083- enum dma_data_direction direction)
21084-{
21085- struct phys_addr buffer;
21086- dma_addr_t dev_addr;
21087- char *map;
21088-
21089- dev_addr = gnttab_dma_map_page(page) + offset;
21090- if (address_needs_mapping(hwdev, dev_addr)) {
21091- gnttab_dma_unmap_page(dev_addr);
21092- buffer.page = page;
21093- buffer.offset = offset;
21094- map = map_single(hwdev, buffer, size, direction);
21095- if (!map) {
21096- swiotlb_full(hwdev, size, direction, 1);
21097- map = io_tlb_overflow_buffer;
21098- }
21099- dev_addr = (dma_addr_t)virt_to_bus(map);
21100- }
21101-
21102- return dev_addr;
21103-}
21104-
21105-void
21106-swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
21107- size_t size, enum dma_data_direction direction)
21108-{
21109- BUG_ON(direction == DMA_NONE);
21110- if (in_swiotlb_aperture(dma_address))
21111- unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
21112- else
21113- gnttab_dma_unmap_page(dma_address);
21114-}
21115-
21116-#endif
21117-
21118 int
21119 swiotlb_dma_mapping_error(dma_addr_t dma_addr)
21120 {