]> git.ipfire.org Git - ipfire-2.x.git/blob - src/patches/suse-2.6.27.39/patches.xen/xen3-patch-2.6.26
Imported linux-2.6.27.39 suse/xen patches.
[ipfire-2.x.git] / src / patches / suse-2.6.27.39 / patches.xen / xen3-patch-2.6.26
1 From: kernel.org
2 Subject: 2.6.26
3 Patch-mainline: 2.6.26
4
5 Acked-by: Jeff Mahoney <jeffm@suse.com>
6 Automatically created from "patches.kernel.org/patch-2.6.26" by xen-port-patches.py
7
8 --- sle11-2009-10-16.orig/arch/x86/Kconfig 2009-03-16 16:33:40.000000000 +0100
9 +++ sle11-2009-10-16/arch/x86/Kconfig 2009-03-16 16:38:05.000000000 +0100
10 @@ -28,7 +28,7 @@ config X86
11 select HAVE_DYNAMIC_FTRACE
12 select HAVE_FTRACE
13 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
14 - select HAVE_ARCH_KGDB if !X86_VOYAGER
15 + select HAVE_ARCH_KGDB if !X86_VOYAGER && !XEN
16 select HAVE_ARCH_TRACEHOOK
17 select HAVE_GENERIC_DMA_COHERENT if X86_32
18 select HAVE_EFFICIENT_UNALIGNED_ACCESS
19 @@ -486,6 +486,7 @@ config PARAVIRT_DEBUG
20
21 config MEMTEST
22 bool "Memtest"
23 + depends on !XEN
24 help
25 This option adds a kernel parameter 'memtest', which allows memtest
26 to be set.
27 @@ -1007,7 +1008,7 @@ config X86_PAE
28 config DIRECT_GBPAGES
29 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
30 default y
31 - depends on X86_64
32 + depends on X86_64 && !XEN
33 help
34 Allow the kernel linear mapping to use 1GB pages on CPUs that
35 support it. This can improve the kernel's performance a tiny bit by
36 @@ -1349,8 +1350,7 @@ source kernel/Kconfig.hz
37
38 config KEXEC
39 bool "kexec system call"
40 - depends on X86_BIOS_REBOOT
41 - depends on !XEN_UNPRIVILEGED_GUEST
42 + depends on X86_BIOS_REBOOT || (XEN && !XEN_UNPRIVILEGED_GUEST)
43 help
44 kexec is a system call that implements the ability to shutdown your
45 current kernel, and to start another kernel. It is like a reboot
46 @@ -1948,6 +1948,4 @@ source "crypto/Kconfig"
47
48 source "arch/x86/kvm/Kconfig"
49
50 -source "drivers/xen/Kconfig"
51 -
52 source "lib/Kconfig"
53 --- sle11-2009-10-16.orig/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:33:40.000000000 +0100
54 +++ sle11-2009-10-16/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:38:05.000000000 +0100
55 @@ -129,12 +129,14 @@ sysenter_tracesys:
56 SAVE_REST
57 CLEAR_RREGS
58 movq %r9,R9(%rsp)
59 - movq $-ENOSYS,RAX(%rsp) /* really needed? */
60 + movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
61 movq %rsp,%rdi /* &pt_regs -> arg1 */
62 call syscall_trace_enter
63 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
64 RESTORE_REST
65 xchgl %ebp,%r9d
66 + cmpl $(IA32_NR_syscalls-1),%eax
67 + ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
68 jmp sysenter_do_call
69 CFI_ENDPROC
70 ENDPROC(ia32_sysenter_target)
71 @@ -200,13 +202,15 @@ cstar_tracesys:
72 SAVE_REST
73 CLEAR_RREGS
74 movq %r9,R9(%rsp)
75 - movq $-ENOSYS,RAX(%rsp) /* really needed? */
76 + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
77 movq %rsp,%rdi /* &pt_regs -> arg1 */
78 call syscall_trace_enter
79 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
80 RESTORE_REST
81 xchgl %ebp,%r9d
82 movl RSP-ARGOFFSET(%rsp), %r8d
83 + cmpl $(IA32_NR_syscalls-1),%eax
84 + ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
85 jmp cstar_do_call
86 END(ia32_cstar_target)
87
88 @@ -264,7 +268,7 @@ ENTRY(ia32_syscall)
89 jnz ia32_tracesys
90 ia32_do_syscall:
91 cmpl $(IA32_NR_syscalls-1),%eax
92 - ja ia32_badsys
93 + ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
94 IA32_ARG_FIXUP
95 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
96 ia32_sysret:
97 @@ -274,7 +278,7 @@ ia32_sysret:
98 ia32_tracesys:
99 SAVE_REST
100 CLEAR_RREGS
101 - movq $-ENOSYS,RAX(%rsp) /* really needed? */
102 + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
103 movq %rsp,%rdi /* &pt_regs -> arg1 */
104 call syscall_trace_enter
105 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
106 @@ -365,7 +369,7 @@ ia32_sys_call_table:
107 .quad sys_setuid16
108 .quad sys_getuid16
109 .quad compat_sys_stime /* stime */ /* 25 */
110 - .quad sys32_ptrace /* ptrace */
111 + .quad compat_sys_ptrace /* ptrace */
112 .quad sys_alarm
113 .quad sys_fstat /* (old)fstat */
114 .quad sys_pause
115 --- sle11-2009-10-16.orig/arch/x86/kernel/Makefile 2009-03-16 16:33:40.000000000 +0100
116 +++ sle11-2009-10-16/arch/x86/kernel/Makefile 2009-03-16 16:38:05.000000000 +0100
117 @@ -122,8 +122,7 @@ ifeq ($(CONFIG_X86_64),y)
118
119 obj-$(CONFIG_XEN) += nmi_64.o
120 time_64-$(CONFIG_XEN) += time_32.o
121 - pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
122 endif
123
124 -disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
125 - smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
126 +disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
127 + pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
128 --- sle11-2009-10-16.orig/arch/x86/kernel/acpi/Makefile 2008-12-01 11:11:08.000000000 +0100
129 +++ sle11-2009-10-16/arch/x86/kernel/acpi/Makefile 2009-03-16 16:38:05.000000000 +0100
130 @@ -15,4 +15,4 @@ $(obj)/wakeup_rm.o: $(obj)/realmode/w
131 $(obj)/realmode/wakeup.bin: FORCE
132 $(Q)$(MAKE) $(build)=$(obj)/realmode
133
134 -disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_$(BITS).o
135 +disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_%.o
136 --- sle11-2009-10-16.orig/arch/x86/kernel/acpi/boot.c 2009-03-16 16:33:40.000000000 +0100
137 +++ sle11-2009-10-16/arch/x86/kernel/acpi/boot.c 2009-08-26 11:55:26.000000000 +0200
138 @@ -251,6 +251,7 @@ static int __init acpi_parse_madt(struct
139
140 static void __cpuinit acpi_register_lapic(int id, u8 enabled)
141 {
142 +#ifndef CONFIG_XEN
143 unsigned int ver = 0;
144
145 if (!enabled) {
146 @@ -264,6 +265,7 @@ static void __cpuinit acpi_register_lapi
147 #endif
148
149 generic_processor_info(id, ver);
150 +#endif
151 }
152
153 static int __init
154 @@ -774,6 +776,7 @@ static int __init acpi_parse_fadt(struct
155 * returns 0 on success, < 0 on error
156 */
157
158 +#ifndef CONFIG_XEN
159 static void __init acpi_register_lapic_address(unsigned long address)
160 {
161 mp_lapic_addr = address;
162 @@ -787,6 +790,9 @@ static void __init acpi_register_lapic_a
163 #endif
164 }
165 }
166 +#else
167 +#define acpi_register_lapic_address(address)
168 +#endif
169
170 static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
171 {
172 --- sle11-2009-10-16.orig/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:33:40.000000000 +0100
173 +++ sle11-2009-10-16/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:38:05.000000000 +0100
174 @@ -10,15 +10,19 @@
175 #include <linux/dmi.h>
176 #include <linux/cpumask.h>
177
178 -#include <asm/smp.h>
179 +#include "realmode/wakeup.h"
180 +#include "sleep.h"
181
182 #ifndef CONFIG_ACPI_PV_SLEEP
183 -/* address in low memory of the wakeup routine. */
184 -unsigned long acpi_wakeup_address = 0;
185 +unsigned long acpi_wakeup_address;
186 unsigned long acpi_realmode_flags;
187 -extern char wakeup_start, wakeup_end;
188
189 -extern unsigned long acpi_copy_wakeup_routine(unsigned long);
190 +/* address in low memory of the wakeup routine. */
191 +static unsigned long acpi_realmode;
192 +
193 +#ifdef CONFIG_64BIT
194 +static char temp_stack[10240];
195 +#endif
196 #endif
197
198 /**
199 @@ -26,17 +30,69 @@ extern unsigned long acpi_copy_wakeup_ro
200 *
201 * Create an identity mapped page table and copy the wakeup routine to
202 * low memory.
203 + *
204 + * Note that this is too late to change acpi_wakeup_address.
205 */
206 int acpi_save_state_mem(void)
207 {
208 #ifndef CONFIG_ACPI_PV_SLEEP
209 - if (!acpi_wakeup_address) {
210 - printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
211 + struct wakeup_header *header;
212 +
213 + if (!acpi_realmode) {
214 + printk(KERN_ERR "Could not allocate memory during boot, "
215 + "S3 disabled\n");
216 return -ENOMEM;
217 }
218 - memcpy((void *)acpi_wakeup_address, &wakeup_start,
219 - &wakeup_end - &wakeup_start);
220 - acpi_copy_wakeup_routine(acpi_wakeup_address);
221 + memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
222 +
223 + header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET);
224 + if (header->signature != 0x51ee1111) {
225 + printk(KERN_ERR "wakeup header does not match\n");
226 + return -EINVAL;
227 + }
228 +
229 + header->video_mode = saved_video_mode;
230 +
231 + header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
232 + /* GDT[0]: GDT self-pointer */
233 + header->wakeup_gdt[0] =
234 + (u64)(sizeof(header->wakeup_gdt) - 1) +
235 + ((u64)(acpi_wakeup_address +
236 + ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
237 + << 16);
238 + /* GDT[1]: real-mode-like code segment */
239 + header->wakeup_gdt[1] = (0x009bULL << 40) +
240 + ((u64)acpi_wakeup_address << 16) + 0xffff;
241 + /* GDT[2]: real-mode-like data segment */
242 + header->wakeup_gdt[2] = (0x0093ULL << 40) +
243 + ((u64)acpi_wakeup_address << 16) + 0xffff;
244 +
245 +#ifndef CONFIG_64BIT
246 + store_gdt((struct desc_ptr *)&header->pmode_gdt);
247 +
248 + header->pmode_efer_low = nx_enabled;
249 + if (header->pmode_efer_low & 1) {
250 + /* This is strange, why not save efer, always? */
251 + rdmsr(MSR_EFER, header->pmode_efer_low,
252 + header->pmode_efer_high);
253 + }
254 +#endif /* !CONFIG_64BIT */
255 +
256 + header->pmode_cr0 = read_cr0();
257 + header->pmode_cr4 = read_cr4();
258 + header->realmode_flags = acpi_realmode_flags;
259 + header->real_magic = 0x12345678;
260 +
261 +#ifndef CONFIG_64BIT
262 + header->pmode_entry = (u32)&wakeup_pmode_return;
263 + header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
264 + saved_magic = 0x12345678;
265 +#else /* CONFIG_64BIT */
266 + header->trampoline_segment = setup_trampoline() >> 4;
267 + init_rsp = (unsigned long)temp_stack + 4096;
268 + initial_code = (unsigned long)wakeup_long64;
269 + saved_magic = 0x123456789abcdef0;
270 +#endif /* CONFIG_64BIT */
271 #endif
272
273 return 0;
274 @@ -61,15 +117,20 @@ void acpi_restore_state_mem(void)
275 void __init acpi_reserve_bootmem(void)
276 {
277 #ifndef CONFIG_ACPI_PV_SLEEP
278 - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
279 + if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
280 printk(KERN_ERR
281 "ACPI: Wakeup code way too big, S3 disabled.\n");
282 return;
283 }
284
285 - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
286 - if (!acpi_wakeup_address)
287 + acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE);
288 +
289 + if (!acpi_realmode) {
290 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
291 + return;
292 + }
293 +
294 + acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
295 #endif
296 }
297
298 --- sle11-2009-10-16.orig/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:33:40.000000000 +0100
299 +++ sle11-2009-10-16/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:38:05.000000000 +0100
300 @@ -5,7 +5,6 @@
301 #include <linux/module.h>
302 #include <linux/percpu.h>
303 #include <linux/bootmem.h>
304 -#include <asm/semaphore.h>
305 #include <asm/processor.h>
306 #include <asm/i387.h>
307 #include <asm/msr.h>
308 @@ -13,6 +12,7 @@
309 #include <asm/mmu_context.h>
310 #include <asm/mtrr.h>
311 #include <asm/mce.h>
312 +#include <asm/pat.h>
313 #ifdef CONFIG_X86_LOCAL_APIC
314 #include <asm/mpspec.h>
315 #include <asm/apic.h>
316 @@ -69,9 +69,9 @@ __u32 cleared_cpu_caps[NCAPINTS] __cpuin
317 static int cachesize_override __cpuinitdata = -1;
318 static int disable_x86_serial_nr __cpuinitdata = 1;
319
320 -struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
321 +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
322
323 -static void __cpuinit default_init(struct cpuinfo_x86 * c)
324 +static void __cpuinit default_init(struct cpuinfo_x86 *c)
325 {
326 /* Not much we can do here... */
327 /* Check if at least it has cpuid */
328 @@ -88,11 +88,11 @@ static struct cpu_dev __cpuinitdata defa
329 .c_init = default_init,
330 .c_vendor = "Unknown",
331 };
332 -static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
333 +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
334
335 static int __init cachesize_setup(char *str)
336 {
337 - get_option (&str, &cachesize_override);
338 + get_option(&str, &cachesize_override);
339 return 1;
340 }
341 __setup("cachesize=", cachesize_setup);
342 @@ -114,12 +114,12 @@ int __cpuinit get_model_name(struct cpui
343 /* Intel chips right-justify this string for some dumb reason;
344 undo that brain damage */
345 p = q = &c->x86_model_id[0];
346 - while ( *p == ' ' )
347 + while (*p == ' ')
348 p++;
349 - if ( p != q ) {
350 - while ( *p )
351 + if (p != q) {
352 + while (*p)
353 *q++ = *p++;
354 - while ( q <= &c->x86_model_id[48] )
355 + while (q <= &c->x86_model_id[48])
356 *q++ = '\0'; /* Zero-pad the rest */
357 }
358
359 @@ -137,7 +137,7 @@ void __cpuinit display_cacheinfo(struct
360 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
361 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
362 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
363 - c->x86_cache_size=(ecx>>24)+(edx>>24);
364 + c->x86_cache_size = (ecx>>24)+(edx>>24);
365 }
366
367 if (n < 0x80000006) /* Some chips just has a large L1. */
368 @@ -145,16 +145,16 @@ void __cpuinit display_cacheinfo(struct
369
370 ecx = cpuid_ecx(0x80000006);
371 l2size = ecx >> 16;
372 -
373 +
374 /* do processor-specific cache resizing */
375 if (this_cpu->c_size_cache)
376 - l2size = this_cpu->c_size_cache(c,l2size);
377 + l2size = this_cpu->c_size_cache(c, l2size);
378
379 /* Allow user to override all this if necessary. */
380 if (cachesize_override != -1)
381 l2size = cachesize_override;
382
383 - if ( l2size == 0 )
384 + if (l2size == 0)
385 return; /* Again, no L2 cache is possible */
386
387 c->x86_cache_size = l2size;
388 @@ -163,16 +163,19 @@ void __cpuinit display_cacheinfo(struct
389 l2size, ecx & 0xFF);
390 }
391
392 -/* Naming convention should be: <Name> [(<Codename>)] */
393 -/* This table only is used unless init_<vendor>() below doesn't set it; */
394 -/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
395 +/*
396 + * Naming convention should be: <Name> [(<Codename>)]
397 + * This table only is used unless init_<vendor>() below doesn't set it;
398 + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
399 + *
400 + */
401
402 /* Look up CPU names by table lookup. */
403 static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
404 {
405 struct cpu_model_info *info;
406
407 - if ( c->x86_model >= 16 )
408 + if (c->x86_model >= 16)
409 return NULL; /* Range check */
410
411 if (!this_cpu)
412 @@ -197,9 +200,9 @@ static void __cpuinit get_cpu_vendor(str
413
414 for (i = 0; i < X86_VENDOR_NUM; i++) {
415 if (cpu_devs[i]) {
416 - if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
417 - (cpu_devs[i]->c_ident[1] &&
418 - !strcmp(v,cpu_devs[i]->c_ident[1]))) {
419 + if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
420 + (cpu_devs[i]->c_ident[1] &&
421 + !strcmp(v, cpu_devs[i]->c_ident[1]))) {
422 c->x86_vendor = i;
423 if (!early)
424 this_cpu = cpu_devs[i];
425 @@ -217,7 +220,7 @@ static void __cpuinit get_cpu_vendor(str
426 }
427
428
429 -static int __init x86_fxsr_setup(char * s)
430 +static int __init x86_fxsr_setup(char *s)
431 {
432 setup_clear_cpu_cap(X86_FEATURE_FXSR);
433 setup_clear_cpu_cap(X86_FEATURE_XMM);
434 @@ -226,7 +229,7 @@ static int __init x86_fxsr_setup(char *
435 __setup("nofxsr", x86_fxsr_setup);
436
437
438 -static int __init x86_sep_setup(char * s)
439 +static int __init x86_sep_setup(char *s)
440 {
441 setup_clear_cpu_cap(X86_FEATURE_SEP);
442 return 1;
443 @@ -315,12 +318,15 @@ static void __cpuinit early_get_cap(stru
444
445 }
446
447 -/* Do minimum CPU detection early.
448 - Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
449 - The others are not touched to avoid unwanted side effects.
450 -
451 - WARNING: this function is only called on the BP. Don't add code here
452 - that is supposed to run on all CPUs. */
453 +/*
454 + * Do minimum CPU detection early.
455 + * Fields really needed: vendor, cpuid_level, family, model, mask,
456 + * cache alignment.
457 + * The others are not touched to avoid unwanted side effects.
458 + *
459 + * WARNING: this function is only called on the BP. Don't add code here
460 + * that is supposed to run on all CPUs.
461 + */
462 static void __init early_cpu_detect(void)
463 {
464 struct cpuinfo_x86 *c = &boot_cpu_data;
465 @@ -335,19 +341,14 @@ static void __init early_cpu_detect(void
466
467 get_cpu_vendor(c, 1);
468
469 - switch (c->x86_vendor) {
470 - case X86_VENDOR_AMD:
471 - early_init_amd(c);
472 - break;
473 - case X86_VENDOR_INTEL:
474 - early_init_intel(c);
475 - break;
476 - }
477 + if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
478 + cpu_devs[c->x86_vendor]->c_early_init)
479 + cpu_devs[c->x86_vendor]->c_early_init(c);
480
481 early_get_cap(c);
482 }
483
484 -static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
485 +static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
486 {
487 u32 tfms, xlvl;
488 unsigned int ebx;
489 @@ -358,13 +359,12 @@ static void __cpuinit generic_identify(s
490 (unsigned int *)&c->x86_vendor_id[0],
491 (unsigned int *)&c->x86_vendor_id[8],
492 (unsigned int *)&c->x86_vendor_id[4]);
493 -
494 +
495 get_cpu_vendor(c, 0);
496 /* Initialize the standard set of capabilities */
497 /* Note that the vendor-specific code below might override */
498 -
499 /* Intel-defined flags: level 0x00000001 */
500 - if ( c->cpuid_level >= 0x00000001 ) {
501 + if (c->cpuid_level >= 0x00000001) {
502 u32 capability, excap;
503 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
504 c->x86_capability[0] = capability;
505 @@ -376,12 +376,14 @@ static void __cpuinit generic_identify(s
506 if (c->x86 >= 0x6)
507 c->x86_model += ((tfms >> 16) & 0xF) << 4;
508 c->x86_mask = tfms & 15;
509 + c->initial_apicid = (ebx >> 24) & 0xFF;
510 #ifdef CONFIG_X86_HT
511 - c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
512 + c->apicid = phys_pkg_id(c->initial_apicid, 0);
513 + c->phys_proc_id = c->initial_apicid;
514 #else
515 - c->apicid = (ebx >> 24) & 0xFF;
516 + c->apicid = c->initial_apicid;
517 #endif
518 - if (c->x86_capability[0] & (1<<19))
519 + if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
520 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
521 } else {
522 /* Have CPUID level 0 only - unheard of */
523 @@ -390,33 +392,30 @@ static void __cpuinit generic_identify(s
524
525 /* AMD-defined flags: level 0x80000001 */
526 xlvl = cpuid_eax(0x80000000);
527 - if ( (xlvl & 0xffff0000) == 0x80000000 ) {
528 - if ( xlvl >= 0x80000001 ) {
529 + if ((xlvl & 0xffff0000) == 0x80000000) {
530 + if (xlvl >= 0x80000001) {
531 c->x86_capability[1] = cpuid_edx(0x80000001);
532 c->x86_capability[6] = cpuid_ecx(0x80000001);
533 }
534 - if ( xlvl >= 0x80000004 )
535 + if (xlvl >= 0x80000004)
536 get_model_name(c); /* Default name */
537 }
538
539 init_scattered_cpuid_features(c);
540 }
541
542 -#ifdef CONFIG_X86_HT
543 - c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
544 -#endif
545 }
546
547 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
548 {
549 - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
550 + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
551 /* Disable processor serial number */
552 - unsigned long lo,hi;
553 - rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
554 + unsigned long lo, hi;
555 + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
556 lo |= 0x200000;
557 - wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
558 + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
559 printk(KERN_NOTICE "CPU serial number disabled.\n");
560 - clear_bit(X86_FEATURE_PN, c->x86_capability);
561 + clear_cpu_cap(c, X86_FEATURE_PN);
562
563 /* Disabling the serial number may affect the cpuid level */
564 c->cpuid_level = cpuid_eax(0);
565 @@ -451,9 +450,11 @@ void __cpuinit identify_cpu(struct cpuin
566 memset(&c->x86_capability, 0, sizeof c->x86_capability);
567
568 if (!have_cpuid_p()) {
569 - /* First of all, decide if this is a 486 or higher */
570 - /* It's a 486 if we can modify the AC flag */
571 - if ( flag_is_changeable_p(X86_EFLAGS_AC) )
572 + /*
573 + * First of all, decide if this is a 486 or higher
574 + * It's a 486 if we can modify the AC flag
575 + */
576 + if (flag_is_changeable_p(X86_EFLAGS_AC))
577 c->x86 = 4;
578 else
579 c->x86 = 3;
580 @@ -486,10 +487,10 @@ void __cpuinit identify_cpu(struct cpuin
581 */
582
583 /* If the model name is still unset, do table lookup. */
584 - if ( !c->x86_model_id[0] ) {
585 + if (!c->x86_model_id[0]) {
586 char *p;
587 p = table_lookup_model(c);
588 - if ( p )
589 + if (p)
590 strcpy(c->x86_model_id, p);
591 else
592 /* Last resort... */
593 @@ -503,9 +504,9 @@ void __cpuinit identify_cpu(struct cpuin
594 * common between the CPUs. The first time this routine gets
595 * executed, c == &boot_cpu_data.
596 */
597 - if ( c != &boot_cpu_data ) {
598 + if (c != &boot_cpu_data) {
599 /* AND the already accumulated flags with these */
600 - for ( i = 0 ; i < NCAPINTS ; i++ )
601 + for (i = 0 ; i < NCAPINTS ; i++)
602 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
603 }
604
605 @@ -549,7 +550,7 @@ void __cpuinit detect_ht(struct cpuinfo_
606
607 if (smp_num_siblings == 1) {
608 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
609 - } else if (smp_num_siblings > 1 ) {
610 + } else if (smp_num_siblings > 1) {
611
612 if (smp_num_siblings > NR_CPUS) {
613 printk(KERN_WARNING "CPU: Unsupported number of the "
614 @@ -559,7 +560,7 @@ void __cpuinit detect_ht(struct cpuinfo_
615 }
616
617 index_msb = get_count_order(smp_num_siblings);
618 - c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
619 + c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
620
621 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
622 c->phys_proc_id);
623 @@ -570,7 +571,7 @@ void __cpuinit detect_ht(struct cpuinfo_
624
625 core_bits = get_count_order(c->x86_max_cores);
626
627 - c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
628 + c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
629 ((1 << core_bits) - 1);
630
631 if (c->x86_max_cores > 1)
632 @@ -604,7 +605,7 @@ void __cpuinit print_cpu_info(struct cpu
633 else
634 printk("%s", c->x86_model_id);
635
636 - if (c->x86_mask || c->cpuid_level >= 0)
637 + if (c->x86_mask || c->cpuid_level >= 0)
638 printk(" stepping %02x\n", c->x86_mask);
639 else
640 printk("\n");
641 @@ -623,24 +624,17 @@ __setup("clearcpuid=", setup_disablecpui
642
643 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
644
645 -/* This is hacky. :)
646 - * We're emulating future behavior.
647 - * In the future, the cpu-specific init functions will be called implicitly
648 - * via the magic of initcalls.
649 - * They will insert themselves into the cpu_devs structure.
650 - * Then, when cpu_init() is called, we can just iterate over that array.
651 - */
652 void __init early_cpu_init(void)
653 {
654 - intel_cpu_init();
655 - cyrix_init_cpu();
656 - nsc_init_cpu();
657 - amd_init_cpu();
658 - centaur_init_cpu();
659 - transmeta_init_cpu();
660 - nexgen_init_cpu();
661 - umc_init_cpu();
662 + struct cpu_vendor_dev *cvdev;
663 +
664 + for (cvdev = __x86cpuvendor_start ;
665 + cvdev < __x86cpuvendor_end ;
666 + cvdev++)
667 + cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
668 +
669 early_cpu_detect();
670 + validate_pat_support(&boot_cpu_data);
671 }
672
673 /* Make sure %fs is initialized properly in idle threads */
674 @@ -685,7 +679,7 @@ void __cpuinit cpu_init(void)
675 int cpu = smp_processor_id();
676 struct task_struct *curr = current;
677 #ifndef CONFIG_X86_NO_TSS
678 - struct tss_struct * t = &per_cpu(init_tss, cpu);
679 + struct tss_struct *t = &per_cpu(init_tss, cpu);
680 #endif
681 struct thread_struct *thread = &curr->thread;
682
683 @@ -738,7 +732,7 @@ void __cpuinit cpu_init(void)
684 mxcsr_feature_mask_init();
685 }
686
687 -#ifdef CONFIG_HOTPLUG_CPU
688 +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
689 void __cpuinit cpu_uninit(void)
690 {
691 int cpu = raw_smp_processor_id();
692 --- sle11-2009-10-16.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-03-16 16:33:40.000000000 +0100
693 +++ sle11-2009-10-16/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-03-16 16:38:05.000000000 +0100
694 @@ -35,6 +35,8 @@ struct mtrr_ops *mtrr_if = &generic_mtrr
695 unsigned int num_var_ranges;
696 unsigned int mtrr_usage_table[MAX_VAR_RANGES];
697
698 +static u64 tom2;
699 +
700 static void __init set_num_var_ranges(void)
701 {
702 struct xen_platform_op op;
703 @@ -162,8 +164,144 @@ mtrr_del(int reg, unsigned long base, un
704 EXPORT_SYMBOL(mtrr_add);
705 EXPORT_SYMBOL(mtrr_del);
706
707 +/*
708 + * Returns the effective MTRR type for the region
709 + * Error returns:
710 + * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
711 + * - 0xFF - when MTRR is not enabled
712 + */
713 +u8 mtrr_type_lookup(u64 start, u64 end)
714 +{
715 + int i, error;
716 + u64 start_mfn, end_mfn, base_mfn, top_mfn;
717 + u8 prev_match, curr_match;
718 + struct xen_platform_op op;
719 +
720 + if (!is_initial_xendomain())
721 + return MTRR_TYPE_WRBACK;
722 +
723 + if (!num_var_ranges)
724 + return 0xFF;
725 +
726 + start_mfn = start >> PAGE_SHIFT;
727 + /* Make end inclusive end, instead of exclusive */
728 + end_mfn = --end >> PAGE_SHIFT;
729 +
730 + /* Look in fixed ranges. Just return the type as per start */
731 + if (start_mfn < 0x100) {
732 +#if 0//todo
733 + op.cmd = XENPF_read_memtype;
734 + op.u.read_memtype.reg = ???;
735 + error = HYPERVISOR_platform_op(&op);
736 + if (!error)
737 + return op.u.read_memtype.type;
738 +#endif
739 + return MTRR_TYPE_UNCACHABLE;
740 + }
741 +
742 + /*
743 + * Look in variable ranges
744 + * Look of multiple ranges matching this address and pick type
745 + * as per MTRR precedence
746 + */
747 + prev_match = 0xFF;
748 + for (i = 0; i < num_var_ranges; ++i) {
749 + op.cmd = XENPF_read_memtype;
750 + op.u.read_memtype.reg = i;
751 + error = HYPERVISOR_platform_op(&op);
752 +
753 + if (error || !op.u.read_memtype.nr_mfns)
754 + continue;
755 +
756 + base_mfn = op.u.read_memtype.mfn;
757 + top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1;
758 +
759 + if (base_mfn > end_mfn || start_mfn > top_mfn) {
760 + continue;
761 + }
762 +
763 + if (base_mfn > start_mfn || end_mfn > top_mfn) {
764 + return 0xFE;
765 + }
766 +
767 + curr_match = op.u.read_memtype.type;
768 + if (prev_match == 0xFF) {
769 + prev_match = curr_match;
770 + continue;
771 + }
772 +
773 + if (prev_match == MTRR_TYPE_UNCACHABLE ||
774 + curr_match == MTRR_TYPE_UNCACHABLE) {
775 + return MTRR_TYPE_UNCACHABLE;
776 + }
777 +
778 + if ((prev_match == MTRR_TYPE_WRBACK &&
779 + curr_match == MTRR_TYPE_WRTHROUGH) ||
780 + (prev_match == MTRR_TYPE_WRTHROUGH &&
781 + curr_match == MTRR_TYPE_WRBACK)) {
782 + prev_match = MTRR_TYPE_WRTHROUGH;
783 + curr_match = MTRR_TYPE_WRTHROUGH;
784 + }
785 +
786 + if (prev_match != curr_match) {
787 + return MTRR_TYPE_UNCACHABLE;
788 + }
789 + }
790 +
791 + if (tom2) {
792 + if (start >= (1ULL<<32) && (end < tom2))
793 + return MTRR_TYPE_WRBACK;
794 + }
795 +
796 + if (prev_match != 0xFF)
797 + return prev_match;
798 +
799 +#if 0//todo
800 + op.cmd = XENPF_read_def_memtype;
801 + error = HYPERVISOR_platform_op(&op);
802 + if (!error)
803 + return op.u.read_def_memtype.type;
804 +#endif
805 + return MTRR_TYPE_UNCACHABLE;
806 +}
807 +
808 +/*
809 + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
810 + * for memory >4GB. Check for that here.
811 + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
812 + * apply to are wrong, but so far we don't know of any such case in the wild.
813 + */
814 +#define Tom2Enabled (1U << 21)
815 +#define Tom2ForceMemTypeWB (1U << 22)
816 +
817 +int __init amd_special_default_mtrr(void)
818 +{
819 + u32 l, h;
820 +
821 + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
822 + return 0;
823 + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
824 + return 0;
825 + /* In case some hypervisor doesn't pass SYSCFG through */
826 + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
827 + return 0;
828 + /*
829 + * Memory between 4GB and top of mem is forced WB by this magic bit.
830 + * Reserved before K8RevF, but should be zero there.
831 + */
832 + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
833 + (Tom2Enabled | Tom2ForceMemTypeWB))
834 + return 1;
835 + return 0;
836 +}
837 +
838 void __init mtrr_bp_init(void)
839 {
840 + if (amd_special_default_mtrr()) {
841 + /* TOP_MEM2 */
842 + rdmsrl(MSR_K8_TOP_MEM2, tom2);
843 + tom2 &= 0xffffff8000000ULL;
844 + }
845 }
846
847 void mtrr_ap_init(void)
848 --- sle11-2009-10-16.orig/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:33:40.000000000 +0100
849 +++ sle11-2009-10-16/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:38:05.000000000 +0100
850 @@ -469,7 +469,7 @@ int __init sanitize_e820_map(struct e820
851 * thinkpad 560x, for example, does not cooperate with the memory
852 * detection code.)
853 */
854 -int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
855 +int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
856 {
857 #ifndef CONFIG_XEN
858 /* Only one memory region (or negative)? Ignore it */
859 @@ -480,33 +480,17 @@ int __init copy_e820_map(struct e820entr
860 #endif
861
862 do {
863 - unsigned long long start = biosmap->addr;
864 - unsigned long long size = biosmap->size;
865 - unsigned long long end = start + size;
866 - unsigned long type = biosmap->type;
867 + u64 start = biosmap->addr;
868 + u64 size = biosmap->size;
869 + u64 end = start + size;
870 + u32 type = biosmap->type;
871
872 /* Overflow in 64 bits? Ignore the memory map. */
873 if (start > end)
874 return -1;
875
876 -#ifndef CONFIG_XEN
877 - /*
878 - * Some BIOSes claim RAM in the 640k - 1M region.
879 - * Not right. Fix it up.
880 - */
881 - if (type == E820_RAM) {
882 - if (start < 0x100000ULL && end > 0xA0000ULL) {
883 - if (start < 0xA0000ULL)
884 - add_memory_region(start, 0xA0000ULL-start, type);
885 - if (end <= 0x100000ULL)
886 - continue;
887 - start = 0x100000ULL;
888 - size = end - start;
889 - }
890 - }
891 -#endif
892 add_memory_region(start, size, type);
893 - } while (biosmap++,--nr_map);
894 + } while (biosmap++, --nr_map);
895
896 #ifdef CONFIG_XEN
897 if (is_initial_xendomain()) {
898 @@ -528,7 +512,7 @@ int __init copy_e820_map(struct e820entr
899 /*
900 * Find the highest page frame number we have available
901 */
902 -void __init find_max_pfn(void)
903 +void __init propagate_e820_map(void)
904 {
905 int i;
906
907 @@ -814,7 +798,7 @@ static int __init parse_memmap(char *arg
908 * size before original memory map is
909 * reset.
910 */
911 - find_max_pfn();
912 + propagate_e820_map();
913 saved_max_pfn = max_pfn;
914 #endif
915 e820.nr_map = 0;
916 --- sle11-2009-10-16.orig/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:33:40.000000000 +0100
917 +++ sle11-2009-10-16/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:38:05.000000000 +0100
918 @@ -40,11 +40,11 @@ struct e820map machine_e820;
919 unsigned long end_pfn;
920
921 /*
922 - * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
923 - * The direct mapping extends to end_pfn_map, so that we can directly access
924 + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
925 + * The direct mapping extends to max_pfn_mapped, so that we can directly access
926 * apertures, ACPI and other tables without having to play with fixmaps.
927 */
928 -unsigned long end_pfn_map;
929 +unsigned long max_pfn_mapped;
930
931 /*
932 * Last pfn which the user wants to use.
933 @@ -63,8 +63,8 @@ struct early_res {
934 static struct early_res early_res[MAX_EARLY_RES] __initdata = {
935 #ifndef CONFIG_XEN
936 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
937 -#ifdef CONFIG_SMP
938 - { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
939 +#ifdef CONFIG_X86_TRAMPOLINE
940 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
941 #endif
942 #endif
943 {}
944 @@ -89,19 +89,47 @@ void __init reserve_early(unsigned long
945 strncpy(r->name, name, sizeof(r->name) - 1);
946 }
947
948 -void __init early_res_to_bootmem(void)
949 +void __init free_early(unsigned long start, unsigned long end)
950 +{
951 + struct early_res *r;
952 + int i, j;
953 +
954 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
955 + r = &early_res[i];
956 + if (start == r->start && end == r->end)
957 + break;
958 + }
959 + if (i >= MAX_EARLY_RES || !early_res[i].end)
960 + panic("free_early on not reserved area: %lx-%lx!", start, end);
961 +
962 + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
963 + ;
964 +
965 + memmove(&early_res[i], &early_res[i + 1],
966 + (j - 1 - i) * sizeof(struct early_res));
967 +
968 + early_res[j - 1].end = 0;
969 +}
970 +
971 +void __init early_res_to_bootmem(unsigned long start, unsigned long end)
972 {
973 int i;
974 + unsigned long final_start, final_end;
975 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
976 struct early_res *r = &early_res[i];
977 - printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
978 - r->start, r->end - 1, r->name);
979 - reserve_bootmem_generic(r->start, r->end - r->start);
980 + final_start = max(start, r->start);
981 + final_end = min(end, r->end);
982 + if (final_start >= final_end)
983 + continue;
984 + printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
985 + final_start, final_end - 1, r->name);
986 + reserve_bootmem_generic(final_start, final_end - final_start);
987 }
988 }
989
990 /* Check for already reserved areas */
991 -static inline int bad_addr(unsigned long *addrp, unsigned long size)
992 +static inline int __init
993 +bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
994 {
995 int i;
996 unsigned long addr = *addrp, last;
997 @@ -111,7 +139,7 @@ again:
998 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
999 struct early_res *r = &early_res[i];
1000 if (last >= r->start && addr < r->end) {
1001 - *addrp = addr = r->end;
1002 + *addrp = addr = round_up(r->end, align);
1003 changed = 1;
1004 goto again;
1005 }
1006 @@ -119,6 +147,40 @@ again:
1007 return changed;
1008 }
1009
1010 +/* Check for already reserved areas */
1011 +static inline int __init
1012 +bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
1013 +{
1014 + int i;
1015 + unsigned long addr = *addrp, last;
1016 + unsigned long size = *sizep;
1017 + int changed = 0;
1018 +again:
1019 + last = addr + size;
1020 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1021 + struct early_res *r = &early_res[i];
1022 + if (last > r->start && addr < r->start) {
1023 + size = r->start - addr;
1024 + changed = 1;
1025 + goto again;
1026 + }
1027 + if (last > r->end && addr < r->end) {
1028 + addr = round_up(r->end, align);
1029 + size = last - addr;
1030 + changed = 1;
1031 + goto again;
1032 + }
1033 + if (last <= r->end && addr >= r->start) {
1034 + (*sizep)++;
1035 + return 0;
1036 + }
1037 + }
1038 + if (changed) {
1039 + *addrp = addr;
1040 + *sizep = size;
1041 + }
1042 + return changed;
1043 +}
1044 /*
1045 * This function checks if any part of the range <start,end> is mapped
1046 * with type.
1047 @@ -194,26 +256,27 @@ int __init e820_all_mapped(unsigned long
1048 * Find a free area with specified alignment in a specific range.
1049 */
1050 unsigned long __init find_e820_area(unsigned long start, unsigned long end,
1051 - unsigned size, unsigned long align)
1052 + unsigned long size, unsigned long align)
1053 {
1054 int i;
1055 - unsigned long mask = ~(align - 1);
1056
1057 for (i = 0; i < e820.nr_map; i++) {
1058 struct e820entry *ei = &e820.map[i];
1059 - unsigned long addr = ei->addr, last;
1060 + unsigned long addr, last;
1061 + unsigned long ei_last;
1062
1063 if (ei->type != E820_RAM)
1064 continue;
1065 + addr = round_up(ei->addr, align);
1066 + ei_last = ei->addr + ei->size;
1067 if (addr < start)
1068 - addr = start;
1069 - if (addr > ei->addr + ei->size)
1070 + addr = round_up(start, align);
1071 + if (addr >= ei_last)
1072 continue;
1073 - while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
1074 + while (bad_addr(&addr, size, align) && addr+size <= ei_last)
1075 ;
1076 - addr = (addr + align - 1) & mask;
1077 last = addr + size;
1078 - if (last > ei->addr + ei->size)
1079 + if (last > ei_last)
1080 continue;
1081 if (last > end)
1082 continue;
1083 @@ -223,6 +286,40 @@ unsigned long __init find_e820_area(unsi
1084 }
1085
1086 /*
1087 + * Find next free range after *start
1088 + */
1089 +unsigned long __init find_e820_area_size(unsigned long start,
1090 + unsigned long *sizep,
1091 + unsigned long align)
1092 +{
1093 + int i;
1094 +
1095 + for (i = 0; i < e820.nr_map; i++) {
1096 + struct e820entry *ei = &e820.map[i];
1097 + unsigned long addr, last;
1098 + unsigned long ei_last;
1099 +
1100 + if (ei->type != E820_RAM)
1101 + continue;
1102 + addr = round_up(ei->addr, align);
1103 + ei_last = ei->addr + ei->size;
1104 + if (addr < start)
1105 + addr = round_up(start, align);
1106 + if (addr >= ei_last)
1107 + continue;
1108 + *sizep = ei_last - addr;
1109 + while (bad_addr_size(&addr, sizep, align) &&
1110 + addr + *sizep <= ei_last)
1111 + ;
1112 + last = addr + *sizep;
1113 + if (last > ei_last)
1114 + continue;
1115 + return addr;
1116 + }
1117 + return -1UL;
1118 +
1119 +}
1120 +/*
1121 * Find the highest page frame number we have available
1122 */
1123 unsigned long __init e820_end_of_ram(void)
1124 @@ -231,31 +328,29 @@ unsigned long __init e820_end_of_ram(voi
1125
1126 end_pfn = find_max_pfn_with_active_regions();
1127
1128 - if (end_pfn > end_pfn_map)
1129 - end_pfn_map = end_pfn;
1130 - if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
1131 - end_pfn_map = MAXMEM>>PAGE_SHIFT;
1132 + if (end_pfn > max_pfn_mapped)
1133 + max_pfn_mapped = end_pfn;
1134 + if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
1135 + max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
1136 if (end_pfn > end_user_pfn)
1137 end_pfn = end_user_pfn;
1138 - if (end_pfn > end_pfn_map)
1139 - end_pfn = end_pfn_map;
1140 + if (end_pfn > max_pfn_mapped)
1141 + end_pfn = max_pfn_mapped;
1142
1143 - printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
1144 + printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
1145 return end_pfn;
1146 }
1147
1148 /*
1149 * Mark e820 reserved areas as busy for the resource manager.
1150 */
1151 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
1152 - struct resource *code_resource,
1153 - struct resource *data_resource,
1154 - struct resource *bss_resource)
1155 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
1156 {
1157 int i;
1158 + struct resource *res;
1159 +
1160 + res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
1161 for (i = 0; i < nr_map; i++) {
1162 - struct resource *res;
1163 - res = alloc_bootmem_low(sizeof(struct resource));
1164 switch (e820[i].type) {
1165 case E820_RAM: res->name = "System RAM"; break;
1166 case E820_ACPI: res->name = "ACPI Tables"; break;
1167 @@ -265,26 +360,8 @@ void __init e820_reserve_resources(struc
1168 res->start = e820[i].addr;
1169 res->end = res->start + e820[i].size - 1;
1170 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1171 - request_resource(&iomem_resource, res);
1172 - if (e820[i].type == E820_RAM) {
1173 - /*
1174 - * We don't know which RAM region contains kernel data,
1175 - * so we try it repeatedly and let the resource manager
1176 - * test it.
1177 - */
1178 -#ifndef CONFIG_XEN
1179 - request_resource(res, code_resource);
1180 - request_resource(res, data_resource);
1181 - request_resource(res, bss_resource);
1182 -#endif
1183 -#ifdef CONFIG_KEXEC
1184 - if (crashk_res.start != crashk_res.end)
1185 - request_resource(res, &crashk_res);
1186 -#ifdef CONFIG_XEN
1187 - xen_machine_kexec_register_resources(res);
1188 -#endif
1189 -#endif
1190 - }
1191 + insert_resource(&iomem_resource, res);
1192 + res++;
1193 }
1194 }
1195
1196 @@ -338,9 +415,9 @@ static int __init e820_find_active_regio
1197 if (*ei_startpfn >= *ei_endpfn)
1198 return 0;
1199
1200 - /* Check if end_pfn_map should be updated */
1201 - if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
1202 - end_pfn_map = *ei_endpfn;
1203 + /* Check if max_pfn_mapped should be updated */
1204 + if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
1205 + max_pfn_mapped = *ei_endpfn;
1206
1207 /* Skip if map is outside the node */
1208 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
1209 @@ -667,10 +744,10 @@ static int __init copy_e820_map(struct e
1210 #endif
1211
1212 do {
1213 - unsigned long start = biosmap->addr;
1214 - unsigned long size = biosmap->size;
1215 - unsigned long end = start + size;
1216 - unsigned long type = biosmap->type;
1217 + u64 start = biosmap->addr;
1218 + u64 size = biosmap->size;
1219 + u64 end = start + size;
1220 + u32 type = biosmap->type;
1221
1222 /* Overflow in 64 bits? Ignore the memory map. */
1223 if (start > end)
1224 @@ -801,7 +878,7 @@ static int __init parse_memmap_opt(char
1225 saved_max_pfn = e820_end_of_ram();
1226 remove_all_active_ranges();
1227 #endif
1228 - end_pfn_map = 0;
1229 + max_pfn_mapped = 0;
1230 e820.nr_map = 0;
1231 userdef = 1;
1232 return 0;
1233 --- sle11-2009-10-16.orig/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:33:40.000000000 +0100
1234 +++ sle11-2009-10-16/arch/x86/kernel/early_printk-xen.c 2009-09-24 10:29:16.000000000 +0200
1235 @@ -13,7 +13,7 @@
1236 #define VGABASE (__ISA_IO_base + 0xb8000)
1237
1238 static int max_ypos = 25, max_xpos = 80;
1239 -static int current_ypos = 25, current_xpos = 0;
1240 +static int current_ypos = 25, current_xpos;
1241
1242 static void early_vga_write(struct console *con, const char *str, unsigned n)
1243 {
1244 @@ -108,12 +108,12 @@ static __init void early_serial_init(cha
1245
1246 if (*s) {
1247 unsigned port;
1248 - if (!strncmp(s,"0x",2)) {
1249 + if (!strncmp(s, "0x", 2)) {
1250 early_serial_base = simple_strtoul(s, &e, 16);
1251 } else {
1252 static int bases[] = { 0x3f8, 0x2f8 };
1253
1254 - if (!strncmp(s,"ttyS",4))
1255 + if (!strncmp(s, "ttyS", 4))
1256 s += 4;
1257 port = simple_strtoul(s, &e, 10);
1258 if (port > 1 || s == e)
1259 @@ -223,7 +223,7 @@ static struct console simnow_console = {
1260
1261 /* Direct interface for emergencies */
1262 static struct console *early_console = &early_vga_console;
1263 -static int early_console_initialized = 0;
1264 +static int early_console_initialized;
1265
1266 void early_printk(const char *fmt, ...)
1267 {
1268 @@ -231,9 +231,9 @@ void early_printk(const char *fmt, ...)
1269 int n;
1270 va_list ap;
1271
1272 - va_start(ap,fmt);
1273 - n = vscnprintf(buf,512,fmt,ap);
1274 - early_console->write(early_console,buf,n);
1275 + va_start(ap, fmt);
1276 + n = vscnprintf(buf, 512, fmt, ap);
1277 + early_console->write(early_console, buf, n);
1278 va_end(ap);
1279 }
1280
1281 @@ -259,16 +259,16 @@ static int __init setup_early_printk(cha
1282 early_console = &early_serial_console;
1283 } else if (!strncmp(buf, "vga", 3)) {
1284 #ifndef CONFIG_XEN
1285 - && boot_params.screen_info.orig_video_isVGA == 1) {
1286 + && boot_params.screen_info.orig_video_isVGA == 1) {
1287 max_xpos = boot_params.screen_info.orig_video_cols;
1288 max_ypos = boot_params.screen_info.orig_video_lines;
1289 current_ypos = boot_params.screen_info.orig_y;
1290 #endif
1291 early_console = &early_vga_console;
1292 - } else if (!strncmp(buf, "simnow", 6)) {
1293 - simnow_init(buf + 6);
1294 - early_console = &simnow_console;
1295 - keep_early = 1;
1296 + } else if (!strncmp(buf, "simnow", 6)) {
1297 + simnow_init(buf + 6);
1298 + early_console = &simnow_console;
1299 + keep_early = 1;
1300 #ifdef CONFIG_XEN
1301 } else if (!strncmp(buf, "xen", 3)) {
1302 early_console = &xenboot_console;
1303 --- sle11-2009-10-16.orig/arch/x86/kernel/entry_32-xen.S 2009-05-14 11:18:32.000000000 +0200
1304 +++ sle11-2009-10-16/arch/x86/kernel/entry_32-xen.S 2009-03-16 16:38:05.000000000 +0100
1305 @@ -1,5 +1,4 @@
1306 /*
1307 - * linux/arch/i386/entry.S
1308 *
1309 * Copyright (C) 1991, 1992 Linus Torvalds
1310 */
1311 @@ -51,6 +50,7 @@
1312 #include <asm/desc.h>
1313 #include <asm/percpu.h>
1314 #include <asm/dwarf2.h>
1315 +#include <asm/processor-flags.h>
1316 #include "irq_vectors.h"
1317 #include <xen/interface/xen.h>
1318
1319 @@ -69,12 +69,6 @@
1320
1321 #define nr_syscalls ((syscall_table_size)/4)
1322
1323 -CF_MASK = 0x00000001
1324 -TF_MASK = 0x00000100
1325 -IF_MASK = 0x00000200
1326 -DF_MASK = 0x00000400
1327 -NT_MASK = 0x00004000
1328 -VM_MASK = 0x00020000
1329 /* Pseudo-eflags. */
1330 NMI_MASK = 0x80000000
1331
1332 @@ -87,7 +81,7 @@ NMI_MASK = 0x80000000
1333
1334 .macro TRACE_IRQS_IRET
1335 #ifdef CONFIG_TRACE_IRQFLAGS
1336 - testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
1337 + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
1338 jz 1f
1339 TRACE_IRQS_ON
1340 1:
1341 @@ -249,7 +243,7 @@ ret_from_intr:
1342 check_userspace:
1343 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
1344 movb PT_CS(%esp), %al
1345 - andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
1346 + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
1347 cmpl $USER_RPL, %eax
1348 jb resume_kernel # not returning to v8086 or userspace
1349
1350 @@ -258,6 +252,7 @@ ENTRY(resume_userspace)
1351 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1352 # setting need_resched or sigpending
1353 # between sampling and the iret
1354 + TRACE_IRQS_OFF
1355 movl TI_flags(%ebp), %ecx
1356 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
1357 # int/exception return?
1358 @@ -274,7 +269,7 @@ need_resched:
1359 movl TI_flags(%ebp), %ecx # need_resched set ?
1360 testb $_TIF_NEED_RESCHED, %cl
1361 jz restore_all
1362 - testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
1363 + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
1364 jz restore_all
1365 call preempt_schedule_irq
1366 jmp need_resched
1367 @@ -299,10 +294,10 @@ ENTRY(ia32_sysenter_target)
1368 movl SYSENTER_stack_sp0(%esp),%esp
1369 sysenter_past_esp:
1370 /*
1371 - * No need to follow this irqs on/off section: the syscall
1372 - * disabled irqs and here we enable it straight after entry:
1373 + * Interrupts are disabled here, but we can't trace it until
1374 + * enough kernel state to call TRACE_IRQS_OFF can be called - but
1375 + * we immediately enable interrupts at that point anyway.
1376 */
1377 - ENABLE_INTERRUPTS(CLBR_NONE)
1378 pushl $(__USER_DS)
1379 CFI_ADJUST_CFA_OFFSET 4
1380 /*CFI_REL_OFFSET ss, 0*/
1381 @@ -310,6 +305,7 @@ sysenter_past_esp:
1382 CFI_ADJUST_CFA_OFFSET 4
1383 CFI_REL_OFFSET esp, 0
1384 pushfl
1385 + orl $X86_EFLAGS_IF, (%esp)
1386 CFI_ADJUST_CFA_OFFSET 4
1387 pushl $(__USER_CS)
1388 CFI_ADJUST_CFA_OFFSET 4
1389 @@ -323,6 +319,11 @@ sysenter_past_esp:
1390 CFI_ADJUST_CFA_OFFSET 4
1391 CFI_REL_OFFSET eip, 0
1392
1393 + pushl %eax
1394 + CFI_ADJUST_CFA_OFFSET 4
1395 + SAVE_ALL
1396 + ENABLE_INTERRUPTS(CLBR_NONE)
1397 +
1398 /*
1399 * Load the potential sixth argument from user stack.
1400 * Careful about security.
1401 @@ -330,14 +331,12 @@ sysenter_past_esp:
1402 cmpl $__PAGE_OFFSET-3,%ebp
1403 jae syscall_fault
1404 1: movl (%ebp),%ebp
1405 + movl %ebp,PT_EBP(%esp)
1406 .section __ex_table,"a"
1407 .align 4
1408 .long 1b,syscall_fault
1409 .previous
1410
1411 - pushl %eax
1412 - CFI_ADJUST_CFA_OFFSET 4
1413 - SAVE_ALL
1414 GET_THREAD_INFO(%ebp)
1415 test_tif %ebp
1416 jnz syscall_trace_entry
1417 @@ -414,7 +413,7 @@ syscall_exit:
1418 # setting need_resched or sigpending
1419 # between sampling and the iret
1420 TRACE_IRQS_OFF
1421 - testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
1422 + testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
1423 jz no_singlestep
1424 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1425 no_singlestep:
1426 @@ -430,7 +429,7 @@ restore_all:
1427 # See comments in process.c:copy_thread() for details.
1428 movb PT_OLDSS(%esp), %ah
1429 movb PT_CS(%esp), %al
1430 - andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1431 + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1432 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
1433 CFI_REMEMBER_STATE
1434 je ldt_ss # returning to user-space with LDT SS
1435 @@ -438,7 +437,7 @@ restore_nocheck:
1436 #else
1437 restore_nocheck:
1438 movl PT_EFLAGS(%esp), %eax
1439 - testl $(VM_MASK|NMI_MASK), %eax
1440 + testl $(X86_EFLAGS_VM|NMI_MASK), %eax
1441 CFI_REMEMBER_STATE
1442 jnz hypervisor_iret
1443 shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
1444 @@ -456,7 +455,7 @@ restore_nocheck_notrace:
1445 irq_return:
1446 INTERRUPT_RETURN
1447 .section .fixup,"ax"
1448 -iret_exc:
1449 +ENTRY(iret_exc)
1450 pushl $0 # no error code
1451 pushl $do_iret_error
1452 jmp error_code
1453 @@ -560,7 +559,7 @@ work_resched:
1454 work_notifysig: # deal with pending signals and
1455 # notify-resume requests
1456 #ifdef CONFIG_VM86
1457 - testl $VM_MASK, PT_EFLAGS(%esp)
1458 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
1459 movl %esp, %eax
1460 jne work_notifysig_v86 # returning to kernel-space or
1461 # vm86-space
1462 @@ -617,9 +616,6 @@ END(syscall_exit_work)
1463
1464 RING0_INT_FRAME # can't unwind into user space anyway
1465 syscall_fault:
1466 - pushl %eax # save orig_eax
1467 - CFI_ADJUST_CFA_OFFSET 4
1468 - SAVE_ALL
1469 GET_THREAD_INFO(%ebp)
1470 movl $-EFAULT,PT_EAX(%esp)
1471 jmp resume_userspace
1472 --- sle11-2009-10-16.orig/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:33:40.000000000 +0100
1473 +++ sle11-2009-10-16/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:38:05.000000000 +0100
1474 @@ -338,19 +338,17 @@ badsys:
1475 /* Do syscall tracing */
1476 tracesys:
1477 SAVE_REST
1478 - movq $-ENOSYS,RAX(%rsp)
1479 + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
1480 FIXUP_TOP_OF_STACK %rdi
1481 movq %rsp,%rdi
1482 call syscall_trace_enter
1483 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
1484 RESTORE_REST
1485 cmpq $__NR_syscall_max,%rax
1486 - movq $-ENOSYS,%rcx
1487 - cmova %rcx,%rax
1488 - ja 1f
1489 + ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
1490 movq %r10,%rcx /* fixup for C */
1491 call *sys_call_table(,%rax,8)
1492 -1: movq %rax,RAX-ARGOFFSET(%rsp)
1493 + movq %rax,RAX-ARGOFFSET(%rsp)
1494 /* Use IRET because user could have changed frame */
1495
1496 /*
1497 --- sle11-2009-10-16.orig/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
1498 +++ sle11-2009-10-16/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
1499 @@ -15,6 +15,7 @@
1500 #include <linux/kernel.h>
1501 #include <linux/ctype.h>
1502 #include <linux/init.h>
1503 +#include <linux/hardirq.h>
1504
1505 #include <asm/smp.h>
1506 #include <asm/ipi.h>
1507 @@ -24,17 +25,12 @@
1508 #include <acpi/acpi_bus.h>
1509 #endif
1510
1511 -/* which logical CPU number maps to which CPU (physical APIC ID) */
1512 #ifndef CONFIG_XEN
1513 -u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
1514 - = { [0 ... NR_CPUS-1] = BAD_APICID };
1515 -void *x86_cpu_to_apicid_early_ptr;
1516 -#endif
1517 -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
1518 -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
1519 +DEFINE_PER_CPU(int, x2apic_extra_bits);
1520
1521 -#ifndef CONFIG_XEN
1522 struct genapic __read_mostly *genapic = &apic_flat;
1523 +
1524 +static enum uv_system_type uv_system_type;
1525 #else
1526 extern struct genapic apic_xen;
1527 struct genapic __read_mostly *genapic = &apic_xen;
1528 @@ -47,6 +43,9 @@ struct genapic __read_mostly *genapic =
1529 void __init setup_apic_routing(void)
1530 {
1531 #ifndef CONFIG_XEN
1532 + if (uv_system_type == UV_NON_UNIQUE_APIC)
1533 + genapic = &apic_x2apic_uv_x;
1534 + else
1535 #ifdef CONFIG_ACPI
1536 /*
1537 * Quirk: some x86_64 machines can only use physical APIC mode
1538 @@ -59,7 +58,7 @@ void __init setup_apic_routing(void)
1539 else
1540 #endif
1541
1542 - if (cpus_weight(cpu_possible_map) <= 8)
1543 + if (num_possible_cpus() <= 8)
1544 genapic = &apic_flat;
1545 else
1546 genapic = &apic_physflat;
1547 @@ -85,3 +84,41 @@ void send_IPI_self(int vector)
1548 xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
1549 #endif
1550 }
1551 +
1552 +int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
1553 +{
1554 +#ifndef CONFIG_XEN
1555 + if (!strcmp(oem_id, "SGI")) {
1556 + if (!strcmp(oem_table_id, "UVL"))
1557 + uv_system_type = UV_LEGACY_APIC;
1558 + else if (!strcmp(oem_table_id, "UVX"))
1559 + uv_system_type = UV_X2APIC;
1560 + else if (!strcmp(oem_table_id, "UVH"))
1561 + uv_system_type = UV_NON_UNIQUE_APIC;
1562 + }
1563 +#endif
1564 + return 0;
1565 +}
1566 +
1567 +#ifndef CONFIG_XEN
1568 +unsigned int read_apic_id(void)
1569 +{
1570 + unsigned int id;
1571 +
1572 + WARN_ON(preemptible() && num_online_cpus() > 1);
1573 + id = apic_read(APIC_ID);
1574 + if (uv_system_type >= UV_X2APIC)
1575 + id |= __get_cpu_var(x2apic_extra_bits);
1576 + return id;
1577 +}
1578 +
1579 +enum uv_system_type get_uv_system_type(void)
1580 +{
1581 + return uv_system_type;
1582 +}
1583 +
1584 +int is_uv_system(void)
1585 +{
1586 + return uv_system_type != UV_NONE;
1587 +}
1588 +#endif
1589 --- sle11-2009-10-16.orig/arch/x86/kernel/genapic_xen_64.c 2008-12-15 11:27:22.000000000 +0100
1590 +++ sle11-2009-10-16/arch/x86/kernel/genapic_xen_64.c 2009-03-16 16:38:05.000000000 +0100
1591 @@ -72,9 +72,7 @@ static cpumask_t xen_target_cpus(void)
1592
1593 static cpumask_t xen_vector_allocation_domain(int cpu)
1594 {
1595 - cpumask_t domain = CPU_MASK_NONE;
1596 - cpu_set(cpu, domain);
1597 - return domain;
1598 + return cpumask_of_cpu(cpu);
1599 }
1600
1601 /*
1602 --- sle11-2009-10-16.orig/arch/x86/kernel/head64-xen.c 2009-03-16 16:33:40.000000000 +0100
1603 +++ sle11-2009-10-16/arch/x86/kernel/head64-xen.c 2009-03-16 16:38:05.000000000 +0100
1604 @@ -17,6 +17,7 @@
1605 #include <linux/string.h>
1606 #include <linux/percpu.h>
1607 #include <linux/start_kernel.h>
1608 +#include <linux/io.h>
1609 #include <linux/module.h>
1610
1611 #include <asm/processor.h>
1612 @@ -29,6 +30,7 @@
1613 #include <asm/sections.h>
1614 #include <asm/kdebug.h>
1615 #include <asm/e820.h>
1616 +#include <asm/bios_ebda.h>
1617
1618 unsigned long start_pfn;
1619
1620 @@ -75,34 +77,75 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
1621 unsigned int machine_to_phys_order;
1622 EXPORT_SYMBOL(machine_to_phys_order);
1623
1624 -#define EBDA_ADDR_POINTER 0x40E
1625 +#define BIOS_LOWMEM_KILOBYTES 0x413
1626
1627 -static __init void reserve_ebda(void)
1628 +/*
1629 + * The BIOS places the EBDA/XBDA at the top of conventional
1630 + * memory, and usually decreases the reported amount of
1631 + * conventional memory (int 0x12) too. This also contains a
1632 + * workaround for Dell systems that neglect to reserve EBDA.
1633 + * The same workaround also avoids a problem with the AMD768MPX
1634 + * chipset: reserve a page before VGA to prevent PCI prefetch
1635 + * into it (errata #56). Usually the page is reserved anyways,
1636 + * unless you have no PS/2 mouse plugged in.
1637 + */
1638 +static void __init reserve_ebda_region(void)
1639 {
1640 #ifndef CONFIG_XEN
1641 - unsigned ebda_addr, ebda_size;
1642 + unsigned int lowmem, ebda_addr;
1643
1644 - /*
1645 - * there is a real-mode segmented pointer pointing to the
1646 - * 4K EBDA area at 0x40E
1647 - */
1648 - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
1649 - ebda_addr <<= 4;
1650 -
1651 - if (!ebda_addr)
1652 + /* To determine the position of the EBDA and the */
1653 + /* end of conventional memory, we need to look at */
1654 + /* the BIOS data area. In a paravirtual environment */
1655 + /* that area is absent. We'll just have to assume */
1656 + /* that the paravirt case can handle memory setup */
1657 + /* correctly, without our help. */
1658 + if (paravirt_enabled())
1659 return;
1660
1661 - ebda_size = *(unsigned short *)__va(ebda_addr);
1662 + /* end of low (conventional) memory */
1663 + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
1664 + lowmem <<= 10;
1665 +
1666 + /* start of EBDA area */
1667 + ebda_addr = get_bios_ebda();
1668 +
1669 + /* Fixup: bios puts an EBDA in the top 64K segment */
1670 + /* of conventional memory, but does not adjust lowmem. */
1671 + if ((lowmem - ebda_addr) <= 0x10000)
1672 + lowmem = ebda_addr;
1673 +
1674 + /* Fixup: bios does not report an EBDA at all. */
1675 + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
1676 + if ((ebda_addr == 0) && (lowmem >= 0x9f000))
1677 + lowmem = 0x9f000;
1678 +
1679 + /* Paranoia: should never happen, but... */
1680 + if ((lowmem == 0) || (lowmem >= 0x100000))
1681 + lowmem = 0x9f000;
1682
1683 - /* Round EBDA up to pages */
1684 - if (ebda_size == 0)
1685 - ebda_size = 1;
1686 - ebda_size <<= 10;
1687 - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
1688 - if (ebda_size > 64*1024)
1689 - ebda_size = 64*1024;
1690 + /* reserve all memory between lowmem and the 1MB mark */
1691 + reserve_early(lowmem, 0x100000, "BIOS reserved");
1692 +#endif
1693 +}
1694
1695 - reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
1696 +static void __init reserve_setup_data(void)
1697 +{
1698 +#ifndef CONFIG_XEN
1699 + struct setup_data *data;
1700 + unsigned long pa_data;
1701 + char buf[32];
1702 +
1703 + if (boot_params.hdr.version < 0x0209)
1704 + return;
1705 + pa_data = boot_params.hdr.setup_data;
1706 + while (pa_data) {
1707 + data = early_ioremap(pa_data, sizeof(*data));
1708 + sprintf(buf, "setup data %x", data->type);
1709 + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
1710 + pa_data = data->next;
1711 + early_iounmap(data, sizeof(*data));
1712 + }
1713 #endif
1714 }
1715
1716 @@ -112,6 +155,19 @@ void __init x86_64_start_kernel(char * r
1717 unsigned long machine_to_phys_nr_ents;
1718 int i;
1719
1720 + /*
1721 + * Build-time sanity checks on the kernel image and module
1722 + * area mappings. (these are purely build-time and produce no code)
1723 + */
1724 + BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
1725 + BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
1726 + BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
1727 + BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
1728 + BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
1729 + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
1730 + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
1731 + (__START_KERNEL & PGDIR_MASK)));
1732 +
1733 xen_setup_features();
1734
1735 xen_start_info = (struct start_info *)real_mode_data;
1736 @@ -140,7 +196,7 @@ void __init x86_64_start_kernel(char * r
1737 /* Cleanup the over mapped high alias */
1738 cleanup_highmap();
1739
1740 - for (i = 0; i < IDT_ENTRIES; i++) {
1741 + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
1742 #ifdef CONFIG_EARLY_PRINTK
1743 set_intr_gate(i, &early_idt_handlers[i]);
1744 #else
1745 @@ -163,7 +219,8 @@ void __init x86_64_start_kernel(char * r
1746 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
1747 start_pfn << PAGE_SHIFT, "Xen provided");
1748
1749 - reserve_ebda();
1750 + reserve_ebda_region();
1751 + reserve_setup_data();
1752
1753 /*
1754 * At this point everything still needed from the boot loader
1755 --- sle11-2009-10-16.orig/arch/x86/kernel/head_32-xen.S 2009-03-16 16:33:40.000000000 +0100
1756 +++ sle11-2009-10-16/arch/x86/kernel/head_32-xen.S 2009-03-16 16:38:05.000000000 +0100
1757 @@ -69,7 +69,7 @@ ENTRY(startup_32)
1758 cld # gcc2 wants the direction flag cleared at all times
1759
1760 pushl $0 # fake return address for unwinder
1761 - jmp start_kernel
1762 + jmp i386_start_kernel
1763
1764 #define HYPERCALL_PAGE_OFFSET 0x1000
1765 .org HYPERCALL_PAGE_OFFSET
1766 --- sle11-2009-10-16.orig/arch/x86/kernel/init_task-xen.c 2009-03-16 16:33:40.000000000 +0100
1767 +++ sle11-2009-10-16/arch/x86/kernel/init_task-xen.c 2009-03-16 16:38:05.000000000 +0100
1768 @@ -11,7 +11,6 @@
1769 #include <asm/desc.h>
1770
1771 static struct fs_struct init_fs = INIT_FS;
1772 -static struct files_struct init_files = INIT_FILES;
1773 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
1774 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
1775 #ifdef CONFIG_X86_XEN
1776 --- sle11-2009-10-16.orig/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
1777 +++ sle11-2009-10-16/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:38:05.000000000 +0100
1778 @@ -88,6 +88,16 @@ int sis_apic_bug = -1;
1779 */
1780 int nr_ioapic_registers[MAX_IO_APICS];
1781
1782 +/* I/O APIC entries */
1783 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
1784 +int nr_ioapics;
1785 +
1786 +/* MP IRQ source entries */
1787 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
1788 +
1789 +/* # of MP IRQ source entries */
1790 +int mp_irq_entries;
1791 +
1792 static int disable_timer_pin_1 __initdata;
1793
1794 /*
1795 @@ -863,10 +873,7 @@ static int __init find_isa_irq_pin(int i
1796 for (i = 0; i < mp_irq_entries; i++) {
1797 int lbus = mp_irqs[i].mpc_srcbus;
1798
1799 - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
1800 - mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
1801 - mp_bus_id_to_type[lbus] == MP_BUS_MCA
1802 - ) &&
1803 + if (test_bit(lbus, mp_bus_not_pci) &&
1804 (mp_irqs[i].mpc_irqtype == type) &&
1805 (mp_irqs[i].mpc_srcbusirq == irq))
1806
1807 @@ -882,10 +889,7 @@ static int __init find_isa_irq_apic(int
1808 for (i = 0; i < mp_irq_entries; i++) {
1809 int lbus = mp_irqs[i].mpc_srcbus;
1810
1811 - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
1812 - mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
1813 - mp_bus_id_to_type[lbus] == MP_BUS_MCA
1814 - ) &&
1815 + if (test_bit(lbus, mp_bus_not_pci) &&
1816 (mp_irqs[i].mpc_irqtype == type) &&
1817 (mp_irqs[i].mpc_srcbusirq == irq))
1818 break;
1819 @@ -926,7 +930,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
1820 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
1821 break;
1822
1823 - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
1824 + if (!test_bit(lbus, mp_bus_not_pci) &&
1825 !mp_irqs[i].mpc_irqtype &&
1826 (bus == lbus) &&
1827 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
1828 @@ -977,6 +981,7 @@ void __init setup_ioapic_dest(void)
1829 #endif /* !CONFIG_XEN */
1830 #endif
1831
1832 +#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1833 /*
1834 * EISA Edge/Level control register, ELCR
1835 */
1836 @@ -990,6 +995,13 @@ static int EISA_ELCR(unsigned int irq)
1837 "Broken MPtable reports ISA irq %d\n", irq);
1838 return 0;
1839 }
1840 +#endif
1841 +
1842 +/* ISA interrupts are always polarity zero edge triggered,
1843 + * when listed as conforming in the MP table. */
1844 +
1845 +#define default_ISA_trigger(idx) (0)
1846 +#define default_ISA_polarity(idx) (0)
1847
1848 /* EISA interrupts are always polarity zero and can be edge or level
1849 * trigger depending on the ELCR value. If an interrupt is listed as
1850 @@ -997,13 +1009,7 @@ static int EISA_ELCR(unsigned int irq)
1851 * be read in from the ELCR */
1852
1853 #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
1854 -#define default_EISA_polarity(idx) (0)
1855 -
1856 -/* ISA interrupts are always polarity zero edge triggered,
1857 - * when listed as conforming in the MP table. */
1858 -
1859 -#define default_ISA_trigger(idx) (0)
1860 -#define default_ISA_polarity(idx) (0)
1861 +#define default_EISA_polarity(idx) default_ISA_polarity(idx)
1862
1863 /* PCI interrupts are always polarity one level triggered,
1864 * when listed as conforming in the MP table. */
1865 @@ -1015,7 +1021,7 @@ static int EISA_ELCR(unsigned int irq)
1866 * when listed as conforming in the MP table. */
1867
1868 #define default_MCA_trigger(idx) (1)
1869 -#define default_MCA_polarity(idx) (0)
1870 +#define default_MCA_polarity(idx) default_ISA_polarity(idx)
1871
1872 static int MPBIOS_polarity(int idx)
1873 {
1874 @@ -1029,35 +1035,9 @@ static int MPBIOS_polarity(int idx)
1875 {
1876 case 0: /* conforms, ie. bus-type dependent polarity */
1877 {
1878 - switch (mp_bus_id_to_type[bus])
1879 - {
1880 - case MP_BUS_ISA: /* ISA pin */
1881 - {
1882 - polarity = default_ISA_polarity(idx);
1883 - break;
1884 - }
1885 - case MP_BUS_EISA: /* EISA pin */
1886 - {
1887 - polarity = default_EISA_polarity(idx);
1888 - break;
1889 - }
1890 - case MP_BUS_PCI: /* PCI pin */
1891 - {
1892 - polarity = default_PCI_polarity(idx);
1893 - break;
1894 - }
1895 - case MP_BUS_MCA: /* MCA pin */
1896 - {
1897 - polarity = default_MCA_polarity(idx);
1898 - break;
1899 - }
1900 - default:
1901 - {
1902 - printk(KERN_WARNING "broken BIOS!!\n");
1903 - polarity = 1;
1904 - break;
1905 - }
1906 - }
1907 + polarity = test_bit(bus, mp_bus_not_pci)?
1908 + default_ISA_polarity(idx):
1909 + default_PCI_polarity(idx);
1910 break;
1911 }
1912 case 1: /* high active */
1913 @@ -1098,11 +1078,15 @@ static int MPBIOS_trigger(int idx)
1914 {
1915 case 0: /* conforms, ie. bus-type dependent */
1916 {
1917 + trigger = test_bit(bus, mp_bus_not_pci)?
1918 + default_ISA_trigger(idx):
1919 + default_PCI_trigger(idx);
1920 +#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1921 switch (mp_bus_id_to_type[bus])
1922 {
1923 case MP_BUS_ISA: /* ISA pin */
1924 {
1925 - trigger = default_ISA_trigger(idx);
1926 + /* set before the switch */
1927 break;
1928 }
1929 case MP_BUS_EISA: /* EISA pin */
1930 @@ -1112,7 +1096,7 @@ static int MPBIOS_trigger(int idx)
1931 }
1932 case MP_BUS_PCI: /* PCI pin */
1933 {
1934 - trigger = default_PCI_trigger(idx);
1935 + /* set before the switch */
1936 break;
1937 }
1938 case MP_BUS_MCA: /* MCA pin */
1939 @@ -1127,6 +1111,7 @@ static int MPBIOS_trigger(int idx)
1940 break;
1941 }
1942 }
1943 +#endif
1944 break;
1945 }
1946 case 1: /* edge */
1947 @@ -1176,39 +1161,22 @@ static int pin_2_irq(int idx, int apic,
1948 if (mp_irqs[idx].mpc_dstirq != pin)
1949 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1950
1951 - switch (mp_bus_id_to_type[bus])
1952 - {
1953 - case MP_BUS_ISA: /* ISA pin */
1954 - case MP_BUS_EISA:
1955 - case MP_BUS_MCA:
1956 - {
1957 - irq = mp_irqs[idx].mpc_srcbusirq;
1958 - break;
1959 - }
1960 - case MP_BUS_PCI: /* PCI pin */
1961 - {
1962 - /*
1963 - * PCI IRQs are mapped in order
1964 - */
1965 - i = irq = 0;
1966 - while (i < apic)
1967 - irq += nr_ioapic_registers[i++];
1968 - irq += pin;
1969 -
1970 - /*
1971 - * For MPS mode, so far only needed by ES7000 platform
1972 - */
1973 - if (ioapic_renumber_irq)
1974 - irq = ioapic_renumber_irq(apic, irq);
1975 + if (test_bit(bus, mp_bus_not_pci))
1976 + irq = mp_irqs[idx].mpc_srcbusirq;
1977 + else {
1978 + /*
1979 + * PCI IRQs are mapped in order
1980 + */
1981 + i = irq = 0;
1982 + while (i < apic)
1983 + irq += nr_ioapic_registers[i++];
1984 + irq += pin;
1985
1986 - break;
1987 - }
1988 - default:
1989 - {
1990 - printk(KERN_ERR "unknown bus type %d.\n",bus);
1991 - irq = 0;
1992 - break;
1993 - }
1994 + /*
1995 + * For MPS mode, so far only needed by ES7000 platform
1996 + */
1997 + if (ioapic_renumber_irq)
1998 + irq = ioapic_renumber_irq(apic, irq);
1999 }
2000
2001 /*
2002 @@ -1314,7 +1282,6 @@ static void __init setup_IO_APIC_irqs(vo
2003 {
2004 struct IO_APIC_route_entry entry;
2005 int apic, pin, idx, irq, first_notcon = 1, vector;
2006 - unsigned long flags;
2007
2008 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
2009
2010 @@ -1380,9 +1347,7 @@ static void __init setup_IO_APIC_irqs(vo
2011 if (!apic && (irq < 16))
2012 disable_8259A_irq(irq);
2013 }
2014 - spin_lock_irqsave(&ioapic_lock, flags);
2015 - __ioapic_write_entry(apic, pin, entry);
2016 - spin_unlock_irqrestore(&ioapic_lock, flags);
2017 + ioapic_write_entry(apic, pin, entry);
2018 }
2019 }
2020
2021 @@ -1577,8 +1542,8 @@ void /*__init*/ print_local_APIC(void *
2022
2023 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
2024 smp_processor_id(), hard_smp_processor_id());
2025 - v = apic_read(APIC_ID);
2026 - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
2027 + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
2028 + GET_APIC_ID(read_apic_id()));
2029 v = apic_read(APIC_LVR);
2030 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
2031 ver = GET_APIC_VERSION(v);
2032 @@ -1791,7 +1756,7 @@ void disable_IO_APIC(void)
2033 entry.delivery_mode = dest_ExtINT; /* ExtInt */
2034 entry.vector = 0;
2035 entry.dest.physical.physical_dest =
2036 - GET_APIC_ID(apic_read(APIC_ID));
2037 + GET_APIC_ID(read_apic_id());
2038
2039 /*
2040 * Add it to the IO-APIC irq-routing table:
2041 @@ -2090,8 +2055,7 @@ static inline void init_IO_APIC_traps(vo
2042 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2043 */
2044 for (irq = 0; irq < NR_IRQS ; irq++) {
2045 - int tmp = irq;
2046 - if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
2047 + if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
2048 /*
2049 * Hmm.. We don't have an entry for this,
2050 * so default to an old-fashioned 8259
2051 @@ -2166,7 +2130,7 @@ static void __init setup_nmi(void)
2052 * cycles as some i82489DX-based boards have glue logic that keeps the
2053 * 8259A interrupt line asserted until INTA. --macro
2054 */
2055 -static inline void unlock_ExtINT_logic(void)
2056 +static inline void __init unlock_ExtINT_logic(void)
2057 {
2058 int apic, pin, i;
2059 struct IO_APIC_route_entry entry0, entry1;
2060 @@ -2218,8 +2182,6 @@ static inline void unlock_ExtINT_logic(v
2061 ioapic_write_entry(apic, pin, entry0);
2062 }
2063
2064 -int timer_uses_ioapic_pin_0;
2065 -
2066 /*
2067 * This code may look a bit paranoid, but it's supposed to cooperate with
2068 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
2069 @@ -2259,9 +2221,6 @@ static inline void __init check_timer(vo
2070 pin2 = ioapic_i8259.pin;
2071 apic2 = ioapic_i8259.apic;
2072
2073 - if (pin1 == 0)
2074 - timer_uses_ioapic_pin_0 = 1;
2075 -
2076 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
2077 vector, apic1, pin1, apic2, pin2);
2078
2079 @@ -2555,6 +2514,7 @@ void destroy_irq(unsigned int irq)
2080 dynamic_irq_cleanup(irq);
2081
2082 spin_lock_irqsave(&vector_lock, flags);
2083 + clear_bit(irq_vector[irq], used_vectors);
2084 irq_vector[irq] = 0;
2085 spin_unlock_irqrestore(&vector_lock, flags);
2086 }
2087 @@ -2871,7 +2831,6 @@ int __init io_apic_get_redir_entries (in
2088 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
2089 {
2090 struct IO_APIC_route_entry entry;
2091 - unsigned long flags;
2092
2093 if (!IO_APIC_IRQ(irq)) {
2094 printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2095 @@ -2912,9 +2871,7 @@ int io_apic_set_pci_routing (int ioapic,
2096 if (!ioapic && (irq < 16))
2097 disable_8259A_irq(irq);
2098
2099 - spin_lock_irqsave(&ioapic_lock, flags);
2100 - __ioapic_write_entry(ioapic, pin, entry);
2101 - spin_unlock_irqrestore(&ioapic_lock, flags);
2102 + ioapic_write_entry(ioapic, pin, entry);
2103
2104 return 0;
2105 }
2106 --- sle11-2009-10-16.orig/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
2107 +++ sle11-2009-10-16/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
2108 @@ -43,13 +43,15 @@
2109 #include <asm/smp.h>
2110 #include <asm/desc.h>
2111 #include <asm/proto.h>
2112 -#include <asm/mach_apic.h>
2113 #include <asm/acpi.h>
2114 #include <asm/dma.h>
2115 #include <asm/nmi.h>
2116 #include <asm/msidef.h>
2117 #include <asm/hypertransport.h>
2118
2119 +#include <mach_ipi.h>
2120 +#include <mach_apic.h>
2121 +
2122 struct irq_cfg {
2123 #ifndef CONFIG_XEN
2124 cpumask_t domain;
2125 @@ -101,6 +103,16 @@ DEFINE_SPINLOCK(vector_lock);
2126 */
2127 int nr_ioapic_registers[MAX_IO_APICS];
2128
2129 +/* I/O APIC entries */
2130 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
2131 +int nr_ioapics;
2132 +
2133 +/* MP IRQ source entries */
2134 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
2135 +
2136 +/* # of MP IRQ source entries */
2137 +int mp_irq_entries;
2138 +
2139 /*
2140 * Rough estimation of how many shared IRQs there are, can
2141 * be changed anytime.
2142 @@ -181,11 +193,10 @@ static inline void io_apic_modify(unsign
2143 writel(value, &io_apic->data);
2144 }
2145
2146 -static int io_apic_level_ack_pending(unsigned int irq)
2147 +static bool io_apic_level_ack_pending(unsigned int irq)
2148 {
2149 struct irq_pin_list *entry;
2150 unsigned long flags;
2151 - int pending = 0;
2152
2153 spin_lock_irqsave(&ioapic_lock, flags);
2154 entry = irq_2_pin + irq;
2155 @@ -198,13 +209,17 @@ static int io_apic_level_ack_pending(uns
2156 break;
2157 reg = io_apic_read(entry->apic, 0x10 + pin*2);
2158 /* Is the remote IRR bit set? */
2159 - pending |= (reg >> 14) & 1;
2160 + if ((reg >> 14) & 1) {
2161 + spin_unlock_irqrestore(&ioapic_lock, flags);
2162 + return true;
2163 + }
2164 if (!entry->next)
2165 break;
2166 entry = irq_2_pin + entry->next;
2167 }
2168 spin_unlock_irqrestore(&ioapic_lock, flags);
2169 - return pending;
2170 +
2171 + return false;
2172 }
2173 #endif
2174
2175 @@ -762,7 +777,7 @@ static void __clear_irq_vector(int irq)
2176 per_cpu(vector_irq, cpu)[vector] = -1;
2177
2178 cfg->vector = 0;
2179 - cfg->domain = CPU_MASK_NONE;
2180 + cpus_clear(cfg->domain);
2181 }
2182
2183 void __setup_vector_irq(int cpu)
2184 @@ -899,9 +914,8 @@ static void __init setup_IO_APIC_irqs(vo
2185 static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
2186 {
2187 struct IO_APIC_route_entry entry;
2188 - unsigned long flags;
2189
2190 - memset(&entry,0,sizeof(entry));
2191 + memset(&entry, 0, sizeof(entry));
2192
2193 disable_8259A_irq(0);
2194
2195 @@ -929,10 +943,7 @@ static void __init setup_ExtINT_IRQ0_pin
2196 /*
2197 * Add it to the IO-APIC irq-routing table:
2198 */
2199 - spin_lock_irqsave(&ioapic_lock, flags);
2200 - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
2201 - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
2202 - spin_unlock_irqrestore(&ioapic_lock, flags);
2203 + ioapic_write_entry(apic, pin, entry);
2204
2205 enable_8259A_irq(0);
2206 }
2207 @@ -1061,8 +1072,7 @@ void __apicdebuginit print_local_APIC(vo
2208
2209 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
2210 smp_processor_id(), hard_smp_processor_id());
2211 - v = apic_read(APIC_ID);
2212 - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
2213 + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
2214 v = apic_read(APIC_LVR);
2215 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
2216 ver = GET_APIC_VERSION(v);
2217 @@ -1260,7 +1270,7 @@ void disable_IO_APIC(void)
2218 entry.dest_mode = 0; /* Physical */
2219 entry.delivery_mode = dest_ExtINT; /* ExtInt */
2220 entry.vector = 0;
2221 - entry.dest = GET_APIC_ID(apic_read(APIC_ID));
2222 + entry.dest = GET_APIC_ID(read_apic_id());
2223
2224 /*
2225 * Add it to the IO-APIC irq-routing table:
2226 @@ -1353,9 +1363,7 @@ static int ioapic_retrigger_irq(unsigned
2227 unsigned long flags;
2228
2229 spin_lock_irqsave(&vector_lock, flags);
2230 - cpus_clear(mask);
2231 - cpu_set(first_cpu(cfg->domain), mask);
2232 -
2233 + mask = cpumask_of_cpu(first_cpu(cfg->domain));
2234 send_IPI_mask(mask, cfg->vector);
2235 spin_unlock_irqrestore(&vector_lock, flags);
2236
2237 @@ -1519,8 +1527,7 @@ static inline void init_IO_APIC_traps(vo
2238 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2239 */
2240 for (irq = 0; irq < NR_IRQS ; irq++) {
2241 - int tmp = irq;
2242 - if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
2243 + if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) {
2244 /*
2245 * Hmm.. We don't have an entry for this,
2246 * so default to an old-fashioned 8259
2247 @@ -1597,22 +1604,19 @@ static void __init setup_nmi(void)
2248 * cycles as some i82489DX-based boards have glue logic that keeps the
2249 * 8259A interrupt line asserted until INTA. --macro
2250 */
2251 -static inline void unlock_ExtINT_logic(void)
2252 +static inline void __init unlock_ExtINT_logic(void)
2253 {
2254 int apic, pin, i;
2255 struct IO_APIC_route_entry entry0, entry1;
2256 unsigned char save_control, save_freq_select;
2257 - unsigned long flags;
2258
2259 pin = find_isa_irq_pin(8, mp_INT);
2260 apic = find_isa_irq_apic(8, mp_INT);
2261 if (pin == -1)
2262 return;
2263
2264 - spin_lock_irqsave(&ioapic_lock, flags);
2265 - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
2266 - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
2267 - spin_unlock_irqrestore(&ioapic_lock, flags);
2268 + entry0 = ioapic_read_entry(apic, pin);
2269 +
2270 clear_IO_APIC_pin(apic, pin);
2271
2272 memset(&entry1, 0, sizeof(entry1));
2273 @@ -1625,10 +1629,7 @@ static inline void unlock_ExtINT_logic(v
2274 entry1.trigger = 0;
2275 entry1.vector = 0;
2276
2277 - spin_lock_irqsave(&ioapic_lock, flags);
2278 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
2279 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
2280 - spin_unlock_irqrestore(&ioapic_lock, flags);
2281 + ioapic_write_entry(apic, pin, entry1);
2282
2283 save_control = CMOS_READ(RTC_CONTROL);
2284 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2285 @@ -1647,10 +1648,7 @@ static inline void unlock_ExtINT_logic(v
2286 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2287 clear_IO_APIC_pin(apic, pin);
2288
2289 - spin_lock_irqsave(&ioapic_lock, flags);
2290 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
2291 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
2292 - spin_unlock_irqrestore(&ioapic_lock, flags);
2293 + ioapic_write_entry(apic, pin, entry0);
2294 }
2295
2296 /*
2297 @@ -2327,7 +2325,6 @@ static struct resource * __init ioapic_s
2298 res = (void *)mem;
2299
2300 if (mem != NULL) {
2301 - memset(mem, 0, n);
2302 mem += sizeof(struct resource) * nr_ioapics;
2303
2304 for (i = 0; i < nr_ioapics; i++) {
2305 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
2306 +++ sle11-2009-10-16/arch/x86/kernel/ipi-xen.c 2009-03-16 16:38:05.000000000 +0100
2307 @@ -0,0 +1,232 @@
2308 +#include <linux/cpumask.h>
2309 +#include <linux/interrupt.h>
2310 +#include <linux/init.h>
2311 +
2312 +#include <linux/mm.h>
2313 +#include <linux/delay.h>
2314 +#include <linux/spinlock.h>
2315 +#include <linux/kernel_stat.h>
2316 +#include <linux/mc146818rtc.h>
2317 +#include <linux/cache.h>
2318 +#include <linux/interrupt.h>
2319 +#include <linux/cpu.h>
2320 +#include <linux/module.h>
2321 +
2322 +#include <asm/smp.h>
2323 +#include <asm/mtrr.h>
2324 +#include <asm/tlbflush.h>
2325 +#include <asm/mmu_context.h>
2326 +#include <asm/apic.h>
2327 +#include <asm/proto.h>
2328 +
2329 +#ifdef CONFIG_X86_32
2330 +#ifndef CONFIG_XEN
2331 +#include <mach_apic.h>
2332 +/*
2333 + * the following functions deal with sending IPIs between CPUs.
2334 + *
2335 + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
2336 + */
2337 +
2338 +static inline int __prepare_ICR(unsigned int shortcut, int vector)
2339 +{
2340 + unsigned int icr = shortcut | APIC_DEST_LOGICAL;
2341 +
2342 + switch (vector) {
2343 + default:
2344 + icr |= APIC_DM_FIXED | vector;
2345 + break;
2346 + case NMI_VECTOR:
2347 + icr |= APIC_DM_NMI;
2348 + break;
2349 + }
2350 + return icr;
2351 +}
2352 +
2353 +static inline int __prepare_ICR2(unsigned int mask)
2354 +{
2355 + return SET_APIC_DEST_FIELD(mask);
2356 +}
2357 +#else
2358 +#include <xen/evtchn.h>
2359 +
2360 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
2361 +
2362 +static inline void __send_IPI_one(unsigned int cpu, int vector)
2363 +{
2364 + int irq = per_cpu(ipi_to_irq, cpu)[vector];
2365 + BUG_ON(irq < 0);
2366 + notify_remote_via_irq(irq);
2367 +}
2368 +#endif
2369 +
2370 +void __send_IPI_shortcut(unsigned int shortcut, int vector)
2371 +{
2372 +#ifndef CONFIG_XEN
2373 + /*
2374 + * Subtle. In the case of the 'never do double writes' workaround
2375 + * we have to lock out interrupts to be safe. As we don't care
2376 + * of the value read we use an atomic rmw access to avoid costly
2377 + * cli/sti. Otherwise we use an even cheaper single atomic write
2378 + * to the APIC.
2379 + */
2380 + unsigned int cfg;
2381 +
2382 + /*
2383 + * Wait for idle.
2384 + */
2385 + apic_wait_icr_idle();
2386 +
2387 + /*
2388 + * No need to touch the target chip field
2389 + */
2390 + cfg = __prepare_ICR(shortcut, vector);
2391 +
2392 + /*
2393 + * Send the IPI. The write to APIC_ICR fires this off.
2394 + */
2395 + apic_write_around(APIC_ICR, cfg);
2396 +#else
2397 + int cpu;
2398 +
2399 + switch (shortcut) {
2400 + case APIC_DEST_SELF:
2401 + __send_IPI_one(smp_processor_id(), vector);
2402 + break;
2403 + case APIC_DEST_ALLBUT:
2404 + for_each_online_cpu(cpu)
2405 + if (cpu != smp_processor_id())
2406 + __send_IPI_one(cpu, vector);
2407 + break;
2408 + default:
2409 + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
2410 + vector);
2411 + break;
2412 + }
2413 +#endif
2414 +}
2415 +
2416 +void send_IPI_self(int vector)
2417 +{
2418 + __send_IPI_shortcut(APIC_DEST_SELF, vector);
2419 +}
2420 +
2421 +#ifndef CONFIG_XEN
2422 +/*
2423 + * This is used to send an IPI with no shorthand notation (the destination is
2424 + * specified in bits 56 to 63 of the ICR).
2425 + */
2426 +static inline void __send_IPI_dest_field(unsigned long mask, int vector)
2427 +{
2428 + unsigned long cfg;
2429 +
2430 + /*
2431 + * Wait for idle.
2432 + */
2433 + if (unlikely(vector == NMI_VECTOR))
2434 + safe_apic_wait_icr_idle();
2435 + else
2436 + apic_wait_icr_idle();
2437 +
2438 + /*
2439 + * prepare target chip field
2440 + */
2441 + cfg = __prepare_ICR2(mask);
2442 + apic_write_around(APIC_ICR2, cfg);
2443 +
2444 + /*
2445 + * program the ICR
2446 + */
2447 + cfg = __prepare_ICR(0, vector);
2448 +
2449 + /*
2450 + * Send the IPI. The write to APIC_ICR fires this off.
2451 + */
2452 + apic_write_around(APIC_ICR, cfg);
2453 +}
2454 +#endif
2455 +
2456 +/*
2457 + * This is only used on smaller machines.
2458 + */
2459 +void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
2460 +{
2461 +#ifndef CONFIG_XEN
2462 + unsigned long mask = cpus_addr(cpumask)[0];
2463 +#else
2464 + cpumask_t mask;
2465 + unsigned int cpu;
2466 +#endif
2467 + unsigned long flags;
2468 +
2469 + local_irq_save(flags);
2470 +#ifndef CONFIG_XEN
2471 + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
2472 + __send_IPI_dest_field(mask, vector);
2473 +#else
2474 + cpus_andnot(mask, cpumask, cpu_online_map);
2475 + WARN_ON(!cpus_empty(mask));
2476 + for_each_online_cpu(cpu)
2477 + if (cpu_isset(cpu, cpumask))
2478 + __send_IPI_one(cpu, vector);
2479 +#endif
2480 + local_irq_restore(flags);
2481 +}
2482 +
2483 +void send_IPI_mask_sequence(cpumask_t mask, int vector)
2484 +{
2485 +#ifndef CONFIG_XEN
2486 + unsigned long flags;
2487 + unsigned int query_cpu;
2488 +
2489 + /*
2490 + * Hack. The clustered APIC addressing mode doesn't allow us to send
2491 + * to an arbitrary mask, so I do a unicasts to each CPU instead. This
2492 + * should be modified to do 1 message per cluster ID - mbligh
2493 + */
2494 +
2495 + local_irq_save(flags);
2496 + for_each_possible_cpu(query_cpu) {
2497 + if (cpu_isset(query_cpu, mask)) {
2498 + __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
2499 + vector);
2500 + }
2501 + }
2502 + local_irq_restore(flags);
2503 +#else
2504 + send_IPI_mask_bitmask(mask, vector);
2505 +#endif
2506 +}
2507 +
2508 +/* must come after the send_IPI functions above for inlining */
2509 +#include <mach_ipi.h>
2510 +
2511 +#ifndef CONFIG_XEN
2512 +static int convert_apicid_to_cpu(int apic_id)
2513 +{
2514 + int i;
2515 +
2516 + for_each_possible_cpu(i) {
2517 + if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
2518 + return i;
2519 + }
2520 + return -1;
2521 +}
2522 +
2523 +int safe_smp_processor_id(void)
2524 +{
2525 + int apicid, cpuid;
2526 +
2527 + if (!boot_cpu_has(X86_FEATURE_APIC))
2528 + return 0;
2529 +
2530 + apicid = hard_smp_processor_id();
2531 + if (apicid == BAD_APICID)
2532 + return 0;
2533 +
2534 + cpuid = convert_apicid_to_cpu(apicid);
2535 +
2536 + return cpuid >= 0 ? cpuid : 0;
2537 +}
2538 +#endif
2539 +#endif
2540 --- sle11-2009-10-16.orig/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:33:40.000000000 +0100
2541 +++ sle11-2009-10-16/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:38:05.000000000 +0100
2542 @@ -79,7 +79,7 @@ unsigned int do_IRQ(struct pt_regs *regs
2543
2544 if (unlikely((unsigned)irq >= NR_IRQS)) {
2545 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
2546 - __FUNCTION__, irq);
2547 + __func__, irq);
2548 BUG();
2549 }
2550
2551 @@ -134,7 +134,7 @@ unsigned int do_IRQ(struct pt_regs *regs
2552 : "=a" (arg1), "=d" (arg2), "=b" (bx)
2553 : "0" (irq), "1" (desc), "2" (isp),
2554 "D" (desc->handle_irq)
2555 - : "memory", "cc"
2556 + : "memory", "cc", "ecx"
2557 );
2558 } else
2559 #endif
2560 @@ -190,8 +190,6 @@ void irq_ctx_exit(int cpu)
2561 hardirq_ctx[cpu] = NULL;
2562 }
2563
2564 -extern asmlinkage void __do_softirq(void);
2565 -
2566 asmlinkage void do_softirq(void)
2567 {
2568 unsigned long flags;
2569 --- sle11-2009-10-16.orig/arch/x86/kernel/machine_kexec_64.c 2009-03-16 16:33:40.000000000 +0100
2570 +++ sle11-2009-10-16/arch/x86/kernel/machine_kexec_64.c 2009-03-16 16:38:05.000000000 +0100
2571 @@ -120,8 +120,6 @@ int __init machine_kexec_setup_resources
2572 return 0;
2573 }
2574
2575 -void machine_kexec_register_resources(struct resource *res) { ; }
2576 -
2577 #else /* CONFIG_XEN */
2578
2579 #define x__pmd(x) __pmd(x)
2580 --- sle11-2009-10-16.orig/arch/x86/kernel/microcode-xen.c 2009-03-16 16:33:40.000000000 +0100
2581 +++ sle11-2009-10-16/arch/x86/kernel/microcode-xen.c 2009-03-16 16:38:05.000000000 +0100
2582 @@ -162,7 +162,7 @@ static int request_microcode(void)
2583 c->x86, c->x86_model, c->x86_mask);
2584 error = request_firmware(&firmware, name, &microcode_pdev->dev);
2585 if (error) {
2586 - pr_debug("ucode data file %s load failed\n", name);
2587 + pr_debug("microcode: ucode data file %s load failed\n", name);
2588 return error;
2589 }
2590
2591 --- sle11-2009-10-16.orig/arch/x86/kernel/mmconf-fam10h_64.c 2009-10-28 14:55:03.000000000 +0100
2592 +++ sle11-2009-10-16/arch/x86/kernel/mmconf-fam10h_64.c 2009-03-16 16:38:05.000000000 +0100
2593 @@ -219,6 +219,16 @@ void __cpuinit fam10h_check_enable_mmcfg
2594 val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2595 FAM10H_MMIO_CONF_ENABLE;
2596 wrmsrl(address, val);
2597 +
2598 +#ifdef CONFIG_XEN
2599 + {
2600 + u64 val2;
2601 +
2602 + rdmsrl(address, val2);
2603 + if (val2 != val)
2604 + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
2605 + }
2606 +#endif
2607 }
2608
2609 static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
2610 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
2611 +++ sle11-2009-10-16/arch/x86/kernel/mpparse-xen.c 2009-03-16 16:38:05.000000000 +0100
2612 @@ -0,0 +1,1101 @@
2613 +/*
2614 + * Intel Multiprocessor Specification 1.1 and 1.4
2615 + * compliant MP-table parsing routines.
2616 + *
2617 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
2618 + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
2619 + * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
2620 + */
2621 +
2622 +#include <linux/mm.h>
2623 +#include <linux/init.h>
2624 +#include <linux/delay.h>
2625 +#include <linux/bootmem.h>
2626 +#include <linux/kernel_stat.h>
2627 +#include <linux/mc146818rtc.h>
2628 +#include <linux/bitops.h>
2629 +#include <linux/acpi.h>
2630 +#include <linux/module.h>
2631 +
2632 +#include <asm/smp.h>
2633 +#include <asm/mtrr.h>
2634 +#include <asm/mpspec.h>
2635 +#include <asm/pgalloc.h>
2636 +#include <asm/io_apic.h>
2637 +#include <asm/proto.h>
2638 +#include <asm/acpi.h>
2639 +#include <asm/bios_ebda.h>
2640 +
2641 +#include <mach_apic.h>
2642 +#ifdef CONFIG_X86_32
2643 +#include <mach_apicdef.h>
2644 +#include <mach_mpparse.h>
2645 +#endif
2646 +
2647 +/* Have we found an MP table */
2648 +int smp_found_config;
2649 +
2650 +/*
2651 + * Various Linux-internal data structures created from the
2652 + * MP-table.
2653 + */
2654 +#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
2655 +int mp_bus_id_to_type[MAX_MP_BUSSES];
2656 +#endif
2657 +
2658 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
2659 +int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
2660 +
2661 +static int mp_current_pci_id;
2662 +
2663 +int pic_mode;
2664 +
2665 +/*
2666 + * Intel MP BIOS table parsing routines:
2667 + */
2668 +
2669 +/*
2670 + * Checksum an MP configuration block.
2671 + */
2672 +
2673 +static int __init mpf_checksum(unsigned char *mp, int len)
2674 +{
2675 + int sum = 0;
2676 +
2677 + while (len--)
2678 + sum += *mp++;
2679 +
2680 + return sum & 0xFF;
2681 +}
2682 +
2683 +#ifdef CONFIG_X86_NUMAQ
2684 +/*
2685 + * Have to match translation table entries to main table entries by counter
2686 + * hence the mpc_record variable .... can't see a less disgusting way of
2687 + * doing this ....
2688 + */
2689 +
2690 +static int mpc_record;
2691 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
2692 + __cpuinitdata;
2693 +#endif
2694 +
2695 +static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
2696 +{
2697 +#ifndef CONFIG_XEN
2698 + int apicid;
2699 + char *bootup_cpu = "";
2700 +
2701 + if (!(m->mpc_cpuflag & CPU_ENABLED)) {
2702 + disabled_cpus++;
2703 + return;
2704 + }
2705 +#ifdef CONFIG_X86_NUMAQ
2706 + apicid = mpc_apic_id(m, translation_table[mpc_record]);
2707 +#else
2708 + apicid = m->mpc_apicid;
2709 +#endif
2710 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
2711 + bootup_cpu = " (Bootup-CPU)";
2712 + boot_cpu_physical_apicid = m->mpc_apicid;
2713 + }
2714 +
2715 + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
2716 + generic_processor_info(apicid, m->mpc_apicver);
2717 +#else /* CONFIG_XEN */
2718 + num_processors++;
2719 +#endif
2720 +}
2721 +
2722 +static void __init MP_bus_info(struct mpc_config_bus *m)
2723 +{
2724 + char str[7];
2725 +
2726 + memcpy(str, m->mpc_bustype, 6);
2727 + str[6] = 0;
2728 +
2729 +#ifdef CONFIG_X86_NUMAQ
2730 + mpc_oem_bus_info(m, str, translation_table[mpc_record]);
2731 +#else
2732 + Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
2733 +#endif
2734 +
2735 +#if MAX_MP_BUSSES < 256
2736 + if (m->mpc_busid >= MAX_MP_BUSSES) {
2737 + printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
2738 + " is too large, max. supported is %d\n",
2739 + m->mpc_busid, str, MAX_MP_BUSSES - 1);
2740 + return;
2741 + }
2742 +#endif
2743 +
2744 + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
2745 + set_bit(m->mpc_busid, mp_bus_not_pci);
2746 +#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
2747 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
2748 +#endif
2749 + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
2750 +#ifdef CONFIG_X86_NUMAQ
2751 + mpc_oem_pci_bus(m, translation_table[mpc_record]);
2752 +#endif
2753 + clear_bit(m->mpc_busid, mp_bus_not_pci);
2754 + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
2755 + mp_current_pci_id++;
2756 +#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
2757 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
2758 + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
2759 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
2760 + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
2761 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
2762 +#endif
2763 + } else
2764 + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
2765 +}
2766 +
2767 +#ifdef CONFIG_X86_IO_APIC
2768 +
2769 +static int bad_ioapic(unsigned long address)
2770 +{
2771 + if (nr_ioapics >= MAX_IO_APICS) {
2772 + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
2773 + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
2774 + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
2775 + }
2776 + if (!address) {
2777 + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
2778 + " found in table, skipping!\n");
2779 + return 1;
2780 + }
2781 + return 0;
2782 +}
2783 +
2784 +static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
2785 +{
2786 + if (!(m->mpc_flags & MPC_APIC_USABLE))
2787 + return;
2788 +
2789 + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
2790 + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
2791 +
2792 + if (bad_ioapic(m->mpc_apicaddr))
2793 + return;
2794 +
2795 + mp_ioapics[nr_ioapics] = *m;
2796 + nr_ioapics++;
2797 +}
2798 +
2799 +static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
2800 +{
2801 + mp_irqs[mp_irq_entries] = *m;
2802 + Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
2803 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
2804 + m->mpc_irqtype, m->mpc_irqflag & 3,
2805 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
2806 + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
2807 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
2808 + panic("Max # of irq sources exceeded!!\n");
2809 +}
2810 +
2811 +#endif
2812 +
2813 +static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
2814 +{
2815 + Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
2816 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
2817 + m->mpc_irqtype, m->mpc_irqflag & 3,
2818 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
2819 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
2820 +}
2821 +
2822 +#ifdef CONFIG_X86_NUMAQ
2823 +static void __init MP_translation_info(struct mpc_config_translation *m)
2824 +{
2825 + printk(KERN_INFO
2826 + "Translation: record %d, type %d, quad %d, global %d, local %d\n",
2827 + mpc_record, m->trans_type, m->trans_quad, m->trans_global,
2828 + m->trans_local);
2829 +
2830 + if (mpc_record >= MAX_MPC_ENTRY)
2831 + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
2832 + else
2833 + translation_table[mpc_record] = m; /* stash this for later */
2834 + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
2835 + node_set_online(m->trans_quad);
2836 +}
2837 +
2838 +/*
2839 + * Read/parse the MPC oem tables
2840 + */
2841 +
2842 +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
2843 + unsigned short oemsize)
2844 +{
2845 + int count = sizeof(*oemtable); /* the header size */
2846 + unsigned char *oemptr = ((unsigned char *)oemtable) + count;
2847 +
2848 + mpc_record = 0;
2849 + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
2850 + oemtable);
2851 + if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
2852 + printk(KERN_WARNING
2853 + "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
2854 + oemtable->oem_signature[0], oemtable->oem_signature[1],
2855 + oemtable->oem_signature[2], oemtable->oem_signature[3]);
2856 + return;
2857 + }
2858 + if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
2859 + printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
2860 + return;
2861 + }
2862 + while (count < oemtable->oem_length) {
2863 + switch (*oemptr) {
2864 + case MP_TRANSLATION:
2865 + {
2866 + struct mpc_config_translation *m =
2867 + (struct mpc_config_translation *)oemptr;
2868 + MP_translation_info(m);
2869 + oemptr += sizeof(*m);
2870 + count += sizeof(*m);
2871 + ++mpc_record;
2872 + break;
2873 + }
2874 + default:
2875 + {
2876 + printk(KERN_WARNING
2877 + "Unrecognised OEM table entry type! - %d\n",
2878 + (int)*oemptr);
2879 + return;
2880 + }
2881 + }
2882 + }
2883 +}
2884 +
2885 +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
2886 + char *productid)
2887 +{
2888 + if (strncmp(oem, "IBM NUMA", 8))
2889 + printk("Warning! May not be a NUMA-Q system!\n");
2890 + if (mpc->mpc_oemptr)
2891 + smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
2892 + mpc->mpc_oemsize);
2893 +}
2894 +#endif /* CONFIG_X86_NUMAQ */
2895 +
2896 +/*
2897 + * Read/parse the MPC
2898 + */
2899 +
2900 +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
2901 +{
2902 + char str[16];
2903 + char oem[10];
2904 + int count = sizeof(*mpc);
2905 + unsigned char *mpt = ((unsigned char *)mpc) + count;
2906 +
2907 + if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
2908 + printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
2909 + mpc->mpc_signature[0], mpc->mpc_signature[1],
2910 + mpc->mpc_signature[2], mpc->mpc_signature[3]);
2911 + return 0;
2912 + }
2913 + if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
2914 + printk(KERN_ERR "MPTABLE: checksum error!\n");
2915 + return 0;
2916 + }
2917 + if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
2918 + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
2919 + mpc->mpc_spec);
2920 + return 0;
2921 + }
2922 + if (!mpc->mpc_lapic) {
2923 + printk(KERN_ERR "MPTABLE: null local APIC address!\n");
2924 + return 0;
2925 + }
2926 + memcpy(oem, mpc->mpc_oem, 8);
2927 + oem[8] = 0;
2928 + printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
2929 +
2930 + memcpy(str, mpc->mpc_productid, 12);
2931 + str[12] = 0;
2932 + printk("Product ID: %s ", str);
2933 +
2934 +#ifdef CONFIG_X86_32
2935 + mps_oem_check(mpc, oem, str);
2936 +#endif
2937 + printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
2938 +
2939 + printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
2940 +
2941 + /* save the local APIC address, it might be non-default */
2942 + if (!acpi_lapic)
2943 + mp_lapic_addr = mpc->mpc_lapic;
2944 +
2945 + if (early)
2946 + return 1;
2947 +
2948 + /*
2949 + * Now process the configuration blocks.
2950 + */
2951 +#ifdef CONFIG_X86_NUMAQ
2952 + mpc_record = 0;
2953 +#endif
2954 + while (count < mpc->mpc_length) {
2955 + switch (*mpt) {
2956 + case MP_PROCESSOR:
2957 + {
2958 + struct mpc_config_processor *m =
2959 + (struct mpc_config_processor *)mpt;
2960 + /* ACPI may have already provided this data */
2961 + if (!acpi_lapic)
2962 + MP_processor_info(m);
2963 + mpt += sizeof(*m);
2964 + count += sizeof(*m);
2965 + break;
2966 + }
2967 + case MP_BUS:
2968 + {
2969 + struct mpc_config_bus *m =
2970 + (struct mpc_config_bus *)mpt;
2971 + MP_bus_info(m);
2972 + mpt += sizeof(*m);
2973 + count += sizeof(*m);
2974 + break;
2975 + }
2976 + case MP_IOAPIC:
2977 + {
2978 +#ifdef CONFIG_X86_IO_APIC
2979 + struct mpc_config_ioapic *m =
2980 + (struct mpc_config_ioapic *)mpt;
2981 + MP_ioapic_info(m);
2982 +#endif
2983 + mpt += sizeof(struct mpc_config_ioapic);
2984 + count += sizeof(struct mpc_config_ioapic);
2985 + break;
2986 + }
2987 + case MP_INTSRC:
2988 + {
2989 +#ifdef CONFIG_X86_IO_APIC
2990 + struct mpc_config_intsrc *m =
2991 + (struct mpc_config_intsrc *)mpt;
2992 +
2993 + MP_intsrc_info(m);
2994 +#endif
2995 + mpt += sizeof(struct mpc_config_intsrc);
2996 + count += sizeof(struct mpc_config_intsrc);
2997 + break;
2998 + }
2999 + case MP_LINTSRC:
3000 + {
3001 + struct mpc_config_lintsrc *m =
3002 + (struct mpc_config_lintsrc *)mpt;
3003 + MP_lintsrc_info(m);
3004 + mpt += sizeof(*m);
3005 + count += sizeof(*m);
3006 + break;
3007 + }
3008 + default:
3009 + /* wrong mptable */
3010 + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
3011 + printk(KERN_ERR "type %x\n", *mpt);
3012 + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
3013 + 1, mpc, mpc->mpc_length, 1);
3014 + count = mpc->mpc_length;
3015 + break;
3016 + }
3017 +#ifdef CONFIG_X86_NUMAQ
3018 + ++mpc_record;
3019 +#endif
3020 + }
3021 + setup_apic_routing();
3022 + if (!num_processors)
3023 + printk(KERN_ERR "MPTABLE: no processors registered!\n");
3024 + return num_processors;
3025 +}
3026 +
3027 +#ifdef CONFIG_X86_IO_APIC
3028 +
3029 +static int __init ELCR_trigger(unsigned int irq)
3030 +{
3031 + unsigned int port;
3032 +
3033 + port = 0x4d0 + (irq >> 3);
3034 + return (inb(port) >> (irq & 7)) & 1;
3035 +}
3036 +
3037 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
3038 +{
3039 + struct mpc_config_intsrc intsrc;
3040 + int i;
3041 + int ELCR_fallback = 0;
3042 +
3043 + intsrc.mpc_type = MP_INTSRC;
3044 + intsrc.mpc_irqflag = 0; /* conforming */
3045 + intsrc.mpc_srcbus = 0;
3046 + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
3047 +
3048 + intsrc.mpc_irqtype = mp_INT;
3049 +
3050 + /*
3051 + * If true, we have an ISA/PCI system with no IRQ entries
3052 + * in the MP table. To prevent the PCI interrupts from being set up
3053 + * incorrectly, we try to use the ELCR. The sanity check to see if
3054 + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
3055 + * never be level sensitive, so we simply see if the ELCR agrees.
3056 + * If it does, we assume it's valid.
3057 + */
3058 + if (mpc_default_type == 5) {
3059 + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
3060 + "falling back to ELCR\n");
3061 +
3062 + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
3063 + ELCR_trigger(13))
3064 + printk(KERN_ERR "ELCR contains invalid data... "
3065 + "not using ELCR\n");
3066 + else {
3067 + printk(KERN_INFO
3068 + "Using ELCR to identify PCI interrupts\n");
3069 + ELCR_fallback = 1;
3070 + }
3071 + }
3072 +
3073 + for (i = 0; i < 16; i++) {
3074 + switch (mpc_default_type) {
3075 + case 2:
3076 + if (i == 0 || i == 13)
3077 + continue; /* IRQ0 & IRQ13 not connected */
3078 + /* fall through */
3079 + default:
3080 + if (i == 2)
3081 + continue; /* IRQ2 is never connected */
3082 + }
3083 +
3084 + if (ELCR_fallback) {
3085 + /*
3086 + * If the ELCR indicates a level-sensitive interrupt, we
3087 + * copy that information over to the MP table in the
3088 + * irqflag field (level sensitive, active high polarity).
3089 + */
3090 + if (ELCR_trigger(i))
3091 + intsrc.mpc_irqflag = 13;
3092 + else
3093 + intsrc.mpc_irqflag = 0;
3094 + }
3095 +
3096 + intsrc.mpc_srcbusirq = i;
3097 + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
3098 + MP_intsrc_info(&intsrc);
3099 + }
3100 +
3101 + intsrc.mpc_irqtype = mp_ExtINT;
3102 + intsrc.mpc_srcbusirq = 0;
3103 + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
3104 + MP_intsrc_info(&intsrc);
3105 +}
3106 +
3107 +#endif
3108 +
3109 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
3110 +{
3111 + struct mpc_config_processor processor;
3112 + struct mpc_config_bus bus;
3113 +#ifdef CONFIG_X86_IO_APIC
3114 + struct mpc_config_ioapic ioapic;
3115 +#endif
3116 + struct mpc_config_lintsrc lintsrc;
3117 + int linttypes[2] = { mp_ExtINT, mp_NMI };
3118 + int i;
3119 +
3120 + /*
3121 + * local APIC has default address
3122 + */
3123 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
3124 +
3125 + /*
3126 + * 2 CPUs, numbered 0 & 1.
3127 + */
3128 + processor.mpc_type = MP_PROCESSOR;
3129 + /* Either an integrated APIC or a discrete 82489DX. */
3130 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
3131 + processor.mpc_cpuflag = CPU_ENABLED;
3132 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
3133 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
3134 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
3135 + processor.mpc_reserved[0] = 0;
3136 + processor.mpc_reserved[1] = 0;
3137 + for (i = 0; i < 2; i++) {
3138 + processor.mpc_apicid = i;
3139 + MP_processor_info(&processor);
3140 + }
3141 +
3142 + bus.mpc_type = MP_BUS;
3143 + bus.mpc_busid = 0;
3144 + switch (mpc_default_type) {
3145 + default:
3146 + printk(KERN_ERR "???\nUnknown standard configuration %d\n",
3147 + mpc_default_type);
3148 + /* fall through */
3149 + case 1:
3150 + case 5:
3151 + memcpy(bus.mpc_bustype, "ISA ", 6);
3152 + break;
3153 + case 2:
3154 + case 6:
3155 + case 3:
3156 + memcpy(bus.mpc_bustype, "EISA ", 6);
3157 + break;
3158 + case 4:
3159 + case 7:
3160 + memcpy(bus.mpc_bustype, "MCA ", 6);
3161 + }
3162 + MP_bus_info(&bus);
3163 + if (mpc_default_type > 4) {
3164 + bus.mpc_busid = 1;
3165 + memcpy(bus.mpc_bustype, "PCI ", 6);
3166 + MP_bus_info(&bus);
3167 + }
3168 +
3169 +#ifdef CONFIG_X86_IO_APIC
3170 + ioapic.mpc_type = MP_IOAPIC;
3171 + ioapic.mpc_apicid = 2;
3172 + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
3173 + ioapic.mpc_flags = MPC_APIC_USABLE;
3174 + ioapic.mpc_apicaddr = 0xFEC00000;
3175 + MP_ioapic_info(&ioapic);
3176 +
3177 + /*
3178 + * We set up most of the low 16 IO-APIC pins according to MPS rules.
3179 + */
3180 + construct_default_ioirq_mptable(mpc_default_type);
3181 +#endif
3182 + lintsrc.mpc_type = MP_LINTSRC;
3183 + lintsrc.mpc_irqflag = 0; /* conforming */
3184 + lintsrc.mpc_srcbusid = 0;
3185 + lintsrc.mpc_srcbusirq = 0;
3186 + lintsrc.mpc_destapic = MP_APIC_ALL;
3187 + for (i = 0; i < 2; i++) {
3188 + lintsrc.mpc_irqtype = linttypes[i];
3189 + lintsrc.mpc_destapiclint = i;
3190 + MP_lintsrc_info(&lintsrc);
3191 + }
3192 +}
3193 +
3194 +static struct intel_mp_floating *mpf_found;
3195 +
3196 +/*
3197 + * Scan the memory blocks for an SMP configuration block.
3198 + */
3199 +static void __init __get_smp_config(unsigned early)
3200 +{
3201 + struct intel_mp_floating *mpf = mpf_found;
3202 +
3203 + if (acpi_lapic && early)
3204 + return;
3205 + /*
3206 + * ACPI supports both logical (e.g. Hyper-Threading) and physical
3207 + * processors, where MPS only supports physical.
3208 + */
3209 + if (acpi_lapic && acpi_ioapic) {
3210 + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
3211 + "information\n");
3212 + return;
3213 + } else if (acpi_lapic)
3214 + printk(KERN_INFO "Using ACPI for processor (LAPIC) "
3215 + "configuration information\n");
3216 +
3217 + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
3218 + mpf->mpf_specification);
3219 +#ifdef CONFIG_X86_32
3220 + if (mpf->mpf_feature2 & (1 << 7)) {
3221 + printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
3222 + pic_mode = 1;
3223 + } else {
3224 + printk(KERN_INFO " Virtual Wire compatibility mode.\n");
3225 + pic_mode = 0;
3226 + }
3227 +#endif
3228 + /*
3229 + * Now see if we need to read further.
3230 + */
3231 + if (mpf->mpf_feature1 != 0) {
3232 + if (early) {
3233 + /*
3234 + * local APIC has default address
3235 + */
3236 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
3237 + return;
3238 + }
3239 +
3240 + printk(KERN_INFO "Default MP configuration #%d\n",
3241 + mpf->mpf_feature1);
3242 + construct_default_ISA_mptable(mpf->mpf_feature1);
3243 +
3244 + } else if (mpf->mpf_physptr) {
3245 +
3246 + /*
3247 + * Read the physical hardware table. Anything here will
3248 + * override the defaults.
3249 + */
3250 + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
3251 + smp_found_config = 0;
3252 + printk(KERN_ERR
3253 + "BIOS bug, MP table errors detected!...\n");
3254 + printk(KERN_ERR "... disabling SMP support. "
3255 + "(tell your hw vendor)\n");
3256 + return;
3257 + }
3258 +
3259 + if (early)
3260 + return;
3261 +#ifdef CONFIG_X86_IO_APIC
3262 + /*
3263 + * If there are no explicit MP IRQ entries, then we are
3264 + * broken. We set up most of the low 16 IO-APIC pins to
3265 + * ISA defaults and hope it will work.
3266 + */
3267 + if (!mp_irq_entries) {
3268 + struct mpc_config_bus bus;
3269 +
3270 + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
3271 + "using default mptable. "
3272 + "(tell your hw vendor)\n");
3273 +
3274 + bus.mpc_type = MP_BUS;
3275 + bus.mpc_busid = 0;
3276 + memcpy(bus.mpc_bustype, "ISA ", 6);
3277 + MP_bus_info(&bus);
3278 +
3279 + construct_default_ioirq_mptable(0);
3280 + }
3281 +#endif
3282 + } else
3283 + BUG();
3284 +
3285 + if (!early)
3286 + printk(KERN_INFO "Processors: %d\n", num_processors);
3287 + /*
3288 + * Only use the first configuration found.
3289 + */
3290 +}
3291 +
3292 +void __init early_get_smp_config(void)
3293 +{
3294 + __get_smp_config(1);
3295 +}
3296 +
3297 +void __init get_smp_config(void)
3298 +{
3299 + __get_smp_config(0);
3300 +}
3301 +
3302 +static int __init smp_scan_config(unsigned long base, unsigned long length,
3303 + unsigned reserve)
3304 +{
3305 + unsigned int *bp = isa_bus_to_virt(base);
3306 + struct intel_mp_floating *mpf;
3307 +
3308 + Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
3309 + BUILD_BUG_ON(sizeof(*mpf) != 16);
3310 +
3311 + while (length > 0) {
3312 + mpf = (struct intel_mp_floating *)bp;
3313 + if ((*bp == SMP_MAGIC_IDENT) &&
3314 + (mpf->mpf_length == 1) &&
3315 + !mpf_checksum((unsigned char *)bp, 16) &&
3316 + ((mpf->mpf_specification == 1)
3317 + || (mpf->mpf_specification == 4))) {
3318 +
3319 + smp_found_config = 1;
3320 + mpf_found = mpf;
3321 +#ifdef CONFIG_X86_32
3322 +#ifndef CONFIG_XEN
3323 + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
3324 + mpf, virt_to_phys(mpf));
3325 + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
3326 + BOOTMEM_DEFAULT);
3327 + if (mpf->mpf_physptr) {
3328 + /*
3329 + * We cannot access to MPC table to compute
3330 + * table size yet, as only few megabytes from
3331 + * the bottom is mapped now.
3332 + * PC-9800's MPC table places on the very last
3333 + * of physical memory; so that simply reserving
3334 + * PAGE_SIZE from mpg->mpf_physptr yields BUG()
3335 + * in reserve_bootmem.
3336 + */
3337 + unsigned long size = PAGE_SIZE;
3338 + unsigned long end = max_low_pfn * PAGE_SIZE;
3339 + if (mpf->mpf_physptr + size > end)
3340 + size = end - mpf->mpf_physptr;
3341 + reserve_bootmem(mpf->mpf_physptr, size,
3342 + BOOTMEM_DEFAULT);
3343 + }
3344 +#else
3345 + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
3346 + mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
3347 +#endif
3348 +#elif !defined(CONFIG_XEN)
3349 + if (!reserve)
3350 + return 1;
3351 +
3352 + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
3353 + if (mpf->mpf_physptr)
3354 + reserve_bootmem_generic(mpf->mpf_physptr,
3355 + PAGE_SIZE);
3356 +#endif
3357 + return 1;
3358 + }
3359 + bp += 4;
3360 + length -= 16;
3361 + }
3362 + return 0;
3363 +}
3364 +
3365 +static void __init __find_smp_config(unsigned reserve)
3366 +{
3367 +#ifndef CONFIG_XEN
3368 + unsigned int address;
3369 +#endif
3370 +
3371 + /*
3372 + * FIXME: Linux assumes you have 640K of base ram..
3373 + * this continues the error...
3374 + *
3375 + * 1) Scan the bottom 1K for a signature
3376 + * 2) Scan the top 1K of base RAM
3377 + * 3) Scan the 64K of bios
3378 + */
3379 + if (smp_scan_config(0x0, 0x400, reserve) ||
3380 + smp_scan_config(639 * 0x400, 0x400, reserve) ||
3381 + smp_scan_config(0xF0000, 0x10000, reserve))
3382 + return;
3383 + /*
3384 + * If it is an SMP machine we should know now, unless the
3385 + * configuration is in an EISA/MCA bus machine with an
3386 + * extended bios data area.
3387 + *
3388 + * there is a real-mode segmented pointer pointing to the
3389 + * 4K EBDA area at 0x40E, calculate and scan it here.
3390 + *
3391 + * NOTE! There are Linux loaders that will corrupt the EBDA
3392 + * area, and as such this kind of SMP config may be less
3393 + * trustworthy, simply because the SMP table may have been
3394 + * stomped on during early boot. These loaders are buggy and
3395 + * should be fixed.
3396 + *
3397 + * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
3398 + */
3399 +
3400 +#ifndef CONFIG_XEN
3401 + address = get_bios_ebda();
3402 + if (address)
3403 + smp_scan_config(address, 0x400, reserve);
3404 +#endif
3405 +}
3406 +
3407 +void __init early_find_smp_config(void)
3408 +{
3409 + __find_smp_config(0);
3410 +}
3411 +
3412 +void __init find_smp_config(void)
3413 +{
3414 + __find_smp_config(1);
3415 +}
3416 +
3417 +/* --------------------------------------------------------------------------
3418 + ACPI-based MP Configuration
3419 + -------------------------------------------------------------------------- */
3420 +
3421 +/*
3422 + * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
3423 + */
3424 +int es7000_plat;
3425 +
3426 +#ifdef CONFIG_ACPI
3427 +
3428 +#ifdef CONFIG_X86_IO_APIC
3429 +
3430 +#define MP_ISA_BUS 0
3431 +
3432 +extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
3433 +
3434 +static int mp_find_ioapic(int gsi)
3435 +{
3436 + int i = 0;
3437 +
3438 + /* Find the IOAPIC that manages this GSI. */
3439 + for (i = 0; i < nr_ioapics; i++) {
3440 + if ((gsi >= mp_ioapic_routing[i].gsi_base)
3441 + && (gsi <= mp_ioapic_routing[i].gsi_end))
3442 + return i;
3443 + }
3444 +
3445 + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
3446 + return -1;
3447 +}
3448 +
3449 +static u8 __init uniq_ioapic_id(u8 id)
3450 +{
3451 +#ifdef CONFIG_X86_32
3452 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3453 + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3454 + return io_apic_get_unique_id(nr_ioapics, id);
3455 + else
3456 + return id;
3457 +#else
3458 + int i;
3459 + DECLARE_BITMAP(used, 256);
3460 + bitmap_zero(used, 256);
3461 + for (i = 0; i < nr_ioapics; i++) {
3462 + struct mpc_config_ioapic *ia = &mp_ioapics[i];
3463 + __set_bit(ia->mpc_apicid, used);
3464 + }
3465 + if (!test_bit(id, used))
3466 + return id;
3467 + return find_first_zero_bit(used, 256);
3468 +#endif
3469 +}
3470 +
3471 +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
3472 +{
3473 + int idx = 0;
3474 +
3475 + if (bad_ioapic(address))
3476 + return;
3477 +
3478 + idx = nr_ioapics;
3479 +
3480 + mp_ioapics[idx].mpc_type = MP_IOAPIC;
3481 + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
3482 + mp_ioapics[idx].mpc_apicaddr = address;
3483 +
3484 +#ifndef CONFIG_XEN
3485 + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
3486 +#endif
3487 + mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
3488 +#ifdef CONFIG_X86_32
3489 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
3490 +#else
3491 + mp_ioapics[idx].mpc_apicver = 0;
3492 +#endif
3493 + /*
3494 + * Build basic GSI lookup table to facilitate gsi->io_apic lookups
3495 + * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
3496 + */
3497 + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
3498 + mp_ioapic_routing[idx].gsi_base = gsi_base;
3499 + mp_ioapic_routing[idx].gsi_end = gsi_base +
3500 + io_apic_get_redir_entries(idx);
3501 +
3502 + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
3503 + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
3504 + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
3505 + mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
3506 +
3507 + nr_ioapics++;
3508 +}
3509 +
3510 +void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
3511 +{
3512 + struct mpc_config_intsrc intsrc;
3513 + int ioapic = -1;
3514 + int pin = -1;
3515 +
3516 + /*
3517 + * Convert 'gsi' to 'ioapic.pin'.
3518 + */
3519 + ioapic = mp_find_ioapic(gsi);
3520 + if (ioapic < 0)
3521 + return;
3522 + pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
3523 +
3524 + /*
3525 + * TBD: This check is for faulty timer entries, where the override
3526 + * erroneously sets the trigger to level, resulting in a HUGE
3527 + * increase of timer interrupts!
3528 + */
3529 + if ((bus_irq == 0) && (trigger == 3))
3530 + trigger = 1;
3531 +
3532 + intsrc.mpc_type = MP_INTSRC;
3533 + intsrc.mpc_irqtype = mp_INT;
3534 + intsrc.mpc_irqflag = (trigger << 2) | polarity;
3535 + intsrc.mpc_srcbus = MP_ISA_BUS;
3536 + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
3537 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
3538 + intsrc.mpc_dstirq = pin; /* INTIN# */
3539 +
3540 + MP_intsrc_info(&intsrc);
3541 +}
3542 +
3543 +void __init mp_config_acpi_legacy_irqs(void)
3544 +{
3545 + struct mpc_config_intsrc intsrc;
3546 + int i = 0;
3547 + int ioapic = -1;
3548 +
3549 +#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
3550 + /*
3551 + * Fabricate the legacy ISA bus (bus #31).
3552 + */
3553 + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
3554 +#endif
3555 + set_bit(MP_ISA_BUS, mp_bus_not_pci);
3556 + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
3557 +
3558 + /*
3559 + * Older generations of ES7000 have no legacy identity mappings
3560 + */
3561 + if (es7000_plat == 1)
3562 + return;
3563 +
3564 + /*
3565 + * Locate the IOAPIC that manages the ISA IRQs (0-15).
3566 + */
3567 + ioapic = mp_find_ioapic(0);
3568 + if (ioapic < 0)
3569 + return;
3570 +
3571 + intsrc.mpc_type = MP_INTSRC;
3572 + intsrc.mpc_irqflag = 0; /* Conforming */
3573 + intsrc.mpc_srcbus = MP_ISA_BUS;
3574 +#ifdef CONFIG_X86_IO_APIC
3575 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
3576 +#endif
3577 + /*
3578 + * Use the default configuration for the IRQs 0-15. Unless
3579 + * overridden by (MADT) interrupt source override entries.
3580 + */
3581 + for (i = 0; i < 16; i++) {
3582 + int idx;
3583 +
3584 + for (idx = 0; idx < mp_irq_entries; idx++) {
3585 + struct mpc_config_intsrc *irq = mp_irqs + idx;
3586 +
3587 + /* Do we already have a mapping for this ISA IRQ? */
3588 + if (irq->mpc_srcbus == MP_ISA_BUS
3589 + && irq->mpc_srcbusirq == i)
3590 + break;
3591 +
3592 + /* Do we already have a mapping for this IOAPIC pin */
3593 + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
3594 + (irq->mpc_dstirq == i))
3595 + break;
3596 + }
3597 +
3598 + if (idx != mp_irq_entries) {
3599 + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
3600 + continue; /* IRQ already used */
3601 + }
3602 +
3603 + intsrc.mpc_irqtype = mp_INT;
3604 + intsrc.mpc_srcbusirq = i; /* Identity mapped */
3605 + intsrc.mpc_dstirq = i;
3606 +
3607 + MP_intsrc_info(&intsrc);
3608 + }
3609 +}
3610 +
3611 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
3612 +{
3613 + int ioapic;
3614 + int ioapic_pin;
3615 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
3616 +#define MAX_GSI_NUM 4096
3617 +#define IRQ_COMPRESSION_START 64
3618 +
3619 + static int pci_irq = IRQ_COMPRESSION_START;
3620 + /*
3621 + * Mapping between Global System Interrupts, which
3622 + * represent all possible interrupts, and IRQs
3623 + * assigned to actual devices.
3624 + */
3625 + static int gsi_to_irq[MAX_GSI_NUM];
3626 +#else
3627 +
3628 + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
3629 + return gsi;
3630 +#endif
3631 +
3632 + /* Don't set up the ACPI SCI because it's already set up */
3633 + if (acpi_gbl_FADT.sci_interrupt == gsi)
3634 + return gsi;
3635 +
3636 + ioapic = mp_find_ioapic(gsi);
3637 + if (ioapic < 0) {
3638 + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
3639 + return gsi;
3640 + }
3641 +
3642 + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
3643 +
3644 +#ifndef CONFIG_X86_32
3645 + if (ioapic_renumber_irq)
3646 + gsi = ioapic_renumber_irq(ioapic, gsi);
3647 +#endif
3648 +
3649 + /*
3650 + * Avoid pin reprogramming. PRTs typically include entries
3651 + * with redundant pin->gsi mappings (but unique PCI devices);
3652 + * we only program the IOAPIC on the first.
3653 + */
3654 + if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
3655 + printk(KERN_ERR "Invalid reference to IOAPIC pin "
3656 + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
3657 + ioapic_pin);
3658 + return gsi;
3659 + }
3660 + if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3661 + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
3662 + mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
3663 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
3664 + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
3665 +#else
3666 + return gsi;
3667 +#endif
3668 + }
3669 +
3670 + set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
3671 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
3672 + /*
3673 + * For GSI >= 64, use IRQ compression
3674 + */
3675 + if ((gsi >= IRQ_COMPRESSION_START)
3676 + && (triggering == ACPI_LEVEL_SENSITIVE)) {
3677 + /*
3678 + * For PCI devices assign IRQs in order, avoiding gaps
3679 + * due to unused I/O APIC pins.
3680 + */
3681 + int irq = gsi;
3682 + if (gsi < MAX_GSI_NUM) {
3683 + /*
3684 + * Retain the VIA chipset work-around (gsi > 15), but
3685 + * avoid a problem where the 8254 timer (IRQ0) is setup
3686 + * via an override (so it's not on pin 0 of the ioapic),
3687 + * and at the same time, the pin 0 interrupt is a PCI
3688 + * type. The gsi > 15 test could cause these two pins
3689 + * to be shared as IRQ0, and they are not shareable.
3690 + * So test for this condition, and if necessary, avoid
3691 + * the pin collision.
3692 + */
3693 + gsi = pci_irq++;
3694 + /*
3695 + * Don't assign IRQ used by ACPI SCI
3696 + */
3697 + if (gsi == acpi_gbl_FADT.sci_interrupt)
3698 + gsi = pci_irq++;
3699 + gsi_to_irq[irq] = gsi;
3700 + } else {
3701 + printk(KERN_ERR "GSI %u is too high\n", gsi);
3702 + return gsi;
3703 + }
3704 + }
3705 +#endif
3706 + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
3707 + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
3708 + polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
3709 + return gsi;
3710 +}
3711 +
3712 +#endif /* CONFIG_X86_IO_APIC */
3713 +#endif /* CONFIG_ACPI */
3714 --- sle11-2009-10-16.orig/arch/x86/kernel/mpparse_32-xen.c 2009-03-16 16:33:40.000000000 +0100
3715 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3716 @@ -1,1161 +0,0 @@
3717 -/*
3718 - * Intel Multiprocessor Specification 1.1 and 1.4
3719 - * compliant MP-table parsing routines.
3720 - *
3721 - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
3722 - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
3723 - *
3724 - * Fixes
3725 - * Erich Boleyn : MP v1.4 and additional changes.
3726 - * Alan Cox : Added EBDA scanning
3727 - * Ingo Molnar : various cleanups and rewrites
3728 - * Maciej W. Rozycki: Bits for default MP configurations
3729 - * Paul Diefenbaugh: Added full ACPI support
3730 - */
3731 -
3732 -#include <linux/mm.h>
3733 -#include <linux/init.h>
3734 -#include <linux/acpi.h>
3735 -#include <linux/delay.h>
3736 -#include <linux/bootmem.h>
3737 -#include <linux/kernel_stat.h>
3738 -#include <linux/mc146818rtc.h>
3739 -#include <linux/bitops.h>
3740 -
3741 -#include <asm/smp.h>
3742 -#include <asm/acpi.h>
3743 -#include <asm/mtrr.h>
3744 -#include <asm/mpspec.h>
3745 -#include <asm/io_apic.h>
3746 -
3747 -#include <mach_apic.h>
3748 -#include <mach_apicdef.h>
3749 -#include <mach_mpparse.h>
3750 -#include <bios_ebda.h>
3751 -
3752 -/* Have we found an MP table */
3753 -int smp_found_config;
3754 -unsigned int __cpuinitdata maxcpus = NR_CPUS;
3755 -
3756 -/*
3757 - * Various Linux-internal data structures created from the
3758 - * MP-table.
3759 - */
3760 -int apic_version [MAX_APICS];
3761 -int mp_bus_id_to_type [MAX_MP_BUSSES];
3762 -int mp_bus_id_to_node [MAX_MP_BUSSES];
3763 -int mp_bus_id_to_local [MAX_MP_BUSSES];
3764 -int quad_local_to_mp_bus_id [NR_CPUS/4][4];
3765 -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
3766 -static int mp_current_pci_id;
3767 -
3768 -/* I/O APIC entries */
3769 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
3770 -
3771 -/* # of MP IRQ source entries */
3772 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
3773 -
3774 -/* MP IRQ source entries */
3775 -int mp_irq_entries;
3776 -
3777 -int nr_ioapics;
3778 -
3779 -int pic_mode;
3780 -unsigned long mp_lapic_addr;
3781 -
3782 -unsigned int def_to_bigsmp = 0;
3783 -
3784 -/* Processor that is doing the boot up */
3785 -unsigned int boot_cpu_physical_apicid = -1U;
3786 -/* Internal processor count */
3787 -unsigned int num_processors;
3788 -
3789 -/* Bitmask of physically existing CPUs */
3790 -physid_mask_t phys_cpu_present_map;
3791 -
3792 -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
3793 -
3794 -/*
3795 - * Intel MP BIOS table parsing routines:
3796 - */
3797 -
3798 -
3799 -/*
3800 - * Checksum an MP configuration block.
3801 - */
3802 -
3803 -static int __init mpf_checksum(unsigned char *mp, int len)
3804 -{
3805 - int sum = 0;
3806 -
3807 - while (len--)
3808 - sum += *mp++;
3809 -
3810 - return sum & 0xFF;
3811 -}
3812 -
3813 -/*
3814 - * Have to match translation table entries to main table entries by counter
3815 - * hence the mpc_record variable .... can't see a less disgusting way of
3816 - * doing this ....
3817 - */
3818 -
3819 -static int mpc_record;
3820 -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
3821 -
3822 -#ifndef CONFIG_XEN
3823 -static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
3824 -{
3825 - int ver, apicid;
3826 - physid_mask_t phys_cpu;
3827 -
3828 - if (!(m->mpc_cpuflag & CPU_ENABLED))
3829 - return;
3830 -
3831 - apicid = mpc_apic_id(m, translation_table[mpc_record]);
3832 -
3833 - if (m->mpc_featureflag&(1<<0))
3834 - Dprintk(" Floating point unit present.\n");
3835 - if (m->mpc_featureflag&(1<<7))
3836 - Dprintk(" Machine Exception supported.\n");
3837 - if (m->mpc_featureflag&(1<<8))
3838 - Dprintk(" 64 bit compare & exchange supported.\n");
3839 - if (m->mpc_featureflag&(1<<9))
3840 - Dprintk(" Internal APIC present.\n");
3841 - if (m->mpc_featureflag&(1<<11))
3842 - Dprintk(" SEP present.\n");
3843 - if (m->mpc_featureflag&(1<<12))
3844 - Dprintk(" MTRR present.\n");
3845 - if (m->mpc_featureflag&(1<<13))
3846 - Dprintk(" PGE present.\n");
3847 - if (m->mpc_featureflag&(1<<14))
3848 - Dprintk(" MCA present.\n");
3849 - if (m->mpc_featureflag&(1<<15))
3850 - Dprintk(" CMOV present.\n");
3851 - if (m->mpc_featureflag&(1<<16))
3852 - Dprintk(" PAT present.\n");
3853 - if (m->mpc_featureflag&(1<<17))
3854 - Dprintk(" PSE present.\n");
3855 - if (m->mpc_featureflag&(1<<18))
3856 - Dprintk(" PSN present.\n");
3857 - if (m->mpc_featureflag&(1<<19))
3858 - Dprintk(" Cache Line Flush Instruction present.\n");
3859 - /* 20 Reserved */
3860 - if (m->mpc_featureflag&(1<<21))
3861 - Dprintk(" Debug Trace and EMON Store present.\n");
3862 - if (m->mpc_featureflag&(1<<22))
3863 - Dprintk(" ACPI Thermal Throttle Registers present.\n");
3864 - if (m->mpc_featureflag&(1<<23))
3865 - Dprintk(" MMX present.\n");
3866 - if (m->mpc_featureflag&(1<<24))
3867 - Dprintk(" FXSR present.\n");
3868 - if (m->mpc_featureflag&(1<<25))
3869 - Dprintk(" XMM present.\n");
3870 - if (m->mpc_featureflag&(1<<26))
3871 - Dprintk(" Willamette New Instructions present.\n");
3872 - if (m->mpc_featureflag&(1<<27))
3873 - Dprintk(" Self Snoop present.\n");
3874 - if (m->mpc_featureflag&(1<<28))
3875 - Dprintk(" HT present.\n");
3876 - if (m->mpc_featureflag&(1<<29))
3877 - Dprintk(" Thermal Monitor present.\n");
3878 - /* 30, 31 Reserved */
3879 -
3880 -
3881 - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
3882 - Dprintk(" Bootup CPU\n");
3883 - boot_cpu_physical_apicid = m->mpc_apicid;
3884 - }
3885 -
3886 - ver = m->mpc_apicver;
3887 -
3888 - /*
3889 - * Validate version
3890 - */
3891 - if (ver == 0x0) {
3892 - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
3893 - "fixing up to 0x10. (tell your hw vendor)\n",
3894 - m->mpc_apicid);
3895 - ver = 0x10;
3896 - }
3897 - apic_version[m->mpc_apicid] = ver;
3898 -
3899 - phys_cpu = apicid_to_cpu_present(apicid);
3900 - physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
3901 -
3902 - if (num_processors >= NR_CPUS) {
3903 - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
3904 - " Processor ignored.\n", NR_CPUS);
3905 - return;
3906 - }
3907 -
3908 - if (num_processors >= maxcpus) {
3909 - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
3910 - " Processor ignored.\n", maxcpus);
3911 - return;
3912 - }
3913 -
3914 - cpu_set(num_processors, cpu_possible_map);
3915 - num_processors++;
3916 -
3917 - /*
3918 - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
3919 - * but we need to work other dependencies like SMP_SUSPEND etc
3920 - * before this can be done without some confusion.
3921 - * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
3922 - * - Ashok Raj <ashok.raj@intel.com>
3923 - */
3924 - if (num_processors > 8) {
3925 - switch (boot_cpu_data.x86_vendor) {
3926 - case X86_VENDOR_INTEL:
3927 - if (!APIC_XAPIC(ver)) {
3928 - def_to_bigsmp = 0;
3929 - break;
3930 - }
3931 - /* If P4 and above fall through */
3932 - case X86_VENDOR_AMD:
3933 - def_to_bigsmp = 1;
3934 - }
3935 - }
3936 - bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
3937 -}
3938 -#else
3939 -static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
3940 -{
3941 - num_processors++;
3942 -}
3943 -#endif /* CONFIG_XEN */
3944 -
3945 -static void __init MP_bus_info (struct mpc_config_bus *m)
3946 -{
3947 - char str[7];
3948 -
3949 - memcpy(str, m->mpc_bustype, 6);
3950 - str[6] = 0;
3951 -
3952 - mpc_oem_bus_info(m, str, translation_table[mpc_record]);
3953 -
3954 -#if MAX_MP_BUSSES < 256
3955 - if (m->mpc_busid >= MAX_MP_BUSSES) {
3956 - printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
3957 - " is too large, max. supported is %d\n",
3958 - m->mpc_busid, str, MAX_MP_BUSSES - 1);
3959 - return;
3960 - }
3961 -#endif
3962 -
3963 - if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
3964 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
3965 - } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
3966 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
3967 - } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
3968 - mpc_oem_pci_bus(m, translation_table[mpc_record]);
3969 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
3970 - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
3971 - mp_current_pci_id++;
3972 - } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
3973 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
3974 - } else {
3975 - printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
3976 - }
3977 -}
3978 -
3979 -static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
3980 -{
3981 - if (!(m->mpc_flags & MPC_APIC_USABLE))
3982 - return;
3983 -
3984 - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
3985 - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
3986 - if (nr_ioapics >= MAX_IO_APICS) {
3987 - printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
3988 - MAX_IO_APICS, nr_ioapics);
3989 - panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
3990 - }
3991 - if (!m->mpc_apicaddr) {
3992 - printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
3993 - " found in MP table, skipping!\n");
3994 - return;
3995 - }
3996 - mp_ioapics[nr_ioapics] = *m;
3997 - nr_ioapics++;
3998 -}
3999 -
4000 -static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
4001 -{
4002 - mp_irqs [mp_irq_entries] = *m;
4003 - Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
4004 - " IRQ %02x, APIC ID %x, APIC INT %02x\n",
4005 - m->mpc_irqtype, m->mpc_irqflag & 3,
4006 - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
4007 - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
4008 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
4009 - panic("Max # of irq sources exceeded!!\n");
4010 -}
4011 -
4012 -static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
4013 -{
4014 - Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
4015 - " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
4016 - m->mpc_irqtype, m->mpc_irqflag & 3,
4017 - (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
4018 - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
4019 -}
4020 -
4021 -#ifdef CONFIG_X86_NUMAQ
4022 -static void __init MP_translation_info (struct mpc_config_translation *m)
4023 -{
4024 - printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
4025 -
4026 - if (mpc_record >= MAX_MPC_ENTRY)
4027 - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
4028 - else
4029 - translation_table[mpc_record] = m; /* stash this for later */
4030 - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
4031 - node_set_online(m->trans_quad);
4032 -}
4033 -
4034 -/*
4035 - * Read/parse the MPC oem tables
4036 - */
4037 -
4038 -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
4039 - unsigned short oemsize)
4040 -{
4041 - int count = sizeof (*oemtable); /* the header size */
4042 - unsigned char *oemptr = ((unsigned char *)oemtable)+count;
4043 -
4044 - mpc_record = 0;
4045 - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
4046 - if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
4047 - {
4048 - printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
4049 - oemtable->oem_signature[0],
4050 - oemtable->oem_signature[1],
4051 - oemtable->oem_signature[2],
4052 - oemtable->oem_signature[3]);
4053 - return;
4054 - }
4055 - if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
4056 - {
4057 - printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
4058 - return;
4059 - }
4060 - while (count < oemtable->oem_length) {
4061 - switch (*oemptr) {
4062 - case MP_TRANSLATION:
4063 - {
4064 - struct mpc_config_translation *m=
4065 - (struct mpc_config_translation *)oemptr;
4066 - MP_translation_info(m);
4067 - oemptr += sizeof(*m);
4068 - count += sizeof(*m);
4069 - ++mpc_record;
4070 - break;
4071 - }
4072 - default:
4073 - {
4074 - printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
4075 - return;
4076 - }
4077 - }
4078 - }
4079 -}
4080 -
4081 -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
4082 - char *productid)
4083 -{
4084 - if (strncmp(oem, "IBM NUMA", 8))
4085 - printk("Warning! May not be a NUMA-Q system!\n");
4086 - if (mpc->mpc_oemptr)
4087 - smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
4088 - mpc->mpc_oemsize);
4089 -}
4090 -#endif /* CONFIG_X86_NUMAQ */
4091 -
4092 -/*
4093 - * Read/parse the MPC
4094 - */
4095 -
4096 -static int __init smp_read_mpc(struct mp_config_table *mpc)
4097 -{
4098 - char str[16];
4099 - char oem[10];
4100 - int count=sizeof(*mpc);
4101 - unsigned char *mpt=((unsigned char *)mpc)+count;
4102 -
4103 - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
4104 - printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
4105 - *(u32 *)mpc->mpc_signature);
4106 - return 0;
4107 - }
4108 - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
4109 - printk(KERN_ERR "SMP mptable: checksum error!\n");
4110 - return 0;
4111 - }
4112 - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
4113 - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
4114 - mpc->mpc_spec);
4115 - return 0;
4116 - }
4117 - if (!mpc->mpc_lapic) {
4118 - printk(KERN_ERR "SMP mptable: null local APIC address!\n");
4119 - return 0;
4120 - }
4121 - memcpy(oem,mpc->mpc_oem,8);
4122 - oem[8]=0;
4123 - printk(KERN_INFO "OEM ID: %s ",oem);
4124 -
4125 - memcpy(str,mpc->mpc_productid,12);
4126 - str[12]=0;
4127 - printk("Product ID: %s ",str);
4128 -
4129 - mps_oem_check(mpc, oem, str);
4130 -
4131 - printk("APIC at: 0x%X\n", mpc->mpc_lapic);
4132 -
4133 - /*
4134 - * Save the local APIC address (it might be non-default) -- but only
4135 - * if we're not using ACPI.
4136 - */
4137 - if (!acpi_lapic)
4138 - mp_lapic_addr = mpc->mpc_lapic;
4139 -
4140 - /*
4141 - * Now process the configuration blocks.
4142 - */
4143 - mpc_record = 0;
4144 - while (count < mpc->mpc_length) {
4145 - switch(*mpt) {
4146 - case MP_PROCESSOR:
4147 - {
4148 - struct mpc_config_processor *m=
4149 - (struct mpc_config_processor *)mpt;
4150 - /* ACPI may have already provided this data */
4151 - if (!acpi_lapic)
4152 - MP_processor_info(m);
4153 - mpt += sizeof(*m);
4154 - count += sizeof(*m);
4155 - break;
4156 - }
4157 - case MP_BUS:
4158 - {
4159 - struct mpc_config_bus *m=
4160 - (struct mpc_config_bus *)mpt;
4161 - MP_bus_info(m);
4162 - mpt += sizeof(*m);
4163 - count += sizeof(*m);
4164 - break;
4165 - }
4166 - case MP_IOAPIC:
4167 - {
4168 - struct mpc_config_ioapic *m=
4169 - (struct mpc_config_ioapic *)mpt;
4170 - MP_ioapic_info(m);
4171 - mpt+=sizeof(*m);
4172 - count+=sizeof(*m);
4173 - break;
4174 - }
4175 - case MP_INTSRC:
4176 - {
4177 - struct mpc_config_intsrc *m=
4178 - (struct mpc_config_intsrc *)mpt;
4179 -
4180 - MP_intsrc_info(m);
4181 - mpt+=sizeof(*m);
4182 - count+=sizeof(*m);
4183 - break;
4184 - }
4185 - case MP_LINTSRC:
4186 - {
4187 - struct mpc_config_lintsrc *m=
4188 - (struct mpc_config_lintsrc *)mpt;
4189 - MP_lintsrc_info(m);
4190 - mpt+=sizeof(*m);
4191 - count+=sizeof(*m);
4192 - break;
4193 - }
4194 - default:
4195 - {
4196 - count = mpc->mpc_length;
4197 - break;
4198 - }
4199 - }
4200 - ++mpc_record;
4201 - }
4202 - setup_apic_routing();
4203 - if (!num_processors)
4204 - printk(KERN_ERR "SMP mptable: no processors registered!\n");
4205 - return num_processors;
4206 -}
4207 -
4208 -static int __init ELCR_trigger(unsigned int irq)
4209 -{
4210 - unsigned int port;
4211 -
4212 - port = 0x4d0 + (irq >> 3);
4213 - return (inb(port) >> (irq & 7)) & 1;
4214 -}
4215 -
4216 -static void __init construct_default_ioirq_mptable(int mpc_default_type)
4217 -{
4218 - struct mpc_config_intsrc intsrc;
4219 - int i;
4220 - int ELCR_fallback = 0;
4221 -
4222 - intsrc.mpc_type = MP_INTSRC;
4223 - intsrc.mpc_irqflag = 0; /* conforming */
4224 - intsrc.mpc_srcbus = 0;
4225 - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
4226 -
4227 - intsrc.mpc_irqtype = mp_INT;
4228 -
4229 - /*
4230 - * If true, we have an ISA/PCI system with no IRQ entries
4231 - * in the MP table. To prevent the PCI interrupts from being set up
4232 - * incorrectly, we try to use the ELCR. The sanity check to see if
4233 - * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
4234 - * never be level sensitive, so we simply see if the ELCR agrees.
4235 - * If it does, we assume it's valid.
4236 - */
4237 - if (mpc_default_type == 5) {
4238 - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
4239 -
4240 - if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
4241 - printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
4242 - else {
4243 - printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
4244 - ELCR_fallback = 1;
4245 - }
4246 - }
4247 -
4248 - for (i = 0; i < 16; i++) {
4249 - switch (mpc_default_type) {
4250 - case 2:
4251 - if (i == 0 || i == 13)
4252 - continue; /* IRQ0 & IRQ13 not connected */
4253 - /* fall through */
4254 - default:
4255 - if (i == 2)
4256 - continue; /* IRQ2 is never connected */
4257 - }
4258 -
4259 - if (ELCR_fallback) {
4260 - /*
4261 - * If the ELCR indicates a level-sensitive interrupt, we
4262 - * copy that information over to the MP table in the
4263 - * irqflag field (level sensitive, active high polarity).
4264 - */
4265 - if (ELCR_trigger(i))
4266 - intsrc.mpc_irqflag = 13;
4267 - else
4268 - intsrc.mpc_irqflag = 0;
4269 - }
4270 -
4271 - intsrc.mpc_srcbusirq = i;
4272 - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
4273 - MP_intsrc_info(&intsrc);
4274 - }
4275 -
4276 - intsrc.mpc_irqtype = mp_ExtINT;
4277 - intsrc.mpc_srcbusirq = 0;
4278 - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
4279 - MP_intsrc_info(&intsrc);
4280 -}
4281 -
4282 -static inline void __init construct_default_ISA_mptable(int mpc_default_type)
4283 -{
4284 - struct mpc_config_processor processor;
4285 - struct mpc_config_bus bus;
4286 - struct mpc_config_ioapic ioapic;
4287 - struct mpc_config_lintsrc lintsrc;
4288 - int linttypes[2] = { mp_ExtINT, mp_NMI };
4289 - int i;
4290 -
4291 - /*
4292 - * local APIC has default address
4293 - */
4294 - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
4295 -
4296 - /*
4297 - * 2 CPUs, numbered 0 & 1.
4298 - */
4299 - processor.mpc_type = MP_PROCESSOR;
4300 - /* Either an integrated APIC or a discrete 82489DX. */
4301 - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
4302 - processor.mpc_cpuflag = CPU_ENABLED;
4303 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
4304 - (boot_cpu_data.x86_model << 4) |
4305 - boot_cpu_data.x86_mask;
4306 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
4307 - processor.mpc_reserved[0] = 0;
4308 - processor.mpc_reserved[1] = 0;
4309 - for (i = 0; i < 2; i++) {
4310 - processor.mpc_apicid = i;
4311 - MP_processor_info(&processor);
4312 - }
4313 -
4314 - bus.mpc_type = MP_BUS;
4315 - bus.mpc_busid = 0;
4316 - switch (mpc_default_type) {
4317 - default:
4318 - printk("???\n");
4319 - printk(KERN_ERR "Unknown standard configuration %d\n",
4320 - mpc_default_type);
4321 - /* fall through */
4322 - case 1:
4323 - case 5:
4324 - memcpy(bus.mpc_bustype, "ISA ", 6);
4325 - break;
4326 - case 2:
4327 - case 6:
4328 - case 3:
4329 - memcpy(bus.mpc_bustype, "EISA ", 6);
4330 - break;
4331 - case 4:
4332 - case 7:
4333 - memcpy(bus.mpc_bustype, "MCA ", 6);
4334 - }
4335 - MP_bus_info(&bus);
4336 - if (mpc_default_type > 4) {
4337 - bus.mpc_busid = 1;
4338 - memcpy(bus.mpc_bustype, "PCI ", 6);
4339 - MP_bus_info(&bus);
4340 - }
4341 -
4342 - ioapic.mpc_type = MP_IOAPIC;
4343 - ioapic.mpc_apicid = 2;
4344 - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
4345 - ioapic.mpc_flags = MPC_APIC_USABLE;
4346 - ioapic.mpc_apicaddr = 0xFEC00000;
4347 - MP_ioapic_info(&ioapic);
4348 -
4349 - /*
4350 - * We set up most of the low 16 IO-APIC pins according to MPS rules.
4351 - */
4352 - construct_default_ioirq_mptable(mpc_default_type);
4353 -
4354 - lintsrc.mpc_type = MP_LINTSRC;
4355 - lintsrc.mpc_irqflag = 0; /* conforming */
4356 - lintsrc.mpc_srcbusid = 0;
4357 - lintsrc.mpc_srcbusirq = 0;
4358 - lintsrc.mpc_destapic = MP_APIC_ALL;
4359 - for (i = 0; i < 2; i++) {
4360 - lintsrc.mpc_irqtype = linttypes[i];
4361 - lintsrc.mpc_destapiclint = i;
4362 - MP_lintsrc_info(&lintsrc);
4363 - }
4364 -}
4365 -
4366 -static struct intel_mp_floating *mpf_found;
4367 -
4368 -/*
4369 - * Scan the memory blocks for an SMP configuration block.
4370 - */
4371 -void __init get_smp_config (void)
4372 -{
4373 - struct intel_mp_floating *mpf = mpf_found;
4374 -
4375 - /*
4376 - * ACPI supports both logical (e.g. Hyper-Threading) and physical
4377 - * processors, where MPS only supports physical.
4378 - */
4379 - if (acpi_lapic && acpi_ioapic) {
4380 - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
4381 - return;
4382 - }
4383 - else if (acpi_lapic)
4384 - printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
4385 -
4386 - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
4387 - if (mpf->mpf_feature2 & (1<<7)) {
4388 - printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
4389 - pic_mode = 1;
4390 - } else {
4391 - printk(KERN_INFO " Virtual Wire compatibility mode.\n");
4392 - pic_mode = 0;
4393 - }
4394 -
4395 - /*
4396 - * Now see if we need to read further.
4397 - */
4398 - if (mpf->mpf_feature1 != 0) {
4399 -
4400 - printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
4401 - construct_default_ISA_mptable(mpf->mpf_feature1);
4402 -
4403 - } else if (mpf->mpf_physptr) {
4404 -
4405 - /*
4406 - * Read the physical hardware table. Anything here will
4407 - * override the defaults.
4408 - */
4409 - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
4410 - smp_found_config = 0;
4411 - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
4412 - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
4413 - return;
4414 - }
4415 - /*
4416 - * If there are no explicit MP IRQ entries, then we are
4417 - * broken. We set up most of the low 16 IO-APIC pins to
4418 - * ISA defaults and hope it will work.
4419 - */
4420 - if (!mp_irq_entries) {
4421 - struct mpc_config_bus bus;
4422 -
4423 - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
4424 -
4425 - bus.mpc_type = MP_BUS;
4426 - bus.mpc_busid = 0;
4427 - memcpy(bus.mpc_bustype, "ISA ", 6);
4428 - MP_bus_info(&bus);
4429 -
4430 - construct_default_ioirq_mptable(0);
4431 - }
4432 -
4433 - } else
4434 - BUG();
4435 -
4436 - printk(KERN_INFO "Processors: %d\n", num_processors);
4437 - /*
4438 - * Only use the first configuration found.
4439 - */
4440 -}
4441 -
4442 -static int __init smp_scan_config (unsigned long base, unsigned long length)
4443 -{
4444 - unsigned long *bp = isa_bus_to_virt(base);
4445 - struct intel_mp_floating *mpf;
4446 -
4447 - printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
4448 - if (sizeof(*mpf) != 16)
4449 - printk("Error: MPF size\n");
4450 -
4451 - while (length > 0) {
4452 - mpf = (struct intel_mp_floating *)bp;
4453 - if ((*bp == SMP_MAGIC_IDENT) &&
4454 - (mpf->mpf_length == 1) &&
4455 - !mpf_checksum((unsigned char *)bp, 16) &&
4456 - ((mpf->mpf_specification == 1)
4457 - || (mpf->mpf_specification == 4)) ) {
4458 -
4459 - smp_found_config = 1;
4460 -#ifndef CONFIG_XEN
4461 - printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4462 - mpf, virt_to_phys(mpf));
4463 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
4464 - BOOTMEM_DEFAULT);
4465 - if (mpf->mpf_physptr) {
4466 - /*
4467 - * We cannot access to MPC table to compute
4468 - * table size yet, as only few megabytes from
4469 - * the bottom is mapped now.
4470 - * PC-9800's MPC table places on the very last
4471 - * of physical memory; so that simply reserving
4472 - * PAGE_SIZE from mpg->mpf_physptr yields BUG()
4473 - * in reserve_bootmem.
4474 - */
4475 - unsigned long size = PAGE_SIZE;
4476 - unsigned long end = max_low_pfn * PAGE_SIZE;
4477 - if (mpf->mpf_physptr + size > end)
4478 - size = end - mpf->mpf_physptr;
4479 - reserve_bootmem(mpf->mpf_physptr, size,
4480 - BOOTMEM_DEFAULT);
4481 - }
4482 -#else
4483 - printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4484 - mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
4485 -#endif
4486 -
4487 - mpf_found = mpf;
4488 - return 1;
4489 - }
4490 - bp += 4;
4491 - length -= 16;
4492 - }
4493 - return 0;
4494 -}
4495 -
4496 -void __init find_smp_config (void)
4497 -{
4498 -#ifndef CONFIG_XEN
4499 - unsigned int address;
4500 -#endif
4501 -
4502 - /*
4503 - * FIXME: Linux assumes you have 640K of base ram..
4504 - * this continues the error...
4505 - *
4506 - * 1) Scan the bottom 1K for a signature
4507 - * 2) Scan the top 1K of base RAM
4508 - * 3) Scan the 64K of bios
4509 - */
4510 - if (smp_scan_config(0x0,0x400) ||
4511 - smp_scan_config(639*0x400,0x400) ||
4512 - smp_scan_config(0xF0000,0x10000))
4513 - return;
4514 - /*
4515 - * If it is an SMP machine we should know now, unless the
4516 - * configuration is in an EISA/MCA bus machine with an
4517 - * extended bios data area.
4518 - *
4519 - * there is a real-mode segmented pointer pointing to the
4520 - * 4K EBDA area at 0x40E, calculate and scan it here.
4521 - *
4522 - * NOTE! There are Linux loaders that will corrupt the EBDA
4523 - * area, and as such this kind of SMP config may be less
4524 - * trustworthy, simply because the SMP table may have been
4525 - * stomped on during early boot. These loaders are buggy and
4526 - * should be fixed.
4527 - *
4528 - * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
4529 - */
4530 -
4531 -#ifndef CONFIG_XEN
4532 - address = get_bios_ebda();
4533 - if (address)
4534 - smp_scan_config(address, 0x400);
4535 -#endif
4536 -}
4537 -
4538 -int es7000_plat;
4539 -
4540 -/* --------------------------------------------------------------------------
4541 - ACPI-based MP Configuration
4542 - -------------------------------------------------------------------------- */
4543 -
4544 -#ifdef CONFIG_ACPI
4545 -
4546 -void __init mp_register_lapic_address(u64 address)
4547 -{
4548 -#ifndef CONFIG_XEN
4549 - mp_lapic_addr = (unsigned long) address;
4550 -
4551 - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
4552 -
4553 - if (boot_cpu_physical_apicid == -1U)
4554 - boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
4555 -
4556 - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
4557 -#endif
4558 -}
4559 -
4560 -void __cpuinit mp_register_lapic (u8 id, u8 enabled)
4561 -{
4562 - struct mpc_config_processor processor;
4563 - int boot_cpu = 0;
4564 -
4565 - if (MAX_APICS - id <= 0) {
4566 - printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
4567 - id, MAX_APICS);
4568 - return;
4569 - }
4570 -
4571 - if (id == boot_cpu_physical_apicid)
4572 - boot_cpu = 1;
4573 -
4574 -#ifndef CONFIG_XEN
4575 - processor.mpc_type = MP_PROCESSOR;
4576 - processor.mpc_apicid = id;
4577 - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
4578 - processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
4579 - processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
4580 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
4581 - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
4582 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
4583 - processor.mpc_reserved[0] = 0;
4584 - processor.mpc_reserved[1] = 0;
4585 -#endif
4586 -
4587 - MP_processor_info(&processor);
4588 -}
4589 -
4590 -#ifdef CONFIG_X86_IO_APIC
4591 -
4592 -#define MP_ISA_BUS 0
4593 -#define MP_MAX_IOAPIC_PIN 127
4594 -
4595 -static struct mp_ioapic_routing {
4596 - int apic_id;
4597 - int gsi_base;
4598 - int gsi_end;
4599 - u32 pin_programmed[4];
4600 -} mp_ioapic_routing[MAX_IO_APICS];
4601 -
4602 -static int mp_find_ioapic (int gsi)
4603 -{
4604 - int i = 0;
4605 -
4606 - /* Find the IOAPIC that manages this GSI. */
4607 - for (i = 0; i < nr_ioapics; i++) {
4608 - if ((gsi >= mp_ioapic_routing[i].gsi_base)
4609 - && (gsi <= mp_ioapic_routing[i].gsi_end))
4610 - return i;
4611 - }
4612 -
4613 - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
4614 -
4615 - return -1;
4616 -}
4617 -
4618 -void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
4619 -{
4620 - int idx = 0;
4621 - int tmpid;
4622 -
4623 - if (nr_ioapics >= MAX_IO_APICS) {
4624 - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
4625 - "(found %d)\n", MAX_IO_APICS, nr_ioapics);
4626 - panic("Recompile kernel with bigger MAX_IO_APICS!\n");
4627 - }
4628 - if (!address) {
4629 - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
4630 - " found in MADT table, skipping!\n");
4631 - return;
4632 - }
4633 -
4634 - idx = nr_ioapics++;
4635 -
4636 - mp_ioapics[idx].mpc_type = MP_IOAPIC;
4637 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
4638 - mp_ioapics[idx].mpc_apicaddr = address;
4639 -
4640 -#ifndef CONFIG_XEN
4641 - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4642 -#endif
4643 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
4644 - && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
4645 - tmpid = io_apic_get_unique_id(idx, id);
4646 - else
4647 - tmpid = id;
4648 - if (tmpid == -1) {
4649 - nr_ioapics--;
4650 - return;
4651 - }
4652 - mp_ioapics[idx].mpc_apicid = tmpid;
4653 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
4654 -
4655 - /*
4656 - * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4657 - * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4658 - */
4659 - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
4660 - mp_ioapic_routing[idx].gsi_base = gsi_base;
4661 - mp_ioapic_routing[idx].gsi_end = gsi_base +
4662 - io_apic_get_redir_entries(idx);
4663 -
4664 - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4665 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4666 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4667 - mp_ioapic_routing[idx].gsi_base,
4668 - mp_ioapic_routing[idx].gsi_end);
4669 -}
4670 -
4671 -void __init
4672 -mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
4673 -{
4674 - struct mpc_config_intsrc intsrc;
4675 - int ioapic = -1;
4676 - int pin = -1;
4677 -
4678 - /*
4679 - * Convert 'gsi' to 'ioapic.pin'.
4680 - */
4681 - ioapic = mp_find_ioapic(gsi);
4682 - if (ioapic < 0)
4683 - return;
4684 - pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
4685 -
4686 - /*
4687 - * TBD: This check is for faulty timer entries, where the override
4688 - * erroneously sets the trigger to level, resulting in a HUGE
4689 - * increase of timer interrupts!
4690 - */
4691 - if ((bus_irq == 0) && (trigger == 3))
4692 - trigger = 1;
4693 -
4694 - intsrc.mpc_type = MP_INTSRC;
4695 - intsrc.mpc_irqtype = mp_INT;
4696 - intsrc.mpc_irqflag = (trigger << 2) | polarity;
4697 - intsrc.mpc_srcbus = MP_ISA_BUS;
4698 - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
4699 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
4700 - intsrc.mpc_dstirq = pin; /* INTIN# */
4701 -
4702 - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
4703 - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
4704 - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
4705 - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
4706 -
4707 - mp_irqs[mp_irq_entries] = intsrc;
4708 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
4709 - panic("Max # of irq sources exceeded!\n");
4710 -}
4711 -
4712 -void __init mp_config_acpi_legacy_irqs (void)
4713 -{
4714 - struct mpc_config_intsrc intsrc;
4715 - int i = 0;
4716 - int ioapic = -1;
4717 -
4718 - /*
4719 - * Fabricate the legacy ISA bus (bus #31).
4720 - */
4721 - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
4722 - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
4723 -
4724 - /*
4725 - * Older generations of ES7000 have no legacy identity mappings
4726 - */
4727 - if (es7000_plat == 1)
4728 - return;
4729 -
4730 - /*
4731 - * Locate the IOAPIC that manages the ISA IRQs (0-15).
4732 - */
4733 - ioapic = mp_find_ioapic(0);
4734 - if (ioapic < 0)
4735 - return;
4736 -
4737 - intsrc.mpc_type = MP_INTSRC;
4738 - intsrc.mpc_irqflag = 0; /* Conforming */
4739 - intsrc.mpc_srcbus = MP_ISA_BUS;
4740 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
4741 -
4742 - /*
4743 - * Use the default configuration for the IRQs 0-15. Unless
4744 - * overridden by (MADT) interrupt source override entries.
4745 - */
4746 - for (i = 0; i < 16; i++) {
4747 - int idx;
4748 -
4749 - for (idx = 0; idx < mp_irq_entries; idx++) {
4750 - struct mpc_config_intsrc *irq = mp_irqs + idx;
4751 -
4752 - /* Do we already have a mapping for this ISA IRQ? */
4753 - if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
4754 - break;
4755 -
4756 - /* Do we already have a mapping for this IOAPIC pin */
4757 - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
4758 - (irq->mpc_dstirq == i))
4759 - break;
4760 - }
4761 -
4762 - if (idx != mp_irq_entries) {
4763 - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
4764 - continue; /* IRQ already used */
4765 - }
4766 -
4767 - intsrc.mpc_irqtype = mp_INT;
4768 - intsrc.mpc_srcbusirq = i; /* Identity mapped */
4769 - intsrc.mpc_dstirq = i;
4770 -
4771 - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
4772 - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
4773 - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
4774 - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
4775 - intsrc.mpc_dstirq);
4776 -
4777 - mp_irqs[mp_irq_entries] = intsrc;
4778 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
4779 - panic("Max # of irq sources exceeded!\n");
4780 - }
4781 -}
4782 -
4783 -#define MAX_GSI_NUM 4096
4784 -#define IRQ_COMPRESSION_START 64
4785 -
4786 -int mp_register_gsi(u32 gsi, int triggering, int polarity)
4787 -{
4788 - int ioapic = -1;
4789 - int ioapic_pin = 0;
4790 - int idx, bit = 0;
4791 - static int pci_irq = IRQ_COMPRESSION_START;
4792 - /*
4793 - * Mapping between Global System Interrupts, which
4794 - * represent all possible interrupts, and IRQs
4795 - * assigned to actual devices.
4796 - */
4797 - static int gsi_to_irq[MAX_GSI_NUM];
4798 -
4799 - /* Don't set up the ACPI SCI because it's already set up */
4800 - if (acpi_gbl_FADT.sci_interrupt == gsi)
4801 - return gsi;
4802 -
4803 - ioapic = mp_find_ioapic(gsi);
4804 - if (ioapic < 0) {
4805 - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
4806 - return gsi;
4807 - }
4808 -
4809 - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
4810 -
4811 - if (ioapic_renumber_irq)
4812 - gsi = ioapic_renumber_irq(ioapic, gsi);
4813 -
4814 - /*
4815 - * Avoid pin reprogramming. PRTs typically include entries
4816 - * with redundant pin->gsi mappings (but unique PCI devices);
4817 - * we only program the IOAPIC on the first.
4818 - */
4819 - bit = ioapic_pin % 32;
4820 - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
4821 - if (idx > 3) {
4822 - printk(KERN_ERR "Invalid reference to IOAPIC pin "
4823 - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
4824 - ioapic_pin);
4825 - return gsi;
4826 - }
4827 - if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
4828 - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
4829 - mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
4830 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
4831 - }
4832 -
4833 - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
4834 -
4835 - /*
4836 - * For GSI >= 64, use IRQ compression
4837 - */
4838 - if ((gsi >= IRQ_COMPRESSION_START)
4839 - && (triggering == ACPI_LEVEL_SENSITIVE)) {
4840 - /*
4841 - * For PCI devices assign IRQs in order, avoiding gaps
4842 - * due to unused I/O APIC pins.
4843 - */
4844 - int irq = gsi;
4845 - if (gsi < MAX_GSI_NUM) {
4846 - /*
4847 - * Retain the VIA chipset work-around (gsi > 15), but
4848 - * avoid a problem where the 8254 timer (IRQ0) is setup
4849 - * via an override (so it's not on pin 0 of the ioapic),
4850 - * and at the same time, the pin 0 interrupt is a PCI
4851 - * type. The gsi > 15 test could cause these two pins
4852 - * to be shared as IRQ0, and they are not shareable.
4853 - * So test for this condition, and if necessary, avoid
4854 - * the pin collision.
4855 - */
4856 - if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
4857 - gsi = pci_irq++;
4858 - /*
4859 - * Don't assign IRQ used by ACPI SCI
4860 - */
4861 - if (gsi == acpi_gbl_FADT.sci_interrupt)
4862 - gsi = pci_irq++;
4863 - gsi_to_irq[irq] = gsi;
4864 - } else {
4865 - printk(KERN_ERR "GSI %u is too high\n", gsi);
4866 - return gsi;
4867 - }
4868 - }
4869 -
4870 - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
4871 - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
4872 - polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
4873 - return gsi;
4874 -}
4875 -
4876 -#endif /* CONFIG_X86_IO_APIC */
4877 -#endif /* CONFIG_ACPI */
4878 --- sle11-2009-10-16.orig/arch/x86/kernel/mpparse_64-xen.c 2009-03-16 16:33:40.000000000 +0100
4879 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
4880 @@ -1,879 +0,0 @@
4881 -/*
4882 - * Intel Multiprocessor Specification 1.1 and 1.4
4883 - * compliant MP-table parsing routines.
4884 - *
4885 - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
4886 - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
4887 - *
4888 - * Fixes
4889 - * Erich Boleyn : MP v1.4 and additional changes.
4890 - * Alan Cox : Added EBDA scanning
4891 - * Ingo Molnar : various cleanups and rewrites
4892 - * Maciej W. Rozycki: Bits for default MP configurations
4893 - * Paul Diefenbaugh: Added full ACPI support
4894 - */
4895 -
4896 -#include <linux/mm.h>
4897 -#include <linux/init.h>
4898 -#include <linux/delay.h>
4899 -#include <linux/bootmem.h>
4900 -#include <linux/kernel_stat.h>
4901 -#include <linux/mc146818rtc.h>
4902 -#include <linux/acpi.h>
4903 -#include <linux/module.h>
4904 -
4905 -#include <asm/smp.h>
4906 -#include <asm/mtrr.h>
4907 -#include <asm/mpspec.h>
4908 -#include <asm/pgalloc.h>
4909 -#include <asm/io_apic.h>
4910 -#include <asm/proto.h>
4911 -#include <asm/acpi.h>
4912 -
4913 -/* Have we found an MP table */
4914 -int smp_found_config;
4915 -
4916 -/*
4917 - * Various Linux-internal data structures created from the
4918 - * MP-table.
4919 - */
4920 -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
4921 -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
4922 -
4923 -static int mp_current_pci_id = 0;
4924 -/* I/O APIC entries */
4925 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
4926 -
4927 -/* # of MP IRQ source entries */
4928 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
4929 -
4930 -/* MP IRQ source entries */
4931 -int mp_irq_entries;
4932 -
4933 -int nr_ioapics;
4934 -unsigned long mp_lapic_addr = 0;
4935 -
4936 -
4937 -
4938 -/* Processor that is doing the boot up */
4939 -unsigned int boot_cpu_id = -1U;
4940 -EXPORT_SYMBOL(boot_cpu_id);
4941 -
4942 -/* Internal processor count */
4943 -unsigned int num_processors;
4944 -
4945 -unsigned disabled_cpus __cpuinitdata;
4946 -
4947 -/* Bitmask of physically existing CPUs */
4948 -physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4949 -
4950 -#ifndef CONFIG_XEN
4951 -u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
4952 - = { [0 ... NR_CPUS-1] = BAD_APICID };
4953 -void *x86_bios_cpu_apicid_early_ptr;
4954 -#endif
4955 -DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
4956 -EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
4957 -
4958 -
4959 -/*
4960 - * Intel MP BIOS table parsing routines:
4961 - */
4962 -
4963 -/*
4964 - * Checksum an MP configuration block.
4965 - */
4966 -
4967 -static int __init mpf_checksum(unsigned char *mp, int len)
4968 -{
4969 - int sum = 0;
4970 -
4971 - while (len--)
4972 - sum += *mp++;
4973 -
4974 - return sum & 0xFF;
4975 -}
4976 -
4977 -#ifndef CONFIG_XEN
4978 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
4979 -{
4980 - int cpu;
4981 - cpumask_t tmp_map;
4982 - char *bootup_cpu = "";
4983 -
4984 - if (!(m->mpc_cpuflag & CPU_ENABLED)) {
4985 - disabled_cpus++;
4986 - return;
4987 - }
4988 - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4989 - bootup_cpu = " (Bootup-CPU)";
4990 - boot_cpu_id = m->mpc_apicid;
4991 - }
4992 -
4993 - printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
4994 -
4995 - if (num_processors >= NR_CPUS) {
4996 - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
4997 - " Processor ignored.\n", NR_CPUS);
4998 - return;
4999 - }
5000 -
5001 - num_processors++;
5002 - cpus_complement(tmp_map, cpu_present_map);
5003 - cpu = first_cpu(tmp_map);
5004 -
5005 - physid_set(m->mpc_apicid, phys_cpu_present_map);
5006 - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
5007 - /*
5008 - * x86_bios_cpu_apicid is required to have processors listed
5009 - * in same order as logical cpu numbers. Hence the first
5010 - * entry is BSP, and so on.
5011 - */
5012 - cpu = 0;
5013 - }
5014 - /* are we being called early in kernel startup? */
5015 - if (x86_cpu_to_apicid_early_ptr) {
5016 - u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
5017 - u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
5018 -
5019 - cpu_to_apicid[cpu] = m->mpc_apicid;
5020 - bios_cpu_apicid[cpu] = m->mpc_apicid;
5021 - } else {
5022 - per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
5023 - per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
5024 - }
5025 -
5026 - cpu_set(cpu, cpu_possible_map);
5027 - cpu_set(cpu, cpu_present_map);
5028 -}
5029 -#else
5030 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
5031 -{
5032 - num_processors++;
5033 -}
5034 -#endif /* CONFIG_XEN */
5035 -
5036 -static void __init MP_bus_info (struct mpc_config_bus *m)
5037 -{
5038 - char str[7];
5039 -
5040 - memcpy(str, m->mpc_bustype, 6);
5041 - str[6] = 0;
5042 - Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
5043 -
5044 - if (strncmp(str, "ISA", 3) == 0) {
5045 - set_bit(m->mpc_busid, mp_bus_not_pci);
5046 - } else if (strncmp(str, "PCI", 3) == 0) {
5047 - clear_bit(m->mpc_busid, mp_bus_not_pci);
5048 - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
5049 - mp_current_pci_id++;
5050 - } else {
5051 - printk(KERN_ERR "Unknown bustype %s\n", str);
5052 - }
5053 -}
5054 -
5055 -static int bad_ioapic(unsigned long address)
5056 -{
5057 - if (nr_ioapics >= MAX_IO_APICS) {
5058 - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
5059 - "(found %d)\n", MAX_IO_APICS, nr_ioapics);
5060 - panic("Recompile kernel with bigger MAX_IO_APICS!\n");
5061 - }
5062 - if (!address) {
5063 - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
5064 - " found in table, skipping!\n");
5065 - return 1;
5066 - }
5067 - return 0;
5068 -}
5069 -
5070 -static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
5071 -{
5072 - if (!(m->mpc_flags & MPC_APIC_USABLE))
5073 - return;
5074 -
5075 - printk("I/O APIC #%d at 0x%X.\n",
5076 - m->mpc_apicid, m->mpc_apicaddr);
5077 -
5078 - if (bad_ioapic(m->mpc_apicaddr))
5079 - return;
5080 -
5081 - mp_ioapics[nr_ioapics] = *m;
5082 - nr_ioapics++;
5083 -}
5084 -
5085 -static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
5086 -{
5087 - mp_irqs [mp_irq_entries] = *m;
5088 - Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
5089 - " IRQ %02x, APIC ID %x, APIC INT %02x\n",
5090 - m->mpc_irqtype, m->mpc_irqflag & 3,
5091 - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
5092 - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
5093 - if (++mp_irq_entries >= MAX_IRQ_SOURCES)
5094 - panic("Max # of irq sources exceeded!!\n");
5095 -}
5096 -
5097 -static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
5098 -{
5099 - Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
5100 - " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
5101 - m->mpc_irqtype, m->mpc_irqflag & 3,
5102 - (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
5103 - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
5104 -}
5105 -
5106 -/*
5107 - * Read/parse the MPC
5108 - */
5109 -
5110 -static int __init smp_read_mpc(struct mp_config_table *mpc)
5111 -{
5112 - char str[16];
5113 - int count=sizeof(*mpc);
5114 - unsigned char *mpt=((unsigned char *)mpc)+count;
5115 -
5116 - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
5117 - printk("MPTABLE: bad signature [%c%c%c%c]!\n",
5118 - mpc->mpc_signature[0],
5119 - mpc->mpc_signature[1],
5120 - mpc->mpc_signature[2],
5121 - mpc->mpc_signature[3]);
5122 - return 0;
5123 - }
5124 - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
5125 - printk("MPTABLE: checksum error!\n");
5126 - return 0;
5127 - }
5128 - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
5129 - printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
5130 - mpc->mpc_spec);
5131 - return 0;
5132 - }
5133 - if (!mpc->mpc_lapic) {
5134 - printk(KERN_ERR "MPTABLE: null local APIC address!\n");
5135 - return 0;
5136 - }
5137 - memcpy(str,mpc->mpc_oem,8);
5138 - str[8] = 0;
5139 - printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
5140 -
5141 - memcpy(str,mpc->mpc_productid,12);
5142 - str[12] = 0;
5143 - printk("MPTABLE: Product ID: %s ",str);
5144 -
5145 - printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
5146 -
5147 - /* save the local APIC address, it might be non-default */
5148 - if (!acpi_lapic)
5149 - mp_lapic_addr = mpc->mpc_lapic;
5150 -
5151 - /*
5152 - * Now process the configuration blocks.
5153 - */
5154 - while (count < mpc->mpc_length) {
5155 - switch(*mpt) {
5156 - case MP_PROCESSOR:
5157 - {
5158 - struct mpc_config_processor *m=
5159 - (struct mpc_config_processor *)mpt;
5160 - if (!acpi_lapic)
5161 - MP_processor_info(m);
5162 - mpt += sizeof(*m);
5163 - count += sizeof(*m);
5164 - break;
5165 - }
5166 - case MP_BUS:
5167 - {
5168 - struct mpc_config_bus *m=
5169 - (struct mpc_config_bus *)mpt;
5170 - MP_bus_info(m);
5171 - mpt += sizeof(*m);
5172 - count += sizeof(*m);
5173 - break;
5174 - }
5175 - case MP_IOAPIC:
5176 - {
5177 - struct mpc_config_ioapic *m=
5178 - (struct mpc_config_ioapic *)mpt;
5179 - MP_ioapic_info(m);
5180 - mpt += sizeof(*m);
5181 - count += sizeof(*m);
5182 - break;
5183 - }
5184 - case MP_INTSRC:
5185 - {
5186 - struct mpc_config_intsrc *m=
5187 - (struct mpc_config_intsrc *)mpt;
5188 -
5189 - MP_intsrc_info(m);
5190 - mpt += sizeof(*m);
5191 - count += sizeof(*m);
5192 - break;
5193 - }
5194 - case MP_LINTSRC:
5195 - {
5196 - struct mpc_config_lintsrc *m=
5197 - (struct mpc_config_lintsrc *)mpt;
5198 - MP_lintsrc_info(m);
5199 - mpt += sizeof(*m);
5200 - count += sizeof(*m);
5201 - break;
5202 - }
5203 - }
5204 - }
5205 - setup_apic_routing();
5206 - if (!num_processors)
5207 - printk(KERN_ERR "MPTABLE: no processors registered!\n");
5208 - return num_processors;
5209 -}
5210 -
5211 -static int __init ELCR_trigger(unsigned int irq)
5212 -{
5213 - unsigned int port;
5214 -
5215 - port = 0x4d0 + (irq >> 3);
5216 - return (inb(port) >> (irq & 7)) & 1;
5217 -}
5218 -
5219 -static void __init construct_default_ioirq_mptable(int mpc_default_type)
5220 -{
5221 - struct mpc_config_intsrc intsrc;
5222 - int i;
5223 - int ELCR_fallback = 0;
5224 -
5225 - intsrc.mpc_type = MP_INTSRC;
5226 - intsrc.mpc_irqflag = 0; /* conforming */
5227 - intsrc.mpc_srcbus = 0;
5228 - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
5229 -
5230 - intsrc.mpc_irqtype = mp_INT;
5231 -
5232 - /*
5233 - * If true, we have an ISA/PCI system with no IRQ entries
5234 - * in the MP table. To prevent the PCI interrupts from being set up
5235 - * incorrectly, we try to use the ELCR. The sanity check to see if
5236 - * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
5237 - * never be level sensitive, so we simply see if the ELCR agrees.
5238 - * If it does, we assume it's valid.
5239 - */
5240 - if (mpc_default_type == 5) {
5241 - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
5242 -
5243 - if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
5244 - printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
5245 - else {
5246 - printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
5247 - ELCR_fallback = 1;
5248 - }
5249 - }
5250 -
5251 - for (i = 0; i < 16; i++) {
5252 - switch (mpc_default_type) {
5253 - case 2:
5254 - if (i == 0 || i == 13)
5255 - continue; /* IRQ0 & IRQ13 not connected */
5256 - /* fall through */
5257 - default:
5258 - if (i == 2)
5259 - continue; /* IRQ2 is never connected */
5260 - }
5261 -
5262 - if (ELCR_fallback) {
5263 - /*
5264 - * If the ELCR indicates a level-sensitive interrupt, we
5265 - * copy that information over to the MP table in the
5266 - * irqflag field (level sensitive, active high polarity).
5267 - */
5268 - if (ELCR_trigger(i))
5269 - intsrc.mpc_irqflag = 13;
5270 - else
5271 - intsrc.mpc_irqflag = 0;
5272 - }
5273 -
5274 - intsrc.mpc_srcbusirq = i;
5275 - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
5276 - MP_intsrc_info(&intsrc);
5277 - }
5278 -
5279 - intsrc.mpc_irqtype = mp_ExtINT;
5280 - intsrc.mpc_srcbusirq = 0;
5281 - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
5282 - MP_intsrc_info(&intsrc);
5283 -}
5284 -
5285 -static inline void __init construct_default_ISA_mptable(int mpc_default_type)
5286 -{
5287 - struct mpc_config_processor processor;
5288 - struct mpc_config_bus bus;
5289 - struct mpc_config_ioapic ioapic;
5290 - struct mpc_config_lintsrc lintsrc;
5291 - int linttypes[2] = { mp_ExtINT, mp_NMI };
5292 - int i;
5293 -
5294 - /*
5295 - * local APIC has default address
5296 - */
5297 - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
5298 -
5299 - /*
5300 - * 2 CPUs, numbered 0 & 1.
5301 - */
5302 - processor.mpc_type = MP_PROCESSOR;
5303 - processor.mpc_apicver = 0;
5304 - processor.mpc_cpuflag = CPU_ENABLED;
5305 - processor.mpc_cpufeature = 0;
5306 - processor.mpc_featureflag = 0;
5307 - processor.mpc_reserved[0] = 0;
5308 - processor.mpc_reserved[1] = 0;
5309 - for (i = 0; i < 2; i++) {
5310 - processor.mpc_apicid = i;
5311 - MP_processor_info(&processor);
5312 - }
5313 -
5314 - bus.mpc_type = MP_BUS;
5315 - bus.mpc_busid = 0;
5316 - switch (mpc_default_type) {
5317 - default:
5318 - printk(KERN_ERR "???\nUnknown standard configuration %d\n",
5319 - mpc_default_type);
5320 - /* fall through */
5321 - case 1:
5322 - case 5:
5323 - memcpy(bus.mpc_bustype, "ISA ", 6);
5324 - break;
5325 - }
5326 - MP_bus_info(&bus);
5327 - if (mpc_default_type > 4) {
5328 - bus.mpc_busid = 1;
5329 - memcpy(bus.mpc_bustype, "PCI ", 6);
5330 - MP_bus_info(&bus);
5331 - }
5332 -
5333 - ioapic.mpc_type = MP_IOAPIC;
5334 - ioapic.mpc_apicid = 2;
5335 - ioapic.mpc_apicver = 0;
5336 - ioapic.mpc_flags = MPC_APIC_USABLE;
5337 - ioapic.mpc_apicaddr = 0xFEC00000;
5338 - MP_ioapic_info(&ioapic);
5339 -
5340 - /*
5341 - * We set up most of the low 16 IO-APIC pins according to MPS rules.
5342 - */
5343 - construct_default_ioirq_mptable(mpc_default_type);
5344 -
5345 - lintsrc.mpc_type = MP_LINTSRC;
5346 - lintsrc.mpc_irqflag = 0; /* conforming */
5347 - lintsrc.mpc_srcbusid = 0;
5348 - lintsrc.mpc_srcbusirq = 0;
5349 - lintsrc.mpc_destapic = MP_APIC_ALL;
5350 - for (i = 0; i < 2; i++) {
5351 - lintsrc.mpc_irqtype = linttypes[i];
5352 - lintsrc.mpc_destapiclint = i;
5353 - MP_lintsrc_info(&lintsrc);
5354 - }
5355 -}
5356 -
5357 -static struct intel_mp_floating *mpf_found;
5358 -
5359 -/*
5360 - * Scan the memory blocks for an SMP configuration block.
5361 - */
5362 -void __init get_smp_config (void)
5363 -{
5364 - struct intel_mp_floating *mpf = mpf_found;
5365 -
5366 - /*
5367 - * ACPI supports both logical (e.g. Hyper-Threading) and physical
5368 - * processors, where MPS only supports physical.
5369 - */
5370 - if (acpi_lapic && acpi_ioapic) {
5371 - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
5372 - return;
5373 - }
5374 - else if (acpi_lapic)
5375 - printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
5376 -
5377 - printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
5378 -
5379 - /*
5380 - * Now see if we need to read further.
5381 - */
5382 - if (mpf->mpf_feature1 != 0) {
5383 -
5384 - printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
5385 - construct_default_ISA_mptable(mpf->mpf_feature1);
5386 -
5387 - } else if (mpf->mpf_physptr) {
5388 -
5389 - /*
5390 - * Read the physical hardware table. Anything here will
5391 - * override the defaults.
5392 - */
5393 - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
5394 - smp_found_config = 0;
5395 - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
5396 - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
5397 - return;
5398 - }
5399 - /*
5400 - * If there are no explicit MP IRQ entries, then we are
5401 - * broken. We set up most of the low 16 IO-APIC pins to
5402 - * ISA defaults and hope it will work.
5403 - */
5404 - if (!mp_irq_entries) {
5405 - struct mpc_config_bus bus;
5406 -
5407 - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
5408 -
5409 - bus.mpc_type = MP_BUS;
5410 - bus.mpc_busid = 0;
5411 - memcpy(bus.mpc_bustype, "ISA ", 6);
5412 - MP_bus_info(&bus);
5413 -
5414 - construct_default_ioirq_mptable(0);
5415 - }
5416 -
5417 - } else
5418 - BUG();
5419 -
5420 - printk(KERN_INFO "Processors: %d\n", num_processors);
5421 - /*
5422 - * Only use the first configuration found.
5423 - */
5424 -}
5425 -
5426 -static int __init smp_scan_config (unsigned long base, unsigned long length)
5427 -{
5428 - extern void __bad_mpf_size(void);
5429 - unsigned int *bp = isa_bus_to_virt(base);
5430 - struct intel_mp_floating *mpf;
5431 -
5432 - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
5433 - if (sizeof(*mpf) != 16)
5434 - __bad_mpf_size();
5435 -
5436 - while (length > 0) {
5437 - mpf = (struct intel_mp_floating *)bp;
5438 - if ((*bp == SMP_MAGIC_IDENT) &&
5439 - (mpf->mpf_length == 1) &&
5440 - !mpf_checksum((unsigned char *)bp, 16) &&
5441 - ((mpf->mpf_specification == 1)
5442 - || (mpf->mpf_specification == 4)) ) {
5443 -
5444 - smp_found_config = 1;
5445 - mpf_found = mpf;
5446 - return 1;
5447 - }
5448 - bp += 4;
5449 - length -= 16;
5450 - }
5451 - return 0;
5452 -}
5453 -
5454 -void __init find_smp_config(void)
5455 -{
5456 - unsigned int address;
5457 -
5458 - /*
5459 - * FIXME: Linux assumes you have 640K of base ram..
5460 - * this continues the error...
5461 - *
5462 - * 1) Scan the bottom 1K for a signature
5463 - * 2) Scan the top 1K of base RAM
5464 - * 3) Scan the 64K of bios
5465 - */
5466 - if (smp_scan_config(0x0,0x400) ||
5467 - smp_scan_config(639*0x400,0x400) ||
5468 - smp_scan_config(0xF0000,0x10000))
5469 - return;
5470 - /*
5471 - * If it is an SMP machine we should know now.
5472 - *
5473 - * there is a real-mode segmented pointer pointing to the
5474 - * 4K EBDA area at 0x40E, calculate and scan it here.
5475 - *
5476 - * NOTE! There are Linux loaders that will corrupt the EBDA
5477 - * area, and as such this kind of SMP config may be less
5478 - * trustworthy, simply because the SMP table may have been
5479 - * stomped on during early boot. These loaders are buggy and
5480 - * should be fixed.
5481 - */
5482 -
5483 - address = *(unsigned short *)phys_to_virt(0x40E);
5484 - address <<= 4;
5485 - if (smp_scan_config(address, 0x1000))
5486 - return;
5487 -
5488 - /* If we have come this far, we did not find an MP table */
5489 - printk(KERN_INFO "No mptable found.\n");
5490 -}
5491 -
5492 -/* --------------------------------------------------------------------------
5493 - ACPI-based MP Configuration
5494 - -------------------------------------------------------------------------- */
5495 -
5496 -#ifdef CONFIG_ACPI
5497 -
5498 -void __init mp_register_lapic_address(u64 address)
5499 -{
5500 -#ifndef CONFIG_XEN
5501 - mp_lapic_addr = (unsigned long) address;
5502 - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
5503 - if (boot_cpu_id == -1U)
5504 - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
5505 -#endif
5506 -}
5507 -
5508 -void __cpuinit mp_register_lapic (u8 id, u8 enabled)
5509 -{
5510 - struct mpc_config_processor processor;
5511 - int boot_cpu = 0;
5512 -
5513 - if (id == boot_cpu_id)
5514 - boot_cpu = 1;
5515 -
5516 -#ifndef CONFIG_XEN
5517 - processor.mpc_type = MP_PROCESSOR;
5518 - processor.mpc_apicid = id;
5519 - processor.mpc_apicver = 0;
5520 - processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
5521 - processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
5522 - processor.mpc_cpufeature = 0;
5523 - processor.mpc_featureflag = 0;
5524 - processor.mpc_reserved[0] = 0;
5525 - processor.mpc_reserved[1] = 0;
5526 -#endif
5527 -
5528 - MP_processor_info(&processor);
5529 -}
5530 -
5531 -#define MP_ISA_BUS 0
5532 -#define MP_MAX_IOAPIC_PIN 127
5533 -
5534 -static struct mp_ioapic_routing {
5535 - int apic_id;
5536 - int gsi_start;
5537 - int gsi_end;
5538 - u32 pin_programmed[4];
5539 -} mp_ioapic_routing[MAX_IO_APICS];
5540 -
5541 -static int mp_find_ioapic(int gsi)
5542 -{
5543 - int i = 0;
5544 -
5545 - /* Find the IOAPIC that manages this GSI. */
5546 - for (i = 0; i < nr_ioapics; i++) {
5547 - if ((gsi >= mp_ioapic_routing[i].gsi_start)
5548 - && (gsi <= mp_ioapic_routing[i].gsi_end))
5549 - return i;
5550 - }
5551 -
5552 - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
5553 - return -1;
5554 -}
5555 -
5556 -static u8 uniq_ioapic_id(u8 id)
5557 -{
5558 - int i;
5559 - DECLARE_BITMAP(used, 256);
5560 - bitmap_zero(used, 256);
5561 - for (i = 0; i < nr_ioapics; i++) {
5562 - struct mpc_config_ioapic *ia = &mp_ioapics[i];
5563 - __set_bit(ia->mpc_apicid, used);
5564 - }
5565 - if (!test_bit(id, used))
5566 - return id;
5567 - return find_first_zero_bit(used, 256);
5568 -}
5569 -
5570 -void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
5571 -{
5572 - int idx = 0;
5573 -
5574 - if (bad_ioapic(address))
5575 - return;
5576 -
5577 - idx = nr_ioapics;
5578 -
5579 - mp_ioapics[idx].mpc_type = MP_IOAPIC;
5580 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
5581 - mp_ioapics[idx].mpc_apicaddr = address;
5582 -
5583 -#ifndef CONFIG_XEN
5584 - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
5585 -#endif
5586 - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
5587 - mp_ioapics[idx].mpc_apicver = 0;
5588 -
5589 - /*
5590 - * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
5591 - * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
5592 - */
5593 - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
5594 - mp_ioapic_routing[idx].gsi_start = gsi_base;
5595 - mp_ioapic_routing[idx].gsi_end = gsi_base +
5596 - io_apic_get_redir_entries(idx);
5597 -
5598 - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
5599 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
5600 - mp_ioapics[idx].mpc_apicaddr,
5601 - mp_ioapic_routing[idx].gsi_start,
5602 - mp_ioapic_routing[idx].gsi_end);
5603 -
5604 - nr_ioapics++;
5605 -}
5606 -
5607 -void __init
5608 -mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
5609 -{
5610 - struct mpc_config_intsrc intsrc;
5611 - int ioapic = -1;
5612 - int pin = -1;
5613 -
5614 - /*
5615 - * Convert 'gsi' to 'ioapic.pin'.
5616 - */
5617 - ioapic = mp_find_ioapic(gsi);
5618 - if (ioapic < 0)
5619 - return;
5620 - pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
5621 -
5622 - /*
5623 - * TBD: This check is for faulty timer entries, where the override
5624 - * erroneously sets the trigger to level, resulting in a HUGE
5625 - * increase of timer interrupts!
5626 - */
5627 - if ((bus_irq == 0) && (trigger == 3))
5628 - trigger = 1;
5629 -
5630 - intsrc.mpc_type = MP_INTSRC;
5631 - intsrc.mpc_irqtype = mp_INT;
5632 - intsrc.mpc_irqflag = (trigger << 2) | polarity;
5633 - intsrc.mpc_srcbus = MP_ISA_BUS;
5634 - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
5635 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
5636 - intsrc.mpc_dstirq = pin; /* INTIN# */
5637 -
5638 - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
5639 - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
5640 - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
5641 - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
5642 -
5643 - mp_irqs[mp_irq_entries] = intsrc;
5644 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
5645 - panic("Max # of irq sources exceeded!\n");
5646 -}
5647 -
5648 -void __init mp_config_acpi_legacy_irqs(void)
5649 -{
5650 - struct mpc_config_intsrc intsrc;
5651 - int i = 0;
5652 - int ioapic = -1;
5653 -
5654 - /*
5655 - * Fabricate the legacy ISA bus (bus #31).
5656 - */
5657 - set_bit(MP_ISA_BUS, mp_bus_not_pci);
5658 -
5659 - /*
5660 - * Locate the IOAPIC that manages the ISA IRQs (0-15).
5661 - */
5662 - ioapic = mp_find_ioapic(0);
5663 - if (ioapic < 0)
5664 - return;
5665 -
5666 - intsrc.mpc_type = MP_INTSRC;
5667 - intsrc.mpc_irqflag = 0; /* Conforming */
5668 - intsrc.mpc_srcbus = MP_ISA_BUS;
5669 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
5670 -
5671 - /*
5672 - * Use the default configuration for the IRQs 0-15. Unless
5673 - * overridden by (MADT) interrupt source override entries.
5674 - */
5675 - for (i = 0; i < 16; i++) {
5676 - int idx;
5677 -
5678 - for (idx = 0; idx < mp_irq_entries; idx++) {
5679 - struct mpc_config_intsrc *irq = mp_irqs + idx;
5680 -
5681 - /* Do we already have a mapping for this ISA IRQ? */
5682 - if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
5683 - break;
5684 -
5685 - /* Do we already have a mapping for this IOAPIC pin */
5686 - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
5687 - (irq->mpc_dstirq == i))
5688 - break;
5689 - }
5690 -
5691 - if (idx != mp_irq_entries) {
5692 - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
5693 - continue; /* IRQ already used */
5694 - }
5695 -
5696 - intsrc.mpc_irqtype = mp_INT;
5697 - intsrc.mpc_srcbusirq = i; /* Identity mapped */
5698 - intsrc.mpc_dstirq = i;
5699 -
5700 - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
5701 - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
5702 - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
5703 - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
5704 - intsrc.mpc_dstirq);
5705 -
5706 - mp_irqs[mp_irq_entries] = intsrc;
5707 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
5708 - panic("Max # of irq sources exceeded!\n");
5709 - }
5710 -}
5711 -
5712 -int mp_register_gsi(u32 gsi, int triggering, int polarity)
5713 -{
5714 - int ioapic = -1;
5715 - int ioapic_pin = 0;
5716 - int idx, bit = 0;
5717 -
5718 - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
5719 - return gsi;
5720 -
5721 - /* Don't set up the ACPI SCI because it's already set up */
5722 - if (acpi_gbl_FADT.sci_interrupt == gsi)
5723 - return gsi;
5724 -
5725 - ioapic = mp_find_ioapic(gsi);
5726 - if (ioapic < 0) {
5727 - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
5728 - return gsi;
5729 - }
5730 -
5731 - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
5732 -
5733 - /*
5734 - * Avoid pin reprogramming. PRTs typically include entries
5735 - * with redundant pin->gsi mappings (but unique PCI devices);
5736 - * we only program the IOAPIC on the first.
5737 - */
5738 - bit = ioapic_pin % 32;
5739 - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
5740 - if (idx > 3) {
5741 - printk(KERN_ERR "Invalid reference to IOAPIC pin "
5742 - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
5743 - ioapic_pin);
5744 - return gsi;
5745 - }
5746 - if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
5747 - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
5748 - mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
5749 - return gsi;
5750 - }
5751 -
5752 - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
5753 -
5754 - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
5755 - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
5756 - polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
5757 - return gsi;
5758 -}
5759 -#endif /*CONFIG_ACPI*/
5760 --- sle11-2009-10-16.orig/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:33:40.000000000 +0100
5761 +++ sle11-2009-10-16/arch/x86/kernel/pci-dma-xen.c 2009-10-22 11:31:59.000000000 +0200
5762 @@ -1,283 +1,251 @@
5763 -/*
5764 - * Dynamic DMA mapping support.
5765 - *
5766 - * On i386 there is no hardware dynamic DMA address translation,
5767 - * so consistent alloc/free are merely page allocation/freeing.
5768 - * The rest of the dynamic DMA mapping interface is implemented
5769 - * in asm/pci.h.
5770 - */
5771 -
5772 -#include <linux/types.h>
5773 -#include <linux/mm.h>
5774 -#include <linux/string.h>
5775 +#include <linux/dma-mapping.h>
5776 +#include <linux/dmar.h>
5777 +#include <linux/bootmem.h>
5778 #include <linux/pci.h>
5779 -#include <linux/module.h>
5780 -#include <linux/version.h>
5781 -#include <asm/io.h>
5782 -#include <xen/balloon.h>
5783 -#include <xen/gnttab.h>
5784 -#include <asm/swiotlb.h>
5785 -#include <asm/tlbflush.h>
5786 -#include <asm/swiotlb_32.h>
5787 -#include <asm/gnttab_dma.h>
5788 -#include <asm/bug.h>
5789
5790 -#ifdef __x86_64__
5791 -#include <asm/iommu.h>
5792 +#include <asm/proto.h>
5793 +#include <asm/dma.h>
5794 +#include <asm/gart.h>
5795 +#include <asm/calgary.h>
5796 +
5797 +int forbid_dac __read_mostly;
5798 +EXPORT_SYMBOL(forbid_dac);
5799 +
5800 +const struct dma_mapping_ops *dma_ops;
5801 +EXPORT_SYMBOL(dma_ops);
5802 +
5803 +static int iommu_sac_force __read_mostly;
5804 +
5805 +#ifdef CONFIG_IOMMU_DEBUG
5806 +int panic_on_overflow __read_mostly = 1;
5807 +int force_iommu __read_mostly = 1;
5808 +#else
5809 +int panic_on_overflow __read_mostly = 0;
5810 +int force_iommu __read_mostly = 0;
5811 +#endif
5812
5813 int iommu_merge __read_mostly = 0;
5814 -EXPORT_SYMBOL(iommu_merge);
5815
5816 -dma_addr_t bad_dma_address __read_mostly;
5817 -EXPORT_SYMBOL(bad_dma_address);
5818 +int no_iommu __read_mostly;
5819 +/* Set this to 1 if there is a HW IOMMU in the system */
5820 +int iommu_detected __read_mostly = 0;
5821
5822 /* This tells the BIO block layer to assume merging. Default to off
5823 because we cannot guarantee merging later. */
5824 int iommu_bio_merge __read_mostly = 0;
5825 EXPORT_SYMBOL(iommu_bio_merge);
5826
5827 -int force_iommu __read_mostly= 0;
5828 +dma_addr_t bad_dma_address __read_mostly = 0;
5829 +EXPORT_SYMBOL(bad_dma_address);
5830
5831 -__init int iommu_setup(char *p)
5832 -{
5833 - return 1;
5834 -}
5835 +/* Dummy device used for NULL arguments (normally ISA). Better would
5836 + be probably a smaller DMA mask, but this is bug-to-bug compatible
5837 + to older i386. */
5838 +struct device fallback_dev = {
5839 + .bus_id = "fallback device",
5840 + .coherent_dma_mask = DMA_32BIT_MASK,
5841 + .dma_mask = &fallback_dev.coherent_dma_mask,
5842 +};
5843
5844 -void __init pci_iommu_alloc(void)
5845 +int dma_set_mask(struct device *dev, u64 mask)
5846 {
5847 -#ifdef CONFIG_SWIOTLB
5848 - pci_swiotlb_init();
5849 -#endif
5850 -}
5851 + if (!dev->dma_mask || !dma_supported(dev, mask))
5852 + return -EIO;
5853 +
5854 + *dev->dma_mask = mask;
5855
5856 -static int __init pci_iommu_init(void)
5857 -{
5858 - no_iommu_init();
5859 return 0;
5860 }
5861 +EXPORT_SYMBOL(dma_set_mask);
5862
5863 -/* Must execute after PCI subsystem */
5864 -fs_initcall(pci_iommu_init);
5865 -#endif
5866 -
5867 -struct dma_coherent_mem {
5868 - void *virt_base;
5869 - u32 device_base;
5870 - int size;
5871 - int flags;
5872 - unsigned long *bitmap;
5873 -};
5874 -
5875 -#define IOMMU_BUG_ON(test) \
5876 -do { \
5877 - if (unlikely(test)) { \
5878 - printk(KERN_ALERT "Fatal DMA error! " \
5879 - "Please use 'swiotlb=force'\n"); \
5880 - BUG(); \
5881 - } \
5882 -} while (0)
5883 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
5884 +static __initdata void *dma32_bootmem_ptr;
5885 +static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
5886
5887 -static int check_pages_physically_contiguous(unsigned long pfn,
5888 - unsigned int offset,
5889 - size_t length)
5890 +static int __init parse_dma32_size_opt(char *p)
5891 {
5892 - unsigned long next_mfn;
5893 - int i;
5894 - int nr_pages;
5895 -
5896 - next_mfn = pfn_to_mfn(pfn);
5897 - nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
5898 -
5899 - for (i = 1; i < nr_pages; i++) {
5900 - if (pfn_to_mfn(++pfn) != ++next_mfn)
5901 - return 0;
5902 - }
5903 - return 1;
5904 + if (!p)
5905 + return -EINVAL;
5906 + dma32_bootmem_size = memparse(p, &p);
5907 + return 0;
5908 }
5909 +early_param("dma32_size", parse_dma32_size_opt);
5910
5911 -int range_straddles_page_boundary(paddr_t p, size_t size)
5912 +void __init dma32_reserve_bootmem(void)
5913 {
5914 - unsigned long pfn = p >> PAGE_SHIFT;
5915 - unsigned int offset = p & ~PAGE_MASK;
5916 + unsigned long size, align;
5917 + if (end_pfn <= MAX_DMA32_PFN)
5918 + return;
5919
5920 - return ((offset + size > PAGE_SIZE) &&
5921 - !check_pages_physically_contiguous(pfn, offset, size));
5922 + align = 64ULL<<20;
5923 + size = round_up(dma32_bootmem_size, align);
5924 + dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
5925 + __pa(MAX_DMA_ADDRESS));
5926 + if (dma32_bootmem_ptr)
5927 + dma32_bootmem_size = size;
5928 + else
5929 + dma32_bootmem_size = 0;
5930 }
5931 -
5932 -int
5933 -dma_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
5934 - enum dma_data_direction direction)
5935 +static void __init dma32_free_bootmem(void)
5936 {
5937 - int i, rc;
5938 + int node;
5939 +
5940 + if (end_pfn <= MAX_DMA32_PFN)
5941 + return;
5942
5943 - BUG_ON(!valid_dma_direction(direction));
5944 - WARN_ON(nents == 0 || sgl->length == 0);
5945 + if (!dma32_bootmem_ptr)
5946 + return;
5947
5948 - if (swiotlb) {
5949 - rc = swiotlb_map_sg(hwdev, sgl, nents, direction);
5950 - } else {
5951 - struct scatterlist *sg;
5952 -
5953 - for_each_sg(sgl, sg, nents, i) {
5954 - BUG_ON(!sg_page(sg));
5955 - sg->dma_address =
5956 - gnttab_dma_map_page(sg_page(sg)) + sg->offset;
5957 - sg->dma_length = sg->length;
5958 - IOMMU_BUG_ON(address_needs_mapping(
5959 - hwdev, sg->dma_address));
5960 - IOMMU_BUG_ON(range_straddles_page_boundary(
5961 - page_to_pseudophys(sg_page(sg)) + sg->offset,
5962 - sg->length));
5963 - }
5964 - rc = nents;
5965 - }
5966 + for_each_online_node(node)
5967 + free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
5968 + dma32_bootmem_size);
5969
5970 - flush_write_buffers();
5971 - return rc;
5972 + dma32_bootmem_ptr = NULL;
5973 + dma32_bootmem_size = 0;
5974 }
5975 -EXPORT_SYMBOL(dma_map_sg);
5976 +#else
5977 +#define dma32_free_bootmem() ((void)0)
5978 +#endif
5979
5980 -void
5981 -dma_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
5982 - enum dma_data_direction direction)
5983 -{
5984 - int i;
5985 +static const struct dma_mapping_ops swiotlb_dma_ops = {
5986 + .mapping_error = swiotlb_dma_mapping_error,
5987 + .map_single = swiotlb_map_single_phys,
5988 + .unmap_single = swiotlb_unmap_single,
5989 + .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
5990 + .sync_single_for_device = swiotlb_sync_single_for_device,
5991 + .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
5992 + .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
5993 + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
5994 + .sync_sg_for_device = swiotlb_sync_sg_for_device,
5995 + .map_sg = swiotlb_map_sg,
5996 + .unmap_sg = swiotlb_unmap_sg,
5997 + .dma_supported = swiotlb_dma_supported
5998 +};
5999
6000 - BUG_ON(!valid_dma_direction(direction));
6001 - if (swiotlb)
6002 - swiotlb_unmap_sg(hwdev, sgl, nents, direction);
6003 - else {
6004 - struct scatterlist *sg;
6005 +void __init pci_iommu_alloc(void)
6006 +{
6007 + /* free the range so iommu could get some range less than 4G */
6008 + dma32_free_bootmem();
6009 + /*
6010 + * The order of these functions is important for
6011 + * fall-back/fail-over reasons
6012 + */
6013 +#ifdef CONFIG_GART_IOMMU
6014 + gart_iommu_hole_init();
6015 +#endif
6016
6017 - for_each_sg(sgl, sg, nents, i)
6018 - gnttab_dma_unmap_page(sg->dma_address);
6019 - }
6020 -}
6021 -EXPORT_SYMBOL(dma_unmap_sg);
6022 +#ifdef CONFIG_CALGARY_IOMMU
6023 + detect_calgary();
6024 +#endif
6025
6026 -#ifdef CONFIG_HIGHMEM
6027 -dma_addr_t
6028 -dma_map_page(struct device *dev, struct page *page, unsigned long offset,
6029 - size_t size, enum dma_data_direction direction)
6030 -{
6031 - dma_addr_t dma_addr;
6032 + detect_intel_iommu();
6033
6034 - BUG_ON(!valid_dma_direction(direction));
6035 +#ifdef CONFIG_SWIOTLB
6036 + swiotlb_init();
6037 if (swiotlb) {
6038 - dma_addr = swiotlb_map_page(
6039 - dev, page, offset, size, direction);
6040 - } else {
6041 - dma_addr = gnttab_dma_map_page(page) + offset;
6042 - IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
6043 + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
6044 + dma_ops = &swiotlb_dma_ops;
6045 }
6046 -
6047 - return dma_addr;
6048 +#endif
6049 }
6050 -EXPORT_SYMBOL(dma_map_page);
6051
6052 -void
6053 -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
6054 - enum dma_data_direction direction)
6055 +/*
6056 + * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
6057 + * documentation.
6058 + */
6059 +static __init int iommu_setup(char *p)
6060 {
6061 - BUG_ON(!valid_dma_direction(direction));
6062 - if (swiotlb)
6063 - swiotlb_unmap_page(dev, dma_address, size, direction);
6064 - else
6065 - gnttab_dma_unmap_page(dma_address);
6066 -}
6067 -EXPORT_SYMBOL(dma_unmap_page);
6068 -#endif /* CONFIG_HIGHMEM */
6069 + iommu_merge = 1;
6070
6071 -int
6072 -dma_mapping_error(dma_addr_t dma_addr)
6073 -{
6074 - if (swiotlb)
6075 - return swiotlb_dma_mapping_error(dma_addr);
6076 - return 0;
6077 -}
6078 -EXPORT_SYMBOL(dma_mapping_error);
6079 + if (!p)
6080 + return -EINVAL;
6081
6082 -int
6083 -dma_supported(struct device *dev, u64 mask)
6084 -{
6085 - if (swiotlb)
6086 - return swiotlb_dma_supported(dev, mask);
6087 - /*
6088 - * By default we'll BUG when an infeasible DMA is requested, and
6089 - * request swiotlb=force (see IOMMU_BUG_ON).
6090 - */
6091 - return 1;
6092 -}
6093 -EXPORT_SYMBOL(dma_supported);
6094 + while (*p) {
6095 + if (!strncmp(p, "off", 3))
6096 + no_iommu = 1;
6097 + /* gart_parse_options has more force support */
6098 + if (!strncmp(p, "force", 5))
6099 + force_iommu = 1;
6100 + if (!strncmp(p, "noforce", 7)) {
6101 + iommu_merge = 0;
6102 + force_iommu = 0;
6103 + }
6104
6105 -void *dma_alloc_coherent(struct device *dev, size_t size,
6106 - dma_addr_t *dma_handle, gfp_t gfp)
6107 -{
6108 - void *ret;
6109 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6110 - unsigned int order = get_order(size);
6111 - unsigned long vstart;
6112 - u64 mask;
6113 + if (!strncmp(p, "biomerge", 8)) {
6114 + iommu_bio_merge = 4096;
6115 + iommu_merge = 1;
6116 + force_iommu = 1;
6117 + }
6118 + if (!strncmp(p, "panic", 5))
6119 + panic_on_overflow = 1;
6120 + if (!strncmp(p, "nopanic", 7))
6121 + panic_on_overflow = 0;
6122 + if (!strncmp(p, "merge", 5)) {
6123 + iommu_merge = 1;
6124 + force_iommu = 1;
6125 + }
6126 + if (!strncmp(p, "nomerge", 7))
6127 + iommu_merge = 0;
6128 + if (!strncmp(p, "forcesac", 8))
6129 + iommu_sac_force = 1;
6130 + if (!strncmp(p, "allowdac", 8))
6131 + forbid_dac = 0;
6132 + if (!strncmp(p, "nodac", 5))
6133 + forbid_dac = -1;
6134 + if (!strncmp(p, "usedac", 6)) {
6135 + forbid_dac = -1;
6136 + return 1;
6137 + }
6138 +#ifdef CONFIG_SWIOTLB
6139 + if (!strncmp(p, "soft", 4))
6140 + swiotlb = 1;
6141 +#endif
6142
6143 - /* ignore region specifiers */
6144 - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
6145 +#ifdef CONFIG_GART_IOMMU
6146 + gart_parse_options(p);
6147 +#endif
6148
6149 - if (mem) {
6150 - int page = bitmap_find_free_region(mem->bitmap, mem->size,
6151 - order);
6152 - if (page >= 0) {
6153 - *dma_handle = mem->device_base + (page << PAGE_SHIFT);
6154 - ret = mem->virt_base + (page << PAGE_SHIFT);
6155 - memset(ret, 0, size);
6156 - return ret;
6157 - }
6158 - if (mem->flags & DMA_MEMORY_EXCLUSIVE)
6159 - return NULL;
6160 +#ifdef CONFIG_CALGARY_IOMMU
6161 + if (!strncmp(p, "calgary", 7))
6162 + use_calgary = 1;
6163 +#endif /* CONFIG_CALGARY_IOMMU */
6164 +
6165 + p += strcspn(p, ",");
6166 + if (*p == ',')
6167 + ++p;
6168 }
6169 + return 0;
6170 +}
6171 +early_param("iommu", iommu_setup);
6172
6173 - if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
6174 - gfp |= GFP_DMA;
6175 -
6176 - vstart = __get_free_pages(gfp, order);
6177 - ret = (void *)vstart;
6178 +static int check_pages_physically_contiguous(unsigned long pfn,
6179 + unsigned int offset,
6180 + size_t length)
6181 +{
6182 + unsigned long next_mfn;
6183 + int i;
6184 + int nr_pages;
6185
6186 - if (dev != NULL && dev->coherent_dma_mask)
6187 - mask = dev->coherent_dma_mask;
6188 - else
6189 - mask = 0xffffffff;
6190 + next_mfn = pfn_to_mfn(pfn);
6191 + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
6192
6193 - if (ret != NULL) {
6194 - if (xen_create_contiguous_region(vstart, order,
6195 - fls64(mask)) != 0) {
6196 - free_pages(vstart, order);
6197 - return NULL;
6198 - }
6199 - memset(ret, 0, size);
6200 - *dma_handle = virt_to_bus(ret);
6201 + for (i = 1; i < nr_pages; i++) {
6202 + if (pfn_to_mfn(++pfn) != ++next_mfn)
6203 + return 0;
6204 }
6205 - return ret;
6206 + return 1;
6207 }
6208 -EXPORT_SYMBOL(dma_alloc_coherent);
6209
6210 -void dma_free_coherent(struct device *dev, size_t size,
6211 - void *vaddr, dma_addr_t dma_handle)
6212 +int range_straddles_page_boundary(paddr_t p, size_t size)
6213 {
6214 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6215 - int order = get_order(size);
6216 -
6217 - WARN_ON(irqs_disabled()); /* for portability */
6218 - if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
6219 - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
6220 + unsigned long pfn = p >> PAGE_SHIFT;
6221 + unsigned int offset = p & ~PAGE_MASK;
6222
6223 - bitmap_release_region(mem->bitmap, page, order);
6224 - } else {
6225 - xen_destroy_contiguous_region((unsigned long)vaddr, order);
6226 - free_pages((unsigned long)vaddr, order);
6227 - }
6228 + return ((offset + size > PAGE_SIZE) &&
6229 + !check_pages_physically_contiguous(pfn, offset, size));
6230 }
6231 -EXPORT_SYMBOL(dma_free_coherent);
6232
6233 -#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
6234 +#ifdef CONFIG_X86_32
6235 int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
6236 dma_addr_t device_addr, size_t size, int flags)
6237 {
6238 @@ -327,8 +295,8 @@ EXPORT_SYMBOL(dma_declare_coherent_memor
6239 void dma_release_declared_memory(struct device *dev)
6240 {
6241 struct dma_coherent_mem *mem = dev->dma_mem;
6242 -
6243 - if(!mem)
6244 +
6245 + if (!mem)
6246 return;
6247 dev->dma_mem = NULL;
6248 iounmap(mem->virt_base);
6249 @@ -341,8 +309,10 @@ void *dma_mark_declared_memory_occupied(
6250 dma_addr_t device_addr, size_t size)
6251 {
6252 struct dma_coherent_mem *mem = dev->dma_mem;
6253 - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
6254 int pos, err;
6255 + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
6256 +
6257 + pages >>= PAGE_SHIFT;
6258
6259 if (!mem)
6260 return ERR_PTR(-EINVAL);
6261 @@ -354,103 +324,270 @@ void *dma_mark_declared_memory_occupied(
6262 return mem->virt_base + (pos << PAGE_SHIFT);
6263 }
6264 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
6265 -#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
6266 -
6267 -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
6268 -/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
6269
6270 -int forbid_dac;
6271 -EXPORT_SYMBOL(forbid_dac);
6272 -
6273 -static __devinit void via_no_dac(struct pci_dev *dev)
6274 +static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
6275 + dma_addr_t *dma_handle, void **ret)
6276 {
6277 - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
6278 - printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
6279 - forbid_dac = 1;
6280 + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6281 + int order = get_order(size);
6282 +
6283 + if (mem) {
6284 + int page = bitmap_find_free_region(mem->bitmap, mem->size,
6285 + order);
6286 + if (page >= 0) {
6287 + *dma_handle = mem->device_base + (page << PAGE_SHIFT);
6288 + *ret = mem->virt_base + (page << PAGE_SHIFT);
6289 + memset(*ret, 0, size);
6290 + }
6291 + if (mem->flags & DMA_MEMORY_EXCLUSIVE)
6292 + *ret = NULL;
6293 }
6294 + return (mem != NULL);
6295 }
6296 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
6297
6298 -static int check_iommu(char *s)
6299 +static int dma_release_coherent(struct device *dev, int order, void *vaddr)
6300 {
6301 - if (!strcmp(s, "usedac")) {
6302 - forbid_dac = -1;
6303 + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6304 +
6305 + if (mem && vaddr >= mem->virt_base && vaddr <
6306 + (mem->virt_base + (mem->size << PAGE_SHIFT))) {
6307 + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
6308 +
6309 + bitmap_release_region(mem->bitmap, page, order);
6310 return 1;
6311 }
6312 return 0;
6313 }
6314 -__setup("iommu=", check_iommu);
6315 +#else
6316 +#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
6317 +#define dma_release_coherent(dev, order, vaddr) (0)
6318 +#endif /* CONFIG_X86_32 */
6319 +
6320 +int dma_supported(struct device *dev, u64 mask)
6321 +{
6322 +#ifdef CONFIG_PCI
6323 + if (mask > 0xffffffff && forbid_dac > 0) {
6324 + printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
6325 + dev->bus_id);
6326 + return 0;
6327 + }
6328 #endif
6329
6330 -dma_addr_t
6331 -dma_map_single(struct device *dev, void *ptr, size_t size,
6332 - enum dma_data_direction direction)
6333 + if (dma_ops->dma_supported)
6334 + return dma_ops->dma_supported(dev, mask);
6335 +
6336 + /* Copied from i386. Doesn't make much sense, because it will
6337 + only work for pci_alloc_coherent.
6338 + The caller just has to use GFP_DMA in this case. */
6339 + if (mask < DMA_24BIT_MASK)
6340 + return 0;
6341 +
6342 + /* Tell the device to use SAC when IOMMU force is on. This
6343 + allows the driver to use cheaper accesses in some cases.
6344 +
6345 + Problem with this is that if we overflow the IOMMU area and
6346 + return DAC as fallback address the device may not handle it
6347 + correctly.
6348 +
6349 + As a special case some controllers have a 39bit address
6350 + mode that is as efficient as 32bit (aic79xx). Don't force
6351 + SAC for these. Assume all masks <= 40 bits are of this
6352 + type. Normally this doesn't make any difference, but gives
6353 + more gentle handling of IOMMU overflow. */
6354 + if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
6355 + printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
6356 + dev->bus_id, mask);
6357 + return 0;
6358 + }
6359 +
6360 + return 1;
6361 +}
6362 +EXPORT_SYMBOL(dma_supported);
6363 +
6364 +/* Allocate DMA memory on node near device */
6365 +static struct page *
6366 +dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
6367 {
6368 - dma_addr_t dma;
6369 + int node;
6370
6371 - BUG_ON(!valid_dma_direction(direction));
6372 - WARN_ON(size == 0);
6373 + node = dev_to_node(dev);
6374
6375 - if (swiotlb) {
6376 - dma = swiotlb_map_single(dev, ptr, size, direction);
6377 - } else {
6378 - dma = gnttab_dma_map_page(virt_to_page(ptr)) +
6379 - offset_in_page(ptr);
6380 - IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size));
6381 - IOMMU_BUG_ON(address_needs_mapping(dev, dma));
6382 - }
6383 -
6384 - flush_write_buffers();
6385 - return dma;
6386 -}
6387 -EXPORT_SYMBOL(dma_map_single);
6388 -
6389 -void
6390 -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
6391 - enum dma_data_direction direction)
6392 -{
6393 - BUG_ON(!valid_dma_direction(direction));
6394 - if (swiotlb)
6395 - swiotlb_unmap_single(dev, dma_addr, size, direction);
6396 - else
6397 - gnttab_dma_unmap_page(dma_addr);
6398 + return alloc_pages_node(node, gfp, order);
6399 +}
6400 +
6401 +/*
6402 + * Allocate memory for a coherent mapping.
6403 + */
6404 +void *
6405 +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
6406 + gfp_t gfp)
6407 +{
6408 + void *memory = NULL;
6409 + struct page *page;
6410 + unsigned long dma_mask = 0;
6411 + int noretry = 0;
6412 + unsigned int order = get_order(size);
6413 +
6414 + /* ignore region specifiers */
6415 + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
6416 +
6417 + if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
6418 + return memory;
6419 +
6420 + if (!dev) {
6421 + dev = &fallback_dev;
6422 + gfp |= GFP_DMA;
6423 + }
6424 + dma_mask = dev->coherent_dma_mask;
6425 + if (dma_mask == 0)
6426 + dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
6427 +
6428 + /* Device not DMA able */
6429 + if (dev->dma_mask == NULL)
6430 + return NULL;
6431 +
6432 +#ifdef CONFIG_XEN
6433 + gfp &= ~(__GFP_DMA | __GFP_DMA32);
6434 +#else
6435 + /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
6436 + if (gfp & __GFP_DMA)
6437 + noretry = 1;
6438 +
6439 +#ifdef CONFIG_X86_64
6440 + /* Why <=? Even when the mask is smaller than 4GB it is often
6441 + larger than 16MB and in this case we have a chance of
6442 + finding fitting memory in the next higher zone first. If
6443 + not retry with true GFP_DMA. -AK */
6444 + if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
6445 + gfp |= GFP_DMA32;
6446 +#endif
6447 +
6448 + again:
6449 +#endif
6450 + page = dma_alloc_pages(dev,
6451 + noretry ? gfp | __GFP_NORETRY : gfp, order);
6452 + if (page == NULL)
6453 + return NULL;
6454 +
6455 +#ifndef CONFIG_XEN
6456 + {
6457 + int high, mmu;
6458 + dma_addr_t bus = page_to_phys(page);
6459 + memory = page_address(page);
6460 + high = (bus + size) >= dma_mask;
6461 + mmu = high;
6462 + if (force_iommu && !(gfp & GFP_DMA))
6463 + mmu = 1;
6464 + else if (high) {
6465 + free_pages((unsigned long)memory, order);
6466 +
6467 + /* Don't use the 16MB ZONE_DMA unless absolutely
6468 + needed. It's better to use remapping first. */
6469 + if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
6470 + gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
6471 + goto again;
6472 + }
6473 +
6474 + /* Let low level make its own zone decisions */
6475 + gfp &= ~(GFP_DMA32|GFP_DMA);
6476 +
6477 + if (dma_ops->alloc_coherent)
6478 + return dma_ops->alloc_coherent(dev, size,
6479 + dma_handle, gfp);
6480 + return NULL;
6481 + }
6482 +
6483 + memset(memory, 0, size);
6484 + if (!mmu) {
6485 + *dma_handle = bus;
6486 + return memory;
6487 + }
6488 + }
6489 +
6490 + if (dma_ops->alloc_coherent) {
6491 + free_pages((unsigned long)memory, order);
6492 + gfp &= ~(GFP_DMA|GFP_DMA32);
6493 + return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
6494 + }
6495 +
6496 + if (dma_ops->map_simple) {
6497 + *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
6498 + size,
6499 + PCI_DMA_BIDIRECTIONAL);
6500 + if (*dma_handle != bad_dma_address)
6501 + return memory;
6502 + }
6503 +#else
6504 + memory = page_address(page);
6505 + if (xen_create_contiguous_region((unsigned long)memory, order,
6506 + fls64(dma_mask)) == 0) {
6507 + memset(memory, 0, size);
6508 + *dma_handle = virt_to_bus(memory);
6509 + return memory;
6510 + }
6511 +#endif
6512 +
6513 + if (panic_on_overflow)
6514 + panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
6515 + (unsigned long)size);
6516 + free_pages((unsigned long)memory, order);
6517 + return NULL;
6518 }
6519 -EXPORT_SYMBOL(dma_unmap_single);
6520 +EXPORT_SYMBOL(dma_alloc_coherent);
6521
6522 -void
6523 -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
6524 - enum dma_data_direction direction)
6525 +/*
6526 + * Unmap coherent memory.
6527 + * The caller must ensure that the device has finished accessing the mapping.
6528 + */
6529 +void dma_free_coherent(struct device *dev, size_t size,
6530 + void *vaddr, dma_addr_t bus)
6531 {
6532 - if (swiotlb)
6533 - swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
6534 + int order = get_order(size);
6535 + WARN_ON(irqs_disabled()); /* for portability */
6536 + if (dma_release_coherent(dev, order, vaddr))
6537 + return;
6538 +#ifndef CONFIG_XEN
6539 + if (dma_ops->unmap_single)
6540 + dma_ops->unmap_single(dev, bus, size, 0);
6541 +#endif
6542 + xen_destroy_contiguous_region((unsigned long)vaddr, order);
6543 + free_pages((unsigned long)vaddr, order);
6544 }
6545 -EXPORT_SYMBOL(dma_sync_single_for_cpu);
6546 +EXPORT_SYMBOL(dma_free_coherent);
6547
6548 -void
6549 -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
6550 - enum dma_data_direction direction)
6551 +static int __init pci_iommu_init(void)
6552 {
6553 - if (swiotlb)
6554 - swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
6555 +#ifdef CONFIG_CALGARY_IOMMU
6556 + calgary_iommu_init();
6557 +#endif
6558 +
6559 + intel_iommu_init();
6560 +
6561 +#ifdef CONFIG_GART_IOMMU
6562 + gart_iommu_init();
6563 +#endif
6564 +
6565 + no_iommu_init();
6566 + return 0;
6567 }
6568 -EXPORT_SYMBOL(dma_sync_single_for_device);
6569
6570 -void
6571 -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
6572 - enum dma_data_direction direction)
6573 +void pci_iommu_shutdown(void)
6574 {
6575 - if (swiotlb)
6576 - swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
6577 - flush_write_buffers();
6578 + gart_iommu_shutdown();
6579 }
6580 -EXPORT_SYMBOL(dma_sync_sg_for_cpu);
6581 +/* Must execute after PCI subsystem */
6582 +fs_initcall(pci_iommu_init);
6583 +
6584 +#ifdef CONFIG_PCI
6585 +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
6586
6587 -void
6588 -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
6589 - enum dma_data_direction direction)
6590 +static __devinit void via_no_dac(struct pci_dev *dev)
6591 {
6592 - if (swiotlb)
6593 - swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
6594 - flush_write_buffers();
6595 + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
6596 + printk(KERN_INFO "PCI: VIA PCI bridge detected."
6597 + "Disabling DAC.\n");
6598 + forbid_dac = 1;
6599 + }
6600 }
6601 -EXPORT_SYMBOL(dma_sync_sg_for_device);
6602 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
6603 +#endif
6604 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
6605 +++ sle11-2009-10-16/arch/x86/kernel/pci-nommu-xen.c 2009-03-16 16:38:05.000000000 +0100
6606 @@ -0,0 +1,103 @@
6607 +#include <linux/dma-mapping.h>
6608 +#include <linux/dmar.h>
6609 +#include <linux/bootmem.h>
6610 +#include <linux/pci.h>
6611 +
6612 +#include <xen/gnttab.h>
6613 +
6614 +#include <asm/proto.h>
6615 +#include <asm/dma.h>
6616 +#include <asm/swiotlb.h>
6617 +#include <asm/tlbflush.h>
6618 +#include <asm/gnttab_dma.h>
6619 +#include <asm/bug.h>
6620 +
6621 +#define IOMMU_BUG_ON(test) \
6622 +do { \
6623 + if (unlikely(test)) { \
6624 + printk(KERN_ALERT "Fatal DMA error! " \
6625 + "Please use 'swiotlb=force'\n"); \
6626 + BUG(); \
6627 + } \
6628 +} while (0)
6629 +
6630 +static int
6631 +gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
6632 + int direction)
6633 +{
6634 + unsigned int i;
6635 + struct scatterlist *sg;
6636 +
6637 + WARN_ON(nents == 0 || sgl->length == 0);
6638 +
6639 + for_each_sg(sgl, sg, nents, i) {
6640 + BUG_ON(!sg_page(sg));
6641 + sg->dma_address =
6642 + gnttab_dma_map_page(sg_page(sg)) + sg->offset;
6643 + sg->dma_length = sg->length;
6644 + IOMMU_BUG_ON(address_needs_mapping(
6645 + hwdev, sg->dma_address));
6646 + IOMMU_BUG_ON(range_straddles_page_boundary(
6647 + page_to_pseudophys(sg_page(sg)) + sg->offset,
6648 + sg->length));
6649 + }
6650 +
6651 + return nents;
6652 +}
6653 +
6654 +static void
6655 +gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
6656 + int direction)
6657 +{
6658 + unsigned int i;
6659 + struct scatterlist *sg;
6660 +
6661 + for_each_sg(sgl, sg, nents, i)
6662 + gnttab_dma_unmap_page(sg->dma_address);
6663 +}
6664 +
6665 +static dma_addr_t
6666 +gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size,
6667 + int direction)
6668 +{
6669 + dma_addr_t dma;
6670 +
6671 + WARN_ON(size == 0);
6672 +
6673 + dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) +
6674 + offset_in_page(paddr);
6675 + IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
6676 + IOMMU_BUG_ON(address_needs_mapping(dev, dma));
6677 +
6678 + return dma;
6679 +}
6680 +
6681 +static void
6682 +gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
6683 + int direction)
6684 +{
6685 + gnttab_dma_unmap_page(dma_addr);
6686 +}
6687 +
6688 +static int nommu_mapping_error(dma_addr_t dma_addr)
6689 +{
6690 + return (dma_addr == bad_dma_address);
6691 +}
6692 +
6693 +static const struct dma_mapping_ops nommu_dma_ops = {
6694 + .map_single = gnttab_map_single,
6695 + .unmap_single = gnttab_unmap_single,
6696 + .map_sg = gnttab_map_sg,
6697 + .unmap_sg = gnttab_unmap_sg,
6698 + .dma_supported = swiotlb_dma_supported,
6699 + .mapping_error = nommu_mapping_error
6700 +};
6701 +
6702 +void __init no_iommu_init(void)
6703 +{
6704 + if (dma_ops)
6705 + return;
6706 +
6707 + force_iommu = 0; /* no HW IOMMU */
6708 + dma_ops = &nommu_dma_ops;
6709 +}
6710 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
6711 +++ sle11-2009-10-16/arch/x86/kernel/process-xen.c 2009-03-16 16:38:05.000000000 +0100
6712 @@ -0,0 +1,188 @@
6713 +#include <linux/errno.h>
6714 +#include <linux/kernel.h>
6715 +#include <linux/mm.h>
6716 +#include <linux/smp.h>
6717 +#include <linux/slab.h>
6718 +#include <linux/sched.h>
6719 +#include <linux/module.h>
6720 +#include <linux/pm.h>
6721 +
6722 +struct kmem_cache *task_xstate_cachep;
6723 +
6724 +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
6725 +{
6726 + *dst = *src;
6727 + if (src->thread.xstate) {
6728 + dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
6729 + GFP_KERNEL);
6730 + if (!dst->thread.xstate)
6731 + return -ENOMEM;
6732 + WARN_ON((unsigned long)dst->thread.xstate & 15);
6733 + memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
6734 + }
6735 + return 0;
6736 +}
6737 +
6738 +void free_thread_xstate(struct task_struct *tsk)
6739 +{
6740 + if (tsk->thread.xstate) {
6741 + kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
6742 + tsk->thread.xstate = NULL;
6743 + }
6744 +}
6745 +
6746 +void free_thread_info(struct thread_info *ti)
6747 +{
6748 + free_thread_xstate(ti->task);
6749 + free_pages((unsigned long)ti, get_order(THREAD_SIZE));
6750 +}
6751 +
6752 +void arch_task_cache_init(void)
6753 +{
6754 + task_xstate_cachep =
6755 + kmem_cache_create("task_xstate", xstate_size,
6756 + __alignof__(union thread_xstate),
6757 + SLAB_PANIC, NULL);
6758 +}
6759 +
6760 +static void do_nothing(void *unused)
6761 +{
6762 +}
6763 +
6764 +/*
6765 + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
6766 + * pm_idle and update to new pm_idle value. Required while changing pm_idle
6767 + * handler on SMP systems.
6768 + *
6769 + * Caller must have changed pm_idle to the new value before the call. Old
6770 + * pm_idle value will not be used by any CPU after the return of this function.
6771 + */
6772 +void cpu_idle_wait(void)
6773 +{
6774 + smp_mb();
6775 + /* kick all the CPUs so that they exit out of pm_idle */
6776 + smp_call_function(do_nothing, NULL, 0, 1);
6777 +}
6778 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
6779 +
6780 +#ifndef CONFIG_XEN
6781 +/*
6782 + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
6783 + * which can obviate IPI to trigger checking of need_resched.
6784 + * We execute MONITOR against need_resched and enter optimized wait state
6785 + * through MWAIT. Whenever someone changes need_resched, we would be woken
6786 + * up from MWAIT (without an IPI).
6787 + *
6788 + * New with Core Duo processors, MWAIT can take some hints based on CPU
6789 + * capability.
6790 + */
6791 +void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
6792 +{
6793 + if (!need_resched()) {
6794 + __monitor((void *)&current_thread_info()->flags, 0, 0);
6795 + smp_mb();
6796 + if (!need_resched())
6797 + __mwait(ax, cx);
6798 + }
6799 +}
6800 +
6801 +/* Default MONITOR/MWAIT with no hints, used for default C1 state */
6802 +static void mwait_idle(void)
6803 +{
6804 + if (!need_resched()) {
6805 + __monitor((void *)&current_thread_info()->flags, 0, 0);
6806 + smp_mb();
6807 + if (!need_resched())
6808 + __sti_mwait(0, 0);
6809 + else
6810 + local_irq_enable();
6811 + } else
6812 + local_irq_enable();
6813 +}
6814 +#endif
6815 +
6816 +/*
6817 + * On SMP it's slightly faster (but much more power-consuming!)
6818 + * to poll the ->work.need_resched flag instead of waiting for the
6819 + * cross-CPU IPI to arrive. Use this option with caution.
6820 + */
6821 +static void poll_idle(void)
6822 +{
6823 + local_irq_enable();
6824 + cpu_relax();
6825 +}
6826 +
6827 +#ifndef CONFIG_XEN
6828 +/*
6829 + * mwait selection logic:
6830 + *
6831 + * It depends on the CPU. For AMD CPUs that support MWAIT this is
6832 + * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
6833 + * then depend on a clock divisor and current Pstate of the core. If
6834 + * all cores of a processor are in halt state (C1) the processor can
6835 + * enter the C1E (C1 enhanced) state. If mwait is used this will never
6836 + * happen.
6837 + *
6838 + * idle=mwait overrides this decision and forces the usage of mwait.
6839 + */
6840 +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
6841 +{
6842 + if (force_mwait)
6843 + return 1;
6844 +
6845 + if (c->x86_vendor == X86_VENDOR_AMD) {
6846 + switch(c->x86) {
6847 + case 0x10:
6848 + case 0x11:
6849 + return 0;
6850 + }
6851 + }
6852 + return 1;
6853 +}
6854 +#endif
6855 +
6856 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
6857 +{
6858 +#ifndef CONFIG_XEN
6859 + static int selected;
6860 +
6861 + if (selected)
6862 + return;
6863 +#ifdef CONFIG_X86_SMP
6864 + if (pm_idle == poll_idle && smp_num_siblings > 1) {
6865 + printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
6866 + " performance may degrade.\n");
6867 + }
6868 +#endif
6869 + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
6870 + /*
6871 + * Skip, if setup has overridden idle.
6872 + * One CPU supports mwait => All CPUs supports mwait
6873 + */
6874 + if (!pm_idle) {
6875 + printk(KERN_INFO "using mwait in idle threads.\n");
6876 + pm_idle = mwait_idle;
6877 + }
6878 + }
6879 + selected = 1;
6880 +#endif
6881 +}
6882 +
6883 +static int __init idle_setup(char *str)
6884 +{
6885 + if (!strcmp(str, "poll")) {
6886 + printk("using polling idle threads.\n");
6887 + pm_idle = poll_idle;
6888 + }
6889 +#ifndef CONFIG_XEN
6890 + else if (!strcmp(str, "mwait"))
6891 + force_mwait = 1;
6892 +#endif
6893 + else
6894 + return -1;
6895 +
6896 + boot_option_idle_override = 1;
6897 + return 0;
6898 +}
6899 +early_param("idle", idle_setup);
6900 +
6901 --- sle11-2009-10-16.orig/arch/x86/kernel/process_32-xen.c 2009-03-16 16:33:40.000000000 +0100
6902 +++ sle11-2009-10-16/arch/x86/kernel/process_32-xen.c 2009-03-16 16:38:05.000000000 +0100
6903 @@ -36,6 +36,7 @@
6904 #include <linux/personality.h>
6905 #include <linux/tick.h>
6906 #include <linux/percpu.h>
6907 +#include <linux/prctl.h>
6908
6909 #include <asm/uaccess.h>
6910 #include <asm/pgtable.h>
6911 @@ -45,7 +46,6 @@
6912 #include <asm/processor.h>
6913 #include <asm/i387.h>
6914 #include <asm/desc.h>
6915 -#include <asm/vm86.h>
6916 #ifdef CONFIG_MATH_EMULATION
6917 #include <asm/math_emu.h>
6918 #endif
6919 @@ -102,16 +102,6 @@ void enable_hlt(void)
6920
6921 EXPORT_SYMBOL(enable_hlt);
6922
6923 -/*
6924 - * On SMP it's slightly faster (but much more power-consuming!)
6925 - * to poll the ->work.need_resched flag instead of waiting for the
6926 - * cross-CPU IPI to arrive. Use this option with caution.
6927 - */
6928 -static void poll_idle(void)
6929 -{
6930 - cpu_relax();
6931 -}
6932 -
6933 static void xen_idle(void)
6934 {
6935 current_thread_info()->status &= ~TS_POLLING;
6936 @@ -121,20 +111,10 @@ static void xen_idle(void)
6937 */
6938 smp_mb();
6939
6940 - local_irq_disable();
6941 - if (!need_resched()) {
6942 - ktime_t t0, t1;
6943 - u64 t0n, t1n;
6944 -
6945 - t0 = ktime_get();
6946 - t0n = ktime_to_ns(t0);
6947 + if (!need_resched())
6948 safe_halt(); /* enables interrupts racelessly */
6949 - local_irq_disable();
6950 - t1 = ktime_get();
6951 - t1n = ktime_to_ns(t1);
6952 - sched_clock_idle_wakeup_event(t1n - t0n);
6953 - }
6954 - local_irq_enable();
6955 + else
6956 + local_irq_enable();
6957 current_thread_info()->status |= TS_POLLING;
6958 }
6959 #ifdef CONFIG_APM_MODULE
6960 @@ -142,7 +122,6 @@ EXPORT_SYMBOL(default_idle);
6961 #endif
6962
6963 #ifdef CONFIG_HOTPLUG_CPU
6964 -extern cpumask_t cpu_initialized;
6965 static inline void play_dead(void)
6966 {
6967 idle_task_exit();
6968 @@ -187,6 +166,7 @@ void cpu_idle(void)
6969 if (cpu_is_offline(cpu))
6970 play_dead();
6971
6972 + local_irq_disable();
6973 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
6974 idle();
6975 }
6976 @@ -197,44 +177,6 @@ void cpu_idle(void)
6977 }
6978 }
6979
6980 -static void do_nothing(void *unused)
6981 -{
6982 -}
6983 -
6984 -/*
6985 - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
6986 - * pm_idle and update to new pm_idle value. Required while changing pm_idle
6987 - * handler on SMP systems.
6988 - *
6989 - * Caller must have changed pm_idle to the new value before the call. Old
6990 - * pm_idle value will not be used by any CPU after the return of this function.
6991 - */
6992 -void cpu_idle_wait(void)
6993 -{
6994 - smp_mb();
6995 - /* kick all the CPUs so that they exit out of pm_idle */
6996 - smp_call_function(do_nothing, NULL, 0, 1);
6997 -}
6998 -EXPORT_SYMBOL_GPL(cpu_idle_wait);
6999 -
7000 -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
7001 -{
7002 -}
7003 -
7004 -static int __init idle_setup(char *str)
7005 -{
7006 - if (!strcmp(str, "poll")) {
7007 - printk("using polling idle threads.\n");
7008 - pm_idle = poll_idle;
7009 - }
7010 - else
7011 - return -1;
7012 -
7013 - boot_option_idle_override = 1;
7014 - return 0;
7015 -}
7016 -early_param("idle", idle_setup);
7017 -
7018 void __show_registers(struct pt_regs *regs, int all)
7019 {
7020 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
7021 @@ -260,7 +202,7 @@ void __show_registers(struct pt_regs *re
7022 init_utsname()->version);
7023
7024 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
7025 - 0xffff & regs->cs, regs->ip, regs->flags,
7026 + (u16)regs->cs, regs->ip, regs->flags,
7027 smp_processor_id());
7028 print_symbol("EIP is at %s\n", regs->ip);
7029
7030 @@ -269,8 +211,7 @@ void __show_registers(struct pt_regs *re
7031 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
7032 regs->si, regs->di, regs->bp, sp);
7033 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
7034 - regs->ds & 0xffff, regs->es & 0xffff,
7035 - regs->fs & 0xffff, gs, ss);
7036 + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
7037
7038 if (!all)
7039 return;
7040 @@ -367,6 +308,7 @@ void flush_thread(void)
7041 /*
7042 * Forget coprocessor state..
7043 */
7044 + tsk->fpu_counter = 0;
7045 clear_fpu(tsk);
7046 clear_used_math();
7047 }
7048 @@ -437,11 +379,30 @@ int copy_thread(int nr, unsigned long cl
7049 return err;
7050 }
7051
7052 -#ifdef CONFIG_SECCOMP
7053 +void
7054 +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
7055 +{
7056 + __asm__("movl %0, %%gs" :: "r"(0));
7057 + regs->fs = 0;
7058 + set_fs(USER_DS);
7059 + regs->ds = __USER_DS;
7060 + regs->es = __USER_DS;
7061 + regs->ss = __USER_DS;
7062 + regs->cs = __USER_CS;
7063 + regs->ip = new_ip;
7064 + regs->sp = new_sp;
7065 + /*
7066 + * Free the old FP and other extended state
7067 + */
7068 + free_thread_xstate(current);
7069 +}
7070 +EXPORT_SYMBOL_GPL(start_thread);
7071 +
7072 static void hard_disable_TSC(void)
7073 {
7074 write_cr4(read_cr4() | X86_CR4_TSD);
7075 }
7076 +
7077 void disable_TSC(void)
7078 {
7079 preempt_disable();
7080 @@ -453,11 +414,47 @@ void disable_TSC(void)
7081 hard_disable_TSC();
7082 preempt_enable();
7083 }
7084 +
7085 static void hard_enable_TSC(void)
7086 {
7087 write_cr4(read_cr4() & ~X86_CR4_TSD);
7088 }
7089 -#endif /* CONFIG_SECCOMP */
7090 +
7091 +static void enable_TSC(void)
7092 +{
7093 + preempt_disable();
7094 + if (test_and_clear_thread_flag(TIF_NOTSC))
7095 + /*
7096 + * Must flip the CPU state synchronously with
7097 + * TIF_NOTSC in the current running context.
7098 + */
7099 + hard_enable_TSC();
7100 + preempt_enable();
7101 +}
7102 +
7103 +int get_tsc_mode(unsigned long adr)
7104 +{
7105 + unsigned int val;
7106 +
7107 + if (test_thread_flag(TIF_NOTSC))
7108 + val = PR_TSC_SIGSEGV;
7109 + else
7110 + val = PR_TSC_ENABLE;
7111 +
7112 + return put_user(val, (unsigned int __user *)adr);
7113 +}
7114 +
7115 +int set_tsc_mode(unsigned int val)
7116 +{
7117 + if (val == PR_TSC_SIGSEGV)
7118 + disable_TSC();
7119 + else if (val == PR_TSC_ENABLE)
7120 + enable_TSC();
7121 + else
7122 + return -EINVAL;
7123 +
7124 + return 0;
7125 +}
7126
7127 static noinline void
7128 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
7129 @@ -473,12 +470,12 @@ __switch_to_xtra(struct task_struct *pre
7130 /* we clear debugctl to make sure DS
7131 * is not in use when we change it */
7132 debugctl = 0;
7133 - wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
7134 + update_debugctlmsr(0);
7135 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
7136 }
7137
7138 if (next->debugctlmsr != debugctl)
7139 - wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
7140 + update_debugctlmsr(next->debugctlmsr);
7141
7142 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
7143 set_debugreg(next->debugreg0, 0);
7144 @@ -490,7 +487,6 @@ __switch_to_xtra(struct task_struct *pre
7145 set_debugreg(next->debugreg7, 7);
7146 }
7147
7148 -#ifdef CONFIG_SECCOMP
7149 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
7150 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
7151 /* prev and next are different */
7152 @@ -499,7 +495,6 @@ __switch_to_xtra(struct task_struct *pre
7153 else
7154 hard_enable_TSC();
7155 }
7156 -#endif
7157
7158 #ifdef X86_BTS
7159 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
7160 @@ -637,7 +632,7 @@ struct task_struct * __switch_to(struct
7161
7162 /* we're going to use this soon, after a few expensive things */
7163 if (next_p->fpu_counter > 5)
7164 - prefetch(&next->i387.fxsave);
7165 + prefetch(next->xstate);
7166
7167 /*
7168 * Now maybe handle debug registers
7169 @@ -658,8 +653,11 @@ struct task_struct * __switch_to(struct
7170 /* If the task has used fpu the last 5 timeslices, just do a full
7171 * restore of the math state immediately to avoid the trap; the
7172 * chances of needing FPU soon are obviously high now
7173 + *
7174 + * tsk_used_math() checks prevent calling math_state_restore(),
7175 + * which can sleep in the case of !tsk_used_math()
7176 */
7177 - if (next_p->fpu_counter > 5)
7178 + if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
7179 math_state_restore();
7180
7181 /*
7182 --- sle11-2009-10-16.orig/arch/x86/kernel/process_64-xen.c 2009-03-16 16:33:40.000000000 +0100
7183 +++ sle11-2009-10-16/arch/x86/kernel/process_64-xen.c 2009-03-16 16:38:05.000000000 +0100
7184 @@ -39,6 +39,7 @@
7185 #include <linux/kprobes.h>
7186 #include <linux/kdebug.h>
7187 #include <linux/tick.h>
7188 +#include <linux/prctl.h>
7189
7190 #include <asm/uaccess.h>
7191 #include <asm/pgtable.h>
7192 @@ -102,17 +103,6 @@ void exit_idle(void)
7193 __exit_idle();
7194 }
7195
7196 -/*
7197 - * On SMP it's slightly faster (but much more power-consuming!)
7198 - * to poll the ->need_resched flag instead of waiting for the
7199 - * cross-CPU IPI to arrive. Use this option with caution.
7200 - */
7201 -static void poll_idle(void)
7202 -{
7203 - local_irq_enable();
7204 - cpu_relax();
7205 -}
7206 -
7207 static void xen_idle(void)
7208 {
7209 current_thread_info()->status &= ~TS_POLLING;
7210 @@ -121,20 +111,10 @@ static void xen_idle(void)
7211 * test NEED_RESCHED:
7212 */
7213 smp_mb();
7214 - local_irq_disable();
7215 - if (!need_resched()) {
7216 - ktime_t t0, t1;
7217 - u64 t0n, t1n;
7218 -
7219 - t0 = ktime_get();
7220 - t0n = ktime_to_ns(t0);
7221 + if (!need_resched())
7222 safe_halt(); /* enables interrupts racelessly */
7223 - local_irq_disable();
7224 - t1 = ktime_get();
7225 - t1n = ktime_to_ns(t1);
7226 - sched_clock_idle_wakeup_event(t1n - t0n);
7227 - }
7228 - local_irq_enable();
7229 + else
7230 + local_irq_enable();
7231 current_thread_info()->status |= TS_POLLING;
7232 }
7233
7234 @@ -195,45 +175,6 @@ void cpu_idle(void)
7235 }
7236 }
7237
7238 -static void do_nothing(void *unused)
7239 -{
7240 -}
7241 -
7242 -/*
7243 - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
7244 - * pm_idle and update to new pm_idle value. Required while changing pm_idle
7245 - * handler on SMP systems.
7246 - *
7247 - * Caller must have changed pm_idle to the new value before the call. Old
7248 - * pm_idle value will not be used by any CPU after the return of this function.
7249 - */
7250 -void cpu_idle_wait(void)
7251 -{
7252 - smp_mb();
7253 - /* kick all the CPUs so that they exit out of pm_idle */
7254 - smp_call_function(do_nothing, NULL, 0, 1);
7255 -}
7256 -EXPORT_SYMBOL_GPL(cpu_idle_wait);
7257 -
7258 -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
7259 -{
7260 -}
7261 -
7262 -static int __init idle_setup(char *str)
7263 -{
7264 - if (!strcmp(str, "poll")) {
7265 - printk("using polling idle threads.\n");
7266 - pm_idle = poll_idle;
7267 - } else if (!strcmp(str, "mwait"))
7268 - force_mwait = 1;
7269 - else
7270 - return -1;
7271 -
7272 - boot_option_idle_override = 1;
7273 - return 0;
7274 -}
7275 -early_param("idle", idle_setup);
7276 -
7277 /* Prints also some state that isn't saved in the pt_regs */
7278 void __show_regs(struct pt_regs * regs)
7279 {
7280 @@ -360,6 +301,7 @@ void flush_thread(void)
7281 /*
7282 * Forget coprocessor state..
7283 */
7284 + tsk->fpu_counter = 0;
7285 clear_fpu(tsk);
7286 clear_used_math();
7287 }
7288 @@ -472,6 +414,83 @@ out:
7289 return err;
7290 }
7291
7292 +void
7293 +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
7294 +{
7295 + asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
7296 + load_gs_index(0);
7297 + regs->ip = new_ip;
7298 + regs->sp = new_sp;
7299 + write_pda(oldrsp, new_sp);
7300 + regs->cs = __USER_CS;
7301 + regs->ss = __USER_DS;
7302 + regs->flags = 0x200;
7303 + set_fs(USER_DS);
7304 + /*
7305 + * Free the old FP and other extended state
7306 + */
7307 + free_thread_xstate(current);
7308 +}
7309 +EXPORT_SYMBOL_GPL(start_thread);
7310 +
7311 +static void hard_disable_TSC(void)
7312 +{
7313 + write_cr4(read_cr4() | X86_CR4_TSD);
7314 +}
7315 +
7316 +void disable_TSC(void)
7317 +{
7318 + preempt_disable();
7319 + if (!test_and_set_thread_flag(TIF_NOTSC))
7320 + /*
7321 + * Must flip the CPU state synchronously with
7322 + * TIF_NOTSC in the current running context.
7323 + */
7324 + hard_disable_TSC();
7325 + preempt_enable();
7326 +}
7327 +
7328 +static void hard_enable_TSC(void)
7329 +{
7330 + write_cr4(read_cr4() & ~X86_CR4_TSD);
7331 +}
7332 +
7333 +static void enable_TSC(void)
7334 +{
7335 + preempt_disable();
7336 + if (test_and_clear_thread_flag(TIF_NOTSC))
7337 + /*
7338 + * Must flip the CPU state synchronously with
7339 + * TIF_NOTSC in the current running context.
7340 + */
7341 + hard_enable_TSC();
7342 + preempt_enable();
7343 +}
7344 +
7345 +int get_tsc_mode(unsigned long adr)
7346 +{
7347 + unsigned int val;
7348 +
7349 + if (test_thread_flag(TIF_NOTSC))
7350 + val = PR_TSC_SIGSEGV;
7351 + else
7352 + val = PR_TSC_ENABLE;
7353 +
7354 + return put_user(val, (unsigned int __user *)adr);
7355 +}
7356 +
7357 +int set_tsc_mode(unsigned int val)
7358 +{
7359 + if (val == PR_TSC_SIGSEGV)
7360 + disable_TSC();
7361 + else if (val == PR_TSC_ENABLE)
7362 + enable_TSC();
7363 + else
7364 + return -EINVAL;
7365 +
7366 + return 0;
7367 +}
7368 +
7369 /*
7370 * This special macro can be used to load a debugging register
7371 */
7372 @@ -491,12 +510,12 @@ static inline void __switch_to_xtra(stru
7373 /* we clear debugctl to make sure DS
7374 * is not in use when we change it */
7375 debugctl = 0;
7376 - wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
7377 + update_debugctlmsr(0);
7378 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
7379 }
7380
7381 if (next->debugctlmsr != debugctl)
7382 - wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
7383 + update_debugctlmsr(next->debugctlmsr);
7384
7385 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
7386 loaddebug(next, 0);
7387 @@ -508,6 +527,15 @@ static inline void __switch_to_xtra(stru
7388 loaddebug(next, 7);
7389 }
7390
7391 + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
7392 + test_tsk_thread_flag(next_p, TIF_NOTSC)) {
7393 + /* prev and next are different */
7394 + if (test_tsk_thread_flag(next_p, TIF_NOTSC))
7395 + hard_disable_TSC();
7396 + else
7397 + hard_enable_TSC();
7398 + }
7399 +
7400 #ifdef X86_BTS
7401 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
7402 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
7403 @@ -547,7 +575,7 @@ __switch_to(struct task_struct *prev_p,
7404
7405 /* we're going to use this soon, after a few expensive things */
7406 if (next_p->fpu_counter>5)
7407 - prefetch(&next->i387.fxsave);
7408 + prefetch(next->xstate);
7409
7410 /*
7411 * This is basically '__unlazy_fpu', except that we queue a
7412 @@ -680,8 +708,11 @@ __switch_to(struct task_struct *prev_p,
7413 /* If the task has used fpu the last 5 timeslices, just do a full
7414 * restore of the math state immediately to avoid the trap; the
7415 * chances of needing FPU soon are obviously high now
7416 + *
7417 + * tsk_used_math() checks prevent calling math_state_restore(),
7418 + * which can sleep in the case of !tsk_used_math()
7419 */
7420 - if (next_p->fpu_counter>5)
7421 + if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
7422 math_state_restore();
7423 return prev_p;
7424 }
7425 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
7426 +++ sle11-2009-10-16/arch/x86/kernel/setup-xen.c 2009-03-16 16:38:05.000000000 +0100
7427 @@ -0,0 +1,141 @@
7428 +#include <linux/kernel.h>
7429 +#include <linux/module.h>
7430 +#include <linux/init.h>
7431 +#include <linux/bootmem.h>
7432 +#include <linux/percpu.h>
7433 +#include <asm/smp.h>
7434 +#include <asm/percpu.h>
7435 +#include <asm/sections.h>
7436 +#include <asm/processor.h>
7437 +#include <asm/setup.h>
7438 +#include <asm/topology.h>
7439 +#include <asm/mpspec.h>
7440 +#include <asm/apicdef.h>
7441 +
7442 +#ifdef CONFIG_X86_LOCAL_APIC
7443 +unsigned int num_processors;
7444 +unsigned disabled_cpus __cpuinitdata;
7445 +/* Processor that is doing the boot up */
7446 +unsigned int boot_cpu_physical_apicid = -1U;
7447 +EXPORT_SYMBOL(boot_cpu_physical_apicid);
7448 +
7449 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
7450 +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
7451 +
7452 +/* Bitmask of physically existing CPUs */
7453 +physid_mask_t phys_cpu_present_map;
7454 +#endif
7455 +
7456 +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
7457 +/*
7458 + * Copy data used in early init routines from the initial arrays to the
7459 + * per cpu data areas. These arrays then become expendable and the
7460 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
7461 + */
7462 +static void __init setup_per_cpu_maps(void)
7463 +{
7464 +#ifndef CONFIG_XEN
7465 + int cpu;
7466 +
7467 + for_each_possible_cpu(cpu) {
7468 + per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
7469 + per_cpu(x86_bios_cpu_apicid, cpu) =
7470 + x86_bios_cpu_apicid_init[cpu];
7471 +#ifdef CONFIG_NUMA
7472 + per_cpu(x86_cpu_to_node_map, cpu) =
7473 + x86_cpu_to_node_map_init[cpu];
7474 +#endif
7475 + }
7476 +
7477 + /* indicate the early static arrays will soon be gone */
7478 + x86_cpu_to_apicid_early_ptr = NULL;
7479 + x86_bios_cpu_apicid_early_ptr = NULL;
7480 +#ifdef CONFIG_NUMA
7481 + x86_cpu_to_node_map_early_ptr = NULL;
7482 +#endif
7483 +#endif
7484 +}
7485 +
7486 +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
7487 +cpumask_t *cpumask_of_cpu_map __read_mostly;
7488 +EXPORT_SYMBOL(cpumask_of_cpu_map);
7489 +
7490 +/* requires nr_cpu_ids to be initialized */
7491 +static void __init setup_cpumask_of_cpu(void)
7492 +{
7493 + int i;
7494 +
7495 + /* alloc_bootmem zeroes memory */
7496 + cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
7497 + for (i = 0; i < nr_cpu_ids; i++)
7498 + cpu_set(i, cpumask_of_cpu_map[i]);
7499 +}
7500 +#else
7501 +static inline void setup_cpumask_of_cpu(void) { }
7502 +#endif
7503 +
7504 +#ifdef CONFIG_X86_32
7505 +/*
7506 + * Great future not-so-futuristic plan: make i386 and x86_64 do it
7507 + * the same way
7508 + */
7509 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
7510 +EXPORT_SYMBOL(__per_cpu_offset);
7511 +#endif
7512 +
7513 +/*
7514 + * Great future plan:
7515 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
7516 + * Always point %gs to its beginning
7517 + */
7518 +void __init setup_per_cpu_areas(void)
7519 +{
7520 + int i, highest_cpu = 0;
7521 + unsigned long size;
7522 +
7523 +#ifdef CONFIG_HOTPLUG_CPU
7524 + prefill_possible_map();
7525 +#endif
7526 +
7527 + /* Copy section for each CPU (we discard the original) */
7528 + size = PERCPU_ENOUGH_ROOM;
7529 + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
7530 + size);
7531 +
7532 + for_each_possible_cpu(i) {
7533 + char *ptr;
7534 +#ifndef CONFIG_NEED_MULTIPLE_NODES
7535 + ptr = alloc_bootmem_pages(size);
7536 +#else
7537 + int node = early_cpu_to_node(i);
7538 + if (!node_online(node) || !NODE_DATA(node)) {
7539 + ptr = alloc_bootmem_pages(size);
7540 + printk(KERN_INFO
7541 + "cpu %d has no node or node-local memory\n", i);
7542 + }
7543 + else
7544 + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
7545 +#endif
7546 + if (!ptr)
7547 + panic("Cannot allocate cpu data for CPU %d\n", i);
7548 +#ifdef CONFIG_X86_64
7549 + cpu_pda(i)->data_offset = ptr - __per_cpu_start;
7550 +#else
7551 + __per_cpu_offset[i] = ptr - __per_cpu_start;
7552 +#endif
7553 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
7554 +
7555 + highest_cpu = i;
7556 + }
7557 +
7558 + nr_cpu_ids = highest_cpu + 1;
7559 + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
7560 +
7561 + /* Setup percpu data maps */
7562 + setup_per_cpu_maps();
7563 +
7564 + /* Setup cpumask_of_cpu map */
7565 + setup_cpumask_of_cpu();
7566 +}
7567 +
7568 +#endif
7569 --- sle11-2009-10-16.orig/arch/x86/kernel/setup64-xen.c 2009-03-16 16:33:40.000000000 +0100
7570 +++ sle11-2009-10-16/arch/x86/kernel/setup64-xen.c 2009-03-16 16:38:05.000000000 +0100
7571 @@ -15,6 +15,7 @@
7572 #include <linux/bootmem.h>
7573 #include <linux/bitops.h>
7574 #include <linux/module.h>
7575 +#include <linux/kgdb.h>
7576 #include <asm/pda.h>
7577 #include <asm/pgtable.h>
7578 #include <asm/processor.h>
7579 @@ -27,6 +28,7 @@
7580 #include <asm/proto.h>
7581 #include <asm/sections.h>
7582 #include <asm/setup.h>
7583 +#include <asm/genapic.h>
7584 #ifdef CONFIG_XEN
7585 #include <asm/hypervisor.h>
7586 #endif
7587 @@ -81,8 +83,8 @@ int force_personality32 = 0;
7588 Control non executable heap for 32bit processes.
7589 To control the stack too use noexec=off
7590
7591 -on PROT_READ does not imply PROT_EXEC for 32bit processes
7592 -off PROT_READ implies PROT_EXEC (default)
7593 +on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
7594 +off PROT_READ implies PROT_EXEC
7595 */
7596 static int __init nonx32_setup(char *str)
7597 {
7598 @@ -94,85 +96,6 @@ static int __init nonx32_setup(char *str
7599 }
7600 __setup("noexec32=", nonx32_setup);
7601
7602 -/*
7603 - * Copy data used in early init routines from the initial arrays to the
7604 - * per cpu data areas. These arrays then become expendable and the
7605 - * *_early_ptr's are zeroed indicating that the static arrays are gone.
7606 - */
7607 -static void __init setup_per_cpu_maps(void)
7608 -{
7609 -#ifndef CONFIG_XEN
7610 - int cpu;
7611 -
7612 - for_each_possible_cpu(cpu) {
7613 -#ifdef CONFIG_SMP
7614 - if (per_cpu_offset(cpu)) {
7615 -#endif
7616 - per_cpu(x86_cpu_to_apicid, cpu) =
7617 - x86_cpu_to_apicid_init[cpu];
7618 - per_cpu(x86_bios_cpu_apicid, cpu) =
7619 - x86_bios_cpu_apicid_init[cpu];
7620 -#ifdef CONFIG_NUMA
7621 - per_cpu(x86_cpu_to_node_map, cpu) =
7622 - x86_cpu_to_node_map_init[cpu];
7623 -#endif
7624 -#ifdef CONFIG_SMP
7625 - }
7626 - else
7627 - printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
7628 - cpu);
7629 -#endif
7630 - }
7631 -
7632 - /* indicate the early static arrays will soon be gone */
7633 - x86_cpu_to_apicid_early_ptr = NULL;
7634 - x86_bios_cpu_apicid_early_ptr = NULL;
7635 -#ifdef CONFIG_NUMA
7636 - x86_cpu_to_node_map_early_ptr = NULL;
7637 -#endif
7638 -#endif
7639 -}
7640 -
7641 -/*
7642 - * Great future plan:
7643 - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
7644 - * Always point %gs to its beginning
7645 - */
7646 -void __init setup_per_cpu_areas(void)
7647 -{
7648 - int i;
7649 - unsigned long size;
7650 -
7651 -#ifdef CONFIG_HOTPLUG_CPU
7652 - prefill_possible_map();
7653 -#endif
7654 -
7655 - /* Copy section for each CPU (we discard the original) */
7656 - size = PERCPU_ENOUGH_ROOM;
7657 -
7658 - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
7659 - for_each_cpu_mask (i, cpu_possible_map) {
7660 - char *ptr;
7661 -#ifndef CONFIG_NEED_MULTIPLE_NODES
7662 - ptr = alloc_bootmem_pages(size);
7663 -#else
7664 - int node = early_cpu_to_node(i);
7665 -
7666 - if (!node_online(node) || !NODE_DATA(node))
7667 - ptr = alloc_bootmem_pages(size);
7668 - else
7669 - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
7670 -#endif
7671 - if (!ptr)
7672 - panic("Cannot allocate cpu data for CPU %d\n", i);
7673 - cpu_pda(i)->data_offset = ptr - __per_cpu_start;
7674 - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
7675 - }
7676 -
7677 - /* setup percpu data maps early */
7678 - setup_per_cpu_maps();
7679 -}
7680 -
7681 #ifdef CONFIG_XEN
7682 static void __init_refok switch_pt(int cpu)
7683 {
7684 @@ -410,6 +333,17 @@ void __cpuinit cpu_init (void)
7685 #endif
7686 load_LDT(&init_mm.context);
7687
7688 +#ifdef CONFIG_KGDB
7689 + /*
7690 + * If the kgdb is connected no debug regs should be altered. This
7691 + * is only applicable when KGDB and a KGDB I/O module are built
7692 + * into the kernel and you are using early debugging with
7693 + * kgdbwait. KGDB will control the kernel HW breakpoint registers.
7694 + */
7695 + if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
7696 + arch_kgdb_ops.correct_hw_break();
7697 + else {
7698 +#endif
7699 /*
7700 * Clear all 6 debug registers:
7701 */
7702 @@ -420,10 +354,17 @@ void __cpuinit cpu_init (void)
7703 set_debugreg(0UL, 3);
7704 set_debugreg(0UL, 6);
7705 set_debugreg(0UL, 7);
7706 +#ifdef CONFIG_KGDB
7707 + /* If the kgdb is connected no debug regs should be altered. */
7708 + }
7709 +#endif
7710
7711 fpu_init();
7712
7713 asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
7714 if (raw_irqs_disabled())
7715 kernel_eflags &= ~X86_EFLAGS_IF;
7716 +
7717 + if (is_uv_system())
7718 + uv_cpu_init();
7719 }
7720 --- sle11-2009-10-16.orig/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:33:40.000000000 +0100
7721 +++ sle11-2009-10-16/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:38:05.000000000 +0100
7722 @@ -39,6 +39,7 @@
7723 #include <linux/efi.h>
7724 #include <linux/init.h>
7725 #include <linux/edd.h>
7726 +#include <linux/iscsi_ibft.h>
7727 #include <linux/nodemask.h>
7728 #include <linux/kernel.h>
7729 #include <linux/percpu.h>
7730 @@ -49,6 +50,7 @@
7731 #include <linux/pfn.h>
7732 #include <linux/pci.h>
7733 #include <linux/init_ohci1394_dma.h>
7734 +#include <linux/kvm_para.h>
7735
7736 #include <video/edid.h>
7737
7738 @@ -70,8 +72,9 @@
7739 #include <xen/firmware.h>
7740 #include <xen/xencons.h>
7741 #include <setup_arch.h>
7742 -#include <bios_ebda.h>
7743 +#include <asm/bios_ebda.h>
7744 #include <asm/cacheflush.h>
7745 +#include <asm/processor.h>
7746
7747 #ifdef CONFIG_XEN
7748 #include <xen/interface/kexec.h>
7749 @@ -136,7 +139,12 @@ static struct resource standard_io_resou
7750 }, {
7751 .name = "keyboard",
7752 .start = 0x0060,
7753 - .end = 0x006f,
7754 + .end = 0x0060,
7755 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
7756 +}, {
7757 + .name = "keyboard",
7758 + .start = 0x0064,
7759 + .end = 0x0064,
7760 .flags = IORESOURCE_BUSY | IORESOURCE_IO
7761 }, {
7762 .name = "dma page reg",
7763 @@ -166,6 +174,8 @@ struct cpuinfo_x86 new_cpu_data __cpuini
7764 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
7765 EXPORT_SYMBOL(boot_cpu_data);
7766
7767 +unsigned int def_to_bigsmp;
7768 +
7769 #ifndef CONFIG_X86_PAE
7770 unsigned long mmu_cr4_features;
7771 #else
7772 @@ -204,7 +214,7 @@ EXPORT_SYMBOL(ist_info);
7773 extern void early_cpu_init(void);
7774 extern int root_mountflags;
7775
7776 -unsigned long saved_videomode;
7777 +unsigned long saved_video_mode;
7778
7779 #define RAMDISK_IMAGE_START_MASK 0x07FF
7780 #define RAMDISK_PROMPT_FLAG 0x8000
7781 @@ -259,7 +269,7 @@ static inline void copy_edd(void)
7782 }
7783 #endif
7784
7785 -int __initdata user_defined_memmap = 0;
7786 +int __initdata user_defined_memmap;
7787
7788 /*
7789 * "mem=nopentium" disables the 4MB page tables.
7790 @@ -420,20 +430,59 @@ unsigned long __init find_max_low_pfn(vo
7791 }
7792
7793 #ifndef CONFIG_XEN
7794 +#define BIOS_LOWMEM_KILOBYTES 0x413
7795 +
7796 /*
7797 - * workaround for Dell systems that neglect to reserve EBDA
7798 + * The BIOS places the EBDA/XBDA at the top of conventional
7799 + * memory, and usually decreases the reported amount of
7800 + * conventional memory (int 0x12) too. This also contains a
7801 + * workaround for Dell systems that neglect to reserve EBDA.
7802 + * The same workaround also avoids a problem with the AMD768MPX
7803 + * chipset: reserve a page before VGA to prevent PCI prefetch
7804 + * into it (errata #56). Usually the page is reserved anyways,
7805 + * unless you have no PS/2 mouse plugged in.
7806 */
7807 static void __init reserve_ebda_region(void)
7808 {
7809 - unsigned int addr;
7810 - addr = get_bios_ebda();
7811 - if (addr)
7812 - reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
7813 + unsigned int lowmem, ebda_addr;
7814 +
7815 + /* To determine the position of the EBDA and the */
7816 + /* end of conventional memory, we need to look at */
7817 + /* the BIOS data area. In a paravirtual environment */
7818 + /* that area is absent. We'll just have to assume */
7819 + /* that the paravirt case can handle memory setup */
7820 + /* correctly, without our help. */
7821 + if (paravirt_enabled())
7822 + return;
7823 +
7824 + /* end of low (conventional) memory */
7825 + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
7826 + lowmem <<= 10;
7827 +
7828 + /* start of EBDA area */
7829 + ebda_addr = get_bios_ebda();
7830 +
7831 + /* Fixup: bios puts an EBDA in the top 64K segment */
7832 + /* of conventional memory, but does not adjust lowmem. */
7833 + if ((lowmem - ebda_addr) <= 0x10000)
7834 + lowmem = ebda_addr;
7835 +
7836 + /* Fixup: bios does not report an EBDA at all. */
7837 + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
7838 + if ((ebda_addr == 0) && (lowmem >= 0x9f000))
7839 + lowmem = 0x9f000;
7840 +
7841 + /* Paranoia: should never happen, but... */
7842 + if ((lowmem == 0) || (lowmem >= 0x100000))
7843 + lowmem = 0x9f000;
7844 +
7845 + /* reserve all memory between lowmem and the 1MB mark */
7846 + reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
7847 }
7848 #endif
7849
7850 #ifndef CONFIG_NEED_MULTIPLE_NODES
7851 -void __init setup_bootmem_allocator(void);
7852 +static void __init setup_bootmem_allocator(void);
7853 static unsigned long __init setup_memory(void)
7854 {
7855 /*
7856 @@ -469,7 +518,7 @@ static unsigned long __init setup_memory
7857 return max_low_pfn;
7858 }
7859
7860 -void __init zone_sizes_init(void)
7861 +static void __init zone_sizes_init(void)
7862 {
7863 unsigned long max_zone_pfns[MAX_NR_ZONES];
7864 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
7865 @@ -521,10 +570,16 @@ static void __init reserve_crashkernel(v
7866 (unsigned long)(crash_size >> 20),
7867 (unsigned long)(crash_base >> 20),
7868 (unsigned long)(total_mem >> 20));
7869 +
7870 + if (reserve_bootmem(crash_base, crash_size,
7871 + BOOTMEM_EXCLUSIVE) < 0) {
7872 + printk(KERN_INFO "crashkernel reservation "
7873 + "failed - memory is in use\n");
7874 + return;
7875 + }
7876 +
7877 crashk_res.start = crash_base;
7878 crashk_res.end = crash_base + crash_size - 1;
7879 - reserve_bootmem(crash_base, crash_size,
7880 - BOOTMEM_DEFAULT);
7881 } else
7882 printk(KERN_INFO "crashkernel reservation failed - "
7883 "you have to specify a base address\n");
7884 @@ -658,16 +713,9 @@ void __init setup_bootmem_allocator(void
7885 */
7886 reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
7887
7888 - /* reserve EBDA region, it's a 4K region */
7889 + /* reserve EBDA region */
7890 reserve_ebda_region();
7891
7892 - /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
7893 - PCI prefetch into it (errata #56). Usually the page is reserved anyways,
7894 - unless you have no PS/2 mouse plugged in. */
7895 - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
7896 - boot_cpu_data.x86 == 6)
7897 - reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
7898 -
7899 #ifdef CONFIG_SMP
7900 /*
7901 * But first pinch a few for the stack/trampoline stuff
7902 @@ -689,6 +737,8 @@ void __init setup_bootmem_allocator(void
7903 #endif
7904 numa_kva_reserve();
7905 reserve_crashkernel();
7906 +
7907 + reserve_ibft_region();
7908 }
7909
7910 /*
7911 @@ -724,6 +774,18 @@ char * __init __attribute__((weak)) memo
7912 return machine_specific_memory_setup();
7913 }
7914
7915 +#ifdef CONFIG_NUMA
7916 +/*
7917 + * In the golden day, when everything among i386 and x86_64 will be
7918 + * integrated, this will not live here
7919 + */
7920 +void *x86_cpu_to_node_map_early_ptr;
7921 +int x86_cpu_to_node_map_init[NR_CPUS] = {
7922 + [0 ... NR_CPUS-1] = NUMA_NO_NODE
7923 +};
7924 +DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
7925 +#endif
7926 +
7927 /*
7928 * Determine if we were loaded by an EFI loader. If so, then we have also been
7929 * passed the efi memmap, systab, etc., so we should use these data structures
7930 @@ -773,7 +835,7 @@ void __init setup_arch(char **cmdline_p)
7931 copy_edid();
7932 apm_info.bios = boot_params.apm_bios_info;
7933 ist_info = boot_params.ist_info;
7934 - saved_videomode = boot_params.hdr.vid_mode;
7935 + saved_video_mode = boot_params.hdr.vid_mode;
7936 if( boot_params.sys_desc_table.length != 0 ) {
7937 set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
7938 machine_id = boot_params.sys_desc_table.table[0];
7939 @@ -840,15 +902,19 @@ void __init setup_arch(char **cmdline_p)
7940 efi_init();
7941
7942 /* update e820 for memory not covered by WB MTRRs */
7943 - find_max_pfn();
7944 + propagate_e820_map();
7945 mtrr_bp_init();
7946 #ifndef CONFIG_XEN
7947 if (mtrr_trim_uncached_memory(max_pfn))
7948 - find_max_pfn();
7949 + propagate_e820_map();
7950 #endif
7951
7952 max_low_pfn = setup_memory();
7953
7954 +#ifdef CONFIG_KVM_CLOCK
7955 + kvmclock_init();
7956 +#endif
7957 +
7958 #ifdef CONFIG_VMI
7959 /*
7960 * Must be after max_low_pfn is determined, and before kernel
7961 @@ -856,6 +922,7 @@ void __init setup_arch(char **cmdline_p)
7962 */
7963 vmi_init();
7964 #endif
7965 + kvm_guest_init();
7966
7967 /*
7968 * NOTE: before this point _nobody_ is allowed to allocate
7969 @@ -977,6 +1044,18 @@ void __init setup_arch(char **cmdline_p)
7970
7971 io_delay_init();
7972
7973 +#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
7974 + /*
7975 + * setup to use the early static init tables during kernel startup
7976 + * X86_SMP will exclude sub-arches that don't deal well with it.
7977 + */
7978 + x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
7979 + x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
7980 +#ifdef CONFIG_NUMA
7981 + x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
7982 +#endif
7983 +#endif
7984 +
7985 #ifdef CONFIG_X86_GENERICARCH
7986 generic_apic_probe();
7987 #endif
7988 --- sle11-2009-10-16.orig/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:33:40.000000000 +0100
7989 +++ sle11-2009-10-16/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:38:05.000000000 +0100
7990 @@ -29,18 +29,22 @@
7991 #include <linux/crash_dump.h>
7992 #include <linux/root_dev.h>
7993 #include <linux/pci.h>
7994 +#include <asm/pci-direct.h>
7995 #include <linux/efi.h>
7996 #include <linux/acpi.h>
7997 #include <linux/kallsyms.h>
7998 #include <linux/edd.h>
7999 +#include <linux/iscsi_ibft.h>
8000 #include <linux/mmzone.h>
8001 #include <linux/kexec.h>
8002 #include <linux/cpufreq.h>
8003 #include <linux/dmi.h>
8004 #include <linux/dma-mapping.h>
8005 #include <linux/ctype.h>
8006 +#include <linux/sort.h>
8007 #include <linux/uaccess.h>
8008 #include <linux/init_ohci1394_dma.h>
8009 +#include <linux/kvm_para.h>
8010
8011 #include <asm/mtrr.h>
8012 #include <asm/uaccess.h>
8013 @@ -58,7 +62,6 @@
8014 #include <asm/mmu_context.h>
8015 #include <asm/proto.h>
8016 #include <asm/setup.h>
8017 -#include <asm/mach_apic.h>
8018 #include <asm/numa.h>
8019 #include <asm/sections.h>
8020 #include <asm/dmi.h>
8021 @@ -66,6 +69,9 @@
8022 #include <asm/mce.h>
8023 #include <asm/ds.h>
8024 #include <asm/topology.h>
8025 +#include <asm/pat.h>
8026 +
8027 +#include <mach_apic.h>
8028 #ifdef CONFIG_XEN
8029 #include <linux/percpu.h>
8030 #include <xen/interface/physdev.h>
8031 @@ -149,7 +155,7 @@ extern int root_mountflags;
8032
8033 char __initdata command_line[COMMAND_LINE_SIZE];
8034
8035 -struct resource standard_io_resources[] = {
8036 +static struct resource standard_io_resources[] = {
8037 { .name = "dma1", .start = 0x00, .end = 0x1f,
8038 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8039 { .name = "pic1", .start = 0x20, .end = 0x21,
8040 @@ -158,7 +164,9 @@ struct resource standard_io_resources[]
8041 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8042 { .name = "timer1", .start = 0x50, .end = 0x53,
8043 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8044 - { .name = "keyboard", .start = 0x60, .end = 0x6f,
8045 + { .name = "keyboard", .start = 0x60, .end = 0x60,
8046 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8047 + { .name = "keyboard", .start = 0x64, .end = 0x64,
8048 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8049 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
8050 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
8051 @@ -224,8 +232,10 @@ contig_initmem_init(unsigned long start_
8052 e820_register_active_regions(0, start_pfn, end_pfn);
8053 #ifdef CONFIG_XEN
8054 free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
8055 + early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
8056 #else
8057 free_bootmem_with_active_regions(0, end_pfn);
8058 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
8059 #endif
8060 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
8061 }
8062 @@ -290,6 +300,7 @@ static void __init reserve_crashkernel(v
8063 (unsigned long)(total_mem >> 20));
8064 crashk_res.start = crash_base;
8065 crashk_res.end = crash_base + crash_size - 1;
8066 + insert_resource(&iomem_resource, &crashk_res);
8067 }
8068 }
8069 #else
8070 @@ -306,6 +317,40 @@ void __attribute__((weak)) __init memory
8071 machine_specific_memory_setup();
8072 }
8073
8074 +static void __init parse_setup_data(void)
8075 +{
8076 + struct setup_data *data;
8077 + unsigned long pa_data;
8078 +
8079 + if (boot_params.hdr.version < 0x0209)
8080 + return;
8081 + pa_data = boot_params.hdr.setup_data;
8082 + while (pa_data) {
8083 + data = early_ioremap(pa_data, PAGE_SIZE);
8084 + switch (data->type) {
8085 + default:
8086 + break;
8087 + }
8088 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
8089 + free_early(pa_data, pa_data+sizeof(*data)+data->len);
8090 +#endif
8091 + pa_data = data->next;
8092 + early_iounmap(data, PAGE_SIZE);
8093 + }
8094 +}
8095 +
8096 +#ifdef CONFIG_PCI_MMCONFIG
8097 +extern void __cpuinit fam10h_check_enable_mmcfg(void);
8098 +extern void __init check_enable_amd_mmconf_dmi(void);
8099 +#else
8100 +void __cpuinit fam10h_check_enable_mmcfg(void)
8101 +{
8102 +}
8103 +void __init check_enable_amd_mmconf_dmi(void)
8104 +{
8105 +}
8106 +#endif
8107 +
8108 /*
8109 * setup_arch - architecture-specific boot-time initializations
8110 *
8111 @@ -389,6 +434,8 @@ void __init setup_arch(char **cmdline_p)
8112 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
8113 *cmdline_p = command_line;
8114
8115 + parse_setup_data();
8116 +
8117 parse_early_param();
8118
8119 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
8120 @@ -398,6 +445,13 @@ void __init setup_arch(char **cmdline_p)
8121
8122 finish_e820_parsing();
8123
8124 +#ifndef CONFIG_XEN
8125 + /* after parse_early_param, so could debug it */
8126 + insert_resource(&iomem_resource, &code_resource);
8127 + insert_resource(&iomem_resource, &data_resource);
8128 + insert_resource(&iomem_resource, &bss_resource);
8129 +#endif
8130 +
8131 early_gart_iommu_check();
8132
8133 e820_register_active_regions(0, 0, -1UL);
8134 @@ -420,15 +474,23 @@ void __init setup_arch(char **cmdline_p)
8135
8136 check_efer();
8137
8138 - init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
8139 + max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
8140 if (efi_enabled)
8141 efi_init();
8142
8143 +#ifndef CONFIG_XEN
8144 + vsmp_init();
8145 +#endif
8146 +
8147 if (is_initial_xendomain())
8148 dmi_scan_machine();
8149
8150 io_delay_init();
8151
8152 +#ifdef CONFIG_KVM_CLOCK
8153 + kvmclock_init();
8154 +#endif
8155 +
8156 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
8157 /* setup to use the early static init tables during kernel startup */
8158 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
8159 @@ -459,9 +521,9 @@ void __init setup_arch(char **cmdline_p)
8160 contig_initmem_init(0, end_pfn);
8161 #endif
8162
8163 - early_res_to_bootmem();
8164 -
8165 #ifndef CONFIG_XEN
8166 + dma32_reserve_bootmem();
8167 +
8168 #ifdef CONFIG_ACPI_SLEEP
8169 /*
8170 * Reserve low memory region for sleep support.
8171 @@ -487,16 +549,17 @@ void __init setup_arch(char **cmdline_p)
8172 unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
8173
8174 if (ramdisk_end <= end_of_mem) {
8175 -#ifndef CONFIG_XEN
8176 - reserve_bootmem_generic(ramdisk_image, ramdisk_size);
8177 -#endif
8178 + /*
8179 + * don't need to reserve again, already reserved early
8180 + * in x86_64_start_kernel, and early_res_to_bootmem
8181 + * convert that to reserved in bootmem
8182 + */
8183 initrd_start = ramdisk_image + PAGE_OFFSET;
8184 initrd_end = initrd_start+ramdisk_size;
8185 #ifdef CONFIG_XEN
8186 initrd_below_start_ok = 1;
8187 #endif
8188 } else {
8189 - /* Assumes everything on node 0 */
8190 free_bootmem(ramdisk_image, ramdisk_size);
8191 printk(KERN_ERR "initrd extends beyond end of memory "
8192 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
8193 @@ -506,6 +569,9 @@ void __init setup_arch(char **cmdline_p)
8194 }
8195 #endif
8196 reserve_crashkernel();
8197 +
8198 + reserve_ibft_region();
8199 +
8200 paging_init();
8201 map_vsyscall();
8202 #ifdef CONFIG_X86_LOCAL_APIC
8203 @@ -633,16 +699,16 @@ void __init setup_arch(char **cmdline_p)
8204 prefill_possible_map();
8205 #endif
8206
8207 + kvm_guest_init();
8208 +
8209 /*
8210 * We trust e820 completely. No explicit ROM probing in memory.
8211 */
8212 #ifdef CONFIG_XEN
8213 if (is_initial_xendomain())
8214 - e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
8215 - &code_resource, &data_resource, &bss_resource);
8216 + e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
8217 #else
8218 - e820_reserve_resources(e820.map, e820.nr_map,
8219 - &code_resource, &data_resource, &bss_resource);
8220 + e820_reserve_resources(e820.map, e820.nr_map);
8221 e820_mark_nosave_regions();
8222 #endif
8223
8224 @@ -690,6 +756,9 @@ void __init setup_arch(char **cmdline_p)
8225 #endif
8226
8227 #endif /* !CONFIG_XEN */
8228 +
8229 + /* do this before identify_cpu for boot cpu */
8230 + check_enable_amd_mmconf_dmi();
8231 }
8232
8233 #ifdef CONFIG_XEN
8234 @@ -786,9 +855,9 @@ static void __cpuinit amd_detect_cmp(str
8235 bits = c->x86_coreid_bits;
8236
8237 /* Low order bits define the core id (index of core in socket) */
8238 - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
8239 - /* Convert the APIC ID into the socket ID */
8240 - c->phys_proc_id = phys_pkg_id(bits);
8241 + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
8242 + /* Convert the initial APIC ID into the socket ID */
8243 + c->phys_proc_id = c->initial_apicid >> bits;
8244
8245 #ifdef CONFIG_NUMA
8246 node = c->phys_proc_id;
8247 @@ -805,7 +874,7 @@ static void __cpuinit amd_detect_cmp(str
8248 If that doesn't result in a usable node fall back to the
8249 path for the previous case. */
8250
8251 - int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
8252 + int ht_nodeid = c->initial_apicid;
8253
8254 if (ht_nodeid >= 0 &&
8255 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
8256 @@ -913,7 +982,7 @@ static void __cpuinit init_amd(struct cp
8257
8258 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
8259 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
8260 - clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
8261 + clear_cpu_cap(c, 0*32+31);
8262
8263 /* On C+ stepping K8 rep microcode works well for copy/memset */
8264 level = cpuid_eax(1);
8265 @@ -955,9 +1024,25 @@ static void __cpuinit init_amd(struct cp
8266 /* MFENCE stops RDTSC speculation */
8267 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
8268
8269 + if (c->x86 == 0x10)
8270 + fam10h_check_enable_mmcfg();
8271 +
8272 #ifndef CONFIG_XEN
8273 if (amd_apic_timer_broken())
8274 disable_apic_timer = 1;
8275 +
8276 + if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
8277 + unsigned long long tseg;
8278 +
8279 + /*
8280 + * Split up direct mapping around the TSEG SMM area.
8281 + * Don't do it for gbpages because there seems very little
8282 + * benefit in doing so.
8283 + */
8284 + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
8285 + (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
8286 + set_memory_4k((unsigned long)__va(tseg), 1);
8287 + }
8288 #endif
8289 }
8290
8291 @@ -1051,7 +1136,7 @@ static void __cpuinit early_init_intel(s
8292 {
8293 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
8294 (c->x86 == 0x6 && c->x86_model >= 0x0e))
8295 - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
8296 + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8297 }
8298
8299 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
8300 @@ -1094,9 +1179,6 @@ static void __cpuinit init_intel(struct
8301
8302 if (c->x86 == 15)
8303 c->x86_cache_alignment = c->x86_clflush_size * 2;
8304 - if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
8305 - (c->x86 == 0x6 && c->x86_model >= 0x0e))
8306 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8307 if (c->x86 == 6)
8308 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
8309 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
8310 @@ -1105,6 +1187,32 @@ static void __cpuinit init_intel(struct
8311 srat_detect_node();
8312 }
8313
8314 +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
8315 +{
8316 + if (c->x86 == 0x6 && c->x86_model >= 0xf)
8317 + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8318 +}
8319 +
8320 +static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
8321 +{
8322 + /* Cache sizes */
8323 + unsigned n;
8324 +
8325 + n = c->extended_cpuid_level;
8326 + if (n >= 0x80000008) {
8327 + unsigned eax = cpuid_eax(0x80000008);
8328 + c->x86_virt_bits = (eax >> 8) & 0xff;
8329 + c->x86_phys_bits = eax & 0xff;
8330 + }
8331 +
8332 + if (c->x86 == 0x6 && c->x86_model >= 0xf) {
8333 + c->x86_cache_alignment = c->x86_clflush_size * 2;
8334 + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8335 + set_cpu_cap(c, X86_FEATURE_REP_GOOD);
8336 + }
8337 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
8338 +}
8339 +
8340 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
8341 {
8342 char *v = c->x86_vendor_id;
8343 @@ -1113,6 +1221,8 @@ static void __cpuinit get_cpu_vendor(str
8344 c->x86_vendor = X86_VENDOR_AMD;
8345 else if (!strcmp(v, "GenuineIntel"))
8346 c->x86_vendor = X86_VENDOR_INTEL;
8347 + else if (!strcmp(v, "CentaurHauls"))
8348 + c->x86_vendor = X86_VENDOR_CENTAUR;
8349 else
8350 c->x86_vendor = X86_VENDOR_UNKNOWN;
8351 }
8352 @@ -1160,15 +1270,16 @@ static void __cpuinit early_identify_cpu
8353 c->x86 += (tfms >> 20) & 0xff;
8354 if (c->x86 >= 0x6)
8355 c->x86_model += ((tfms >> 16) & 0xF) << 4;
8356 - if (c->x86_capability[0] & (1<<19))
8357 + if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
8358 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
8359 } else {
8360 /* Have CPUID level 0 only - unheard of */
8361 c->x86 = 4;
8362 }
8363
8364 + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
8365 #ifdef CONFIG_SMP
8366 - c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
8367 + c->phys_proc_id = c->initial_apicid;
8368 #endif
8369 /* AMD-defined flags: level 0x80000001 */
8370 xlvl = cpuid_eax(0x80000000);
8371 @@ -1201,8 +1312,12 @@ static void __cpuinit early_identify_cpu
8372 case X86_VENDOR_INTEL:
8373 early_init_intel(c);
8374 break;
8375 + case X86_VENDOR_CENTAUR:
8376 + early_init_centaur(c);
8377 + break;
8378 }
8379
8380 + validate_pat_support(c);
8381 }
8382
8383 /*
8384 @@ -1237,6 +1352,10 @@ void __cpuinit identify_cpu(struct cpuin
8385 init_intel(c);
8386 break;
8387
8388 + case X86_VENDOR_CENTAUR:
8389 + init_centaur(c);
8390 + break;
8391 +
8392 case X86_VENDOR_UNKNOWN:
8393 default:
8394 display_cacheinfo(c);
8395 @@ -1266,14 +1385,24 @@ void __cpuinit identify_cpu(struct cpuin
8396 #endif
8397 select_idle_routine(c);
8398
8399 - if (c != &boot_cpu_data)
8400 - mtrr_ap_init();
8401 #ifdef CONFIG_NUMA
8402 numa_add_cpu(smp_processor_id());
8403 #endif
8404
8405 }
8406
8407 +void __cpuinit identify_boot_cpu(void)
8408 +{
8409 + identify_cpu(&boot_cpu_data);
8410 +}
8411 +
8412 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
8413 +{
8414 + BUG_ON(c == &boot_cpu_data);
8415 + identify_cpu(c);
8416 + mtrr_ap_init();
8417 +}
8418 +
8419 static __init int setup_noclflush(char *arg)
8420 {
8421 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
8422 @@ -1302,123 +1431,3 @@ static __init int setup_disablecpuid(cha
8423 return 1;
8424 }
8425 __setup("clearcpuid=", setup_disablecpuid);
8426 -
8427 -/*
8428 - * Get CPU information for use by the procfs.
8429 - */
8430 -
8431 -static int show_cpuinfo(struct seq_file *m, void *v)
8432 -{
8433 - struct cpuinfo_x86 *c = v;
8434 - int cpu = 0, i;
8435 -
8436 -#ifdef CONFIG_SMP
8437 - cpu = c->cpu_index;
8438 -#endif
8439 -
8440 - seq_printf(m, "processor\t: %u\n"
8441 - "vendor_id\t: %s\n"
8442 - "cpu family\t: %d\n"
8443 - "model\t\t: %d\n"
8444 - "model name\t: %s\n",
8445 - (unsigned)cpu,
8446 - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8447 - c->x86,
8448 - (int)c->x86_model,
8449 - c->x86_model_id[0] ? c->x86_model_id : "unknown");
8450 -
8451 - if (c->x86_mask || c->cpuid_level >= 0)
8452 - seq_printf(m, "stepping\t: %d\n", c->x86_mask);
8453 - else
8454 - seq_printf(m, "stepping\t: unknown\n");
8455 -
8456 - if (cpu_has(c, X86_FEATURE_TSC)) {
8457 - unsigned int freq = cpufreq_quick_get((unsigned)cpu);
8458 -
8459 - if (!freq)
8460 - freq = cpu_khz;
8461 - seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
8462 - freq / 1000, (freq % 1000));
8463 - }
8464 -
8465 - /* Cache size */
8466 - if (c->x86_cache_size >= 0)
8467 - seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
8468 -
8469 -#ifdef CONFIG_SMP
8470 - if (smp_num_siblings * c->x86_max_cores > 1) {
8471 - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
8472 - seq_printf(m, "siblings\t: %d\n",
8473 - cpus_weight(per_cpu(cpu_core_map, cpu)));
8474 - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
8475 - seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
8476 - }
8477 -#endif
8478 -
8479 - seq_printf(m,
8480 - "fpu\t\t: yes\n"
8481 - "fpu_exception\t: yes\n"
8482 - "cpuid level\t: %d\n"
8483 - "wp\t\t: yes\n"
8484 - "flags\t\t:",
8485 - c->cpuid_level);
8486 -
8487 - for (i = 0; i < 32*NCAPINTS; i++)
8488 - if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8489 - seq_printf(m, " %s", x86_cap_flags[i]);
8490 -
8491 - seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
8492 - c->loops_per_jiffy/(500000/HZ),
8493 - (c->loops_per_jiffy/(5000/HZ)) % 100);
8494 -
8495 - if (c->x86_tlbsize > 0)
8496 - seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
8497 - seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
8498 - seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
8499 -
8500 - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8501 - c->x86_phys_bits, c->x86_virt_bits);
8502 -
8503 - seq_printf(m, "power management:");
8504 - for (i = 0; i < 32; i++) {
8505 - if (c->x86_power & (1 << i)) {
8506 - if (i < ARRAY_SIZE(x86_power_flags) &&
8507 - x86_power_flags[i])
8508 - seq_printf(m, "%s%s",
8509 - x86_power_flags[i][0]?" ":"",
8510 - x86_power_flags[i]);
8511 - else
8512 - seq_printf(m, " [%d]", i);
8513 - }
8514 - }
8515 -
8516 - seq_printf(m, "\n\n");
8517 -
8518 - return 0;
8519 -}
8520 -
8521 -static void *c_start(struct seq_file *m, loff_t *pos)
8522 -{
8523 - if (*pos == 0) /* just in case, cpu 0 is not the first */
8524 - *pos = first_cpu(cpu_online_map);
8525 - if ((*pos) < NR_CPUS && cpu_online(*pos))
8526 - return &cpu_data(*pos);
8527 - return NULL;
8528 -}
8529 -
8530 -static void *c_next(struct seq_file *m, void *v, loff_t *pos)
8531 -{
8532 - *pos = next_cpu(*pos, cpu_online_map);
8533 - return c_start(m, pos);
8534 -}
8535 -
8536 -static void c_stop(struct seq_file *m, void *v)
8537 -{
8538 -}
8539 -
8540 -const struct seq_operations cpuinfo_op = {
8541 - .start = c_start,
8542 - .next = c_next,
8543 - .stop = c_stop,
8544 - .show = show_cpuinfo,
8545 -};
8546 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
8547 +++ sle11-2009-10-16/arch/x86/kernel/smp-xen.c 2009-03-16 16:38:05.000000000 +0100
8548 @@ -0,0 +1,329 @@
8549 +/*
8550 + * Intel SMP support routines.
8551 + *
8552 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
8553 + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
8554 + * (c) 2002,2003 Andi Kleen, SuSE Labs.
8555 + *
8556 + * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
8557 + *
8558 + * This code is released under the GNU General Public License version 2 or
8559 + * later.
8560 + */
8561 +
8562 +#include <linux/init.h>
8563 +
8564 +#include <linux/mm.h>
8565 +#include <linux/delay.h>
8566 +#include <linux/spinlock.h>
8567 +#include <linux/kernel_stat.h>
8568 +#include <linux/mc146818rtc.h>
8569 +#include <linux/cache.h>
8570 +#include <linux/interrupt.h>
8571 +#include <linux/cpu.h>
8572 +
8573 +#include <asm/mtrr.h>
8574 +#include <asm/tlbflush.h>
8575 +#include <asm/mmu_context.h>
8576 +#include <asm/proto.h>
8577 +#include <mach_ipi.h>
8578 +#include <xen/evtchn.h>
8579 +/*
8580 + * Some notes on x86 processor bugs affecting SMP operation:
8581 + *
8582 + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
8583 + * The Linux implications for SMP are handled as follows:
8584 + *
8585 + * Pentium III / [Xeon]
8586 + * None of the E1AP-E3AP errata are visible to the user.
8587 + *
8588 + * E1AP. see PII A1AP
8589 + * E2AP. see PII A2AP
8590 + * E3AP. see PII A3AP
8591 + *
8592 + * Pentium II / [Xeon]
8593 + * None of the A1AP-A3AP errata are visible to the user.
8594 + *
8595 + * A1AP. see PPro 1AP
8596 + * A2AP. see PPro 2AP
8597 + * A3AP. see PPro 7AP
8598 + *
8599 + * Pentium Pro
8600 + * None of 1AP-9AP errata are visible to the normal user,
8601 + * except occasional delivery of 'spurious interrupt' as trap #15.
8602 + * This is very rare and a non-problem.
8603 + *
8604 + * 1AP. Linux maps APIC as non-cacheable
8605 + * 2AP. worked around in hardware
8606 + * 3AP. fixed in C0 and above steppings microcode update.
8607 + * Linux does not use excessive STARTUP_IPIs.
8608 + * 4AP. worked around in hardware
8609 + * 5AP. symmetric IO mode (normal Linux operation) not affected.
8610 + * 'noapic' mode has vector 0xf filled out properly.
8611 + * 6AP. 'noapic' mode might be affected - fixed in later steppings
8612 + * 7AP. We do not assume writes to the LVT deassering IRQs
8613 + * 8AP. We do not enable low power mode (deep sleep) during MP bootup
8614 + * 9AP. We do not use mixed mode
8615 + *
8616 + * Pentium
8617 + * There is a marginal case where REP MOVS on 100MHz SMP
8618 + * machines with B stepping processors can fail. XXX should provide
8619 + * an L1cache=Writethrough or L1cache=off option.
8620 + *
8621 + * B stepping CPUs may hang. There are hardware work arounds
8622 + * for this. We warn about it in case your board doesn't have the work
8623 + * arounds. Basically that's so I can tell anyone with a B stepping
8624 + * CPU and SMP problems "tough".
8625 + *
8626 + * Specific items [From Pentium Processor Specification Update]
8627 + *
8628 + * 1AP. Linux doesn't use remote read
8629 + * 2AP. Linux doesn't trust APIC errors
8630 + * 3AP. We work around this
8631 + * 4AP. Linux never generated 3 interrupts of the same priority
8632 + * to cause a lost local interrupt.
8633 + * 5AP. Remote read is never used
8634 + * 6AP. not affected - worked around in hardware
8635 + * 7AP. not affected - worked around in hardware
8636 + * 8AP. worked around in hardware - we get explicit CS errors if not
8637 + * 9AP. only 'noapic' mode affected. Might generate spurious
8638 + * interrupts, we log only the first one and count the
8639 + * rest silently.
8640 + * 10AP. not affected - worked around in hardware
8641 + * 11AP. Linux reads the APIC between writes to avoid this, as per
8642 + * the documentation. Make sure you preserve this as it affects
8643 + * the C stepping chips too.
8644 + * 12AP. not affected - worked around in hardware
8645 + * 13AP. not affected - worked around in hardware
8646 + * 14AP. we always deassert INIT during bootup
8647 + * 15AP. not affected - worked around in hardware
8648 + * 16AP. not affected - worked around in hardware
8649 + * 17AP. not affected - worked around in hardware
8650 + * 18AP. not affected - worked around in hardware
8651 + * 19AP. not affected - worked around in BIOS
8652 + *
8653 + * If this sounds worrying believe me these bugs are either ___RARE___,
8654 + * or are signal timing bugs worked around in hardware and there's
8655 + * about nothing of note with C stepping upwards.
8656 + */
8657 +
8658 +/*
8659 + * this function sends a 'reschedule' IPI to another CPU.
8660 + * it goes straight through and wastes no time serializing
8661 + * anything. Worst case is that we lose a reschedule ...
8662 + */
8663 +void xen_smp_send_reschedule(int cpu)
8664 +{
8665 + if (unlikely(cpu_is_offline(cpu))) {
8666 + WARN_ON(1);
8667 + return;
8668 + }
8669 + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
8670 +}
8671 +
8672 +/*
8673 + * Structure and data for smp_call_function(). This is designed to minimise
8674 + * static memory requirements. It also looks cleaner.
8675 + */
8676 +static DEFINE_SPINLOCK(call_lock);
8677 +
8678 +struct call_data_struct {
8679 + void (*func) (void *info);
8680 + void *info;
8681 + atomic_t started;
8682 + atomic_t finished;
8683 + int wait;
8684 +};
8685 +
8686 +void lock_ipi_call_lock(void)
8687 +{
8688 + spin_lock_irq(&call_lock);
8689 +}
8690 +
8691 +void unlock_ipi_call_lock(void)
8692 +{
8693 + spin_unlock_irq(&call_lock);
8694 +}
8695 +
8696 +static struct call_data_struct *call_data;
8697 +
8698 +static void __smp_call_function(void (*func) (void *info), void *info,
8699 + int nonatomic, int wait)
8700 +{
8701 + struct call_data_struct data;
8702 + int cpus = num_online_cpus() - 1;
8703 +
8704 + if (!cpus)
8705 + return;
8706 +
8707 + data.func = func;
8708 + data.info = info;
8709 + atomic_set(&data.started, 0);
8710 + data.wait = wait;
8711 + if (wait)
8712 + atomic_set(&data.finished, 0);
8713 +
8714 + call_data = &data;
8715 + mb();
8716 +
8717 + /* Send a message to all other CPUs and wait for them to respond */
8718 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
8719 +
8720 + /* Wait for response */
8721 + while (atomic_read(&data.started) != cpus)
8722 + cpu_relax();
8723 +
8724 + if (wait)
8725 + while (atomic_read(&data.finished) != cpus)
8726 + cpu_relax();
8727 +}
8728 +
8729 +
8730 +/**
8731 + * smp_call_function_mask(): Run a function on a set of other CPUs.
8732 + * @mask: The set of cpus to run on. Must not include the current cpu.
8733 + * @func: The function to run. This must be fast and non-blocking.
8734 + * @info: An arbitrary pointer to pass to the function.
8735 + * @wait: If true, wait (atomically) until function has completed on other CPUs.
8736 + *
8737 + * Returns 0 on success, else a negative status code.
8738 + *
8739 + * If @wait is true, then returns once @func has returned; otherwise
8740 + * it returns just before the target cpu calls @func.
8741 + *
8742 + * You must not call this function with disabled interrupts or from a
8743 + * hardware interrupt handler or from a bottom half handler.
8744 + */
8745 +int
8746 +xen_smp_call_function_mask(cpumask_t mask,
8747 + void (*func)(void *), void *info,
8748 + int wait)
8749 +{
8750 + struct call_data_struct data;
8751 + cpumask_t allbutself;
8752 + int cpus;
8753 +
8754 + /* Can deadlock when called with interrupts disabled */
8755 + WARN_ON(irqs_disabled());
8756 +
8757 + /* Holding any lock stops cpus from going down. */
8758 + spin_lock(&call_lock);
8759 +
8760 + allbutself = cpu_online_map;
8761 + cpu_clear(smp_processor_id(), allbutself);
8762 +
8763 + cpus_and(mask, mask, allbutself);
8764 + cpus = cpus_weight(mask);
8765 +
8766 + if (!cpus) {
8767 + spin_unlock(&call_lock);
8768 + return 0;
8769 + }
8770 +
8771 + data.func = func;
8772 + data.info = info;
8773 + atomic_set(&data.started, 0);
8774 + data.wait = wait;
8775 + if (wait)
8776 + atomic_set(&data.finished, 0);
8777 +
8778 + call_data = &data;
8779 + wmb();
8780 +
8781 + /* Send a message to other CPUs */
8782 + if (cpus_equal(mask, allbutself) &&
8783 + cpus_equal(cpu_online_map, cpu_callout_map))
8784 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
8785 + else
8786 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
8787 +
8788 + /* Wait for response */
8789 + while (atomic_read(&data.started) != cpus)
8790 + cpu_relax();
8791 +
8792 + if (wait)
8793 + while (atomic_read(&data.finished) != cpus)
8794 + cpu_relax();
8795 + spin_unlock(&call_lock);
8796 +
8797 + return 0;
8798 +}
8799 +
8800 +static void stop_this_cpu(void *dummy)
8801 +{
8802 + local_irq_disable();
8803 + /*
8804 + * Remove this CPU:
8805 + */
8806 + cpu_clear(smp_processor_id(), cpu_online_map);
8807 + disable_all_local_evtchn();
8808 + if (hlt_works(smp_processor_id()))
8809 + for (;;) halt();
8810 + for (;;);
8811 +}
8812 +
8813 +/*
8814 + * this function calls the 'stop' function on all other CPUs in the system.
8815 + */
8816 +
8817 +void xen_smp_send_stop(void)
8818 +{
8819 + int nolock;
8820 + unsigned long flags;
8821 +
8822 + /* Don't deadlock on the call lock in panic */
8823 + nolock = !spin_trylock(&call_lock);
8824 + local_irq_save(flags);
8825 + __smp_call_function(stop_this_cpu, NULL, 0, 0);
8826 + if (!nolock)
8827 + spin_unlock(&call_lock);
8828 + disable_all_local_evtchn();
8829 + local_irq_restore(flags);
8830 +}
8831 +
8832 +/*
8833 + * Reschedule call back. Nothing to do,
8834 + * all the work is done automatically when
8835 + * we return from the interrupt.
8836 + */
8837 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
8838 +{
8839 +#ifdef CONFIG_X86_32
8840 + __get_cpu_var(irq_stat).irq_resched_count++;
8841 +#else
8842 + add_pda(irq_resched_count, 1);
8843 +#endif
8844 + return IRQ_HANDLED;
8845 +}
8846 +
8847 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
8848 +{
8849 + void (*func) (void *info) = call_data->func;
8850 + void *info = call_data->info;
8851 + int wait = call_data->wait;
8852 +
8853 + /*
8854 + * Notify initiating CPU that I've grabbed the data and am
8855 + * about to execute the function
8856 + */
8857 + mb();
8858 + atomic_inc(&call_data->started);
8859 + /*
8860 + * At this point the info structure may be out of scope unless wait==1
8861 + */
8862 + irq_enter();
8863 + (*func)(info);
8864 +#ifdef CONFIG_X86_32
8865 + __get_cpu_var(irq_stat).irq_call_count++;
8866 +#else
8867 + add_pda(irq_call_count, 1);
8868 +#endif
8869 + irq_exit();
8870 +
8871 + if (wait) {
8872 + mb();
8873 + atomic_inc(&call_data->finished);
8874 + }
8875 +
8876 + return IRQ_HANDLED;
8877 +}
8878 --- sle11-2009-10-16.orig/arch/x86/kernel/smp_32-xen.c 2009-03-16 16:33:40.000000000 +0100
8879 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
8880 @@ -1,647 +0,0 @@
8881 -/*
8882 - * Intel SMP support routines.
8883 - *
8884 - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
8885 - * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
8886 - *
8887 - * This code is released under the GNU General Public License version 2 or
8888 - * later.
8889 - */
8890 -
8891 -#include <linux/init.h>
8892 -
8893 -#include <linux/mm.h>
8894 -#include <linux/delay.h>
8895 -#include <linux/spinlock.h>
8896 -#include <linux/kernel_stat.h>
8897 -#include <linux/mc146818rtc.h>
8898 -#include <linux/cache.h>
8899 -#include <linux/interrupt.h>
8900 -#include <linux/cpu.h>
8901 -#include <linux/module.h>
8902 -
8903 -#include <asm/mtrr.h>
8904 -#include <asm/tlbflush.h>
8905 -#include <asm/mmu_context.h>
8906 -#if 0
8907 -#include <mach_apic.h>
8908 -#endif
8909 -#include <xen/evtchn.h>
8910 -
8911 -/*
8912 - * Some notes on x86 processor bugs affecting SMP operation:
8913 - *
8914 - * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
8915 - * The Linux implications for SMP are handled as follows:
8916 - *
8917 - * Pentium III / [Xeon]
8918 - * None of the E1AP-E3AP errata are visible to the user.
8919 - *
8920 - * E1AP. see PII A1AP
8921 - * E2AP. see PII A2AP
8922 - * E3AP. see PII A3AP
8923 - *
8924 - * Pentium II / [Xeon]
8925 - * None of the A1AP-A3AP errata are visible to the user.
8926 - *
8927 - * A1AP. see PPro 1AP
8928 - * A2AP. see PPro 2AP
8929 - * A3AP. see PPro 7AP
8930 - *
8931 - * Pentium Pro
8932 - * None of 1AP-9AP errata are visible to the normal user,
8933 - * except occasional delivery of 'spurious interrupt' as trap #15.
8934 - * This is very rare and a non-problem.
8935 - *
8936 - * 1AP. Linux maps APIC as non-cacheable
8937 - * 2AP. worked around in hardware
8938 - * 3AP. fixed in C0 and above steppings microcode update.
8939 - * Linux does not use excessive STARTUP_IPIs.
8940 - * 4AP. worked around in hardware
8941 - * 5AP. symmetric IO mode (normal Linux operation) not affected.
8942 - * 'noapic' mode has vector 0xf filled out properly.
8943 - * 6AP. 'noapic' mode might be affected - fixed in later steppings
8944 - * 7AP. We do not assume writes to the LVT deassering IRQs
8945 - * 8AP. We do not enable low power mode (deep sleep) during MP bootup
8946 - * 9AP. We do not use mixed mode
8947 - *
8948 - * Pentium
8949 - * There is a marginal case where REP MOVS on 100MHz SMP
8950 - * machines with B stepping processors can fail. XXX should provide
8951 - * an L1cache=Writethrough or L1cache=off option.
8952 - *
8953 - * B stepping CPUs may hang. There are hardware work arounds
8954 - * for this. We warn about it in case your board doesn't have the work
8955 - * arounds. Basically that's so I can tell anyone with a B stepping
8956 - * CPU and SMP problems "tough".
8957 - *
8958 - * Specific items [From Pentium Processor Specification Update]
8959 - *
8960 - * 1AP. Linux doesn't use remote read
8961 - * 2AP. Linux doesn't trust APIC errors
8962 - * 3AP. We work around this
8963 - * 4AP. Linux never generated 3 interrupts of the same priority
8964 - * to cause a lost local interrupt.
8965 - * 5AP. Remote read is never used
8966 - * 6AP. not affected - worked around in hardware
8967 - * 7AP. not affected - worked around in hardware
8968 - * 8AP. worked around in hardware - we get explicit CS errors if not
8969 - * 9AP. only 'noapic' mode affected. Might generate spurious
8970 - * interrupts, we log only the first one and count the
8971 - * rest silently.
8972 - * 10AP. not affected - worked around in hardware
8973 - * 11AP. Linux reads the APIC between writes to avoid this, as per
8974 - * the documentation. Make sure you preserve this as it affects
8975 - * the C stepping chips too.
8976 - * 12AP. not affected - worked around in hardware
8977 - * 13AP. not affected - worked around in hardware
8978 - * 14AP. we always deassert INIT during bootup
8979 - * 15AP. not affected - worked around in hardware
8980 - * 16AP. not affected - worked around in hardware
8981 - * 17AP. not affected - worked around in hardware
8982 - * 18AP. not affected - worked around in hardware
8983 - * 19AP. not affected - worked around in BIOS
8984 - *
8985 - * If this sounds worrying believe me these bugs are either ___RARE___,
8986 - * or are signal timing bugs worked around in hardware and there's
8987 - * about nothing of note with C stepping upwards.
8988 - */
8989 -
8990 -DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
8991 -
8992 -/*
8993 - * the following functions deal with sending IPIs between CPUs.
8994 - *
8995 - * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
8996 - */
8997 -
8998 -static inline int __prepare_ICR (unsigned int shortcut, int vector)
8999 -{
9000 - unsigned int icr = shortcut | APIC_DEST_LOGICAL;
9001 -
9002 - switch (vector) {
9003 - default:
9004 - icr |= APIC_DM_FIXED | vector;
9005 - break;
9006 - case NMI_VECTOR:
9007 - icr |= APIC_DM_NMI;
9008 - break;
9009 - }
9010 - return icr;
9011 -}
9012 -
9013 -static inline int __prepare_ICR2 (unsigned int mask)
9014 -{
9015 - return SET_APIC_DEST_FIELD(mask);
9016 -}
9017 -
9018 -DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
9019 -
9020 -static inline void __send_IPI_one(unsigned int cpu, int vector)
9021 -{
9022 - int irq = per_cpu(ipi_to_irq, cpu)[vector];
9023 - BUG_ON(irq < 0);
9024 - notify_remote_via_irq(irq);
9025 -}
9026 -
9027 -void __send_IPI_shortcut(unsigned int shortcut, int vector)
9028 -{
9029 - int cpu;
9030 -
9031 - switch (shortcut) {
9032 - case APIC_DEST_SELF:
9033 - __send_IPI_one(smp_processor_id(), vector);
9034 - break;
9035 - case APIC_DEST_ALLBUT:
9036 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
9037 - if (cpu == smp_processor_id())
9038 - continue;
9039 - if (cpu_isset(cpu, cpu_online_map)) {
9040 - __send_IPI_one(cpu, vector);
9041 - }
9042 - }
9043 - break;
9044 - default:
9045 - printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
9046 - vector);
9047 - break;
9048 - }
9049 -}
9050 -
9051 -void send_IPI_self(int vector)
9052 -{
9053 - __send_IPI_shortcut(APIC_DEST_SELF, vector);
9054 -}
9055 -
9056 -/*
9057 - * This is only used on smaller machines.
9058 - */
9059 -void send_IPI_mask_bitmask(cpumask_t mask, int vector)
9060 -{
9061 - unsigned long flags;
9062 - unsigned int cpu;
9063 -
9064 - local_irq_save(flags);
9065 - WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
9066 -
9067 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
9068 - if (cpu_isset(cpu, mask)) {
9069 - __send_IPI_one(cpu, vector);
9070 - }
9071 - }
9072 -
9073 - local_irq_restore(flags);
9074 -}
9075 -
9076 -void send_IPI_mask_sequence(cpumask_t mask, int vector)
9077 -{
9078 -
9079 - send_IPI_mask_bitmask(mask, vector);
9080 -}
9081 -
9082 -#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
9083 -
9084 -#if 0 /* XEN */
9085 -/*
9086 - * Smarter SMP flushing macros.
9087 - * c/o Linus Torvalds.
9088 - *
9089 - * These mean you can really definitely utterly forget about
9090 - * writing to user space from interrupts. (Its not allowed anyway).
9091 - *
9092 - * Optimizations Manfred Spraul <manfred@colorfullife.com>
9093 - */
9094 -
9095 -static cpumask_t flush_cpumask;
9096 -static struct mm_struct * flush_mm;
9097 -static unsigned long flush_va;
9098 -static DEFINE_SPINLOCK(tlbstate_lock);
9099 -
9100 -/*
9101 - * We cannot call mmdrop() because we are in interrupt context,
9102 - * instead update mm->cpu_vm_mask.
9103 - *
9104 - * We need to reload %cr3 since the page tables may be going
9105 - * away from under us..
9106 - */
9107 -void leave_mm(int cpu)
9108 -{
9109 - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
9110 - BUG();
9111 - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
9112 - load_cr3(swapper_pg_dir);
9113 -}
9114 -EXPORT_SYMBOL_GPL(leave_mm);
9115 -
9116 -/*
9117 - *
9118 - * The flush IPI assumes that a thread switch happens in this order:
9119 - * [cpu0: the cpu that switches]
9120 - * 1) switch_mm() either 1a) or 1b)
9121 - * 1a) thread switch to a different mm
9122 - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
9123 - * Stop ipi delivery for the old mm. This is not synchronized with
9124 - * the other cpus, but smp_invalidate_interrupt ignore flush ipis
9125 - * for the wrong mm, and in the worst case we perform a superfluous
9126 - * tlb flush.
9127 - * 1a2) set cpu_tlbstate to TLBSTATE_OK
9128 - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
9129 - * was in lazy tlb mode.
9130 - * 1a3) update cpu_tlbstate[].active_mm
9131 - * Now cpu0 accepts tlb flushes for the new mm.
9132 - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
9133 - * Now the other cpus will send tlb flush ipis.
9134 - * 1a4) change cr3.
9135 - * 1b) thread switch without mm change
9136 - * cpu_tlbstate[].active_mm is correct, cpu0 already handles
9137 - * flush ipis.
9138 - * 1b1) set cpu_tlbstate to TLBSTATE_OK
9139 - * 1b2) test_and_set the cpu bit in cpu_vm_mask.
9140 - * Atomically set the bit [other cpus will start sending flush ipis],
9141 - * and test the bit.
9142 - * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
9143 - * 2) switch %%esp, ie current
9144 - *
9145 - * The interrupt must handle 2 special cases:
9146 - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
9147 - * - the cpu performs speculative tlb reads, i.e. even if the cpu only
9148 - * runs in kernel space, the cpu could load tlb entries for user space
9149 - * pages.
9150 - *
9151 - * The good news is that cpu_tlbstate is local to each cpu, no
9152 - * write/read ordering problems.
9153 - */
9154 -
9155 -/*
9156 - * TLB flush IPI:
9157 - *
9158 - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
9159 - * 2) Leave the mm if we are in the lazy tlb mode.
9160 - */
9161 -
9162 -irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id)
9163 -{
9164 - unsigned long cpu;
9165 -
9166 - cpu = get_cpu();
9167 -
9168 - if (!cpu_isset(cpu, flush_cpumask))
9169 - goto out;
9170 - /*
9171 - * This was a BUG() but until someone can quote me the
9172 - * line from the intel manual that guarantees an IPI to
9173 - * multiple CPUs is retried _only_ on the erroring CPUs
9174 - * its staying as a return
9175 - *
9176 - * BUG();
9177 - */
9178 -
9179 - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
9180 - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
9181 - if (flush_va == TLB_FLUSH_ALL)
9182 - local_flush_tlb();
9183 - else
9184 - __flush_tlb_one(flush_va);
9185 - } else
9186 - leave_mm(cpu);
9187 - }
9188 - smp_mb__before_clear_bit();
9189 - cpu_clear(cpu, flush_cpumask);
9190 - smp_mb__after_clear_bit();
9191 -out:
9192 - put_cpu_no_resched();
9193 - __get_cpu_var(irq_stat).irq_tlb_count++;
9194 -
9195 - return IRQ_HANDLED;
9196 -}
9197 -
9198 -void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
9199 - unsigned long va)
9200 -{
9201 - cpumask_t cpumask = *cpumaskp;
9202 -
9203 - /*
9204 - * A couple of (to be removed) sanity checks:
9205 - *
9206 - * - current CPU must not be in mask
9207 - * - mask must exist :)
9208 - */
9209 - BUG_ON(cpus_empty(cpumask));
9210 - BUG_ON(cpu_isset(smp_processor_id(), cpumask));
9211 - BUG_ON(!mm);
9212 -
9213 -#ifdef CONFIG_HOTPLUG_CPU
9214 - /* If a CPU which we ran on has gone down, OK. */
9215 - cpus_and(cpumask, cpumask, cpu_online_map);
9216 - if (unlikely(cpus_empty(cpumask)))
9217 - return;
9218 -#endif
9219 -
9220 - /*
9221 - * i'm not happy about this global shared spinlock in the
9222 - * MM hot path, but we'll see how contended it is.
9223 - * AK: x86-64 has a faster method that could be ported.
9224 - */
9225 - spin_lock(&tlbstate_lock);
9226 -
9227 - flush_mm = mm;
9228 - flush_va = va;
9229 - cpus_or(flush_cpumask, cpumask, flush_cpumask);
9230 - /*
9231 - * We have to send the IPI only to
9232 - * CPUs affected.
9233 - */
9234 - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
9235 -
9236 - while (!cpus_empty(flush_cpumask))
9237 - /* nothing. lockup detection does not belong here */
9238 - cpu_relax();
9239 -
9240 - flush_mm = NULL;
9241 - flush_va = 0;
9242 - spin_unlock(&tlbstate_lock);
9243 -}
9244 -
9245 -void flush_tlb_current_task(void)
9246 -{
9247 - struct mm_struct *mm = current->mm;
9248 - cpumask_t cpu_mask;
9249 -
9250 - preempt_disable();
9251 - cpu_mask = mm->cpu_vm_mask;
9252 - cpu_clear(smp_processor_id(), cpu_mask);
9253 -
9254 - local_flush_tlb();
9255 - if (!cpus_empty(cpu_mask))
9256 - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9257 - preempt_enable();
9258 -}
9259 -
9260 -void flush_tlb_mm (struct mm_struct * mm)
9261 -{
9262 - cpumask_t cpu_mask;
9263 -
9264 - preempt_disable();
9265 - cpu_mask = mm->cpu_vm_mask;
9266 - cpu_clear(smp_processor_id(), cpu_mask);
9267 -
9268 - if (current->active_mm == mm) {
9269 - if (current->mm)
9270 - local_flush_tlb();
9271 - else
9272 - leave_mm(smp_processor_id());
9273 - }
9274 - if (!cpus_empty(cpu_mask))
9275 - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9276 -
9277 - preempt_enable();
9278 -}
9279 -
9280 -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
9281 -{
9282 - struct mm_struct *mm = vma->vm_mm;
9283 - cpumask_t cpu_mask;
9284 -
9285 - preempt_disable();
9286 - cpu_mask = mm->cpu_vm_mask;
9287 - cpu_clear(smp_processor_id(), cpu_mask);
9288 -
9289 - if (current->active_mm == mm) {
9290 - if(current->mm)
9291 - __flush_tlb_one(va);
9292 - else
9293 - leave_mm(smp_processor_id());
9294 - }
9295 -
9296 - if (!cpus_empty(cpu_mask))
9297 - flush_tlb_others(cpu_mask, mm, va);
9298 -
9299 - preempt_enable();
9300 -}
9301 -EXPORT_SYMBOL(flush_tlb_page);
9302 -
9303 -static void do_flush_tlb_all(void* info)
9304 -{
9305 - unsigned long cpu = smp_processor_id();
9306 -
9307 - __flush_tlb_all();
9308 - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
9309 - leave_mm(cpu);
9310 -}
9311 -
9312 -void flush_tlb_all(void)
9313 -{
9314 - on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
9315 -}
9316 -
9317 -#endif /* XEN */
9318 -
9319 -/*
9320 - * this function sends a 'reschedule' IPI to another CPU.
9321 - * it goes straight through and wastes no time serializing
9322 - * anything. Worst case is that we lose a reschedule ...
9323 - */
9324 -void xen_smp_send_reschedule(int cpu)
9325 -{
9326 - WARN_ON(cpu_is_offline(cpu));
9327 - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
9328 -}
9329 -
9330 -/*
9331 - * Structure and data for smp_call_function(). This is designed to minimise
9332 - * static memory requirements. It also looks cleaner.
9333 - */
9334 -static DEFINE_SPINLOCK(call_lock);
9335 -
9336 -struct call_data_struct {
9337 - void (*func) (void *info);
9338 - void *info;
9339 - atomic_t started;
9340 - atomic_t finished;
9341 - int wait;
9342 -};
9343 -
9344 -void lock_ipi_call_lock(void)
9345 -{
9346 - spin_lock_irq(&call_lock);
9347 -}
9348 -
9349 -void unlock_ipi_call_lock(void)
9350 -{
9351 - spin_unlock_irq(&call_lock);
9352 -}
9353 -
9354 -static struct call_data_struct *call_data;
9355 -
9356 -static void __smp_call_function(void (*func) (void *info), void *info,
9357 - int nonatomic, int wait)
9358 -{
9359 - struct call_data_struct data;
9360 - int cpus = num_online_cpus() - 1;
9361 -
9362 - if (!cpus)
9363 - return;
9364 -
9365 - data.func = func;
9366 - data.info = info;
9367 - atomic_set(&data.started, 0);
9368 - data.wait = wait;
9369 - if (wait)
9370 - atomic_set(&data.finished, 0);
9371 -
9372 - call_data = &data;
9373 - mb();
9374 -
9375 - /* Send a message to all other CPUs and wait for them to respond */
9376 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
9377 -
9378 - /* Wait for response */
9379 - while (atomic_read(&data.started) != cpus)
9380 - cpu_relax();
9381 -
9382 - if (wait)
9383 - while (atomic_read(&data.finished) != cpus)
9384 - cpu_relax();
9385 -}
9386 -
9387 -
9388 -/**
9389 - * smp_call_function_mask(): Run a function on a set of other CPUs.
9390 - * @mask: The set of cpus to run on. Must not include the current cpu.
9391 - * @func: The function to run. This must be fast and non-blocking.
9392 - * @info: An arbitrary pointer to pass to the function.
9393 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
9394 - *
9395 - * Returns 0 on success, else a negative status code.
9396 - *
9397 - * If @wait is true, then returns once @func has returned; otherwise
9398 - * it returns just before the target cpu calls @func.
9399 - *
9400 - * You must not call this function with disabled interrupts or from a
9401 - * hardware interrupt handler or from a bottom half handler.
9402 - */
9403 -int
9404 -xen_smp_call_function_mask(cpumask_t mask,
9405 - void (*func)(void *), void *info,
9406 - int wait)
9407 -{
9408 - struct call_data_struct data;
9409 - cpumask_t allbutself;
9410 - int cpus;
9411 -
9412 - /* Can deadlock when called with interrupts disabled */
9413 - WARN_ON(irqs_disabled());
9414 -
9415 - /* Holding any lock stops cpus from going down. */
9416 - spin_lock(&call_lock);
9417 -
9418 - allbutself = cpu_online_map;
9419 - cpu_clear(smp_processor_id(), allbutself);
9420 -
9421 - cpus_and(mask, mask, allbutself);
9422 - cpus = cpus_weight(mask);
9423 -
9424 - if (!cpus) {
9425 - spin_unlock(&call_lock);
9426 - return 0;
9427 - }
9428 -
9429 - data.func = func;
9430 - data.info = info;
9431 - atomic_set(&data.started, 0);
9432 - data.wait = wait;
9433 - if (wait)
9434 - atomic_set(&data.finished, 0);
9435 -
9436 - call_data = &data;
9437 - mb();
9438 -
9439 - /* Send a message to other CPUs */
9440 - if (cpus_equal(mask, allbutself))
9441 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
9442 - else
9443 - send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
9444 -
9445 - /* Wait for response */
9446 - while (atomic_read(&data.started) != cpus)
9447 - cpu_relax();
9448 -
9449 - if (wait)
9450 - while (atomic_read(&data.finished) != cpus)
9451 - cpu_relax();
9452 - spin_unlock(&call_lock);
9453 -
9454 - return 0;
9455 -}
9456 -
9457 -static void stop_this_cpu (void * dummy)
9458 -{
9459 - local_irq_disable();
9460 - /*
9461 - * Remove this CPU:
9462 - */
9463 - cpu_clear(smp_processor_id(), cpu_online_map);
9464 - disable_all_local_evtchn();
9465 - if (cpu_data(smp_processor_id()).hlt_works_ok)
9466 - for(;;) halt();
9467 - for (;;);
9468 -}
9469 -
9470 -/*
9471 - * this function calls the 'stop' function on all other CPUs in the system.
9472 - */
9473 -
9474 -void xen_smp_send_stop(void)
9475 -{
9476 - /* Don't deadlock on the call lock in panic */
9477 - int nolock = !spin_trylock(&call_lock);
9478 - unsigned long flags;
9479 -
9480 - local_irq_save(flags);
9481 - __smp_call_function(stop_this_cpu, NULL, 0, 0);
9482 - if (!nolock)
9483 - spin_unlock(&call_lock);
9484 - disable_all_local_evtchn();
9485 - local_irq_restore(flags);
9486 -}
9487 -
9488 -/*
9489 - * Reschedule call back. Nothing to do,
9490 - * all the work is done automatically when
9491 - * we return from the interrupt.
9492 - */
9493 -irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
9494 -{
9495 - __get_cpu_var(irq_stat).irq_resched_count++;
9496 -
9497 - return IRQ_HANDLED;
9498 -}
9499 -
9500 -#include <linux/kallsyms.h>
9501 -irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
9502 -{
9503 - void (*func) (void *info) = call_data->func;
9504 - void *info = call_data->info;
9505 - int wait = call_data->wait;
9506 -
9507 - /*
9508 - * Notify initiating CPU that I've grabbed the data and am
9509 - * about to execute the function
9510 - */
9511 - mb();
9512 - atomic_inc(&call_data->started);
9513 - /*
9514 - * At this point the info structure may be out of scope unless wait==1
9515 - */
9516 - irq_enter();
9517 - (*func)(info);
9518 - __get_cpu_var(irq_stat).irq_call_count++;
9519 - irq_exit();
9520 -
9521 - if (wait) {
9522 - mb();
9523 - atomic_inc(&call_data->finished);
9524 - }
9525 -
9526 - return IRQ_HANDLED;
9527 -}
9528 --- sle11-2009-10-16.orig/arch/x86/kernel/smp_64-xen.c 2009-03-16 16:33:40.000000000 +0100
9529 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
9530 @@ -1,554 +0,0 @@
9531 -/*
9532 - * Intel SMP support routines.
9533 - *
9534 - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
9535 - * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
9536 - * (c) 2002,2003 Andi Kleen, SuSE Labs.
9537 - *
9538 - * This code is released under the GNU General Public License version 2 or
9539 - * later.
9540 - */
9541 -
9542 -#include <linux/init.h>
9543 -
9544 -#include <linux/mm.h>
9545 -#include <linux/delay.h>
9546 -#include <linux/spinlock.h>
9547 -#include <linux/smp.h>
9548 -#include <linux/kernel_stat.h>
9549 -#include <linux/mc146818rtc.h>
9550 -#include <linux/interrupt.h>
9551 -
9552 -#include <asm/mtrr.h>
9553 -#include <asm/pgalloc.h>
9554 -#include <asm/tlbflush.h>
9555 -#include <asm/mach_apic.h>
9556 -#include <asm/mmu_context.h>
9557 -#include <asm/proto.h>
9558 -#include <asm/apicdef.h>
9559 -#include <asm/idle.h>
9560 -#ifdef CONFIG_XEN
9561 -#include <xen/evtchn.h>
9562 -#endif
9563 -
9564 -#ifndef CONFIG_XEN
9565 -/*
9566 - * Smarter SMP flushing macros.
9567 - * c/o Linus Torvalds.
9568 - *
9569 - * These mean you can really definitely utterly forget about
9570 - * writing to user space from interrupts. (Its not allowed anyway).
9571 - *
9572 - * Optimizations Manfred Spraul <manfred@colorfullife.com>
9573 - *
9574 - * More scalable flush, from Andi Kleen
9575 - *
9576 - * To avoid global state use 8 different call vectors.
9577 - * Each CPU uses a specific vector to trigger flushes on other
9578 - * CPUs. Depending on the received vector the target CPUs look into
9579 - * the right per cpu variable for the flush data.
9580 - *
9581 - * With more than 8 CPUs they are hashed to the 8 available
9582 - * vectors. The limited global vector space forces us to this right now.
9583 - * In future when interrupts are split into per CPU domains this could be
9584 - * fixed, at the cost of triggering multiple IPIs in some cases.
9585 - */
9586 -
9587 -union smp_flush_state {
9588 - struct {
9589 - cpumask_t flush_cpumask;
9590 - struct mm_struct *flush_mm;
9591 - unsigned long flush_va;
9592 - spinlock_t tlbstate_lock;
9593 - };
9594 - char pad[SMP_CACHE_BYTES];
9595 -} ____cacheline_aligned;
9596 -
9597 -/* State is put into the per CPU data section, but padded
9598 - to a full cache line because other CPUs can access it and we don't
9599 - want false sharing in the per cpu data segment. */
9600 -static DEFINE_PER_CPU(union smp_flush_state, flush_state);
9601 -
9602 -/*
9603 - * We cannot call mmdrop() because we are in interrupt context,
9604 - * instead update mm->cpu_vm_mask.
9605 - */
9606 -void leave_mm(int cpu)
9607 -{
9608 - if (read_pda(mmu_state) == TLBSTATE_OK)
9609 - BUG();
9610 - cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
9611 - load_cr3(swapper_pg_dir);
9612 -}
9613 -EXPORT_SYMBOL_GPL(leave_mm);
9614 -
9615 -/*
9616 - *
9617 - * The flush IPI assumes that a thread switch happens in this order:
9618 - * [cpu0: the cpu that switches]
9619 - * 1) switch_mm() either 1a) or 1b)
9620 - * 1a) thread switch to a different mm
9621 - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
9622 - * Stop ipi delivery for the old mm. This is not synchronized with
9623 - * the other cpus, but smp_invalidate_interrupt ignore flush ipis
9624 - * for the wrong mm, and in the worst case we perform a superfluous
9625 - * tlb flush.
9626 - * 1a2) set cpu mmu_state to TLBSTATE_OK
9627 - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
9628 - * was in lazy tlb mode.
9629 - * 1a3) update cpu active_mm
9630 - * Now cpu0 accepts tlb flushes for the new mm.
9631 - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
9632 - * Now the other cpus will send tlb flush ipis.
9633 - * 1a4) change cr3.
9634 - * 1b) thread switch without mm change
9635 - * cpu active_mm is correct, cpu0 already handles
9636 - * flush ipis.
9637 - * 1b1) set cpu mmu_state to TLBSTATE_OK
9638 - * 1b2) test_and_set the cpu bit in cpu_vm_mask.
9639 - * Atomically set the bit [other cpus will start sending flush ipis],
9640 - * and test the bit.
9641 - * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
9642 - * 2) switch %%esp, ie current
9643 - *
9644 - * The interrupt must handle 2 special cases:
9645 - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
9646 - * - the cpu performs speculative tlb reads, i.e. even if the cpu only
9647 - * runs in kernel space, the cpu could load tlb entries for user space
9648 - * pages.
9649 - *
9650 - * The good news is that cpu mmu_state is local to each cpu, no
9651 - * write/read ordering problems.
9652 - */
9653 -
9654 -/*
9655 - * TLB flush IPI:
9656 - *
9657 - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
9658 - * 2) Leave the mm if we are in the lazy tlb mode.
9659 - *
9660 - * Interrupts are disabled.
9661 - */
9662 -
9663 -asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
9664 -{
9665 - int cpu;
9666 - int sender;
9667 - union smp_flush_state *f;
9668 -
9669 - cpu = smp_processor_id();
9670 - /*
9671 - * orig_rax contains the negated interrupt vector.
9672 - * Use that to determine where the sender put the data.
9673 - */
9674 - sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
9675 - f = &per_cpu(flush_state, sender);
9676 -
9677 - if (!cpu_isset(cpu, f->flush_cpumask))
9678 - goto out;
9679 - /*
9680 - * This was a BUG() but until someone can quote me the
9681 - * line from the intel manual that guarantees an IPI to
9682 - * multiple CPUs is retried _only_ on the erroring CPUs
9683 - * its staying as a return
9684 - *
9685 - * BUG();
9686 - */
9687 -
9688 - if (f->flush_mm == read_pda(active_mm)) {
9689 - if (read_pda(mmu_state) == TLBSTATE_OK) {
9690 - if (f->flush_va == TLB_FLUSH_ALL)
9691 - local_flush_tlb();
9692 - else
9693 - __flush_tlb_one(f->flush_va);
9694 - } else
9695 - leave_mm(cpu);
9696 - }
9697 -out:
9698 - ack_APIC_irq();
9699 - cpu_clear(cpu, f->flush_cpumask);
9700 - add_pda(irq_tlb_count, 1);
9701 -}
9702 -
9703 -void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
9704 - unsigned long va)
9705 -{
9706 - int sender;
9707 - union smp_flush_state *f;
9708 - cpumask_t cpumask = *cpumaskp;
9709 -
9710 - /* Caller has disabled preemption */
9711 - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
9712 - f = &per_cpu(flush_state, sender);
9713 -
9714 - /*
9715 - * Could avoid this lock when
9716 - * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
9717 - * probably not worth checking this for a cache-hot lock.
9718 - */
9719 - spin_lock(&f->tlbstate_lock);
9720 -
9721 - f->flush_mm = mm;
9722 - f->flush_va = va;
9723 - cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
9724 -
9725 - /*
9726 - * We have to send the IPI only to
9727 - * CPUs affected.
9728 - */
9729 - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
9730 -
9731 - while (!cpus_empty(f->flush_cpumask))
9732 - cpu_relax();
9733 -
9734 - f->flush_mm = NULL;
9735 - f->flush_va = 0;
9736 - spin_unlock(&f->tlbstate_lock);
9737 -}
9738 -
9739 -int __cpuinit init_smp_flush(void)
9740 -{
9741 - int i;
9742 -
9743 - for_each_cpu_mask(i, cpu_possible_map) {
9744 - spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
9745 - }
9746 - return 0;
9747 -}
9748 -core_initcall(init_smp_flush);
9749 -
9750 -void flush_tlb_current_task(void)
9751 -{
9752 - struct mm_struct *mm = current->mm;
9753 - cpumask_t cpu_mask;
9754 -
9755 - preempt_disable();
9756 - cpu_mask = mm->cpu_vm_mask;
9757 - cpu_clear(smp_processor_id(), cpu_mask);
9758 -
9759 - local_flush_tlb();
9760 - if (!cpus_empty(cpu_mask))
9761 - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9762 - preempt_enable();
9763 -}
9764 -
9765 -void flush_tlb_mm (struct mm_struct * mm)
9766 -{
9767 - cpumask_t cpu_mask;
9768 -
9769 - preempt_disable();
9770 - cpu_mask = mm->cpu_vm_mask;
9771 - cpu_clear(smp_processor_id(), cpu_mask);
9772 -
9773 - if (current->active_mm == mm) {
9774 - if (current->mm)
9775 - local_flush_tlb();
9776 - else
9777 - leave_mm(smp_processor_id());
9778 - }
9779 - if (!cpus_empty(cpu_mask))
9780 - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9781 -
9782 - preempt_enable();
9783 -}
9784 -
9785 -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
9786 -{
9787 - struct mm_struct *mm = vma->vm_mm;
9788 - cpumask_t cpu_mask;
9789 -
9790 - preempt_disable();
9791 - cpu_mask = mm->cpu_vm_mask;
9792 - cpu_clear(smp_processor_id(), cpu_mask);
9793 -
9794 - if (current->active_mm == mm) {
9795 - if(current->mm)
9796 - __flush_tlb_one(va);
9797 - else
9798 - leave_mm(smp_processor_id());
9799 - }
9800 -
9801 - if (!cpus_empty(cpu_mask))
9802 - flush_tlb_others(cpu_mask, mm, va);
9803 -
9804 - preempt_enable();
9805 -}
9806 -
9807 -static void do_flush_tlb_all(void* info)
9808 -{
9809 - unsigned long cpu = smp_processor_id();
9810 -
9811 - __flush_tlb_all();
9812 - if (read_pda(mmu_state) == TLBSTATE_LAZY)
9813 - leave_mm(cpu);
9814 -}
9815 -
9816 -void flush_tlb_all(void)
9817 -{
9818 - on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
9819 -}
9820 -#endif /* Xen */
9821 -
9822 -/*
9823 - * this function sends a 'reschedule' IPI to another CPU.
9824 - * it goes straight through and wastes no time serializing
9825 - * anything. Worst case is that we lose a reschedule ...
9826 - */
9827 -
9828 -void smp_send_reschedule(int cpu)
9829 -{
9830 - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
9831 -}
9832 -
9833 -/*
9834 - * Structure and data for smp_call_function(). This is designed to minimise
9835 - * static memory requirements. It also looks cleaner.
9836 - */
9837 -static DEFINE_SPINLOCK(call_lock);
9838 -
9839 -struct call_data_struct {
9840 - void (*func) (void *info);
9841 - void *info;
9842 - atomic_t started;
9843 - atomic_t finished;
9844 - int wait;
9845 -};
9846 -
9847 -static struct call_data_struct * call_data;
9848 -
9849 -void lock_ipi_call_lock(void)
9850 -{
9851 - spin_lock_irq(&call_lock);
9852 -}
9853 -
9854 -void unlock_ipi_call_lock(void)
9855 -{
9856 - spin_unlock_irq(&call_lock);
9857 -}
9858 -
9859 -/*
9860 - * this function sends a 'generic call function' IPI to all other CPU
9861 - * of the system defined in the mask.
9862 - */
9863 -static int __smp_call_function_mask(cpumask_t mask,
9864 - void (*func)(void *), void *info,
9865 - int wait)
9866 -{
9867 - struct call_data_struct data;
9868 - cpumask_t allbutself;
9869 - int cpus;
9870 -
9871 - allbutself = cpu_online_map;
9872 - cpu_clear(smp_processor_id(), allbutself);
9873 -
9874 - cpus_and(mask, mask, allbutself);
9875 - cpus = cpus_weight(mask);
9876 -
9877 - if (!cpus)
9878 - return 0;
9879 -
9880 - data.func = func;
9881 - data.info = info;
9882 - atomic_set(&data.started, 0);
9883 - data.wait = wait;
9884 - if (wait)
9885 - atomic_set(&data.finished, 0);
9886 -
9887 - call_data = &data;
9888 - wmb();
9889 -
9890 - /* Send a message to other CPUs */
9891 - if (cpus_equal(mask, allbutself))
9892 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
9893 - else
9894 - send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
9895 -
9896 - /* Wait for response */
9897 - while (atomic_read(&data.started) != cpus)
9898 - cpu_relax();
9899 -
9900 - if (!wait)
9901 - return 0;
9902 -
9903 - while (atomic_read(&data.finished) != cpus)
9904 - cpu_relax();
9905 -
9906 - return 0;
9907 -}
9908 -/**
9909 - * smp_call_function_mask(): Run a function on a set of other CPUs.
9910 - * @mask: The set of cpus to run on. Must not include the current cpu.
9911 - * @func: The function to run. This must be fast and non-blocking.
9912 - * @info: An arbitrary pointer to pass to the function.
9913 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
9914 - *
9915 - * Returns 0 on success, else a negative status code.
9916 - *
9917 - * If @wait is true, then returns once @func has returned; otherwise
9918 - * it returns just before the target cpu calls @func.
9919 - *
9920 - * You must not call this function with disabled interrupts or from a
9921 - * hardware interrupt handler or from a bottom half handler.
9922 - */
9923 -int smp_call_function_mask(cpumask_t mask,
9924 - void (*func)(void *), void *info,
9925 - int wait)
9926 -{
9927 - int ret;
9928 -
9929 - /* Can deadlock when called with interrupts disabled */
9930 - WARN_ON(irqs_disabled());
9931 -
9932 - spin_lock(&call_lock);
9933 - ret = __smp_call_function_mask(mask, func, info, wait);
9934 - spin_unlock(&call_lock);
9935 - return ret;
9936 -}
9937 -EXPORT_SYMBOL(smp_call_function_mask);
9938 -
9939 -/*
9940 - * smp_call_function_single - Run a function on a specific CPU
9941 - * @func: The function to run. This must be fast and non-blocking.
9942 - * @info: An arbitrary pointer to pass to the function.
9943 - * @nonatomic: Currently unused.
9944 - * @wait: If true, wait until function has completed on other CPUs.
9945 - *
9946 - * Retrurns 0 on success, else a negative status code.
9947 - *
9948 - * Does not return until the remote CPU is nearly ready to execute <func>
9949 - * or is or has executed.
9950 - */
9951 -
9952 -int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
9953 - int nonatomic, int wait)
9954 -{
9955 - /* prevent preemption and reschedule on another processor */
9956 - int ret, me = get_cpu();
9957 -
9958 - /* Can deadlock when called with interrupts disabled */
9959 - WARN_ON(irqs_disabled());
9960 -
9961 - if (cpu == me) {
9962 - local_irq_disable();
9963 - func(info);
9964 - local_irq_enable();
9965 - put_cpu();
9966 - return 0;
9967 - }
9968 -
9969 - ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
9970 -
9971 - put_cpu();
9972 - return ret;
9973 -}
9974 -EXPORT_SYMBOL(smp_call_function_single);
9975 -
9976 -/*
9977 - * smp_call_function - run a function on all other CPUs.
9978 - * @func: The function to run. This must be fast and non-blocking.
9979 - * @info: An arbitrary pointer to pass to the function.
9980 - * @nonatomic: currently unused.
9981 - * @wait: If true, wait (atomically) until function has completed on other
9982 - * CPUs.
9983 - *
9984 - * Returns 0 on success, else a negative status code. Does not return until
9985 - * remote CPUs are nearly ready to execute func or are or have executed.
9986 - *
9987 - * You must not call this function with disabled interrupts or from a
9988 - * hardware interrupt handler or from a bottom half handler.
9989 - * Actually there are a few legal cases, like panic.
9990 - */
9991 -int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
9992 - int wait)
9993 -{
9994 - return smp_call_function_mask(cpu_online_map, func, info, wait);
9995 -}
9996 -EXPORT_SYMBOL(smp_call_function);
9997 -
9998 -static void stop_this_cpu(void *dummy)
9999 -{
10000 - local_irq_disable();
10001 - /*
10002 - * Remove this CPU:
10003 - */
10004 - cpu_clear(smp_processor_id(), cpu_online_map);
10005 - disable_all_local_evtchn();
10006 - for (;;)
10007 - halt();
10008 -}
10009 -
10010 -void smp_send_stop(void)
10011 -{
10012 - int nolock;
10013 - unsigned long flags;
10014 -
10015 -#ifndef CONFIG_XEN
10016 - if (reboot_force)
10017 - return;
10018 -#endif
10019 -
10020 - /* Don't deadlock on the call lock in panic */
10021 - nolock = !spin_trylock(&call_lock);
10022 - local_irq_save(flags);
10023 - __smp_call_function_mask(cpu_online_map, stop_this_cpu, NULL, 0);
10024 - if (!nolock)
10025 - spin_unlock(&call_lock);
10026 - disable_all_local_evtchn();
10027 - local_irq_restore(flags);
10028 -}
10029 -
10030 -/*
10031 - * Reschedule call back. Nothing to do,
10032 - * all the work is done automatically when
10033 - * we return from the interrupt.
10034 - */
10035 -#ifndef CONFIG_XEN
10036 -asmlinkage void smp_reschedule_interrupt(void)
10037 -#else
10038 -asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx)
10039 -#endif
10040 -{
10041 -#ifndef CONFIG_XEN
10042 - ack_APIC_irq();
10043 -#endif
10044 - add_pda(irq_resched_count, 1);
10045 -#ifdef CONFIG_XEN
10046 - return IRQ_HANDLED;
10047 -#endif
10048 -}
10049 -
10050 -#ifndef CONFIG_XEN
10051 -asmlinkage void smp_call_function_interrupt(void)
10052 -#else
10053 -asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx)
10054 -#endif
10055 -{
10056 - void (*func) (void *info) = call_data->func;
10057 - void *info = call_data->info;
10058 - int wait = call_data->wait;
10059 -
10060 -#ifndef CONFIG_XEN
10061 - ack_APIC_irq();
10062 -#endif
10063 - /*
10064 - * Notify initiating CPU that I've grabbed the data and am
10065 - * about to execute the function
10066 - */
10067 - mb();
10068 - atomic_inc(&call_data->started);
10069 - /*
10070 - * At this point the info structure may be out of scope unless wait==1
10071 - */
10072 - exit_idle();
10073 - irq_enter();
10074 - (*func)(info);
10075 - add_pda(irq_call_count, 1);
10076 - irq_exit();
10077 - if (wait) {
10078 - mb();
10079 - atomic_inc(&call_data->finished);
10080 - }
10081 -#ifdef CONFIG_XEN
10082 - return IRQ_HANDLED;
10083 -#endif
10084 -}
10085 --- sle11-2009-10-16.orig/arch/x86/kernel/time_32-xen.c 2009-10-28 14:58:05.000000000 +0100
10086 +++ sle11-2009-10-16/arch/x86/kernel/time_32-xen.c 2009-10-28 14:58:12.000000000 +0100
10087 @@ -700,8 +700,6 @@ int xen_update_persistent_clock(void)
10088 return 0;
10089 }
10090
10091 -extern void (*late_time_init)(void);
10092 -
10093 /* Dynamically-mapped IRQ. */
10094 DEFINE_PER_CPU(int, timer_irq);
10095
10096 --- sle11-2009-10-16.orig/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:33:40.000000000 +0100
10097 +++ sle11-2009-10-16/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:38:05.000000000 +0100
10098 @@ -9,26 +9,28 @@
10099 * 'Traps.c' handles hardware traps and faults after we have saved some
10100 * state in 'asm.s'.
10101 */
10102 -#include <linux/sched.h>
10103 +#include <linux/interrupt.h>
10104 +#include <linux/kallsyms.h>
10105 +#include <linux/spinlock.h>
10106 +#include <linux/highmem.h>
10107 +#include <linux/kprobes.h>
10108 +#include <linux/uaccess.h>
10109 +#include <linux/utsname.h>
10110 +#include <linux/kdebug.h>
10111 #include <linux/kernel.h>
10112 +#include <linux/module.h>
10113 +#include <linux/ptrace.h>
10114 #include <linux/string.h>
10115 +#include <linux/unwind.h>
10116 +#include <linux/delay.h>
10117 #include <linux/errno.h>
10118 +#include <linux/kexec.h>
10119 +#include <linux/sched.h>
10120 #include <linux/timer.h>
10121 -#include <linux/mm.h>
10122 #include <linux/init.h>
10123 -#include <linux/delay.h>
10124 -#include <linux/spinlock.h>
10125 -#include <linux/interrupt.h>
10126 -#include <linux/highmem.h>
10127 -#include <linux/kallsyms.h>
10128 -#include <linux/ptrace.h>
10129 -#include <linux/utsname.h>
10130 -#include <linux/kprobes.h>
10131 -#include <linux/kexec.h>
10132 -#include <linux/unwind.h>
10133 -#include <linux/uaccess.h>
10134 -#include <linux/nmi.h>
10135 #include <linux/bug.h>
10136 +#include <linux/nmi.h>
10137 +#include <linux/mm.h>
10138
10139 #ifdef CONFIG_EISA
10140 #include <linux/ioport.h>
10141 @@ -43,21 +45,18 @@
10142 #include <linux/edac.h>
10143 #endif
10144
10145 +#include <asm/arch_hooks.h>
10146 +#include <asm/stacktrace.h>
10147 #include <asm/processor.h>
10148 -#include <asm/system.h>
10149 -#include <asm/io.h>
10150 -#include <asm/atomic.h>
10151 #include <asm/debugreg.h>
10152 +#include <asm/atomic.h>
10153 +#include <asm/system.h>
10154 +#include <asm/unwind.h>
10155 #include <asm/desc.h>
10156 #include <asm/i387.h>
10157 #include <asm/nmi.h>
10158 -#include <asm/unwind.h>
10159 #include <asm/smp.h>
10160 -#include <asm/arch_hooks.h>
10161 -#include <linux/kdebug.h>
10162 -#include <asm/stacktrace.h>
10163 -
10164 -#include <linux/module.h>
10165 +#include <asm/io.h>
10166
10167 #include "mach_traps.h"
10168
10169 @@ -71,7 +70,7 @@ EXPORT_SYMBOL_GPL(used_vectors);
10170 asmlinkage int system_call(void);
10171
10172 /* Do we ignore FPU interrupts ? */
10173 -char ignore_fpu_irq = 0;
10174 +char ignore_fpu_irq;
10175
10176 #ifndef CONFIG_X86_NO_IDT
10177 /*
10178 @@ -113,12 +112,13 @@ static unsigned int code_bytes = 64;
10179 void printk_address(unsigned long address, int reliable)
10180 {
10181 #ifdef CONFIG_KALLSYMS
10182 - unsigned long offset = 0, symsize;
10183 + char namebuf[KSYM_NAME_LEN];
10184 + unsigned long offset = 0;
10185 + unsigned long symsize;
10186 const char *symname;
10187 - char *modname;
10188 - char *delim = ":";
10189 - char namebuf[128];
10190 char reliab[4] = "";
10191 + char *delim = ":";
10192 + char *modname;
10193
10194 symname = kallsyms_lookup(address, &symsize, &offset,
10195 &modname, namebuf);
10196 @@ -146,13 +146,14 @@ static inline int valid_stack_ptr(struct
10197
10198 /* The form of the top of the frame on the stack */
10199 struct stack_frame {
10200 - struct stack_frame *next_frame;
10201 - unsigned long return_address;
10202 + struct stack_frame *next_frame;
10203 + unsigned long return_address;
10204 };
10205
10206 -static inline unsigned long print_context_stack(struct thread_info *tinfo,
10207 - unsigned long *stack, unsigned long bp,
10208 - const struct stacktrace_ops *ops, void *data)
10209 +static inline unsigned long
10210 +print_context_stack(struct thread_info *tinfo,
10211 + unsigned long *stack, unsigned long bp,
10212 + const struct stacktrace_ops *ops, void *data)
10213 {
10214 struct stack_frame *frame = (struct stack_frame *)bp;
10215
10216 @@ -174,7 +175,7 @@ static inline unsigned long print_contex
10217 return bp;
10218 }
10219
10220 -#define MSG(msg) ops->warning(data, msg)
10221 +#define MSG(msg) ops->warning(data, msg)
10222
10223 void dump_trace(struct task_struct *task, struct pt_regs *regs,
10224 unsigned long *stack, unsigned long bp,
10225 @@ -185,6 +186,7 @@ void dump_trace(struct task_struct *task
10226
10227 if (!stack) {
10228 unsigned long dummy;
10229 +
10230 stack = &dummy;
10231 if (task != current)
10232 stack = (unsigned long *)task->thread.sp;
10233 @@ -194,7 +196,7 @@ void dump_trace(struct task_struct *task
10234 if (!bp) {
10235 if (task == current) {
10236 /* Grab bp right from our regs */
10237 - asm ("movl %%ebp, %0" : "=r" (bp) : );
10238 + asm("movl %%ebp, %0" : "=r" (bp) :);
10239 } else {
10240 /* bp is the last reg pushed by switch_to */
10241 bp = *(unsigned long *) task->thread.sp;
10242 @@ -204,15 +206,18 @@ void dump_trace(struct task_struct *task
10243
10244 while (1) {
10245 struct thread_info *context;
10246 +
10247 context = (struct thread_info *)
10248 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
10249 bp = print_context_stack(context, stack, bp, ops, data);
10250 - /* Should be after the line below, but somewhere
10251 - in early boot context comes out corrupted and we
10252 - can't reference it -AK */
10253 + /*
10254 + * Should be after the line below, but somewhere
10255 + * in early boot context comes out corrupted and we
10256 + * can't reference it:
10257 + */
10258 if (ops->stack(data, "IRQ") < 0)
10259 break;
10260 - stack = (unsigned long*)context->previous_esp;
10261 + stack = (unsigned long *)context->previous_esp;
10262 if (!stack)
10263 break;
10264 touch_nmi_watchdog();
10265 @@ -251,15 +256,15 @@ static void print_trace_address(void *da
10266 }
10267
10268 static const struct stacktrace_ops print_trace_ops = {
10269 - .warning = print_trace_warning,
10270 - .warning_symbol = print_trace_warning_symbol,
10271 - .stack = print_trace_stack,
10272 - .address = print_trace_address,
10273 + .warning = print_trace_warning,
10274 + .warning_symbol = print_trace_warning_symbol,
10275 + .stack = print_trace_stack,
10276 + .address = print_trace_address,
10277 };
10278
10279 static void
10280 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
10281 - unsigned long *stack, unsigned long bp, char *log_lvl)
10282 + unsigned long *stack, unsigned long bp, char *log_lvl)
10283 {
10284 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
10285 printk("%s =======================\n", log_lvl);
10286 @@ -271,21 +276,22 @@ void show_trace(struct task_struct *task
10287 show_trace_log_lvl(task, regs, stack, bp, "");
10288 }
10289
10290 -static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
10291 - unsigned long *sp, unsigned long bp, char *log_lvl)
10292 +static void
10293 +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
10294 + unsigned long *sp, unsigned long bp, char *log_lvl)
10295 {
10296 unsigned long *stack;
10297 int i;
10298
10299 if (sp == NULL) {
10300 if (task)
10301 - sp = (unsigned long*)task->thread.sp;
10302 + sp = (unsigned long *)task->thread.sp;
10303 else
10304 sp = (unsigned long *)&sp;
10305 }
10306
10307 stack = sp;
10308 - for(i = 0; i < kstack_depth_to_print; i++) {
10309 + for (i = 0; i < kstack_depth_to_print; i++) {
10310 if (kstack_end(stack))
10311 break;
10312 if (i && ((i % 8) == 0))
10313 @@ -293,6 +299,7 @@ static void show_stack_log_lvl(struct ta
10314 printk("%08lx ", *stack++);
10315 }
10316 printk("\n%sCall Trace:\n", log_lvl);
10317 +
10318 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
10319 }
10320
10321 @@ -307,8 +314,8 @@ void show_stack(struct task_struct *task
10322 */
10323 void dump_stack(void)
10324 {
10325 - unsigned long stack;
10326 unsigned long bp = 0;
10327 + unsigned long stack;
10328
10329 #ifdef CONFIG_FRAME_POINTER
10330 if (!bp)
10331 @@ -320,6 +327,7 @@ void dump_stack(void)
10332 init_utsname()->release,
10333 (int)strcspn(init_utsname()->version, " "),
10334 init_utsname()->version);
10335 +
10336 show_trace(current, NULL, &stack, bp);
10337 }
10338
10339 @@ -331,6 +339,7 @@ void show_registers(struct pt_regs *regs
10340
10341 print_modules();
10342 __show_registers(regs, 0);
10343 +
10344 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
10345 TASK_COMM_LEN, current->comm, task_pid_nr(current),
10346 current_thread_info(), current, task_thread_info(current));
10347 @@ -339,10 +348,10 @@ void show_registers(struct pt_regs *regs
10348 * time of the fault..
10349 */
10350 if (!user_mode_vm(regs)) {
10351 - u8 *ip;
10352 unsigned int code_prologue = code_bytes * 43 / 64;
10353 unsigned int code_len = code_bytes;
10354 unsigned char c;
10355 + u8 *ip;
10356
10357 printk("\n" KERN_EMERG "Stack: ");
10358 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
10359 @@ -369,7 +378,7 @@ void show_registers(struct pt_regs *regs
10360 }
10361 }
10362 printk("\n");
10363 -}
10364 +}
10365
10366 int is_valid_bugaddr(unsigned long ip)
10367 {
10368 @@ -385,10 +394,10 @@ int is_valid_bugaddr(unsigned long ip)
10369
10370 static int die_counter;
10371
10372 -int __kprobes __die(const char * str, struct pt_regs * regs, long err)
10373 +int __kprobes __die(const char *str, struct pt_regs *regs, long err)
10374 {
10375 - unsigned long sp;
10376 unsigned short ss;
10377 + unsigned long sp;
10378
10379 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
10380 #ifdef CONFIG_PREEMPT
10381 @@ -403,8 +412,8 @@ int __kprobes __die(const char * str, st
10382 printk("\n");
10383
10384 if (notify_die(DIE_OOPS, str, regs, err,
10385 - current->thread.trap_no, SIGSEGV) !=
10386 - NOTIFY_STOP) {
10387 + current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
10388 +
10389 show_registers(regs);
10390 /* Executive summary in case the oops scrolled away */
10391 sp = (unsigned long) (&regs->sp);
10392 @@ -416,17 +425,18 @@ int __kprobes __die(const char * str, st
10393 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
10394 print_symbol("%s", regs->ip);
10395 printk(" SS:ESP %04x:%08lx\n", ss, sp);
10396 +
10397 return 0;
10398 - } else {
10399 - return 1;
10400 }
10401 +
10402 + return 1;
10403 }
10404
10405 /*
10406 - * This is gone through when something in the kernel has done something bad and
10407 - * is about to be terminated.
10408 + * This is gone through when something in the kernel has done something bad
10409 + * and is about to be terminated:
10410 */
10411 -void die(const char * str, struct pt_regs * regs, long err)
10412 +void die(const char *str, struct pt_regs *regs, long err)
10413 {
10414 static struct {
10415 raw_spinlock_t lock;
10416 @@ -448,8 +458,9 @@ void die(const char * str, struct pt_reg
10417 die.lock_owner = smp_processor_id();
10418 die.lock_owner_depth = 0;
10419 bust_spinlocks(1);
10420 - } else
10421 + } else {
10422 raw_local_irq_save(flags);
10423 + }
10424
10425 if (++die.lock_owner_depth < 3) {
10426 report_bug(regs->ip, regs);
10427 @@ -482,19 +493,20 @@ void die(const char * str, struct pt_reg
10428 do_exit(SIGSEGV);
10429 }
10430
10431 -static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
10432 +static inline void
10433 +die_if_kernel(const char *str, struct pt_regs *regs, long err)
10434 {
10435 if (!user_mode_vm(regs))
10436 die(str, regs, err);
10437 }
10438
10439 -static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
10440 - struct pt_regs * regs, long error_code,
10441 - siginfo_t *info)
10442 +static void __kprobes
10443 +do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
10444 + long error_code, siginfo_t *info)
10445 {
10446 struct task_struct *tsk = current;
10447
10448 - if (regs->flags & VM_MASK) {
10449 + if (regs->flags & X86_VM_MASK) {
10450 if (vm86)
10451 goto vm86_trap;
10452 goto trap_signal;
10453 @@ -503,109 +515,112 @@ static void __kprobes do_trap(int trapnr
10454 if (!user_mode(regs))
10455 goto kernel_trap;
10456
10457 - trap_signal: {
10458 - /*
10459 - * We want error_code and trap_no set for userspace faults and
10460 - * kernelspace faults which result in die(), but not
10461 - * kernelspace faults which are fixed up. die() gives the
10462 - * process no chance to handle the signal and notice the
10463 - * kernel fault information, so that won't result in polluting
10464 - * the information about previously queued, but not yet
10465 - * delivered, faults. See also do_general_protection below.
10466 - */
10467 - tsk->thread.error_code = error_code;
10468 - tsk->thread.trap_no = trapnr;
10469 +trap_signal:
10470 + /*
10471 + * We want error_code and trap_no set for userspace faults and
10472 + * kernelspace faults which result in die(), but not
10473 + * kernelspace faults which are fixed up. die() gives the
10474 + * process no chance to handle the signal and notice the
10475 + * kernel fault information, so that won't result in polluting
10476 + * the information about previously queued, but not yet
10477 + * delivered, faults. See also do_general_protection below.
10478 + */
10479 + tsk->thread.error_code = error_code;
10480 + tsk->thread.trap_no = trapnr;
10481
10482 - if (info)
10483 - force_sig_info(signr, info, tsk);
10484 - else
10485 - force_sig(signr, tsk);
10486 - return;
10487 - }
10488 + if (info)
10489 + force_sig_info(signr, info, tsk);
10490 + else
10491 + force_sig(signr, tsk);
10492 + return;
10493
10494 - kernel_trap: {
10495 - if (!fixup_exception(regs)) {
10496 - tsk->thread.error_code = error_code;
10497 - tsk->thread.trap_no = trapnr;
10498 - die(str, regs, error_code);
10499 - }
10500 - return;
10501 +kernel_trap:
10502 + if (!fixup_exception(regs)) {
10503 + tsk->thread.error_code = error_code;
10504 + tsk->thread.trap_no = trapnr;
10505 + die(str, regs, error_code);
10506 }
10507 + return;
10508
10509 - vm86_trap: {
10510 - int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
10511 - if (ret) goto trap_signal;
10512 - return;
10513 - }
10514 +vm86_trap:
10515 + if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
10516 + error_code, trapnr))
10517 + goto trap_signal;
10518 + return;
10519 }
10520
10521 -#define DO_ERROR(trapnr, signr, str, name) \
10522 -void do_##name(struct pt_regs * regs, long error_code) \
10523 -{ \
10524 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10525 - == NOTIFY_STOP) \
10526 - return; \
10527 - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
10528 -}
10529 -
10530 -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
10531 -void do_##name(struct pt_regs * regs, long error_code) \
10532 -{ \
10533 - siginfo_t info; \
10534 - if (irq) \
10535 - local_irq_enable(); \
10536 - info.si_signo = signr; \
10537 - info.si_errno = 0; \
10538 - info.si_code = sicode; \
10539 - info.si_addr = (void __user *)siaddr; \
10540 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10541 - == NOTIFY_STOP) \
10542 - return; \
10543 - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
10544 -}
10545 -
10546 -#define DO_VM86_ERROR(trapnr, signr, str, name) \
10547 -void do_##name(struct pt_regs * regs, long error_code) \
10548 -{ \
10549 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10550 - == NOTIFY_STOP) \
10551 - return; \
10552 - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
10553 -}
10554 -
10555 -#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
10556 -void do_##name(struct pt_regs * regs, long error_code) \
10557 -{ \
10558 - siginfo_t info; \
10559 - info.si_signo = signr; \
10560 - info.si_errno = 0; \
10561 - info.si_code = sicode; \
10562 - info.si_addr = (void __user *)siaddr; \
10563 - trace_hardirqs_fixup(); \
10564 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10565 - == NOTIFY_STOP) \
10566 - return; \
10567 - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
10568 +#define DO_ERROR(trapnr, signr, str, name) \
10569 +void do_##name(struct pt_regs *regs, long error_code) \
10570 +{ \
10571 + trace_hardirqs_fixup(); \
10572 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10573 + == NOTIFY_STOP) \
10574 + return; \
10575 + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
10576 +}
10577 +
10578 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
10579 +void do_##name(struct pt_regs *regs, long error_code) \
10580 +{ \
10581 + siginfo_t info; \
10582 + if (irq) \
10583 + local_irq_enable(); \
10584 + info.si_signo = signr; \
10585 + info.si_errno = 0; \
10586 + info.si_code = sicode; \
10587 + info.si_addr = (void __user *)siaddr; \
10588 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10589 + == NOTIFY_STOP) \
10590 + return; \
10591 + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
10592 +}
10593 +
10594 +#define DO_VM86_ERROR(trapnr, signr, str, name) \
10595 +void do_##name(struct pt_regs *regs, long error_code) \
10596 +{ \
10597 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10598 + == NOTIFY_STOP) \
10599 + return; \
10600 + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
10601 +}
10602 +
10603 +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
10604 +void do_##name(struct pt_regs *regs, long error_code) \
10605 +{ \
10606 + siginfo_t info; \
10607 + info.si_signo = signr; \
10608 + info.si_errno = 0; \
10609 + info.si_code = sicode; \
10610 + info.si_addr = (void __user *)siaddr; \
10611 + trace_hardirqs_fixup(); \
10612 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10613 + == NOTIFY_STOP) \
10614 + return; \
10615 + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
10616 }
10617
10618 -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
10619 +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
10620 #ifndef CONFIG_KPROBES
10621 -DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
10622 +DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
10623 #endif
10624 -DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
10625 -DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
10626 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
10627 -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
10628 +DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
10629 +DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
10630 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
10631 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
10632 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
10633 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
10634 DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
10635 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
10636 -DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
10637 +DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
10638
10639 void __kprobes do_general_protection(struct pt_regs * regs,
10640 long error_code)
10641 {
10642 - if (regs->flags & VM_MASK)
10643 + struct thread_struct *thread;
10644 +
10645 + thread = &current->thread;
10646 +
10647 + if (regs->flags & X86_VM_MASK)
10648 goto gp_in_vm86;
10649
10650 if (!user_mode(regs))
10651 @@ -613,6 +628,7 @@ void __kprobes do_general_protection(str
10652
10653 current->thread.error_code = error_code;
10654 current->thread.trap_no = 13;
10655 +
10656 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
10657 printk_ratelimit()) {
10658 printk(KERN_INFO
10659 @@ -642,22 +658,25 @@ gp_in_kernel:
10660 }
10661 }
10662
10663 -static __kprobes void
10664 -mem_parity_error(unsigned char reason, struct pt_regs * regs)
10665 +static notrace __kprobes void
10666 +mem_parity_error(unsigned char reason, struct pt_regs *regs)
10667 {
10668 - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
10669 - "CPU %d.\n", reason, smp_processor_id());
10670 - printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
10671 + printk(KERN_EMERG
10672 + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
10673 + reason, smp_processor_id());
10674 +
10675 + printk(KERN_EMERG
10676 + "You have some hardware problem, likely on the PCI bus.\n");
10677
10678 #if defined(CONFIG_EDAC)
10679 - if(edac_handler_set()) {
10680 + if (edac_handler_set()) {
10681 edac_atomic_assert_error();
10682 return;
10683 }
10684 #endif
10685
10686 if (panic_on_unrecovered_nmi)
10687 - panic("NMI: Not continuing");
10688 + panic("NMI: Not continuing");
10689
10690 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
10691
10692 @@ -665,8 +684,8 @@ mem_parity_error(unsigned char reason, s
10693 clear_mem_error(reason);
10694 }
10695
10696 -static __kprobes void
10697 -io_check_error(unsigned char reason, struct pt_regs * regs)
10698 +static notrace __kprobes void
10699 +io_check_error(unsigned char reason, struct pt_regs *regs)
10700 {
10701 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
10702 show_registers(regs);
10703 @@ -675,38 +694,43 @@ io_check_error(unsigned char reason, str
10704 clear_io_check_error(reason);
10705 }
10706
10707 -static __kprobes void
10708 -unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
10709 +static notrace __kprobes void
10710 +unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
10711 {
10712 + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
10713 + return;
10714 #ifdef CONFIG_MCA
10715 - /* Might actually be able to figure out what the guilty party
10716 - * is. */
10717 - if( MCA_bus ) {
10718 + /*
10719 + * Might actually be able to figure out what the guilty party
10720 + * is:
10721 + */
10722 + if (MCA_bus) {
10723 mca_handle_nmi();
10724 return;
10725 }
10726 #endif
10727 - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
10728 - "CPU %d.\n", reason, smp_processor_id());
10729 + printk(KERN_EMERG
10730 + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
10731 + reason, smp_processor_id());
10732 +
10733 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
10734 if (panic_on_unrecovered_nmi)
10735 - panic("NMI: Not continuing");
10736 + panic("NMI: Not continuing");
10737
10738 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
10739 }
10740
10741 static DEFINE_SPINLOCK(nmi_print_lock);
10742
10743 -void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
10744 +void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
10745 {
10746 - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
10747 - NOTIFY_STOP)
10748 + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
10749 return;
10750
10751 spin_lock(&nmi_print_lock);
10752 /*
10753 * We are in trouble anyway, lets at least try
10754 - * to get a message out.
10755 + * to get a message out:
10756 */
10757 bust_spinlocks(1);
10758 printk(KERN_EMERG "%s", msg);
10759 @@ -717,9 +741,10 @@ void __kprobes die_nmi(struct pt_regs *r
10760 spin_unlock(&nmi_print_lock);
10761 bust_spinlocks(0);
10762
10763 - /* If we are in kernel we are probably nested up pretty bad
10764 - * and might aswell get out now while we still can.
10765 - */
10766 + /*
10767 + * If we are in kernel we are probably nested up pretty bad
10768 + * and might aswell get out now while we still can:
10769 + */
10770 if (!user_mode_vm(regs)) {
10771 current->thread.trap_no = 2;
10772 crash_kexec(regs);
10773 @@ -728,14 +753,14 @@ void __kprobes die_nmi(struct pt_regs *r
10774 do_exit(SIGSEGV);
10775 }
10776
10777 -static __kprobes void default_do_nmi(struct pt_regs * regs)
10778 +static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
10779 {
10780 unsigned char reason = 0;
10781
10782 - /* Only the BSP gets external NMIs from the system. */
10783 + /* Only the BSP gets external NMIs from the system: */
10784 if (!smp_processor_id())
10785 reason = get_nmi_reason();
10786 -
10787 +
10788 if (!(reason & 0xc0)) {
10789 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
10790 == NOTIFY_STOP)
10791 @@ -748,8 +773,10 @@ static __kprobes void default_do_nmi(str
10792 if (nmi_watchdog_tick(regs, reason))
10793 return;
10794 if (!do_nmi_callback(regs, smp_processor_id()))
10795 -#endif
10796 unknown_nmi_error(reason, regs);
10797 +#else
10798 + unknown_nmi_error(reason, regs);
10799 +#endif
10800
10801 return;
10802 }
10803 @@ -761,14 +788,14 @@ static __kprobes void default_do_nmi(str
10804 io_check_error(reason, regs);
10805 /*
10806 * Reassert NMI in case it became active meanwhile
10807 - * as it's edge-triggered.
10808 + * as it's edge-triggered:
10809 */
10810 reassert_nmi();
10811 }
10812
10813 static int ignore_nmis;
10814
10815 -__kprobes void do_nmi(struct pt_regs * regs, long error_code)
10816 +notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
10817 {
10818 int cpu;
10819
10820 @@ -804,9 +831,12 @@ void __kprobes do_int3(struct pt_regs *r
10821 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
10822 == NOTIFY_STOP)
10823 return;
10824 - /* This is an interrupt gate, because kprobes wants interrupts
10825 - disabled. Normal trap handlers don't. */
10826 + /*
10827 + * This is an interrupt gate, because kprobes wants interrupts
10828 + * disabled. Normal trap handlers don't.
10829 + */
10830 restore_interrupts(regs);
10831 +
10832 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
10833 }
10834 #endif
10835 @@ -821,7 +851,7 @@ void __kprobes do_int3(struct pt_regs *r
10836 * from user space. Such code must not hold kernel locks (since it
10837 * can equally take a page fault), therefore it is safe to call
10838 * force_sig_info even though that claims and releases locks.
10839 - *
10840 + *
10841 * Code in ./signal.c ensures that the debug control register
10842 * is restored before we deliver any signal, and therefore that
10843 * user code runs with the correct debug control register even though
10844 @@ -833,10 +863,10 @@ void __kprobes do_int3(struct pt_regs *r
10845 * find every occurrence of the TF bit that could be saved away even
10846 * by user code)
10847 */
10848 -void __kprobes do_debug(struct pt_regs * regs, long error_code)
10849 +void __kprobes do_debug(struct pt_regs *regs, long error_code)
10850 {
10851 - unsigned int condition;
10852 struct task_struct *tsk = current;
10853 + unsigned int condition;
10854
10855 trace_hardirqs_fixup();
10856
10857 @@ -861,7 +891,7 @@ void __kprobes do_debug(struct pt_regs *
10858 goto clear_dr7;
10859 }
10860
10861 - if (regs->flags & VM_MASK)
10862 + if (regs->flags & X86_VM_MASK)
10863 goto debug_vm86;
10864
10865 /* Save debug status register where ptrace can see it */
10866 @@ -884,7 +914,8 @@ void __kprobes do_debug(struct pt_regs *
10867 /* Ok, finally something we can handle */
10868 send_sigtrap(tsk, regs, error_code);
10869
10870 - /* Disable additional traps. They'll be re-enabled when
10871 + /*
10872 + * Disable additional traps. They'll be re-enabled when
10873 * the signal is delivered.
10874 */
10875 clear_dr7:
10876 @@ -897,7 +928,7 @@ debug_vm86:
10877
10878 clear_TF_reenable:
10879 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
10880 - regs->flags &= ~TF_MASK;
10881 + regs->flags &= ~X86_EFLAGS_TF;
10882 return;
10883 }
10884
10885 @@ -908,9 +939,10 @@ clear_TF_reenable:
10886 */
10887 void math_error(void __user *ip)
10888 {
10889 - struct task_struct * task;
10890 + struct task_struct *task;
10891 + unsigned short cwd;
10892 + unsigned short swd;
10893 siginfo_t info;
10894 - unsigned short cwd, swd;
10895
10896 /*
10897 * Save the info for the exception handler and clear the error.
10898 @@ -936,36 +968,36 @@ void math_error(void __user *ip)
10899 cwd = get_fpu_cwd(task);
10900 swd = get_fpu_swd(task);
10901 switch (swd & ~cwd & 0x3f) {
10902 - case 0x000: /* No unmasked exception */
10903 - return;
10904 - default: /* Multiple exceptions */
10905 - break;
10906 - case 0x001: /* Invalid Op */
10907 - /*
10908 - * swd & 0x240 == 0x040: Stack Underflow
10909 - * swd & 0x240 == 0x240: Stack Overflow
10910 - * User must clear the SF bit (0x40) if set
10911 - */
10912 - info.si_code = FPE_FLTINV;
10913 - break;
10914 - case 0x002: /* Denormalize */
10915 - case 0x010: /* Underflow */
10916 - info.si_code = FPE_FLTUND;
10917 - break;
10918 - case 0x004: /* Zero Divide */
10919 - info.si_code = FPE_FLTDIV;
10920 - break;
10921 - case 0x008: /* Overflow */
10922 - info.si_code = FPE_FLTOVF;
10923 - break;
10924 - case 0x020: /* Precision */
10925 - info.si_code = FPE_FLTRES;
10926 - break;
10927 + case 0x000: /* No unmasked exception */
10928 + return;
10929 + default: /* Multiple exceptions */
10930 + break;
10931 + case 0x001: /* Invalid Op */
10932 + /*
10933 + * swd & 0x240 == 0x040: Stack Underflow
10934 + * swd & 0x240 == 0x240: Stack Overflow
10935 + * User must clear the SF bit (0x40) if set
10936 + */
10937 + info.si_code = FPE_FLTINV;
10938 + break;
10939 + case 0x002: /* Denormalize */
10940 + case 0x010: /* Underflow */
10941 + info.si_code = FPE_FLTUND;
10942 + break;
10943 + case 0x004: /* Zero Divide */
10944 + info.si_code = FPE_FLTDIV;
10945 + break;
10946 + case 0x008: /* Overflow */
10947 + info.si_code = FPE_FLTOVF;
10948 + break;
10949 + case 0x020: /* Precision */
10950 + info.si_code = FPE_FLTRES;
10951 + break;
10952 }
10953 force_sig_info(SIGFPE, &info, task);
10954 }
10955
10956 -void do_coprocessor_error(struct pt_regs * regs, long error_code)
10957 +void do_coprocessor_error(struct pt_regs *regs, long error_code)
10958 {
10959 ignore_fpu_irq = 1;
10960 math_error((void __user *)regs->ip);
10961 @@ -973,9 +1005,9 @@ void do_coprocessor_error(struct pt_regs
10962
10963 static void simd_math_error(void __user *ip)
10964 {
10965 - struct task_struct * task;
10966 - siginfo_t info;
10967 + struct task_struct *task;
10968 unsigned short mxcsr;
10969 + siginfo_t info;
10970
10971 /*
10972 * Save the info for the exception handler and clear the error.
10973 @@ -996,84 +1028,82 @@ static void simd_math_error(void __user
10974 */
10975 mxcsr = get_fpu_mxcsr(task);
10976 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
10977 - case 0x000:
10978 - default:
10979 - break;
10980 - case 0x001: /* Invalid Op */
10981 - info.si_code = FPE_FLTINV;
10982 - break;
10983 - case 0x002: /* Denormalize */
10984 - case 0x010: /* Underflow */
10985 - info.si_code = FPE_FLTUND;
10986 - break;
10987 - case 0x004: /* Zero Divide */
10988 - info.si_code = FPE_FLTDIV;
10989 - break;
10990 - case 0x008: /* Overflow */
10991 - info.si_code = FPE_FLTOVF;
10992 - break;
10993 - case 0x020: /* Precision */
10994 - info.si_code = FPE_FLTRES;
10995 - break;
10996 + case 0x000:
10997 + default:
10998 + break;
10999 + case 0x001: /* Invalid Op */
11000 + info.si_code = FPE_FLTINV;
11001 + break;
11002 + case 0x002: /* Denormalize */
11003 + case 0x010: /* Underflow */
11004 + info.si_code = FPE_FLTUND;
11005 + break;
11006 + case 0x004: /* Zero Divide */
11007 + info.si_code = FPE_FLTDIV;
11008 + break;
11009 + case 0x008: /* Overflow */
11010 + info.si_code = FPE_FLTOVF;
11011 + break;
11012 + case 0x020: /* Precision */
11013 + info.si_code = FPE_FLTRES;
11014 + break;
11015 }
11016 force_sig_info(SIGFPE, &info, task);
11017 }
11018
11019 -void do_simd_coprocessor_error(struct pt_regs * regs,
11020 - long error_code)
11021 +void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
11022 {
11023 if (cpu_has_xmm) {
11024 /* Handle SIMD FPU exceptions on PIII+ processors. */
11025 ignore_fpu_irq = 1;
11026 simd_math_error((void __user *)regs->ip);
11027 - } else {
11028 - /*
11029 - * Handle strange cache flush from user space exception
11030 - * in all other cases. This is undocumented behaviour.
11031 - */
11032 - if (regs->flags & VM_MASK) {
11033 - handle_vm86_fault((struct kernel_vm86_regs *)regs,
11034 - error_code);
11035 - return;
11036 - }
11037 - current->thread.trap_no = 19;
11038 - current->thread.error_code = error_code;
11039 - die_if_kernel("cache flush denied", regs, error_code);
11040 - force_sig(SIGSEGV, current);
11041 + return;
11042 + }
11043 + /*
11044 + * Handle strange cache flush from user space exception
11045 + * in all other cases. This is undocumented behaviour.
11046 + */
11047 + if (regs->flags & X86_VM_MASK) {
11048 + handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
11049 + return;
11050 }
11051 + current->thread.trap_no = 19;
11052 + current->thread.error_code = error_code;
11053 + die_if_kernel("cache flush denied", regs, error_code);
11054 + force_sig(SIGSEGV, current);
11055 }
11056
11057 #ifndef CONFIG_XEN
11058 -void do_spurious_interrupt_bug(struct pt_regs * regs,
11059 - long error_code)
11060 +void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
11061 {
11062 #if 0
11063 /* No need to warn about this any longer. */
11064 - printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
11065 + printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
11066 #endif
11067 }
11068
11069 -unsigned long patch_espfix_desc(unsigned long uesp,
11070 - unsigned long kesp)
11071 +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
11072 {
11073 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
11074 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
11075 unsigned long new_kesp = kesp - base;
11076 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
11077 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
11078 +
11079 /* Set up base for espfix segment */
11080 - desc &= 0x00f0ff0000000000ULL;
11081 - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
11082 + desc &= 0x00f0ff0000000000ULL;
11083 + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
11084 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
11085 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
11086 (lim_pages & 0xffff);
11087 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
11088 +
11089 return new_kesp;
11090 }
11091 #endif
11092
11093 /*
11094 - * 'math_state_restore()' saves the current math information in the
11095 + * 'math_state_restore()' saves the current math information in the
11096 * old math state array, and gets the new ones from the current task
11097 *
11098 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
11099 @@ -1087,9 +1117,22 @@ asmlinkage void math_state_restore(void)
11100 struct thread_info *thread = current_thread_info();
11101 struct task_struct *tsk = thread->task;
11102
11103 + if (!tsk_used_math(tsk)) {
11104 + local_irq_enable();
11105 + /*
11106 + * does a slab alloc which can sleep
11107 + */
11108 + if (init_fpu(tsk)) {
11109 + /*
11110 + * ran out of memory!
11111 + */
11112 + do_group_exit(SIGKILL);
11113 + return;
11114 + }
11115 + local_irq_disable();
11116 + }
11117 +
11118 /* NB. 'clts' is done for us by Xen during virtual trap. */
11119 - if (!tsk_used_math(tsk))
11120 - init_fpu(tsk);
11121 restore_fpu(tsk);
11122 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
11123 tsk->fpu_counter++;
11124 @@ -1100,15 +1143,15 @@ EXPORT_SYMBOL_GPL(math_state_restore);
11125
11126 asmlinkage void math_emulate(long arg)
11127 {
11128 - printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
11129 - printk(KERN_EMERG "killing %s.\n",current->comm);
11130 - force_sig(SIGFPE,current);
11131 + printk(KERN_EMERG
11132 + "math-emulation not enabled and no coprocessor found.\n");
11133 + printk(KERN_EMERG "killing %s.\n", current->comm);
11134 + force_sig(SIGFPE, current);
11135 schedule();
11136 }
11137
11138 #endif /* CONFIG_MATH_EMULATION */
11139
11140 -
11141 /*
11142 * NB. All these are "trap gates" (i.e. events_mask isn't set) except
11143 * for those that specify <dpl>|4 in the second field.
11144 @@ -1146,25 +1189,21 @@ void __init trap_init(void)
11145 if (ret)
11146 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
11147
11148 - /*
11149 - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
11150 - * Generate a build-time error if the alignment is wrong.
11151 - */
11152 - BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
11153 if (cpu_has_fxsr) {
11154 printk(KERN_INFO "Enabling fast FPU save and restore... ");
11155 set_in_cr4(X86_CR4_OSFXSR);
11156 printk("done.\n");
11157 }
11158 if (cpu_has_xmm) {
11159 - printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
11160 - "support... ");
11161 + printk(KERN_INFO
11162 + "Enabling unmasked SIMD FPU exception support... ");
11163 set_in_cr4(X86_CR4_OSXMMEXCPT);
11164 printk("done.\n");
11165 }
11166
11167 + init_thread_xstate();
11168 /*
11169 - * Should be a barrier for any external CPU state.
11170 + * Should be a barrier for any external CPU state:
11171 */
11172 cpu_init();
11173 }
11174 @@ -1183,6 +1222,7 @@ void __cpuinit smp_trap_init(trap_info_t
11175 static int __init kstack_setup(char *s)
11176 {
11177 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
11178 +
11179 return 1;
11180 }
11181 __setup("kstack=", kstack_setup);
11182 --- sle11-2009-10-16.orig/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:33:40.000000000 +0100
11183 +++ sle11-2009-10-16/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11184 @@ -33,6 +33,8 @@
11185 #include <linux/kdebug.h>
11186 #include <linux/utsname.h>
11187
11188 +#include <mach_traps.h>
11189 +
11190 #if defined(CONFIG_EDAC)
11191 #include <linux/edac.h>
11192 #endif
11193 @@ -601,10 +603,16 @@ void die(const char * str, struct pt_reg
11194 }
11195
11196 #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
11197 -void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
11198 +notrace __kprobes void
11199 +die_nmi(char *str, struct pt_regs *regs, int do_panic)
11200 {
11201 - unsigned long flags = oops_begin();
11202 + unsigned long flags;
11203 +
11204 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
11205 + NOTIFY_STOP)
11206 + return;
11207
11208 + flags = oops_begin();
11209 /*
11210 * We are in trouble anyway, lets at least try
11211 * to get a message out.
11212 @@ -769,7 +777,7 @@ asmlinkage void __kprobes do_general_pro
11213 die("general protection fault", regs, error_code);
11214 }
11215
11216 -static __kprobes void
11217 +static notrace __kprobes void
11218 mem_parity_error(unsigned char reason, struct pt_regs * regs)
11219 {
11220 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
11221 @@ -792,7 +800,7 @@ mem_parity_error(unsigned char reason, s
11222 clear_mem_error(reason);
11223 }
11224
11225 -static __kprobes void
11226 +static notrace __kprobes void
11227 io_check_error(unsigned char reason, struct pt_regs * regs)
11228 {
11229 printk("NMI: IOCK error (debug interrupt?)\n");
11230 @@ -802,9 +810,11 @@ io_check_error(unsigned char reason, str
11231 clear_io_check_error(reason);
11232 }
11233
11234 -static __kprobes void
11235 +static notrace __kprobes void
11236 unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
11237 {
11238 + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
11239 + return;
11240 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
11241 reason);
11242 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
11243 @@ -817,7 +827,7 @@ unknown_nmi_error(unsigned char reason,
11244
11245 /* Runs on IST stack. This code must keep interrupts off all the time.
11246 Nested NMIs are prevented by the CPU. */
11247 -asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
11248 +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
11249 {
11250 unsigned char reason = 0;
11251 int cpu;
11252 @@ -1117,11 +1127,25 @@ asmlinkage void __attribute__((weak)) mc
11253 asmlinkage void math_state_restore(void)
11254 {
11255 struct task_struct *me = current;
11256 +
11257 + if (!used_math()) {
11258 + local_irq_enable();
11259 + /*
11260 + * does a slab alloc which can sleep
11261 + */
11262 + if (init_fpu(me)) {
11263 + /*
11264 + * ran out of memory!
11265 + */
11266 + do_group_exit(SIGKILL);
11267 + return;
11268 + }
11269 + local_irq_disable();
11270 + }
11271 +
11272 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
11273
11274 - if (!used_math())
11275 - init_fpu(me);
11276 - restore_fpu_checking(&me->thread.i387.fxsave);
11277 + restore_fpu_checking(&me->thread.xstate->fxsave);
11278 task_thread_info(me)->status |= TS_USEDFPU;
11279 me->fpu_counter++;
11280 }
11281 @@ -1168,6 +1192,10 @@ void __init trap_init(void)
11282 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
11283
11284 /*
11285 + * initialize the per thread extended state:
11286 + */
11287 + init_thread_xstate();
11288 + /*
11289 * Should be a barrier for any external CPU state.
11290 */
11291 cpu_init();
11292 --- sle11-2009-10-16.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:33:40.000000000 +0100
11293 +++ sle11-2009-10-16/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11294 @@ -216,7 +216,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
11295 return 0;
11296 }
11297
11298 -long __vsyscall(3) venosys_1(void)
11299 +static long __vsyscall(3) venosys_1(void)
11300 {
11301 return -ENOSYS;
11302 }
11303 --- sle11-2009-10-16.orig/arch/x86/mm/fault-xen.c 2009-03-16 16:33:40.000000000 +0100
11304 +++ sle11-2009-10-16/arch/x86/mm/fault-xen.c 2009-03-16 16:38:05.000000000 +0100
11305 @@ -510,6 +510,11 @@ static int vmalloc_fault(unsigned long a
11306 unsigned long pgd_paddr;
11307 pmd_t *pmd_k;
11308 pte_t *pte_k;
11309 +
11310 + /* Make sure we are in vmalloc area */
11311 + if (!(address >= VMALLOC_START && address < VMALLOC_END))
11312 + return -1;
11313 +
11314 /*
11315 * Synchronize this task's top level page-table
11316 * with the 'reference' page table.
11317 @@ -670,7 +675,7 @@ void __kprobes do_page_fault(struct pt_r
11318 #ifdef CONFIG_X86_32
11319 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11320 fault has been handled. */
11321 - if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
11322 + if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
11323 local_irq_enable();
11324
11325 /*
11326 @@ -1017,9 +1022,5 @@ void vmalloc_sync_all(void)
11327 if (address == start)
11328 start = address + PGDIR_SIZE;
11329 }
11330 - /* Check that there is no need to do the same for the modules area. */
11331 - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
11332 - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
11333 - (__START_KERNEL & PGDIR_MASK)));
11334 #endif
11335 }
11336 --- sle11-2009-10-16.orig/arch/x86/mm/highmem_32-xen.c 2009-03-16 16:33:40.000000000 +0100
11337 +++ sle11-2009-10-16/arch/x86/mm/highmem_32-xen.c 2009-03-16 16:38:05.000000000 +0100
11338 @@ -200,6 +200,8 @@ EXPORT_SYMBOL(kmap);
11339 EXPORT_SYMBOL(kunmap);
11340 EXPORT_SYMBOL(kmap_atomic);
11341 EXPORT_SYMBOL(kunmap_atomic);
11342 +#ifdef CONFIG_HIGHPTE
11343 EXPORT_SYMBOL(kmap_atomic_to_page);
11344 +#endif
11345 EXPORT_SYMBOL(clear_highpage);
11346 EXPORT_SYMBOL(copy_highpage);
11347 --- sle11-2009-10-16.orig/arch/x86/mm/init_32-xen.c 2009-03-16 16:33:40.000000000 +0100
11348 +++ sle11-2009-10-16/arch/x86/mm/init_32-xen.c 2009-03-16 16:38:05.000000000 +0100
11349 @@ -1,5 +1,4 @@
11350 /*
11351 - * linux/arch/i386/mm/init.c
11352 *
11353 * Copyright (C) 1995 Linus Torvalds
11354 *
11355 @@ -22,6 +21,7 @@
11356 #include <linux/init.h>
11357 #include <linux/highmem.h>
11358 #include <linux/pagemap.h>
11359 +#include <linux/pci.h>
11360 #include <linux/pfn.h>
11361 #include <linux/poison.h>
11362 #include <linux/bootmem.h>
11363 @@ -54,6 +54,8 @@
11364
11365 unsigned int __VMALLOC_RESERVE = 128 << 20;
11366
11367 +unsigned long max_pfn_mapped;
11368 +
11369 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
11370 unsigned long highstart_pfn, highend_pfn;
11371
11372 @@ -73,7 +75,7 @@ static pmd_t * __init one_md_table_init(
11373 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
11374 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
11375
11376 - paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
11377 + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
11378 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
11379 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
11380 pud = pud_offset(pgd, 0);
11381 @@ -107,7 +109,7 @@ static pte_t * __init one_page_table_ini
11382 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
11383 }
11384
11385 - paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
11386 + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
11387 make_lowmem_page_readonly(page_table,
11388 XENFEAT_writable_page_tables);
11389 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
11390 @@ -209,8 +211,13 @@ static void __init kernel_physical_mappi
11391 /*
11392 * Map with big pages if possible, otherwise
11393 * create normal page tables:
11394 + *
11395 + * Don't use a large page for the first 2/4MB of memory
11396 + * because there are often fixed size MTRRs in there
11397 + * and overlapping MTRRs into large pages can cause
11398 + * slowdowns.
11399 */
11400 - if (cpu_has_pse) {
11401 + if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
11402 unsigned int addr2;
11403 pgprot_t prot = PAGE_KERNEL_LARGE;
11404
11405 @@ -224,6 +231,7 @@ static void __init kernel_physical_mappi
11406 set_pmd(pmd, pfn_pmd(pfn, prot));
11407
11408 pfn += PTRS_PER_PTE;
11409 + max_pfn_mapped = pfn;
11410 continue;
11411 }
11412 pte = one_page_table_init(pmd);
11413 @@ -241,6 +249,7 @@ static void __init kernel_physical_mappi
11414
11415 set_pte(pte, pfn_pte(pfn, prot));
11416 }
11417 + max_pfn_mapped = pfn;
11418 pte_ofs = 0;
11419 }
11420 pmd_idx = 0;
11421 @@ -262,6 +271,25 @@ static inline int page_kills_ppro(unsign
11422
11423 #endif
11424
11425 +/*
11426 + * devmem_is_allowed() checks to see if /dev/mem access to a certain address
11427 + * is valid. The argument is a physical page number.
11428 + *
11429 + *
11430 + * On x86, access has to be given to the first megabyte of ram because that area
11431 + * contains bios code and data regions used by X and dosemu and similar apps.
11432 + * Access has to be given to non-kernel-ram areas as well, these contain the PCI
11433 + * mmio resources as well as potential bios/acpi data regions.
11434 + */
11435 +int devmem_is_allowed(unsigned long pagenr)
11436 +{
11437 + if (pagenr <= 256)
11438 + return 1;
11439 + if (mfn_to_local_pfn(pagenr) >= max_pfn)
11440 + return 1;
11441 + return 0;
11442 +}
11443 +
11444 #ifdef CONFIG_HIGHMEM
11445 pte_t *kmap_pte;
11446 pgprot_t kmap_prot;
11447 @@ -303,48 +331,18 @@ static void __init permanent_kmaps_init(
11448 pkmap_page_table = pte;
11449 }
11450
11451 -static void __meminit free_new_highpage(struct page *page, int pfn)
11452 -{
11453 - init_page_count(page);
11454 - if (pfn < xen_start_info->nr_pages)
11455 - __free_page(page);
11456 - totalhigh_pages++;
11457 -}
11458 -
11459 void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
11460 {
11461 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
11462 ClearPageReserved(page);
11463 - free_new_highpage(page, pfn);
11464 + init_page_count(page);
11465 + if (pfn < xen_start_info->nr_pages)
11466 + __free_page(page);
11467 + totalhigh_pages++;
11468 } else
11469 SetPageReserved(page);
11470 }
11471
11472 -static int __meminit
11473 -add_one_highpage_hotplug(struct page *page, unsigned long pfn)
11474 -{
11475 - free_new_highpage(page, pfn);
11476 - totalram_pages++;
11477 -#ifdef CONFIG_FLATMEM
11478 - max_mapnr = max(pfn, max_mapnr);
11479 -#endif
11480 - num_physpages++;
11481 -
11482 - return 0;
11483 -}
11484 -
11485 -/*
11486 - * Not currently handling the NUMA case.
11487 - * Assuming single node and all memory that
11488 - * has been added dynamically that would be
11489 - * onlined here is in HIGHMEM.
11490 - */
11491 -void __meminit online_page(struct page *page)
11492 -{
11493 - ClearPageReserved(page);
11494 - add_one_highpage_hotplug(page, page_to_pfn(page));
11495 -}
11496 -
11497 #ifndef CONFIG_NUMA
11498 static void __init set_highmem_pages_init(int bad_ppro)
11499 {
11500 @@ -459,15 +457,13 @@ void zap_low_mappings(void)
11501 {
11502 int i;
11503
11504 - save_pg_dir();
11505 -
11506 /*
11507 * Zap initial low-memory mappings.
11508 *
11509 * Note that "pgd_clear()" doesn't do it for
11510 * us, because pgd_clear() is a no-op on i386.
11511 */
11512 - for (i = 0; i < USER_PTRS_PER_PGD; i++) {
11513 + for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
11514 #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
11515 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
11516 #else
11517 @@ -572,9 +568,9 @@ void __init paging_init(void)
11518
11519 /*
11520 * Test if the WP bit works in supervisor mode. It isn't supported on 386's
11521 - * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
11522 - * used to involve black magic jumps to work around some nasty CPU bugs,
11523 - * but fortunately the switch to using exceptions got rid of all that.
11524 + * and also on some strange 486's. All 586+'s are OK. This used to involve
11525 + * black magic jumps to work around some nasty CPU bugs, but fortunately the
11526 + * switch to using exceptions got rid of all that.
11527 */
11528 static void __init test_wp_bit(void)
11529 {
11530 @@ -605,9 +601,7 @@ void __init mem_init(void)
11531 int tmp, bad_ppro;
11532 unsigned long pfn;
11533
11534 -#if defined(CONFIG_SWIOTLB)
11535 - swiotlb_init();
11536 -#endif
11537 + pci_iommu_alloc();
11538
11539 #ifdef CONFIG_FLATMEM
11540 BUG_ON(!mem_map);
11541 @@ -710,16 +704,8 @@ void __init mem_init(void)
11542 test_wp_bit();
11543
11544 cpa_init();
11545 -
11546 - /*
11547 - * Subtle. SMP is doing it's boot stuff late (because it has to
11548 - * fork idle threads) - but it also needs low mappings for the
11549 - * protected-mode entry to work. We zap these entries only after
11550 - * the WP-bit has been tested.
11551 - */
11552 -#ifndef CONFIG_SMP
11553 + save_pg_dir();
11554 zap_low_mappings();
11555 -#endif
11556
11557 SetPagePinned(virt_to_page(init_mm.pgd));
11558 }
11559 @@ -769,25 +755,17 @@ void mark_rodata_ro(void)
11560 unsigned long start = PFN_ALIGN(_text);
11561 unsigned long size = PFN_ALIGN(_etext) - start;
11562
11563 -#ifndef CONFIG_KPROBES
11564 -#ifdef CONFIG_HOTPLUG_CPU
11565 - /* It must still be possible to apply SMP alternatives. */
11566 - if (num_possible_cpus() <= 1)
11567 -#endif
11568 - {
11569 - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
11570 - printk(KERN_INFO "Write protecting the kernel text: %luk\n",
11571 - size >> 10);
11572 + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
11573 + printk(KERN_INFO "Write protecting the kernel text: %luk\n",
11574 + size >> 10);
11575
11576 #ifdef CONFIG_CPA_DEBUG
11577 - printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
11578 - start, start+size);
11579 - set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
11580 + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
11581 + start, start+size);
11582 + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
11583
11584 - printk(KERN_INFO "Testing CPA: write protecting again\n");
11585 - set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
11586 -#endif
11587 - }
11588 + printk(KERN_INFO "Testing CPA: write protecting again\n");
11589 + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
11590 #endif
11591 start += size;
11592 size = (unsigned long)__end_rodata - start;
11593 --- sle11-2009-10-16.orig/arch/x86/mm/init_64-xen.c 2009-03-16 16:33:40.000000000 +0100
11594 +++ sle11-2009-10-16/arch/x86/mm/init_64-xen.c 2009-03-16 16:38:05.000000000 +0100
11595 @@ -52,9 +52,6 @@
11596
11597 #include <xen/features.h>
11598
11599 -const struct dma_mapping_ops *dma_ops;
11600 -EXPORT_SYMBOL(dma_ops);
11601 -
11602 #if CONFIG_XEN_COMPAT <= 0x030002
11603 unsigned int __kernel_page_user;
11604 EXPORT_SYMBOL(__kernel_page_user);
11605 @@ -68,6 +65,28 @@ extern unsigned long start_pfn;
11606 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
11607 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
11608
11609 +#ifndef CONFIG_XEN
11610 +int direct_gbpages __meminitdata
11611 +#ifdef CONFIG_DIRECT_GBPAGES
11612 + = 1
11613 +#endif
11614 +;
11615 +
11616 +static int __init parse_direct_gbpages_off(char *arg)
11617 +{
11618 + direct_gbpages = 0;
11619 + return 0;
11620 +}
11621 +early_param("nogbpages", parse_direct_gbpages_off);
11622 +
11623 +static int __init parse_direct_gbpages_on(char *arg)
11624 +{
11625 + direct_gbpages = 1;
11626 + return 0;
11627 +}
11628 +early_param("gbpages", parse_direct_gbpages_on);
11629 +#endif
11630 +
11631 /*
11632 * Use this until direct mapping is established, i.e. before __va() is
11633 * available in init_memory_mapping().
11634 @@ -135,9 +154,6 @@ void show_mem(void)
11635
11636 printk(KERN_INFO "Mem-info:\n");
11637 show_free_areas();
11638 - printk(KERN_INFO "Free swap: %6ldkB\n",
11639 - nr_swap_pages << (PAGE_SHIFT-10));
11640 -
11641 for_each_online_pgdat(pgdat) {
11642 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
11643 /*
11644 @@ -328,7 +344,7 @@ void __init cleanup_highmap(void)
11645 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
11646
11647 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
11648 - if (!pmd_present(*pmd))
11649 + if (pmd_none(*pmd))
11650 continue;
11651 if (vaddr < (unsigned long) _text || vaddr > end)
11652 set_pmd(pmd, __pmd(0));
11653 @@ -337,8 +353,7 @@ void __init cleanup_highmap(void)
11654 #endif
11655
11656 /* NOTE: this is meant to be run only at boot */
11657 -void __init
11658 -__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
11659 +void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
11660 {
11661 unsigned long address = __fix_to_virt(idx);
11662
11663 @@ -463,7 +478,7 @@ __meminit void early_iounmap(void *addr,
11664 }
11665 #endif
11666
11667 -static void __meminit
11668 +static unsigned long __meminit
11669 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
11670 {
11671 int i = pmd_index(address);
11672 @@ -503,21 +518,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
11673 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
11674 }
11675 }
11676 + return address;
11677 }
11678
11679 -static void __meminit
11680 +static unsigned long __meminit
11681 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
11682 {
11683 pmd_t *pmd = pmd_offset(pud, 0);
11684 + unsigned long last_map_addr;
11685 +
11686 spin_lock(&init_mm.page_table_lock);
11687 - phys_pmd_init(pmd, address, end);
11688 + last_map_addr = phys_pmd_init(pmd, address, end);
11689 spin_unlock(&init_mm.page_table_lock);
11690 __flush_tlb_all();
11691 + return last_map_addr;
11692 }
11693
11694 -static void __meminit
11695 +static unsigned long __meminit
11696 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
11697 {
11698 + unsigned long last_map_addr = end;
11699 int i = pud_index(addr);
11700
11701 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
11702 @@ -529,7 +549,15 @@ phys_pud_init(pud_t *pud_page, unsigned
11703 break;
11704
11705 if (__pud_val(*pud)) {
11706 - phys_pmd_update(pud, addr, end);
11707 + if (!pud_large(*pud))
11708 + last_map_addr = phys_pmd_update(pud, addr, end);
11709 + continue;
11710 + }
11711 +
11712 + if (direct_gbpages) {
11713 + set_pte((pte_t *)pud,
11714 + pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
11715 + last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
11716 continue;
11717 }
11718
11719 @@ -537,12 +565,14 @@ phys_pud_init(pud_t *pud_page, unsigned
11720
11721 spin_lock(&init_mm.page_table_lock);
11722 *pud = __pud(pmd_phys | _KERNPG_TABLE);
11723 - phys_pmd_init(pmd, addr, end);
11724 + last_map_addr = phys_pmd_init(pmd, addr, end);
11725 spin_unlock(&init_mm.page_table_lock);
11726
11727 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
11728 }
11729 __flush_tlb_all();
11730 +
11731 + return last_map_addr >> PAGE_SHIFT;
11732 }
11733
11734 void __init xen_init_pt(void)
11735 @@ -754,16 +784,138 @@ static void __init xen_finish_init_mappi
11736 table_end = start_pfn;
11737 }
11738
11739 +static void __init init_gbpages(void)
11740 +{
11741 +#ifndef CONFIG_XEN
11742 + if (direct_gbpages && cpu_has_gbpages)
11743 + printk(KERN_INFO "Using GB pages for direct mapping\n");
11744 + else
11745 + direct_gbpages = 0;
11746 +#endif
11747 +}
11748 +
11749 +#ifdef CONFIG_MEMTEST_BOOTPARAM
11750 +
11751 +static void __init memtest(unsigned long start_phys, unsigned long size,
11752 + unsigned pattern)
11753 +{
11754 + unsigned long i;
11755 + unsigned long *start;
11756 + unsigned long start_bad;
11757 + unsigned long last_bad;
11758 + unsigned long val;
11759 + unsigned long start_phys_aligned;
11760 + unsigned long count;
11761 + unsigned long incr;
11762 +
11763 + switch (pattern) {
11764 + case 0:
11765 + val = 0UL;
11766 + break;
11767 + case 1:
11768 + val = -1UL;
11769 + break;
11770 + case 2:
11771 + val = 0x5555555555555555UL;
11772 + break;
11773 + case 3:
11774 + val = 0xaaaaaaaaaaaaaaaaUL;
11775 + break;
11776 + default:
11777 + return;
11778 + }
11779 +
11780 + incr = sizeof(unsigned long);
11781 + start_phys_aligned = ALIGN(start_phys, incr);
11782 + count = (size - (start_phys_aligned - start_phys))/incr;
11783 + start = __va(start_phys_aligned);
11784 + start_bad = 0;
11785 + last_bad = 0;
11786 +
11787 + for (i = 0; i < count; i++)
11788 + start[i] = val;
11789 + for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
11790 + if (*start != val) {
11791 + if (start_phys_aligned == last_bad + incr) {
11792 + last_bad += incr;
11793 + } else {
11794 + if (start_bad) {
11795 + printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
11796 + val, start_bad, last_bad + incr);
11797 + reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
11798 + }
11799 + start_bad = last_bad = start_phys_aligned;
11800 + }
11801 + }
11802 + }
11803 + if (start_bad) {
11804 + printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
11805 + val, start_bad, last_bad + incr);
11806 + reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
11807 + }
11808 +
11809 +}
11810 +
11811 +static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
11812 +
11813 +static int __init parse_memtest(char *arg)
11814 +{
11815 + if (arg)
11816 + memtest_pattern = simple_strtoul(arg, NULL, 0);
11817 + return 0;
11818 +}
11819 +
11820 +early_param("memtest", parse_memtest);
11821 +
11822 +static void __init early_memtest(unsigned long start, unsigned long end)
11823 +{
11824 + u64 t_start, t_size;
11825 + unsigned pattern;
11826 +
11827 + if (!memtest_pattern)
11828 + return;
11829 +
11830 + printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
11831 + for (pattern = 0; pattern < memtest_pattern; pattern++) {
11832 + t_start = start;
11833 + t_size = 0;
11834 + while (t_start < end) {
11835 + t_start = find_e820_area_size(t_start, &t_size, 1);
11836 +
11837 + /* done ? */
11838 + if (t_start >= end)
11839 + break;
11840 + if (t_start + t_size > end)
11841 + t_size = end - t_start;
11842 +
11843 + printk(KERN_CONT "\n %016llx - %016llx pattern %d",
11844 + (unsigned long long)t_start,
11845 + (unsigned long long)t_start + t_size, pattern);
11846 +
11847 + memtest(t_start, t_size, pattern);
11848 +
11849 + t_start += t_size;
11850 + }
11851 + }
11852 + printk(KERN_CONT "\n");
11853 +}
11854 +#else
11855 +static void __init early_memtest(unsigned long start, unsigned long end)
11856 +{
11857 +}
11858 +#endif
11859 +
11860 /*
11861 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
11862 * This runs before bootmem is initialized and gets pages directly from
11863 * the physical memory. To access them they are temporarily mapped.
11864 */
11865 -void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
11866 +unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
11867 {
11868 - unsigned long next;
11869 + unsigned long next, last_map_addr = end;
11870 + unsigned long start_phys = start, end_phys = end;
11871
11872 - pr_debug("init_memory_mapping\n");
11873 + printk(KERN_INFO "init_memory_mapping\n");
11874
11875 /*
11876 * Find space for the kernel direct mapping tables.
11877 @@ -772,8 +924,10 @@ void __init_refok init_memory_mapping(un
11878 * memory mapped. Unfortunately this is done currently before the
11879 * nodes are discovered.
11880 */
11881 - if (!after_bootmem)
11882 + if (!after_bootmem) {
11883 + init_gbpages();
11884 find_early_table_space(end);
11885 + }
11886
11887 start = (unsigned long)__va(start);
11888 end = (unsigned long)__va(end);
11889 @@ -790,7 +944,7 @@ void __init_refok init_memory_mapping(un
11890 next = start + PGDIR_SIZE;
11891 if (next > end)
11892 next = end;
11893 - phys_pud_init(pud, __pa(start), __pa(next));
11894 + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
11895 if (!after_bootmem) {
11896 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
11897 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
11898 @@ -807,6 +961,11 @@ void __init_refok init_memory_mapping(un
11899 if (!after_bootmem)
11900 reserve_early(table_start << PAGE_SHIFT,
11901 table_end << PAGE_SHIFT, "PGTABLE");
11902 +
11903 + if (!after_bootmem)
11904 + early_memtest(start_phys, end_phys);
11905 +
11906 + return last_map_addr;
11907 }
11908
11909 #ifndef CONFIG_NUMA
11910 @@ -830,15 +989,6 @@ void __init paging_init(void)
11911 /*
11912 * Memory hotplug specific functions
11913 */
11914 -void online_page(struct page *page)
11915 -{
11916 - ClearPageReserved(page);
11917 - init_page_count(page);
11918 - __free_page(page);
11919 - totalram_pages++;
11920 - num_physpages++;
11921 -}
11922 -
11923 #ifdef CONFIG_MEMORY_HOTPLUG
11924 /*
11925 * Memory is added always to NORMAL zone. This means you will never get
11926 @@ -848,11 +998,13 @@ int arch_add_memory(int nid, u64 start,
11927 {
11928 struct pglist_data *pgdat = NODE_DATA(nid);
11929 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
11930 - unsigned long start_pfn = start >> PAGE_SHIFT;
11931 + unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
11932 unsigned long nr_pages = size >> PAGE_SHIFT;
11933 int ret;
11934
11935 - init_memory_mapping(start, start + size-1);
11936 + last_mapped_pfn = init_memory_mapping(start, start + size-1);
11937 + if (last_mapped_pfn > max_pfn_mapped)
11938 + max_pfn_mapped = last_mapped_pfn;
11939
11940 ret = __add_pages(zone, start_pfn, nr_pages);
11941 WARN_ON(1);
11942 @@ -871,6 +1023,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
11943
11944 #endif /* CONFIG_MEMORY_HOTPLUG */
11945
11946 +/*
11947 + * devmem_is_allowed() checks to see if /dev/mem access to a certain address
11948 + * is valid. The argument is a physical page number.
11949 + *
11950 + *
11951 + * On x86, access has to be given to the first megabyte of ram because that area
11952 + * contains bios code and data regions used by X and dosemu and similar apps.
11953 + * Access has to be given to non-kernel-ram areas as well, these contain the PCI
11954 + * mmio resources as well as potential bios/acpi data regions.
11955 + */
11956 +int devmem_is_allowed(unsigned long pagenr)
11957 +{
11958 + if (pagenr <= 256)
11959 + return 1;
11960 + if (mfn_to_local_pfn(pagenr) >= max_pfn)
11961 + return 1;
11962 + return 0;
11963 +}
11964 +
11965 +
11966 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
11967 kcore_modules, kcore_vsyscall;
11968
11969 @@ -979,24 +1151,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
11970
11971 void mark_rodata_ro(void)
11972 {
11973 - unsigned long start = (unsigned long)_stext, end;
11974 -
11975 -#ifdef CONFIG_HOTPLUG_CPU
11976 - /* It must still be possible to apply SMP alternatives. */
11977 - if (num_possible_cpus() > 1)
11978 - start = (unsigned long)_etext;
11979 -#endif
11980 -
11981 -#ifdef CONFIG_KPROBES
11982 - start = (unsigned long)__start_rodata;
11983 -#endif
11984 -
11985 - end = (unsigned long)__end_rodata;
11986 - start = (start + PAGE_SIZE - 1) & PAGE_MASK;
11987 - end &= PAGE_MASK;
11988 - if (end <= start)
11989 - return;
11990 -
11991 + unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
11992
11993 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
11994 (end - start) >> 10);
11995 @@ -1019,6 +1174,7 @@ void mark_rodata_ro(void)
11996 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
11997 #endif
11998 }
11999 +
12000 #endif
12001
12002 #ifdef CONFIG_BLK_DEV_INITRD
12003 @@ -1031,7 +1187,7 @@ void free_initrd_mem(unsigned long start
12004 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
12005 {
12006 #ifdef CONFIG_NUMA
12007 - int nid = phys_to_nid(phys);
12008 + int nid, next_nid;
12009 #endif
12010 unsigned long pfn = phys >> PAGE_SHIFT;
12011
12012 @@ -1040,7 +1196,7 @@ void __init reserve_bootmem_generic(unsi
12013 * This can happen with kdump kernels when accessing
12014 * firmware tables:
12015 */
12016 - if (pfn < end_pfn_map)
12017 + if (pfn < max_pfn_mapped)
12018 return;
12019
12020 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
12021 @@ -1050,10 +1206,16 @@ void __init reserve_bootmem_generic(unsi
12022
12023 /* Should check here against the e820 map to avoid double free */
12024 #ifdef CONFIG_NUMA
12025 - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
12026 + nid = phys_to_nid(phys);
12027 + next_nid = phys_to_nid(phys + len - 1);
12028 + if (nid == next_nid)
12029 + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
12030 + else
12031 + reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
12032 #else
12033 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
12034 #endif
12035 +
12036 #ifndef CONFIG_XEN
12037 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
12038 dma_reserve += len / PAGE_SIZE;
12039 @@ -1149,6 +1311,10 @@ const char *arch_vma_name(struct vm_area
12040 /*
12041 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
12042 */
12043 +static long __meminitdata addr_start, addr_end;
12044 +static void __meminitdata *p_start, *p_end;
12045 +static int __meminitdata node_start;
12046 +
12047 int __meminit
12048 vmemmap_populate(struct page *start_page, unsigned long size, int node)
12049 {
12050 @@ -1183,12 +1349,32 @@ vmemmap_populate(struct page *start_page
12051 PAGE_KERNEL_LARGE);
12052 set_pmd(pmd, __pmd_ma(__pte_val(entry)));
12053
12054 - printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
12055 - addr, addr + PMD_SIZE - 1, p, node);
12056 + /* check to see if we have contiguous blocks */
12057 + if (p_end != p || node_start != node) {
12058 + if (p_start)
12059 + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
12060 + addr_start, addr_end-1, p_start, p_end-1, node_start);
12061 + addr_start = addr;
12062 + node_start = node;
12063 + p_start = p;
12064 + }
12065 + addr_end = addr + PMD_SIZE;
12066 + p_end = p + PMD_SIZE;
12067 } else {
12068 vmemmap_verify((pte_t *)pmd, node, addr, next);
12069 }
12070 }
12071 return 0;
12072 }
12073 +
12074 +void __meminit vmemmap_populate_print_last(void)
12075 +{
12076 + if (p_start) {
12077 + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
12078 + addr_start, addr_end-1, p_start, p_end-1, node_start);
12079 + p_start = NULL;
12080 + p_end = NULL;
12081 + node_start = 0;
12082 + }
12083 +}
12084 #endif
12085 --- sle11-2009-10-16.orig/arch/x86/mm/ioremap-xen.c 2009-03-16 16:33:40.000000000 +0100
12086 +++ sle11-2009-10-16/arch/x86/mm/ioremap-xen.c 2009-03-16 16:38:05.000000000 +0100
12087 @@ -20,14 +20,11 @@
12088 #include <asm/pgtable.h>
12089 #include <asm/tlbflush.h>
12090 #include <asm/pgalloc.h>
12091 +#include <asm/pat.h>
12092
12093 -enum ioremap_mode {
12094 - IOR_MODE_UNCACHED,
12095 - IOR_MODE_CACHED,
12096 -};
12097 -
12098 -#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12099 +#ifdef CONFIG_X86_64
12100
12101 +#ifndef CONFIG_XEN
12102 unsigned long __phys_addr(unsigned long x)
12103 {
12104 if (x >= __START_KERNEL_map)
12105 @@ -35,6 +32,19 @@ unsigned long __phys_addr(unsigned long
12106 return x - PAGE_OFFSET;
12107 }
12108 EXPORT_SYMBOL(__phys_addr);
12109 +#endif
12110 +
12111 +static inline int phys_addr_valid(unsigned long addr)
12112 +{
12113 + return addr < (1UL << boot_cpu_data.x86_phys_bits);
12114 +}
12115 +
12116 +#else
12117 +
12118 +static inline int phys_addr_valid(unsigned long addr)
12119 +{
12120 + return 1;
12121 +}
12122
12123 #endif
12124
12125 @@ -92,7 +102,8 @@ static int __direct_remap_pfn_range(stru
12126 * Fill in the machine address: PTE ptr is done later by
12127 * apply_to_page_range().
12128 */
12129 - v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
12130 + pgprot_val(prot) |= _PAGE_IO;
12131 + v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot)));
12132
12133 mfn++;
12134 address += PAGE_SIZE;
12135 @@ -189,10 +200,9 @@ int touch_pte_range(struct mm_struct *mm
12136
12137 EXPORT_SYMBOL(touch_pte_range);
12138
12139 -#ifdef CONFIG_X86_32
12140 int page_is_ram(unsigned long pagenr)
12141 {
12142 - unsigned long addr, end;
12143 + resource_size_t addr, end;
12144 int i;
12145
12146 #ifndef CONFIG_XEN
12147 @@ -228,31 +238,51 @@ int page_is_ram(unsigned long pagenr)
12148 }
12149 return 0;
12150 }
12151 -#endif
12152
12153 /*
12154 * Fix up the linear direct mapping of the kernel to avoid cache attribute
12155 * conflicts.
12156 */
12157 static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
12158 - enum ioremap_mode mode)
12159 + unsigned long prot_val)
12160 {
12161 unsigned long nrpages = size >> PAGE_SHIFT;
12162 int err;
12163
12164 - switch (mode) {
12165 - case IOR_MODE_UNCACHED:
12166 + switch (prot_val) {
12167 + case _PAGE_CACHE_UC:
12168 default:
12169 - err = set_memory_uc(vaddr, nrpages);
12170 + err = _set_memory_uc(vaddr, nrpages);
12171 + break;
12172 + case _PAGE_CACHE_WC:
12173 + err = _set_memory_wc(vaddr, nrpages);
12174 break;
12175 - case IOR_MODE_CACHED:
12176 - err = set_memory_wb(vaddr, nrpages);
12177 + case _PAGE_CACHE_WB:
12178 + err = _set_memory_wb(vaddr, nrpages);
12179 break;
12180 }
12181
12182 return err;
12183 }
12184
12185 +int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
12186 + unsigned long prot_val)
12187 +{
12188 + unsigned long sz;
12189 + int rc;
12190 +
12191 + for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
12192 + unsigned long pfn = mfn_to_local_pfn(mfn);
12193 +
12194 + if (pfn >= max_pfn_mapped)
12195 + continue;
12196 + rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
12197 + PAGE_SIZE, prot_val);
12198 + }
12199 +
12200 + return rc;
12201 +}
12202 +
12203 /*
12204 * Remap an arbitrary physical address space into the kernel virtual
12205 * address space. Needed when the kernel wants to access high addresses
12206 @@ -262,12 +292,15 @@ static int ioremap_change_attr(unsigned
12207 * have to convert them into an offset in a page-aligned mapping, but the
12208 * caller shouldn't need to know that small detail.
12209 */
12210 -static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
12211 - enum ioremap_mode mode)
12212 +static void __iomem *__ioremap_caller(resource_size_t phys_addr,
12213 + unsigned long size, unsigned long prot_val, void *caller)
12214 {
12215 - unsigned long mfn, offset, last_addr, vaddr;
12216 + unsigned long mfn, offset, vaddr;
12217 + resource_size_t last_addr;
12218 struct vm_struct *area;
12219 + unsigned long new_prot_val;
12220 pgprot_t prot;
12221 + int retval;
12222 domid_t domid = DOMID_IO;
12223
12224 /* Don't allow wraparound or zero size */
12225 @@ -275,6 +308,13 @@ static void __iomem *__ioremap(resource_
12226 if (!size || last_addr < phys_addr)
12227 return NULL;
12228
12229 + if (!phys_addr_valid(phys_addr)) {
12230 + printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
12231 + (unsigned long long)phys_addr);
12232 + WARN_ON_ONCE(1);
12233 + return NULL;
12234 + }
12235 +
12236 /*
12237 * Don't remap the low PCI/ISA area, it's always mapped..
12238 */
12239 @@ -287,55 +327,86 @@ static void __iomem *__ioremap(resource_
12240 for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
12241 unsigned long pfn = mfn_to_local_pfn(mfn);
12242
12243 - if (pfn >= max_pfn)
12244 - continue;
12245 + if (pfn_valid(pfn)) {
12246 + if (!PageReserved(pfn_to_page(pfn)))
12247 + return NULL;
12248 + domid = DOMID_SELF;
12249 + }
12250 + }
12251 + WARN_ON_ONCE(domid == DOMID_SELF);
12252
12253 - domid = DOMID_SELF;
12254 + /*
12255 + * Mappings have to be page-aligned
12256 + */
12257 + offset = phys_addr & ~PAGE_MASK;
12258 + phys_addr &= PAGE_MASK;
12259 + size = PAGE_ALIGN(last_addr+1) - phys_addr;
12260
12261 - if (pfn >= max_pfn_mapped) /* bogus */
12262 - continue;
12263 + retval = reserve_memtype(phys_addr, phys_addr + size,
12264 + prot_val, &new_prot_val);
12265 + if (retval) {
12266 + pr_debug("Warning: reserve_memtype returned %d\n", retval);
12267 + return NULL;
12268 + }
12269
12270 - if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
12271 + if (prot_val != new_prot_val) {
12272 + /*
12273 + * Do not fallback to certain memory types with certain
12274 + * requested type:
12275 + * - request is uc-, return cannot be write-back
12276 + * - request is uc-, return cannot be write-combine
12277 + * - request is write-combine, return cannot be write-back
12278 + */
12279 + if ((prot_val == _PAGE_CACHE_UC_MINUS &&
12280 + (new_prot_val == _PAGE_CACHE_WB ||
12281 + new_prot_val == _PAGE_CACHE_WC)) ||
12282 + (prot_val == _PAGE_CACHE_WC &&
12283 + new_prot_val == _PAGE_CACHE_WB)) {
12284 + pr_debug(
12285 + "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
12286 + (unsigned long long)phys_addr,
12287 + (unsigned long long)(phys_addr + size),
12288 + prot_val, new_prot_val);
12289 + free_memtype(phys_addr, phys_addr + size);
12290 return NULL;
12291 + }
12292 + prot_val = new_prot_val;
12293 }
12294
12295 - switch (mode) {
12296 - case IOR_MODE_UNCACHED:
12297 + switch (prot_val) {
12298 + case _PAGE_CACHE_UC:
12299 default:
12300 - /*
12301 - * FIXME: we will use UC MINUS for now, as video fb drivers
12302 - * depend on it. Upcoming ioremap_wc() will fix this behavior.
12303 - */
12304 + prot = PAGE_KERNEL_NOCACHE;
12305 + break;
12306 + case _PAGE_CACHE_UC_MINUS:
12307 prot = PAGE_KERNEL_UC_MINUS;
12308 break;
12309 - case IOR_MODE_CACHED:
12310 + case _PAGE_CACHE_WC:
12311 + prot = PAGE_KERNEL_WC;
12312 + break;
12313 + case _PAGE_CACHE_WB:
12314 prot = PAGE_KERNEL;
12315 break;
12316 }
12317
12318 /*
12319 - * Mappings have to be page-aligned
12320 - */
12321 - offset = phys_addr & ~PAGE_MASK;
12322 - phys_addr &= PAGE_MASK;
12323 - size = PAGE_ALIGN(last_addr+1) - phys_addr;
12324 -
12325 - /*
12326 * Ok, go for it..
12327 */
12328 - area = get_vm_area(size, VM_IOREMAP | (mode << 20));
12329 + area = get_vm_area_caller(size, VM_IOREMAP, caller);
12330 if (!area)
12331 return NULL;
12332 area->phys_addr = phys_addr;
12333 vaddr = (unsigned long) area->addr;
12334 if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
12335 size, prot, domid)) {
12336 + free_memtype(phys_addr, phys_addr + size);
12337 free_vm_area(area);
12338 return NULL;
12339 }
12340
12341 - if (ioremap_change_attr(vaddr, size, mode) < 0) {
12342 - iounmap((void __iomem *) vaddr);
12343 + if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
12344 + free_memtype(phys_addr, phys_addr + size);
12345 + vunmap(area->addr);
12346 return NULL;
12347 }
12348
12349 @@ -365,16 +436,72 @@ static void __iomem *__ioremap(resource_
12350 */
12351 void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
12352 {
12353 - return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
12354 + /*
12355 + * Ideally, this should be:
12356 + * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
12357 + *
12358 + * Till we fix all X drivers to use ioremap_wc(), we will use
12359 + * UC MINUS.
12360 + */
12361 + unsigned long val = _PAGE_CACHE_UC_MINUS;
12362 +
12363 + return __ioremap_caller(phys_addr, size, val,
12364 + __builtin_return_address(0));
12365 }
12366 EXPORT_SYMBOL(ioremap_nocache);
12367
12368 +/**
12369 + * ioremap_wc - map memory into CPU space write combined
12370 + * @offset: bus address of the memory
12371 + * @size: size of the resource to map
12372 + *
12373 + * This version of ioremap ensures that the memory is marked write combining.
12374 + * Write combining allows faster writes to some hardware devices.
12375 + *
12376 + * Must be freed with iounmap.
12377 + */
12378 +void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
12379 +{
12380 + if (pat_wc_enabled)
12381 + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
12382 + __builtin_return_address(0));
12383 + else
12384 + return ioremap_nocache(phys_addr, size);
12385 +}
12386 +EXPORT_SYMBOL(ioremap_wc);
12387 +
12388 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
12389 {
12390 - return __ioremap(phys_addr, size, IOR_MODE_CACHED);
12391 + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
12392 + __builtin_return_address(0));
12393 }
12394 EXPORT_SYMBOL(ioremap_cache);
12395
12396 +#ifndef CONFIG_XEN
12397 +static void __iomem *ioremap_default(resource_size_t phys_addr,
12398 + unsigned long size)
12399 +{
12400 + unsigned long flags;
12401 + void *ret;
12402 + int err;
12403 +
12404 + /*
12405 + * - WB for WB-able memory and no other conflicting mappings
12406 + * - UC_MINUS for non-WB-able memory with no other conflicting mappings
12407 + * - Inherit from confliting mappings otherwise
12408 + */
12409 + err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
12410 + if (err < 0)
12411 + return NULL;
12412 +
12413 + ret = (void *) __ioremap_caller(phys_addr, size, flags,
12414 + __builtin_return_address(0));
12415 +
12416 + free_memtype(phys_addr, phys_addr + size);
12417 + return (void __iomem *)ret;
12418 +}
12419 +#endif
12420 +
12421 /**
12422 * iounmap - Free a IO remapping
12423 * @addr: virtual address from ioremap_*
12424 @@ -417,15 +544,7 @@ void iounmap(volatile void __iomem *addr
12425 return;
12426 }
12427
12428 - if ((p->flags >> 20) != IOR_MODE_CACHED) {
12429 - unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
12430 - unsigned long mfn = p->phys_addr;
12431 - unsigned long va = (unsigned long)addr;
12432 -
12433 - for (; n > 0; n--, mfn++, va += PAGE_SIZE)
12434 - if (mfn_to_local_pfn(mfn) < max_pfn)
12435 - set_memory_wb(va, 1);
12436 - }
12437 + free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
12438
12439 /* Finally remove it */
12440 o = remove_vm_area((void *)addr);
12441 @@ -434,6 +553,37 @@ void iounmap(volatile void __iomem *addr
12442 }
12443 EXPORT_SYMBOL(iounmap);
12444
12445 +#ifndef CONFIG_XEN
12446 +/*
12447 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
12448 + * access
12449 + */
12450 +void *xlate_dev_mem_ptr(unsigned long phys)
12451 +{
12452 + void *addr;
12453 + unsigned long start = phys & PAGE_MASK;
12454 +
12455 + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
12456 + if (page_is_ram(start >> PAGE_SHIFT))
12457 + return __va(phys);
12458 +
12459 + addr = (void *)ioremap_default(start, PAGE_SIZE);
12460 + if (addr)
12461 + addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
12462 +
12463 + return addr;
12464 +}
12465 +
12466 +void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
12467 +{
12468 + if (page_is_ram(phys >> PAGE_SHIFT))
12469 + return;
12470 +
12471 + iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
12472 + return;
12473 +}
12474 +#endif
12475 +
12476 int __initdata early_ioremap_debug;
12477
12478 static int __init early_ioremap_debug_setup(char *str)
12479 @@ -445,8 +595,8 @@ static int __init early_ioremap_debug_se
12480 early_param("early_ioremap_debug", early_ioremap_debug_setup);
12481
12482 static __initdata int after_paging_init;
12483 -static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
12484 - __attribute__((aligned(PAGE_SIZE)));
12485 +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
12486 + __section(.bss.page_aligned);
12487
12488 #ifdef CONFIG_X86_32
12489 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
12490 @@ -461,8 +611,8 @@ static inline pmd_t * __init early_iorem
12491 }
12492 #else
12493 #define early_ioremap_pmd early_get_pmd
12494 +#undef make_lowmem_page_readonly
12495 #define make_lowmem_page_readonly early_make_page_readonly
12496 -#define make_lowmem_page_writable make_page_writable
12497 #endif
12498
12499 static inline pte_t * __init early_ioremap_pte(unsigned long addr)
12500 @@ -512,7 +662,7 @@ void __init early_ioremap_clear(void)
12501 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
12502 pmd_clear(pmd);
12503 make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
12504 - /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
12505 + /* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */
12506 __flush_tlb_all();
12507 }
12508
12509 @@ -654,10 +804,11 @@ void __init early_iounmap(void *addr, un
12510 unsigned long offset;
12511 unsigned int nrpages;
12512 enum fixed_addresses idx;
12513 - unsigned int nesting;
12514 + int nesting;
12515
12516 nesting = --early_ioremap_nested;
12517 - WARN_ON(nesting < 0);
12518 + if (WARN_ON(nesting < 0))
12519 + return;
12520
12521 if (early_ioremap_debug) {
12522 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
12523 --- sle11-2009-10-16.orig/arch/x86/mm/pageattr-xen.c 2009-03-16 16:37:14.000000000 +0100
12524 +++ sle11-2009-10-16/arch/x86/mm/pageattr-xen.c 2009-03-16 16:38:05.000000000 +0100
12525 @@ -9,6 +9,8 @@
12526 #include <linux/slab.h>
12527 #include <linux/mm.h>
12528 #include <linux/interrupt.h>
12529 +#include <linux/seq_file.h>
12530 +#include <linux/debugfs.h>
12531
12532 #include <asm/e820.h>
12533 #include <asm/processor.h>
12534 @@ -17,370 +19,7 @@
12535 #include <asm/uaccess.h>
12536 #include <asm/pgalloc.h>
12537 #include <asm/proto.h>
12538 -#include <asm/mmu_context.h>
12539 -
12540 -#ifndef CONFIG_X86_64
12541 -#define TASK_SIZE64 TASK_SIZE
12542 -#endif
12543 -
12544 -static void _pin_lock(struct mm_struct *mm, int lock) {
12545 - if (lock)
12546 - spin_lock(&mm->page_table_lock);
12547 -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
12548 - /* While mm->page_table_lock protects us against insertions and
12549 - * removals of higher level page table pages, it doesn't protect
12550 - * against updates of pte-s. Such updates, however, require the
12551 - * pte pages to be in consistent state (unpinned+writable or
12552 - * pinned+readonly). The pinning and attribute changes, however
12553 - * cannot be done atomically, which is why such updates must be
12554 - * prevented from happening concurrently.
12555 - * Note that no pte lock can ever elsewhere be acquired nesting
12556 - * with an already acquired one in the same mm, or with the mm's
12557 - * page_table_lock already acquired, as that would break in the
12558 - * non-split case (where all these are actually resolving to the
12559 - * one page_table_lock). Thus acquiring all of them here is not
12560 - * going to result in dead locks, and the order of acquires
12561 - * doesn't matter.
12562 - */
12563 - {
12564 - pgd_t *pgd = mm->pgd;
12565 - unsigned g;
12566 -
12567 - for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
12568 - pud_t *pud;
12569 - unsigned u;
12570 -
12571 - if (pgd_none(*pgd))
12572 - continue;
12573 - pud = pud_offset(pgd, 0);
12574 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
12575 - pmd_t *pmd;
12576 - unsigned m;
12577 -
12578 - if (pud_none(*pud))
12579 - continue;
12580 - pmd = pmd_offset(pud, 0);
12581 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
12582 - spinlock_t *ptl;
12583 -
12584 - if (pmd_none(*pmd))
12585 - continue;
12586 - ptl = pte_lockptr(0, pmd);
12587 - if (lock)
12588 - spin_lock(ptl);
12589 - else
12590 - spin_unlock(ptl);
12591 - }
12592 - }
12593 - }
12594 - }
12595 -#endif
12596 - if (!lock)
12597 - spin_unlock(&mm->page_table_lock);
12598 -}
12599 -#define pin_lock(mm) _pin_lock(mm, 1)
12600 -#define pin_unlock(mm) _pin_lock(mm, 0)
12601 -
12602 -#define PIN_BATCH sizeof(void *)
12603 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
12604 -
12605 -static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
12606 - unsigned int cpu, unsigned int seq)
12607 -{
12608 - unsigned long pfn = page_to_pfn(page);
12609 -
12610 - if (PageHighMem(page)) {
12611 - if (pgprot_val(flags) & _PAGE_RW)
12612 - ClearPagePinned(page);
12613 - else
12614 - SetPagePinned(page);
12615 - } else {
12616 - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
12617 - (unsigned long)__va(pfn << PAGE_SHIFT),
12618 - pfn_pte(pfn, flags), 0);
12619 - if (unlikely(++seq == PIN_BATCH)) {
12620 - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
12621 - PIN_BATCH, NULL)))
12622 - BUG();
12623 - seq = 0;
12624 - }
12625 - }
12626 -
12627 - return seq;
12628 -}
12629 -
12630 -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
12631 -{
12632 - pgd_t *pgd = pgd_base;
12633 - pud_t *pud;
12634 - pmd_t *pmd;
12635 - int g,u,m;
12636 - unsigned int cpu, seq;
12637 - multicall_entry_t *mcl;
12638 -
12639 - if (xen_feature(XENFEAT_auto_translated_physmap))
12640 - return;
12641 -
12642 - cpu = get_cpu();
12643 -
12644 - /*
12645 - * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
12646 - * may not be the 'current' task's pagetables (e.g., current may be
12647 - * 32-bit, but the pagetables may be for a 64-bit task).
12648 - * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
12649 - * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
12650 - */
12651 - for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
12652 - if (pgd_none(*pgd))
12653 - continue;
12654 - pud = pud_offset(pgd, 0);
12655 - if (PTRS_PER_PUD > 1) /* not folded */
12656 - seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
12657 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
12658 - if (pud_none(*pud))
12659 - continue;
12660 - pmd = pmd_offset(pud, 0);
12661 - if (PTRS_PER_PMD > 1) /* not folded */
12662 - seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
12663 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
12664 - if (pmd_none(*pmd))
12665 - continue;
12666 - seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
12667 - }
12668 - }
12669 - }
12670 -
12671 - mcl = per_cpu(pb_mcl, cpu);
12672 -#ifdef CONFIG_X86_64
12673 - if (unlikely(seq > PIN_BATCH - 2)) {
12674 - if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
12675 - BUG();
12676 - seq = 0;
12677 - }
12678 - MULTI_update_va_mapping(mcl + seq,
12679 - (unsigned long)__user_pgd(pgd_base),
12680 - pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
12681 - 0);
12682 - MULTI_update_va_mapping(mcl + seq + 1,
12683 - (unsigned long)pgd_base,
12684 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
12685 - UVMF_TLB_FLUSH);
12686 - if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
12687 - BUG();
12688 -#else
12689 - if (likely(seq != 0)) {
12690 - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
12691 - (unsigned long)pgd_base,
12692 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
12693 - UVMF_TLB_FLUSH);
12694 - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
12695 - seq + 1, NULL)))
12696 - BUG();
12697 - } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
12698 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
12699 - UVMF_TLB_FLUSH))
12700 - BUG();
12701 -#endif
12702 -
12703 - put_cpu();
12704 -}
12705 -
12706 -static void __pgd_pin(pgd_t *pgd)
12707 -{
12708 - pgd_walk(pgd, PAGE_KERNEL_RO);
12709 - kmap_flush_unused();
12710 - xen_pgd_pin(__pa(pgd)); /* kernel */
12711 -#ifdef CONFIG_X86_64
12712 - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
12713 -#endif
12714 - SetPagePinned(virt_to_page(pgd));
12715 -}
12716 -
12717 -static void __pgd_unpin(pgd_t *pgd)
12718 -{
12719 - xen_pgd_unpin(__pa(pgd));
12720 -#ifdef CONFIG_X86_64
12721 - xen_pgd_unpin(__pa(__user_pgd(pgd)));
12722 -#endif
12723 - pgd_walk(pgd, PAGE_KERNEL);
12724 - ClearPagePinned(virt_to_page(pgd));
12725 -}
12726 -
12727 -void pgd_test_and_unpin(pgd_t *pgd)
12728 -{
12729 - if (PagePinned(virt_to_page(pgd)))
12730 - __pgd_unpin(pgd);
12731 -}
12732 -
12733 -void mm_pin(struct mm_struct *mm)
12734 -{
12735 - if (xen_feature(XENFEAT_writable_page_tables))
12736 - return;
12737 -
12738 - pin_lock(mm);
12739 - __pgd_pin(mm->pgd);
12740 - pin_unlock(mm);
12741 -}
12742 -
12743 -void mm_unpin(struct mm_struct *mm)
12744 -{
12745 - if (xen_feature(XENFEAT_writable_page_tables))
12746 - return;
12747 -
12748 - pin_lock(mm);
12749 - __pgd_unpin(mm->pgd);
12750 - pin_unlock(mm);
12751 -}
12752 -
12753 -void mm_pin_all(void)
12754 -{
12755 - struct page *page;
12756 - unsigned long flags;
12757 -
12758 - if (xen_feature(XENFEAT_writable_page_tables))
12759 - return;
12760 -
12761 - /*
12762 - * Allow uninterrupted access to the pgd_list. Also protects
12763 - * __pgd_pin() by disabling preemption.
12764 - * All other CPUs must be at a safe point (e.g., in stop_machine
12765 - * or offlined entirely).
12766 - */
12767 - spin_lock_irqsave(&pgd_lock, flags);
12768 - list_for_each_entry(page, &pgd_list, lru) {
12769 - if (!PagePinned(page))
12770 - __pgd_pin((pgd_t *)page_address(page));
12771 - }
12772 - spin_unlock_irqrestore(&pgd_lock, flags);
12773 -}
12774 -
12775 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
12776 -{
12777 - if (!PagePinned(virt_to_page(mm->pgd)))
12778 - mm_pin(mm);
12779 -}
12780 -
12781 -void arch_exit_mmap(struct mm_struct *mm)
12782 -{
12783 - struct task_struct *tsk = current;
12784 -
12785 - task_lock(tsk);
12786 -
12787 - /*
12788 - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
12789 - * *much* faster this way, as no tlb flushes means bigger wrpt batches.
12790 - */
12791 - if (tsk->active_mm == mm) {
12792 - tsk->active_mm = &init_mm;
12793 - atomic_inc(&init_mm.mm_count);
12794 -
12795 - switch_mm(mm, &init_mm, tsk);
12796 -
12797 - atomic_dec(&mm->mm_count);
12798 - BUG_ON(atomic_read(&mm->mm_count) == 0);
12799 - }
12800 -
12801 - task_unlock(tsk);
12802 -
12803 - if (PagePinned(virt_to_page(mm->pgd))
12804 - && atomic_read(&mm->mm_count) == 1
12805 - && !mm->context.has_foreign_mappings)
12806 - mm_unpin(mm);
12807 -}
12808 -
12809 -static void _pte_free(struct page *page, unsigned int order)
12810 -{
12811 - BUG_ON(order);
12812 - __pte_free(page);
12813 -}
12814 -
12815 -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
12816 -{
12817 - struct page *pte;
12818 -
12819 -#ifdef CONFIG_HIGHPTE
12820 - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
12821 -#else
12822 - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
12823 -#endif
12824 - if (pte) {
12825 - pgtable_page_ctor(pte);
12826 - SetPageForeign(pte, _pte_free);
12827 - init_page_count(pte);
12828 - }
12829 - return pte;
12830 -}
12831 -
12832 -void __pte_free(pgtable_t pte)
12833 -{
12834 - if (!PageHighMem(pte)) {
12835 - unsigned long va = (unsigned long)page_address(pte);
12836 - unsigned int level;
12837 - pte_t *ptep = lookup_address(va, &level);
12838 -
12839 - BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
12840 - if (!pte_write(*ptep)
12841 - && HYPERVISOR_update_va_mapping(va,
12842 - mk_pte(pte, PAGE_KERNEL),
12843 - 0))
12844 - BUG();
12845 - } else
12846 -#ifdef CONFIG_HIGHPTE
12847 - ClearPagePinned(pte);
12848 -#else
12849 - BUG();
12850 -#endif
12851 -
12852 - ClearPageForeign(pte);
12853 - init_page_count(pte);
12854 - pgtable_page_dtor(pte);
12855 - __free_page(pte);
12856 -}
12857 -
12858 -#if PAGETABLE_LEVELS >= 3
12859 -static void _pmd_free(struct page *page, unsigned int order)
12860 -{
12861 - BUG_ON(order);
12862 - __pmd_free(page);
12863 -}
12864 -
12865 -pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
12866 -{
12867 - struct page *pmd;
12868 -
12869 - pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
12870 - if (!pmd)
12871 - return NULL;
12872 - SetPageForeign(pmd, _pmd_free);
12873 - init_page_count(pmd);
12874 - return page_address(pmd);
12875 -}
12876 -
12877 -void __pmd_free(pgtable_t pmd)
12878 -{
12879 - unsigned long va = (unsigned long)page_address(pmd);
12880 - unsigned int level;
12881 - pte_t *ptep = lookup_address(va, &level);
12882 -
12883 - BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
12884 - if (!pte_write(*ptep)
12885 - && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
12886 - BUG();
12887 -
12888 - ClearPageForeign(pmd);
12889 - init_page_count(pmd);
12890 - __free_page(pmd);
12891 -}
12892 -#endif
12893 -
12894 -/* blktap and gntdev need this, as otherwise they would implicitly (and
12895 - * needlessly, as they never use it) reference init_mm. */
12896 -pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
12897 - unsigned long addr, pte_t *ptep, int full)
12898 -{
12899 - return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
12900 -}
12901 -EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
12902 +#include <asm/pat.h>
12903
12904 /*
12905 * The current flushing context - we pass it instead of 5 arguments:
12906 @@ -392,6 +31,7 @@ struct cpa_data {
12907 int numpages;
12908 int flushtlb;
12909 unsigned long pfn;
12910 + unsigned force_split : 1;
12911 };
12912
12913 #ifdef CONFIG_X86_64
12914 @@ -637,6 +277,9 @@ try_preserve_large_page(pte_t *kpte, uns
12915 int i, do_split = 1;
12916 unsigned int level;
12917
12918 + if (cpa->force_split)
12919 + return 1;
12920 +
12921 spin_lock_irqsave(&pgd_lock, flags);
12922 /*
12923 * Check for races, another CPU might have split this page
12924 @@ -856,9 +499,7 @@ static int split_large_page(pte_t *kpte,
12925 goto out_unlock;
12926
12927 pbase = (pte_t *)page_address(base);
12928 -#ifdef CONFIG_X86_32
12929 - paravirt_alloc_pt(&init_mm, page_to_pfn(base));
12930 -#endif
12931 + paravirt_alloc_pte(&init_mm, page_to_pfn(base));
12932 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
12933
12934 #ifdef CONFIG_X86_64
12935 @@ -919,7 +560,7 @@ static int __change_page_attr(struct cpa
12936 repeat:
12937 kpte = lookup_address(address, &level);
12938 if (!kpte)
12939 - return primary ? -EINVAL : 0;
12940 + return 0;
12941
12942 old_pte = *kpte;
12943 if (!__pte_val(old_pte)) {
12944 @@ -1078,7 +719,8 @@ static inline int cache_attr(pgprot_t at
12945 }
12946
12947 static int change_page_attr_set_clr(unsigned long addr, int numpages,
12948 - pgprot_t mask_set, pgprot_t mask_clr)
12949 + pgprot_t mask_set, pgprot_t mask_clr,
12950 + int force_split)
12951 {
12952 struct cpa_data cpa;
12953 int ret, cache, checkalias;
12954 @@ -1089,7 +731,7 @@ static int change_page_attr_set_clr(unsi
12955 */
12956 mask_set = canon_pgprot(mask_set);
12957 mask_clr = canon_pgprot(mask_clr);
12958 - if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
12959 + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
12960 return 0;
12961
12962 /* Ensure we are PAGE_SIZE aligned */
12963 @@ -1106,6 +748,7 @@ static int change_page_attr_set_clr(unsi
12964 cpa.mask_set = mask_set;
12965 cpa.mask_clr = mask_clr;
12966 cpa.flushtlb = 0;
12967 + cpa.force_split = force_split;
12968
12969 /* No alias checking for _NX bit modifications */
12970 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
12971 @@ -1144,26 +787,67 @@ out:
12972 static inline int change_page_attr_set(unsigned long addr, int numpages,
12973 pgprot_t mask)
12974 {
12975 - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
12976 + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
12977 }
12978
12979 static inline int change_page_attr_clear(unsigned long addr, int numpages,
12980 pgprot_t mask)
12981 {
12982 - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
12983 + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
12984 }
12985
12986 -int set_memory_uc(unsigned long addr, int numpages)
12987 +int _set_memory_uc(unsigned long addr, int numpages)
12988 {
12989 + /*
12990 + * for now UC MINUS. see comments in ioremap_nocache()
12991 + */
12992 return change_page_attr_set(addr, numpages,
12993 - __pgprot(_PAGE_PCD));
12994 + __pgprot(_PAGE_CACHE_UC_MINUS));
12995 +}
12996 +
12997 +int set_memory_uc(unsigned long addr, int numpages)
12998 +{
12999 + /*
13000 + * for now UC MINUS. see comments in ioremap_nocache()
13001 + */
13002 + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
13003 + _PAGE_CACHE_UC_MINUS, NULL))
13004 + return -EINVAL;
13005 +
13006 + return _set_memory_uc(addr, numpages);
13007 }
13008 EXPORT_SYMBOL(set_memory_uc);
13009
13010 -int set_memory_wb(unsigned long addr, int numpages)
13011 +int _set_memory_wc(unsigned long addr, int numpages)
13012 +{
13013 + return change_page_attr_set(addr, numpages,
13014 + __pgprot(_PAGE_CACHE_WC));
13015 +}
13016 +
13017 +int set_memory_wc(unsigned long addr, int numpages)
13018 +{
13019 + if (!pat_wc_enabled)
13020 + return set_memory_uc(addr, numpages);
13021 +
13022 + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
13023 + _PAGE_CACHE_WC, NULL))
13024 + return -EINVAL;
13025 +
13026 + return _set_memory_wc(addr, numpages);
13027 +}
13028 +EXPORT_SYMBOL(set_memory_wc);
13029 +
13030 +int _set_memory_wb(unsigned long addr, int numpages)
13031 {
13032 return change_page_attr_clear(addr, numpages,
13033 - __pgprot(_PAGE_PCD | _PAGE_PWT));
13034 + __pgprot(_PAGE_CACHE_MASK));
13035 +}
13036 +
13037 +int set_memory_wb(unsigned long addr, int numpages)
13038 +{
13039 + free_memtype(addr, addr + numpages * PAGE_SIZE);
13040 +
13041 + return _set_memory_wb(addr, numpages);
13042 }
13043 EXPORT_SYMBOL(set_memory_wb);
13044
13045 @@ -1194,6 +878,12 @@ int set_memory_np(unsigned long addr, in
13046 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
13047 }
13048
13049 +int set_memory_4k(unsigned long addr, int numpages)
13050 +{
13051 + return change_page_attr_set_clr(addr, numpages, __pgprot(0),
13052 + __pgprot(0), 1);
13053 +}
13054 +
13055 int set_pages_uc(struct page *page, int numpages)
13056 {
13057 unsigned long addr = (unsigned long)page_address(page);
13058 @@ -1303,6 +993,45 @@ void kernel_map_pages(struct page *page,
13059 cpa_fill_pool(NULL);
13060 }
13061
13062 +#ifdef CONFIG_DEBUG_FS
13063 +static int dpa_show(struct seq_file *m, void *v)
13064 +{
13065 + seq_puts(m, "DEBUG_PAGEALLOC\n");
13066 + seq_printf(m, "pool_size : %lu\n", pool_size);
13067 + seq_printf(m, "pool_pages : %lu\n", pool_pages);
13068 + seq_printf(m, "pool_low : %lu\n", pool_low);
13069 + seq_printf(m, "pool_used : %lu\n", pool_used);
13070 + seq_printf(m, "pool_failed : %lu\n", pool_failed);
13071 +
13072 + return 0;
13073 +}
13074 +
13075 +static int dpa_open(struct inode *inode, struct file *filp)
13076 +{
13077 + return single_open(filp, dpa_show, NULL);
13078 +}
13079 +
13080 +static const struct file_operations dpa_fops = {
13081 + .open = dpa_open,
13082 + .read = seq_read,
13083 + .llseek = seq_lseek,
13084 + .release = single_release,
13085 +};
13086 +
13087 +static int __init debug_pagealloc_proc_init(void)
13088 +{
13089 + struct dentry *de;
13090 +
13091 + de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
13092 + &dpa_fops);
13093 + if (!de)
13094 + return -ENOMEM;
13095 +
13096 + return 0;
13097 +}
13098 +__initcall(debug_pagealloc_proc_init);
13099 +#endif
13100 +
13101 #ifdef CONFIG_HIBERNATION
13102
13103 bool kernel_page_present(struct page *page)
13104 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
13105 +++ sle11-2009-10-16/arch/x86/mm/pat-xen.c 2009-03-16 16:38:05.000000000 +0100
13106 @@ -0,0 +1,602 @@
13107 +/*
13108 + * Handle caching attributes in page tables (PAT)
13109 + *
13110 + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
13111 + * Suresh B Siddha <suresh.b.siddha@intel.com>
13112 + *
13113 + * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
13114 + */
13115 +
13116 +#include <linux/mm.h>
13117 +#include <linux/kernel.h>
13118 +#include <linux/gfp.h>
13119 +#include <linux/fs.h>
13120 +#include <linux/bootmem.h>
13121 +
13122 +#include <asm/msr.h>
13123 +#include <asm/tlbflush.h>
13124 +#include <asm/processor.h>
13125 +#include <asm/page.h>
13126 +#include <asm/pgtable.h>
13127 +#include <asm/pat.h>
13128 +#include <asm/e820.h>
13129 +#include <asm/cacheflush.h>
13130 +#include <asm/fcntl.h>
13131 +#include <asm/mtrr.h>
13132 +#include <asm/io.h>
13133 +
13134 +#ifdef CONFIG_X86_PAT
13135 +int __read_mostly pat_wc_enabled = 1;
13136 +
13137 +void __cpuinit pat_disable(char *reason)
13138 +{
13139 + pat_wc_enabled = 0;
13140 + printk(KERN_INFO "%s\n", reason);
13141 +}
13142 +
13143 +static int __init nopat(char *str)
13144 +{
13145 + pat_disable("PAT support disabled.");
13146 + return 0;
13147 +}
13148 +early_param("nopat", nopat);
13149 +#endif
13150 +
13151 +static u64 __read_mostly boot_pat_state;
13152 +
13153 +enum {
13154 + PAT_UC = 0, /* uncached */
13155 + PAT_WC = 1, /* Write combining */
13156 + PAT_WT = 4, /* Write Through */
13157 + PAT_WP = 5, /* Write Protected */
13158 + PAT_WB = 6, /* Write Back (default) */
13159 + PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
13160 +};
13161 +
13162 +#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
13163 +
13164 +void pat_init(void)
13165 +{
13166 + u64 pat;
13167 +
13168 + if (!pat_wc_enabled)
13169 + return;
13170 +
13171 + /* Paranoia check. */
13172 + if (!cpu_has_pat) {
13173 + printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
13174 + /*
13175 + * Panic if this happens on the secondary CPU, and we
13176 + * switched to PAT on the boot CPU. We have no way to
13177 + * undo PAT.
13178 + */
13179 + BUG_ON(boot_pat_state);
13180 + }
13181 +
13182 +#ifndef CONFIG_XEN
13183 + /* Set PWT to Write-Combining. All other bits stay the same */
13184 + /*
13185 + * PTE encoding used in Linux:
13186 + * PAT
13187 + * |PCD
13188 + * ||PWT
13189 + * |||
13190 + * 000 WB _PAGE_CACHE_WB
13191 + * 001 WC _PAGE_CACHE_WC
13192 + * 010 UC- _PAGE_CACHE_UC_MINUS
13193 + * 011 UC _PAGE_CACHE_UC
13194 + * PAT bit unused
13195 + */
13196 + pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
13197 + PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
13198 +
13199 + /* Boot CPU check */
13200 + if (!boot_pat_state)
13201 + rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
13202 +
13203 + wrmsrl(MSR_IA32_CR_PAT, pat);
13204 +#else
13205 + /*
13206 + * PAT settings are part of the hypervisor interface, and their
13207 + * assignment cannot be changed.
13208 + */
13209 + rdmsrl(MSR_IA32_CR_PAT, pat);
13210 + if (!boot_pat_state)
13211 + boot_pat_state = pat;
13212 +#endif
13213 + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
13214 + smp_processor_id(), boot_pat_state, pat);
13215 +}
13216 +
13217 +#undef PAT
13218 +
13219 +static char *cattr_name(unsigned long flags)
13220 +{
13221 + switch (flags & _PAGE_CACHE_MASK) {
13222 + case _PAGE_CACHE_UC: return "uncached";
13223 + case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
13224 + case _PAGE_CACHE_WB: return "write-back";
13225 + case _PAGE_CACHE_WC: return "write-combining";
13226 + case _PAGE_CACHE_WP: return "write-protected";
13227 + case _PAGE_CACHE_WT: return "write-through";
13228 + default: return "broken";
13229 + }
13230 +}
13231 +
13232 +/*
13233 + * The global memtype list keeps track of memory type for specific
13234 + * physical memory areas. Conflicting memory types in different
13235 + * mappings can cause CPU cache corruption. To avoid this we keep track.
13236 + *
13237 + * The list is sorted based on starting address and can contain multiple
13238 + * entries for each address (this allows reference counting for overlapping
13239 + * areas). All the aliases have the same cache attributes of course.
13240 + * Zero attributes are represented as holes.
13241 + *
13242 + * Currently the data structure is a list because the number of mappings
13243 + * are expected to be relatively small. If this should be a problem
13244 + * it could be changed to a rbtree or similar.
13245 + *
13246 + * memtype_lock protects the whole list.
13247 + */
13248 +
13249 +struct memtype {
13250 + u64 start;
13251 + u64 end;
13252 + unsigned long type;
13253 + struct list_head nd;
13254 +};
13255 +
13256 +static LIST_HEAD(memtype_list);
13257 +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
13258 +
13259 +/*
13260 + * Does intersection of PAT memory type and MTRR memory type and returns
13261 + * the resulting memory type as PAT understands it.
13262 + * (Type in pat and mtrr will not have same value)
13263 + * The intersection is based on "Effective Memory Type" tables in IA-32
13264 + * SDM vol 3a
13265 + */
13266 +static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
13267 + unsigned long *ret_prot)
13268 +{
13269 + unsigned long pat_type;
13270 + u8 mtrr_type;
13271 +
13272 + pat_type = prot & _PAGE_CACHE_MASK;
13273 + prot &= (~_PAGE_CACHE_MASK);
13274 +
13275 + /*
13276 + * We return the PAT request directly for types where PAT takes
13277 + * precedence with respect to MTRR and for UC_MINUS.
13278 + * Consistency checks with other PAT requests is done later
13279 + * while going through memtype list.
13280 + */
13281 + if (pat_type == _PAGE_CACHE_WC) {
13282 + *ret_prot = prot | _PAGE_CACHE_WC;
13283 + return 0;
13284 + } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
13285 + *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
13286 + return 0;
13287 + } else if (pat_type == _PAGE_CACHE_UC) {
13288 + *ret_prot = prot | _PAGE_CACHE_UC;
13289 + return 0;
13290 + }
13291 +
13292 + /*
13293 + * Look for MTRR hint to get the effective type in case where PAT
13294 + * request is for WB.
13295 + */
13296 + mtrr_type = mtrr_type_lookup(start, end);
13297 +
13298 + if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
13299 + *ret_prot = prot | _PAGE_CACHE_UC;
13300 + } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
13301 + *ret_prot = prot | _PAGE_CACHE_WC;
13302 + } else {
13303 + *ret_prot = prot | _PAGE_CACHE_WB;
13304 + }
13305 +
13306 + return 0;
13307 +}
13308 +
13309 +/*
13310 + * req_type typically has one of the:
13311 + * - _PAGE_CACHE_WB
13312 + * - _PAGE_CACHE_WC
13313 + * - _PAGE_CACHE_UC_MINUS
13314 + * - _PAGE_CACHE_UC
13315 + *
13316 + * req_type will have a special case value '-1', when requester want to inherit
13317 + * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
13318 + *
13319 + * If ret_type is NULL, function will return an error if it cannot reserve the
13320 + * region with req_type. If ret_type is non-null, function will return
13321 + * available type in ret_type in case of no error. In case of any error
13322 + * it will return a negative return value.
13323 + */
13324 +int reserve_memtype(u64 start, u64 end, unsigned long req_type,
13325 + unsigned long *ret_type)
13326 +{
13327 + struct memtype *new_entry = NULL;
13328 + struct memtype *parse;
13329 + unsigned long actual_type;
13330 + int err = 0;
13331 +
13332 + /* Only track when pat_wc_enabled */
13333 + if (!pat_wc_enabled) {
13334 + /* This is identical to page table setting without PAT */
13335 + if (ret_type) {
13336 + if (req_type == -1) {
13337 + *ret_type = _PAGE_CACHE_WB;
13338 + } else {
13339 + *ret_type = req_type;
13340 + }
13341 + }
13342 + return 0;
13343 + }
13344 +
13345 + /* Low ISA region is always mapped WB in page table. No need to track */
13346 + if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
13347 + if (ret_type)
13348 + *ret_type = _PAGE_CACHE_WB;
13349 +
13350 + return 0;
13351 + }
13352 +
13353 + if (req_type == -1) {
13354 + /*
13355 + * Call mtrr_lookup to get the type hint. This is an
13356 + * optimization for /dev/mem mmap'ers into WB memory (BIOS
13357 + * tools and ACPI tools). Use WB request for WB memory and use
13358 + * UC_MINUS otherwise.
13359 + */
13360 + u8 mtrr_type = mtrr_type_lookup(start, end);
13361 +
13362 + if (mtrr_type == MTRR_TYPE_WRBACK) {
13363 + req_type = _PAGE_CACHE_WB;
13364 + actual_type = _PAGE_CACHE_WB;
13365 + } else {
13366 + req_type = _PAGE_CACHE_UC_MINUS;
13367 + actual_type = _PAGE_CACHE_UC_MINUS;
13368 + }
13369 + } else {
13370 + req_type &= _PAGE_CACHE_MASK;
13371 + err = pat_x_mtrr_type(start, end, req_type, &actual_type);
13372 + }
13373 +
13374 + if (err) {
13375 + if (ret_type)
13376 + *ret_type = actual_type;
13377 +
13378 + return -EINVAL;
13379 + }
13380 +
13381 + new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
13382 + if (!new_entry)
13383 + return -ENOMEM;
13384 +
13385 + new_entry->start = start;
13386 + new_entry->end = end;
13387 + new_entry->type = actual_type;
13388 +
13389 + if (ret_type)
13390 + *ret_type = actual_type;
13391 +
13392 + spin_lock(&memtype_lock);
13393 +
13394 + /* Search for existing mapping that overlaps the current range */
13395 + list_for_each_entry(parse, &memtype_list, nd) {
13396 + struct memtype *saved_ptr;
13397 +
13398 + if (parse->start >= end) {
13399 + pr_debug("New Entry\n");
13400 + list_add(&new_entry->nd, parse->nd.prev);
13401 + new_entry = NULL;
13402 + break;
13403 + }
13404 +
13405 + if (start <= parse->start && end >= parse->start) {
13406 + if (actual_type != parse->type && ret_type) {
13407 + actual_type = parse->type;
13408 + *ret_type = actual_type;
13409 + new_entry->type = actual_type;
13410 + }
13411 +
13412 + if (actual_type != parse->type) {
13413 + printk(
13414 + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13415 + current->comm, current->pid,
13416 + start, end,
13417 + cattr_name(actual_type),
13418 + cattr_name(parse->type));
13419 + err = -EBUSY;
13420 + break;
13421 + }
13422 +
13423 + saved_ptr = parse;
13424 + /*
13425 + * Check to see whether the request overlaps more
13426 + * than one entry in the list
13427 + */
13428 + list_for_each_entry_continue(parse, &memtype_list, nd) {
13429 + if (end <= parse->start) {
13430 + break;
13431 + }
13432 +
13433 + if (actual_type != parse->type) {
13434 + printk(
13435 + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13436 + current->comm, current->pid,
13437 + start, end,
13438 + cattr_name(actual_type),
13439 + cattr_name(parse->type));
13440 + err = -EBUSY;
13441 + break;
13442 + }
13443 + }
13444 +
13445 + if (err) {
13446 + break;
13447 + }
13448 +
13449 + pr_debug("Overlap at 0x%Lx-0x%Lx\n",
13450 + saved_ptr->start, saved_ptr->end);
13451 + /* No conflict. Go ahead and add this new entry */
13452 + list_add(&new_entry->nd, saved_ptr->nd.prev);
13453 + new_entry = NULL;
13454 + break;
13455 + }
13456 +
13457 + if (start < parse->end) {
13458 + if (actual_type != parse->type && ret_type) {
13459 + actual_type = parse->type;
13460 + *ret_type = actual_type;
13461 + new_entry->type = actual_type;
13462 + }
13463 +
13464 + if (actual_type != parse->type) {
13465 + printk(
13466 + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13467 + current->comm, current->pid,
13468 + start, end,
13469 + cattr_name(actual_type),
13470 + cattr_name(parse->type));
13471 + err = -EBUSY;
13472 + break;
13473 + }
13474 +
13475 + saved_ptr = parse;
13476 + /*
13477 + * Check to see whether the request overlaps more
13478 + * than one entry in the list
13479 + */
13480 + list_for_each_entry_continue(parse, &memtype_list, nd) {
13481 + if (end <= parse->start) {
13482 + break;
13483 + }
13484 +
13485 + if (actual_type != parse->type) {
13486 + printk(
13487 + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13488 + current->comm, current->pid,
13489 + start, end,
13490 + cattr_name(actual_type),
13491 + cattr_name(parse->type));
13492 + err = -EBUSY;
13493 + break;
13494 + }
13495 + }
13496 +
13497 + if (err) {
13498 + break;
13499 + }
13500 +
13501 + pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
13502 + saved_ptr->start, saved_ptr->end);
13503 + /* No conflict. Go ahead and add this new entry */
13504 + list_add(&new_entry->nd, &saved_ptr->nd);
13505 + new_entry = NULL;
13506 + break;
13507 + }
13508 + }
13509 +
13510 + if (err) {
13511 + printk(KERN_INFO
13512 + "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
13513 + start, end, cattr_name(new_entry->type),
13514 + cattr_name(req_type));
13515 + kfree(new_entry);
13516 + spin_unlock(&memtype_lock);
13517 + return err;
13518 + }
13519 +
13520 + if (new_entry) {
13521 + /* No conflict. Not yet added to the list. Add to the tail */
13522 + list_add_tail(&new_entry->nd, &memtype_list);
13523 + pr_debug("New Entry\n");
13524 + }
13525 +
13526 + if (ret_type) {
13527 + pr_debug(
13528 + "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
13529 + start, end, cattr_name(actual_type),
13530 + cattr_name(req_type), cattr_name(*ret_type));
13531 + } else {
13532 + pr_debug(
13533 + "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
13534 + start, end, cattr_name(actual_type),
13535 + cattr_name(req_type));
13536 + }
13537 +
13538 + spin_unlock(&memtype_lock);
13539 + return err;
13540 +}
13541 +
13542 +int free_memtype(u64 start, u64 end)
13543 +{
13544 + struct memtype *ml;
13545 + int err = -EINVAL;
13546 +
13547 + /* Only track when pat_wc_enabled */
13548 + if (!pat_wc_enabled) {
13549 + return 0;
13550 + }
13551 +
13552 + /* Low ISA region is always mapped WB. No need to track */
13553 + if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
13554 + return 0;
13555 + }
13556 +
13557 + spin_lock(&memtype_lock);
13558 + list_for_each_entry(ml, &memtype_list, nd) {
13559 + if (ml->start == start && ml->end == end) {
13560 + list_del(&ml->nd);
13561 + kfree(ml);
13562 + err = 0;
13563 + break;
13564 + }
13565 + }
13566 + spin_unlock(&memtype_lock);
13567 +
13568 + if (err) {
13569 + printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
13570 + current->comm, current->pid, start, end);
13571 + }
13572 +
13573 + pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
13574 + return err;
13575 +}
13576 +
13577 +
13578 +/*
13579 + * /dev/mem mmap interface. The memtype used for mapping varies:
13580 + * - Use UC for mappings with O_SYNC flag
13581 + * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
13582 + * inherit the memtype from existing mapping.
13583 + * - Else use UC_MINUS memtype (for backward compatibility with existing
13584 + * X drivers.
13585 + */
13586 +pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
13587 + unsigned long size, pgprot_t vma_prot)
13588 +{
13589 + return vma_prot;
13590 +}
13591 +
13592 +#ifdef CONFIG_NONPROMISC_DEVMEM
13593 +/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
13594 +static inline int range_is_allowed(unsigned long mfn, unsigned long size)
13595 +{
13596 + return 1;
13597 +}
13598 +#else
13599 +static inline int range_is_allowed(unsigned long mfn, unsigned long size)
13600 +{
13601 + u64 from = ((u64)mfn) << PAGE_SHIFT;
13602 + u64 to = from + size;
13603 + u64 cursor = from;
13604 +
13605 + while (cursor < to) {
13606 + if (!devmem_is_allowed(mfn)) {
13607 + printk(KERN_INFO
13608 + "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
13609 + current->comm, from, to);
13610 + return 0;
13611 + }
13612 + cursor += PAGE_SIZE;
13613 + mfn++;
13614 + }
13615 + return 1;
13616 +}
13617 +#endif /* CONFIG_NONPROMISC_DEVMEM */
13618 +
13619 +int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
13620 + unsigned long size, pgprot_t *vma_prot)
13621 +{
13622 + u64 addr = (u64)mfn << PAGE_SHIFT;
13623 + unsigned long flags = _PAGE_CACHE_UC_MINUS;
13624 + int retval;
13625 +
13626 + if (!range_is_allowed(mfn, size))
13627 + return 0;
13628 +
13629 + if (file->f_flags & O_SYNC) {
13630 + flags = _PAGE_CACHE_UC;
13631 + }
13632 +
13633 +#ifndef CONFIG_X86_32
13634 +#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */
13635 + /*
13636 + * On the PPro and successors, the MTRRs are used to set
13637 + * memory types for physical addresses outside main memory,
13638 + * so blindly setting UC or PWT on those pages is wrong.
13639 + * For Pentiums and earlier, the surround logic should disable
13640 + * caching for the high addresses through the KEN pin, but
13641 + * we maintain the tradition of paranoia in this code.
13642 + */
13643 + if (!pat_wc_enabled &&
13644 + ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
13645 + test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
13646 + test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
13647 + test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
13648 + (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
13649 + flags = _PAGE_CACHE_UC;
13650 + }
13651 +#endif
13652 +#endif
13653 +
13654 + /*
13655 + * With O_SYNC, we can only take UC mapping. Fail if we cannot.
13656 + * Without O_SYNC, we want to get
13657 + * - WB for WB-able memory and no other conflicting mappings
13658 + * - UC_MINUS for non-WB-able memory with no other conflicting mappings
13659 + * - Inherit from confliting mappings otherwise
13660 + */
13661 + if (flags != _PAGE_CACHE_UC_MINUS) {
13662 + retval = reserve_memtype(addr, addr + size, flags, NULL);
13663 + } else {
13664 + retval = reserve_memtype(addr, addr + size, -1, &flags);
13665 + }
13666 +
13667 + if (retval < 0)
13668 + return 0;
13669 +
13670 + if (ioremap_check_change_attr(mfn, size, flags) < 0) {
13671 + free_memtype(addr, addr + size);
13672 + printk(KERN_INFO
13673 + "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
13674 + current->comm, current->pid,
13675 + cattr_name(flags),
13676 + addr, addr + size);
13677 + return 0;
13678 + }
13679 +
13680 + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
13681 + flags);
13682 + return 1;
13683 +}
13684 +
13685 +void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
13686 +{
13687 + u64 addr = (u64)mfn << PAGE_SHIFT;
13688 + unsigned long flags;
13689 + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
13690 +
13691 + reserve_memtype(addr, addr + size, want_flags, &flags);
13692 + if (flags != want_flags) {
13693 + printk(KERN_INFO
13694 + "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
13695 + current->comm, current->pid,
13696 + cattr_name(want_flags),
13697 + addr, (unsigned long long)(addr + size),
13698 + cattr_name(flags));
13699 + }
13700 +}
13701 +
13702 +void unmap_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
13703 +{
13704 + u64 addr = (u64)mfn << PAGE_SHIFT;
13705 +
13706 + free_memtype(addr, addr + size);
13707 +}
13708 +
13709 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
13710 +++ sle11-2009-10-16/arch/x86/mm/pgtable-xen.c 2009-03-16 16:38:05.000000000 +0100
13711 @@ -0,0 +1,709 @@
13712 +#include <linux/mm.h>
13713 +#include <linux/module.h>
13714 +#include <xen/features.h>
13715 +#include <asm/pgalloc.h>
13716 +#include <asm/pgtable.h>
13717 +#include <asm/tlb.h>
13718 +#include <asm/hypervisor.h>
13719 +#include <asm/mmu_context.h>
13720 +
13721 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
13722 +{
13723 + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
13724 + if (pte)
13725 + make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
13726 + return pte;
13727 +}
13728 +
13729 +static void _pte_free(struct page *page, unsigned int order)
13730 +{
13731 + BUG_ON(order);
13732 + __pte_free(page);
13733 +}
13734 +
13735 +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
13736 +{
13737 + struct page *pte;
13738 +
13739 +#ifdef CONFIG_HIGHPTE
13740 + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
13741 +#else
13742 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
13743 +#endif
13744 + if (pte) {
13745 + pgtable_page_ctor(pte);
13746 + SetPageForeign(pte, _pte_free);
13747 + init_page_count(pte);
13748 + }
13749 + return pte;
13750 +}
13751 +
13752 +void __pte_free(pgtable_t pte)
13753 +{
13754 + if (!PageHighMem(pte)) {
13755 + unsigned long va = (unsigned long)page_address(pte);
13756 + unsigned int level;
13757 + pte_t *ptep = lookup_address(va, &level);
13758 +
13759 + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
13760 + if (!pte_write(*ptep)
13761 + && HYPERVISOR_update_va_mapping(va,
13762 + mk_pte(pte, PAGE_KERNEL),
13763 + 0))
13764 + BUG();
13765 + } else
13766 +#ifdef CONFIG_HIGHPTE
13767 + ClearPagePinned(pte);
13768 +#else
13769 + BUG();
13770 +#endif
13771 +
13772 + ClearPageForeign(pte);
13773 + init_page_count(pte);
13774 + pgtable_page_dtor(pte);
13775 + __free_page(pte);
13776 +}
13777 +
13778 +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
13779 +{
13780 + pgtable_page_dtor(pte);
13781 + paravirt_release_pte(page_to_pfn(pte));
13782 + tlb_remove_page(tlb, pte);
13783 +}
13784 +
13785 +#if PAGETABLE_LEVELS > 2
13786 +static void _pmd_free(struct page *page, unsigned int order)
13787 +{
13788 + BUG_ON(order);
13789 + __pmd_free(page);
13790 +}
13791 +
13792 +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
13793 +{
13794 + struct page *pmd;
13795 +
13796 + pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
13797 + if (!pmd)
13798 + return NULL;
13799 + SetPageForeign(pmd, _pmd_free);
13800 + init_page_count(pmd);
13801 + return page_address(pmd);
13802 +}
13803 +
13804 +void __pmd_free(pgtable_t pmd)
13805 +{
13806 + unsigned long va = (unsigned long)page_address(pmd);
13807 + unsigned int level;
13808 + pte_t *ptep = lookup_address(va, &level);
13809 +
13810 + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
13811 + if (!pte_write(*ptep)
13812 + && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
13813 + BUG();
13814 +
13815 + ClearPageForeign(pmd);
13816 + init_page_count(pmd);
13817 + __free_page(pmd);
13818 +}
13819 +
13820 +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
13821 +{
13822 + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
13823 + tlb_remove_page(tlb, virt_to_page(pmd));
13824 +}
13825 +
13826 +#if PAGETABLE_LEVELS > 3
13827 +void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
13828 +{
13829 + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
13830 + tlb_remove_page(tlb, virt_to_page(pud));
13831 +}
13832 +#endif /* PAGETABLE_LEVELS > 3 */
13833 +#endif /* PAGETABLE_LEVELS > 2 */
13834 +
13835 +#ifndef CONFIG_X86_64
13836 +#define TASK_SIZE64 TASK_SIZE
13837 +#endif
13838 +
13839 +static void _pin_lock(struct mm_struct *mm, int lock) {
13840 + if (lock)
13841 + spin_lock(&mm->page_table_lock);
13842 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
13843 + /* While mm->page_table_lock protects us against insertions and
13844 + * removals of higher level page table pages, it doesn't protect
13845 + * against updates of pte-s. Such updates, however, require the
13846 + * pte pages to be in consistent state (unpinned+writable or
13847 + * pinned+readonly). The pinning and attribute changes, however
13848 + * cannot be done atomically, which is why such updates must be
13849 + * prevented from happening concurrently.
13850 + * Note that no pte lock can ever elsewhere be acquired nesting
13851 + * with an already acquired one in the same mm, or with the mm's
13852 + * page_table_lock already acquired, as that would break in the
13853 + * non-split case (where all these are actually resolving to the
13854 + * one page_table_lock). Thus acquiring all of them here is not
13855 + * going to result in dead locks, and the order of acquires
13856 + * doesn't matter.
13857 + */
13858 + {
13859 + pgd_t *pgd = mm->pgd;
13860 + unsigned g;
13861 +
13862 + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
13863 + pud_t *pud;
13864 + unsigned u;
13865 +
13866 + if (pgd_none(*pgd))
13867 + continue;
13868 + pud = pud_offset(pgd, 0);
13869 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
13870 + pmd_t *pmd;
13871 + unsigned m;
13872 +
13873 + if (pud_none(*pud))
13874 + continue;
13875 + pmd = pmd_offset(pud, 0);
13876 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
13877 + spinlock_t *ptl;
13878 +
13879 + if (pmd_none(*pmd))
13880 + continue;
13881 + ptl = pte_lockptr(0, pmd);
13882 + if (lock)
13883 + spin_lock(ptl);
13884 + else
13885 + spin_unlock(ptl);
13886 + }
13887 + }
13888 + }
13889 + }
13890 +#endif
13891 + if (!lock)
13892 + spin_unlock(&mm->page_table_lock);
13893 +}
13894 +#define pin_lock(mm) _pin_lock(mm, 1)
13895 +#define pin_unlock(mm) _pin_lock(mm, 0)
13896 +
13897 +#define PIN_BATCH sizeof(void *)
13898 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
13899 +
13900 +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
13901 + unsigned int cpu, unsigned int seq)
13902 +{
13903 + unsigned long pfn = page_to_pfn(page);
13904 +
13905 + if (PageHighMem(page)) {
13906 + if (pgprot_val(flags) & _PAGE_RW)
13907 + ClearPagePinned(page);
13908 + else
13909 + SetPagePinned(page);
13910 + } else {
13911 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
13912 + (unsigned long)__va(pfn << PAGE_SHIFT),
13913 + pfn_pte(pfn, flags), 0);
13914 + if (unlikely(++seq == PIN_BATCH)) {
13915 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
13916 + PIN_BATCH, NULL)))
13917 + BUG();
13918 + seq = 0;
13919 + }
13920 + }
13921 +
13922 + return seq;
13923 +}
13924 +
13925 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
13926 +{
13927 + pgd_t *pgd = pgd_base;
13928 + pud_t *pud;
13929 + pmd_t *pmd;
13930 + int g,u,m;
13931 + unsigned int cpu, seq;
13932 + multicall_entry_t *mcl;
13933 +
13934 + if (xen_feature(XENFEAT_auto_translated_physmap))
13935 + return;
13936 +
13937 + cpu = get_cpu();
13938 +
13939 + /*
13940 + * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
13941 + * may not be the 'current' task's pagetables (e.g., current may be
13942 + * 32-bit, but the pagetables may be for a 64-bit task).
13943 + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
13944 + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
13945 + */
13946 + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
13947 + if (pgd_none(*pgd))
13948 + continue;
13949 + pud = pud_offset(pgd, 0);
13950 + if (PTRS_PER_PUD > 1) /* not folded */
13951 + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
13952 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
13953 + if (pud_none(*pud))
13954 + continue;
13955 + pmd = pmd_offset(pud, 0);
13956 + if (PTRS_PER_PMD > 1) /* not folded */
13957 + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
13958 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
13959 + if (pmd_none(*pmd))
13960 + continue;
13961 + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
13962 + }
13963 + }
13964 + }
13965 +
13966 + mcl = per_cpu(pb_mcl, cpu);
13967 +#ifdef CONFIG_X86_64
13968 + if (unlikely(seq > PIN_BATCH - 2)) {
13969 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
13970 + BUG();
13971 + seq = 0;
13972 + }
13973 + MULTI_update_va_mapping(mcl + seq,
13974 + (unsigned long)__user_pgd(pgd_base),
13975 + pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
13976 + 0);
13977 + MULTI_update_va_mapping(mcl + seq + 1,
13978 + (unsigned long)pgd_base,
13979 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
13980 + UVMF_TLB_FLUSH);
13981 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
13982 + BUG();
13983 +#else
13984 + if (likely(seq != 0)) {
13985 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
13986 + (unsigned long)pgd_base,
13987 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
13988 + UVMF_TLB_FLUSH);
13989 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
13990 + seq + 1, NULL)))
13991 + BUG();
13992 + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
13993 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
13994 + UVMF_TLB_FLUSH))
13995 + BUG();
13996 +#endif
13997 +
13998 + put_cpu();
13999 +}
14000 +
14001 +static void __pgd_pin(pgd_t *pgd)
14002 +{
14003 + pgd_walk(pgd, PAGE_KERNEL_RO);
14004 + kmap_flush_unused();
14005 + xen_pgd_pin(__pa(pgd)); /* kernel */
14006 +#ifdef CONFIG_X86_64
14007 + xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
14008 +#endif
14009 + SetPagePinned(virt_to_page(pgd));
14010 +}
14011 +
14012 +static void __pgd_unpin(pgd_t *pgd)
14013 +{
14014 + xen_pgd_unpin(__pa(pgd));
14015 +#ifdef CONFIG_X86_64
14016 + xen_pgd_unpin(__pa(__user_pgd(pgd)));
14017 +#endif
14018 + pgd_walk(pgd, PAGE_KERNEL);
14019 + ClearPagePinned(virt_to_page(pgd));
14020 +}
14021 +
14022 +static void pgd_test_and_unpin(pgd_t *pgd)
14023 +{
14024 + if (PagePinned(virt_to_page(pgd)))
14025 + __pgd_unpin(pgd);
14026 +}
14027 +
14028 +void mm_pin(struct mm_struct *mm)
14029 +{
14030 + if (xen_feature(XENFEAT_writable_page_tables))
14031 + return;
14032 +
14033 + pin_lock(mm);
14034 + __pgd_pin(mm->pgd);
14035 + pin_unlock(mm);
14036 +}
14037 +
14038 +void mm_unpin(struct mm_struct *mm)
14039 +{
14040 + if (xen_feature(XENFEAT_writable_page_tables))
14041 + return;
14042 +
14043 + pin_lock(mm);
14044 + __pgd_unpin(mm->pgd);
14045 + pin_unlock(mm);
14046 +}
14047 +
14048 +void mm_pin_all(void)
14049 +{
14050 + struct page *page;
14051 + unsigned long flags;
14052 +
14053 + if (xen_feature(XENFEAT_writable_page_tables))
14054 + return;
14055 +
14056 + /*
14057 + * Allow uninterrupted access to the pgd_list. Also protects
14058 + * __pgd_pin() by disabling preemption.
14059 + * All other CPUs must be at a safe point (e.g., in stop_machine
14060 + * or offlined entirely).
14061 + */
14062 + spin_lock_irqsave(&pgd_lock, flags);
14063 + list_for_each_entry(page, &pgd_list, lru) {
14064 + if (!PagePinned(page))
14065 + __pgd_pin((pgd_t *)page_address(page));
14066 + }
14067 + spin_unlock_irqrestore(&pgd_lock, flags);
14068 +}
14069 +
14070 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
14071 +{
14072 + if (!PagePinned(virt_to_page(mm->pgd)))
14073 + mm_pin(mm);
14074 +}
14075 +
14076 +void arch_exit_mmap(struct mm_struct *mm)
14077 +{
14078 + struct task_struct *tsk = current;
14079 +
14080 + task_lock(tsk);
14081 +
14082 + /*
14083 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
14084 + * *much* faster this way, as no tlb flushes means bigger wrpt batches.
14085 + */
14086 + if (tsk->active_mm == mm) {
14087 + tsk->active_mm = &init_mm;
14088 + atomic_inc(&init_mm.mm_count);
14089 +
14090 + switch_mm(mm, &init_mm, tsk);
14091 +
14092 + atomic_dec(&mm->mm_count);
14093 + BUG_ON(atomic_read(&mm->mm_count) == 0);
14094 + }
14095 +
14096 + task_unlock(tsk);
14097 +
14098 + if (PagePinned(virt_to_page(mm->pgd))
14099 + && atomic_read(&mm->mm_count) == 1
14100 + && !mm->context.has_foreign_mappings)
14101 + mm_unpin(mm);
14102 +}
14103 +
14104 +static inline void pgd_list_add(pgd_t *pgd)
14105 +{
14106 + struct page *page = virt_to_page(pgd);
14107 +
14108 + list_add(&page->lru, &pgd_list);
14109 +}
14110 +
14111 +static inline void pgd_list_del(pgd_t *pgd)
14112 +{
14113 + struct page *page = virt_to_page(pgd);
14114 +
14115 + list_del(&page->lru);
14116 +}
14117 +
14118 +#define UNSHARED_PTRS_PER_PGD \
14119 + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
14120 +
14121 +static void pgd_ctor(void *p)
14122 +{
14123 + pgd_t *pgd = p;
14124 + unsigned long flags;
14125 +
14126 + pgd_test_and_unpin(pgd);
14127 +
14128 + /* Clear usermode parts of PGD */
14129 + memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
14130 +
14131 + spin_lock_irqsave(&pgd_lock, flags);
14132 +
14133 + /* If the pgd points to a shared pagetable level (either the
14134 + ptes in non-PAE, or shared PMD in PAE), then just copy the
14135 + references from swapper_pg_dir. */
14136 + if (PAGETABLE_LEVELS == 2 ||
14137 + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
14138 + PAGETABLE_LEVELS == 4) {
14139 + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
14140 + swapper_pg_dir + KERNEL_PGD_BOUNDARY,
14141 + KERNEL_PGD_PTRS);
14142 + paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
14143 + __pa(swapper_pg_dir) >> PAGE_SHIFT,
14144 + KERNEL_PGD_BOUNDARY,
14145 + KERNEL_PGD_PTRS);
14146 + }
14147 +
14148 +#ifdef CONFIG_X86_64
14149 + /* set level3_user_pgt for vsyscall area */
14150 + __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
14151 + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
14152 +#endif
14153 +
14154 +#ifndef CONFIG_X86_PAE
14155 + /* list required to sync kernel mapping updates */
14156 + if (!SHARED_KERNEL_PMD)
14157 + pgd_list_add(pgd);
14158 +#endif
14159 +
14160 + spin_unlock_irqrestore(&pgd_lock, flags);
14161 +}
14162 +
14163 +static void pgd_dtor(void *pgd)
14164 +{
14165 + unsigned long flags; /* can be called from interrupt context */
14166 +
14167 + if (!SHARED_KERNEL_PMD) {
14168 + spin_lock_irqsave(&pgd_lock, flags);
14169 + pgd_list_del(pgd);
14170 + spin_unlock_irqrestore(&pgd_lock, flags);
14171 + }
14172 +
14173 + pgd_test_and_unpin(pgd);
14174 +}
14175 +
14176 +/*
14177 + * List of all pgd's needed for non-PAE so it can invalidate entries
14178 + * in both cached and uncached pgd's; not needed for PAE since the
14179 + * kernel pmd is shared. If PAE were not to share the pmd a similar
14180 + * tactic would be needed. This is essentially codepath-based locking
14181 + * against pageattr.c; it is the unique case in which a valid change
14182 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
14183 + * vmalloc faults work because attached pagetables are never freed.
14184 + * -- wli
14185 + */
14186 +
14187 +#ifdef CONFIG_X86_PAE
14188 +/*
14189 + * Mop up any pmd pages which may still be attached to the pgd.
14190 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
14191 + * preallocate which never got a corresponding vma will need to be
14192 + * freed manually.
14193 + */
14194 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
14195 +{
14196 + int i;
14197 +
14198 + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
14199 + pgd_t pgd = pgdp[i];
14200 +
14201 + if (__pgd_val(pgd) != 0) {
14202 + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
14203 +
14204 + pgdp[i] = xen_make_pgd(0);
14205 +
14206 + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
14207 + pmd_free(mm, pmd);
14208 + }
14209 + }
14210 +
14211 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
14212 + xen_destroy_contiguous_region((unsigned long)pgdp, 0);
14213 +}
14214 +
14215 +/*
14216 + * In PAE mode, we need to do a cr3 reload (=tlb flush) when
14217 + * updating the top-level pagetable entries to guarantee the
14218 + * processor notices the update. Since this is expensive, and
14219 + * all 4 top-level entries are used almost immediately in a
14220 + * new process's life, we just pre-populate them here.
14221 + *
14222 + * Also, if we're in a paravirt environment where the kernel pmd is
14223 + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
14224 + * and initialize the kernel pmds here.
14225 + */
14226 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14227 +{
14228 + pud_t *pud;
14229 + pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
14230 + unsigned long addr, flags;
14231 + int i;
14232 +
14233 + /*
14234 + * We can race save/restore (if we sleep during a GFP_KERNEL memory
14235 + * allocation). We therefore store virtual addresses of pmds as they
14236 + * do not change across save/restore, and poke the machine addresses
14237 + * into the pgdir under the pgd_lock.
14238 + */
14239 + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
14240 + pmds[i] = pmd_alloc_one(mm, addr);
14241 + if (!pmds[i])
14242 + goto out_oom;
14243 + }
14244 +
14245 + spin_lock_irqsave(&pgd_lock, flags);
14246 +
14247 + /* Protect against save/restore: move below 4GB under pgd_lock. */
14248 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
14249 + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
14250 + spin_unlock_irqrestore(&pgd_lock, flags);
14251 +out_oom:
14252 + while (i--)
14253 + pmd_free(mm, pmds[i]);
14254 + return 0;
14255 + }
14256 +
14257 + /* Copy kernel pmd contents and write-protect the new pmds. */
14258 + pud = pud_offset(pgd, 0);
14259 + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
14260 + i++, pud++, addr += PUD_SIZE) {
14261 + if (i >= KERNEL_PGD_BOUNDARY) {
14262 + memcpy(pmds[i],
14263 + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
14264 + sizeof(pmd_t) * PTRS_PER_PMD);
14265 + make_lowmem_page_readonly(
14266 + pmds[i], XENFEAT_writable_page_tables);
14267 + }
14268 +
14269 + /* It is safe to poke machine addresses of pmds under the pgd_lock. */
14270 + pud_populate(mm, pud, pmds[i]);
14271 + }
14272 +
14273 + /* List required to sync kernel mapping updates and
14274 + * to pin/unpin on save/restore. */
14275 + pgd_list_add(pgd);
14276 +
14277 + spin_unlock_irqrestore(&pgd_lock, flags);
14278 +
14279 + return 1;
14280 +}
14281 +
14282 +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
14283 +{
14284 + struct page *page = virt_to_page(pmd);
14285 + unsigned long pfn = page_to_pfn(page);
14286 +
14287 + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
14288 +
14289 + /* Note: almost everything apart from _PAGE_PRESENT is
14290 + reserved at the pmd (PDPT) level. */
14291 + if (PagePinned(virt_to_page(mm->pgd))) {
14292 + BUG_ON(PageHighMem(page));
14293 + BUG_ON(HYPERVISOR_update_va_mapping(
14294 + (unsigned long)__va(pfn << PAGE_SHIFT),
14295 + pfn_pte(pfn, PAGE_KERNEL_RO), 0));
14296 + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
14297 + } else
14298 + *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
14299 +
14300 + /*
14301 + * According to Intel App note "TLBs, Paging-Structure Caches,
14302 + * and Their Invalidation", April 2007, document 317080-001,
14303 + * section 8.1: in PAE mode we explicitly have to flush the
14304 + * TLB via cr3 if the top-level pgd is changed...
14305 + */
14306 + if (mm == current->active_mm)
14307 + xen_tlb_flush();
14308 +}
14309 +#else /* !CONFIG_X86_PAE */
14310 +/* No need to prepopulate any pagetable entries in non-PAE modes. */
14311 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14312 +{
14313 + return 1;
14314 +}
14315 +
14316 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
14317 +{
14318 +}
14319 +#endif /* CONFIG_X86_PAE */
14320 +
14321 +#ifdef CONFIG_X86_64
14322 +/* We allocate two contiguous pages for kernel and user. */
14323 +#define PGD_ORDER 1
14324 +#else
14325 +#define PGD_ORDER 0
14326 +#endif
14327 +
14328 +pgd_t *pgd_alloc(struct mm_struct *mm)
14329 +{
14330 + pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
14331 +
14332 + /* so that alloc_pd can use it */
14333 + mm->pgd = pgd;
14334 + if (pgd)
14335 + pgd_ctor(pgd);
14336 +
14337 + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
14338 + free_pages((unsigned long)pgd, PGD_ORDER);
14339 + pgd = NULL;
14340 + }
14341 +
14342 + return pgd;
14343 +}
14344 +
14345 +void pgd_free(struct mm_struct *mm, pgd_t *pgd)
14346 +{
14347 + /*
14348 + * After this the pgd should not be pinned for the duration of this
14349 + * function's execution. We should never sleep and thus never race:
14350 + * 1. User pmds will not become write-protected under our feet due
14351 + * to a concurrent mm_pin_all().
14352 + * 2. The machine addresses in PGD entries will not become invalid
14353 + * due to a concurrent save/restore.
14354 + */
14355 + pgd_dtor(pgd);
14356 +
14357 + pgd_mop_up_pmds(mm, pgd);
14358 + free_pages((unsigned long)pgd, PGD_ORDER);
14359 +}
14360 +
14361 +/* blktap and gntdev need this, as otherwise they would implicitly (and
14362 + * needlessly, as they never use it) reference init_mm. */
14363 +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
14364 + unsigned long addr, pte_t *ptep, int full)
14365 +{
14366 + return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
14367 +}
14368 +EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
14369 +
14370 +int ptep_set_access_flags(struct vm_area_struct *vma,
14371 + unsigned long address, pte_t *ptep,
14372 + pte_t entry, int dirty)
14373 +{
14374 + int changed = !pte_same(*ptep, entry);
14375 +
14376 + if (changed && dirty) {
14377 + if (likely(vma->vm_mm == current->mm)) {
14378 + if (HYPERVISOR_update_va_mapping(address,
14379 + entry,
14380 + (unsigned long)vma->vm_mm->cpu_vm_mask.bits|
14381 + UVMF_INVLPG|UVMF_MULTI))
14382 + BUG();
14383 + } else {
14384 + xen_l1_entry_update(ptep, entry);
14385 + flush_tlb_page(vma, address);
14386 + }
14387 + }
14388 +
14389 + return changed;
14390 +}
14391 +
14392 +int ptep_test_and_clear_young(struct vm_area_struct *vma,
14393 + unsigned long addr, pte_t *ptep)
14394 +{
14395 + int ret = 0;
14396 +
14397 + if (pte_young(*ptep))
14398 + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
14399 + &ptep->pte);
14400 +
14401 + if (ret)
14402 + pte_update(vma->vm_mm, addr, ptep);
14403 +
14404 + return ret;
14405 +}
14406 +
14407 +int ptep_clear_flush_young(struct vm_area_struct *vma,
14408 + unsigned long address, pte_t *ptep)
14409 +{
14410 + pte_t pte = *ptep;
14411 + int young = pte_young(pte);
14412 +
14413 + pte = pte_mkold(pte);
14414 + if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
14415 + ptep_set_access_flags(vma, address, ptep, pte, young);
14416 + else if (young)
14417 + ptep->pte_low = pte.pte_low;
14418 +
14419 + return young;
14420 +}
14421 --- sle11-2009-10-16.orig/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:33:40.000000000 +0100
14422 +++ sle11-2009-10-16/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:38:05.000000000 +0100
14423 @@ -1,7 +1,3 @@
14424 -/*
14425 - * linux/arch/i386/mm/pgtable.c
14426 - */
14427 -
14428 #include <linux/sched.h>
14429 #include <linux/kernel.h>
14430 #include <linux/errno.h>
14431 @@ -41,7 +37,6 @@ void show_mem(void)
14432
14433 printk(KERN_INFO "Mem-info:\n");
14434 show_free_areas();
14435 - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
14436 for_each_online_pgdat(pgdat) {
14437 pgdat_resize_lock(pgdat, &flags);
14438 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14439 @@ -157,243 +152,6 @@ void __init reserve_top_address(unsigned
14440 __VMALLOC_RESERVE += reserve;
14441 }
14442
14443 -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
14444 -{
14445 - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
14446 - if (pte)
14447 - make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
14448 - return pte;
14449 -}
14450 -
14451 -/*
14452 - * List of all pgd's needed for non-PAE so it can invalidate entries
14453 - * in both cached and uncached pgd's; not needed for PAE since the
14454 - * kernel pmd is shared. If PAE were not to share the pmd a similar
14455 - * tactic would be needed. This is essentially codepath-based locking
14456 - * against pageattr.c; it is the unique case in which a valid change
14457 - * of kernel pagetables can't be lazily synchronized by vmalloc faults.
14458 - * vmalloc faults work because attached pagetables are never freed.
14459 - * -- wli
14460 - */
14461 -static inline void pgd_list_add(pgd_t *pgd)
14462 -{
14463 - struct page *page = virt_to_page(pgd);
14464 -
14465 - list_add(&page->lru, &pgd_list);
14466 -}
14467 -
14468 -static inline void pgd_list_del(pgd_t *pgd)
14469 -{
14470 - struct page *page = virt_to_page(pgd);
14471 -
14472 - list_del(&page->lru);
14473 -}
14474 -
14475 -#define UNSHARED_PTRS_PER_PGD \
14476 - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
14477 -
14478 -static void pgd_ctor(void *p)
14479 -{
14480 - pgd_t *pgd = p;
14481 - unsigned long flags;
14482 -
14483 - pgd_test_and_unpin(pgd);
14484 -
14485 - /* Clear usermode parts of PGD */
14486 - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
14487 -
14488 - spin_lock_irqsave(&pgd_lock, flags);
14489 -
14490 - /* If the pgd points to a shared pagetable level (either the
14491 - ptes in non-PAE, or shared PMD in PAE), then just copy the
14492 - references from swapper_pg_dir. */
14493 - if (PAGETABLE_LEVELS == 2 ||
14494 - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
14495 - clone_pgd_range(pgd + USER_PTRS_PER_PGD,
14496 - swapper_pg_dir + USER_PTRS_PER_PGD,
14497 - KERNEL_PGD_PTRS);
14498 - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
14499 - __pa(swapper_pg_dir) >> PAGE_SHIFT,
14500 - USER_PTRS_PER_PGD,
14501 - KERNEL_PGD_PTRS);
14502 - }
14503 -
14504 - /* list required to sync kernel mapping updates */
14505 - if (PAGETABLE_LEVELS == 2)
14506 - pgd_list_add(pgd);
14507 -
14508 - spin_unlock_irqrestore(&pgd_lock, flags);
14509 -}
14510 -
14511 -static void pgd_dtor(void *pgd)
14512 -{
14513 - unsigned long flags; /* can be called from interrupt context */
14514 -
14515 - if (!SHARED_KERNEL_PMD) {
14516 - spin_lock_irqsave(&pgd_lock, flags);
14517 - pgd_list_del(pgd);
14518 - spin_unlock_irqrestore(&pgd_lock, flags);
14519 - }
14520 -
14521 - pgd_test_and_unpin(pgd);
14522 -}
14523 -
14524 -#ifdef CONFIG_X86_PAE
14525 -/*
14526 - * Mop up any pmd pages which may still be attached to the pgd.
14527 - * Normally they will be freed by munmap/exit_mmap, but any pmd we
14528 - * preallocate which never got a corresponding vma will need to be
14529 - * freed manually.
14530 - */
14531 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
14532 -{
14533 - int i;
14534 -
14535 - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
14536 - pgd_t pgd = pgdp[i];
14537 -
14538 - if (__pgd_val(pgd) != 0) {
14539 - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
14540 -
14541 - pgdp[i] = xen_make_pgd(0);
14542 -
14543 - paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
14544 - pmd_free(mm, pmd);
14545 - }
14546 - }
14547 -}
14548 -
14549 -/*
14550 - * In PAE mode, we need to do a cr3 reload (=tlb flush) when
14551 - * updating the top-level pagetable entries to guarantee the
14552 - * processor notices the update. Since this is expensive, and
14553 - * all 4 top-level entries are used almost immediately in a
14554 - * new process's life, we just pre-populate them here.
14555 - *
14556 - * Also, if we're in a paravirt environment where the kernel pmd is
14557 - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
14558 - * and initialize the kernel pmds here.
14559 - */
14560 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14561 -{
14562 - pud_t *pud;
14563 - pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
14564 - unsigned long addr, flags;
14565 - int i;
14566 -
14567 - /*
14568 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
14569 - * allocation). We therefore store virtual addresses of pmds as they
14570 - * do not change across save/restore, and poke the machine addresses
14571 - * into the pgdir under the pgd_lock.
14572 - */
14573 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
14574 - pmds[i] = pmd_alloc_one(mm, addr);
14575 - if (!pmds[i])
14576 - goto out_oom;
14577 - }
14578 -
14579 - spin_lock_irqsave(&pgd_lock, flags);
14580 -
14581 - /* Protect against save/restore: move below 4GB under pgd_lock. */
14582 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
14583 - && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
14584 - spin_unlock_irqrestore(&pgd_lock, flags);
14585 -out_oom:
14586 - while (i--)
14587 - pmd_free(mm, pmds[i]);
14588 - return 0;
14589 - }
14590 -
14591 - /* Copy kernel pmd contents and write-protect the new pmds. */
14592 - pud = pud_offset(pgd, 0);
14593 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
14594 - i++, pud++, addr += PUD_SIZE) {
14595 - if (i >= USER_PTRS_PER_PGD) {
14596 - memcpy(pmds[i],
14597 - (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
14598 - sizeof(pmd_t) * PTRS_PER_PMD);
14599 - make_lowmem_page_readonly(
14600 - pmds[i], XENFEAT_writable_page_tables);
14601 - }
14602 -
14603 - /* It is safe to poke machine addresses of pmds under the pgd_lock. */
14604 - pud_populate(mm, pud, pmds[i]);
14605 - }
14606 -
14607 - /* List required to sync kernel mapping updates and
14608 - * to pin/unpin on save/restore. */
14609 - pgd_list_add(pgd);
14610 -
14611 - spin_unlock_irqrestore(&pgd_lock, flags);
14612 -
14613 - return 1;
14614 -}
14615 -#else /* !CONFIG_X86_PAE */
14616 -/* No need to prepopulate any pagetable entries in non-PAE modes. */
14617 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14618 -{
14619 - return 1;
14620 -}
14621 -
14622 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
14623 -{
14624 -}
14625 -#endif /* CONFIG_X86_PAE */
14626 -
14627 -pgd_t *pgd_alloc(struct mm_struct *mm)
14628 -{
14629 - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
14630 -
14631 - /* so that alloc_pd can use it */
14632 - mm->pgd = pgd;
14633 - if (pgd)
14634 - pgd_ctor(pgd);
14635 -
14636 - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
14637 - free_page((unsigned long)pgd);
14638 - pgd = NULL;
14639 - }
14640 -
14641 - return pgd;
14642 -}
14643 -
14644 -void pgd_free(struct mm_struct *mm, pgd_t *pgd)
14645 -{
14646 - /*
14647 - * After this the pgd should not be pinned for the duration of this
14648 - * function's execution. We should never sleep and thus never race:
14649 - * 1. User pmds will not become write-protected under our feet due
14650 - * to a concurrent mm_pin_all().
14651 - * 2. The machine addresses in PGD entries will not become invalid
14652 - * due to a concurrent save/restore.
14653 - */
14654 - pgd_dtor(pgd);
14655 -
14656 - if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
14657 - xen_destroy_contiguous_region((unsigned long)pgd, 0);
14658 -
14659 - pgd_mop_up_pmds(mm, pgd);
14660 - free_page((unsigned long)pgd);
14661 -}
14662 -
14663 -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
14664 -{
14665 - pgtable_page_dtor(pte);
14666 - paravirt_release_pt(page_to_pfn(pte));
14667 - tlb_remove_page(tlb, pte);
14668 -}
14669 -
14670 -#ifdef CONFIG_X86_PAE
14671 -
14672 -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
14673 -{
14674 - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
14675 - tlb_remove_page(tlb, virt_to_page(pmd));
14676 -}
14677 -
14678 -#endif
14679 -
14680 void make_lowmem_page_readonly(void *va, unsigned int feature)
14681 {
14682 pte_t *pte;
14683 --- sle11-2009-10-16.orig/arch/x86/pci/i386.c 2009-10-28 14:55:03.000000000 +0100
14684 +++ sle11-2009-10-16/arch/x86/pci/i386.c 2009-10-08 12:08:34.000000000 +0200
14685 @@ -338,10 +338,14 @@ int pci_mmap_page_range(struct pci_dev *
14686 flags);
14687 }
14688
14689 +#ifndef CONFIG_XEN
14690 if (((vma->vm_pgoff < max_low_pfn_mapped) ||
14691 (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
14692 vma->vm_pgoff < max_pfn_mapped)) &&
14693 ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
14694 +#else
14695 + if (ioremap_check_change_attr(vma->vm_pgoff, len, flags)) {
14696 +#endif
14697 free_memtype(addr, addr + len);
14698 return -EINVAL;
14699 }
14700 --- sle11-2009-10-16.orig/arch/x86/pci/irq-xen.c 2009-03-16 16:33:40.000000000 +0100
14701 +++ sle11-2009-10-16/arch/x86/pci/irq-xen.c 2009-03-16 16:38:05.000000000 +0100
14702 @@ -140,9 +140,11 @@ static void __init pirq_peer_trick(void)
14703 busmap[e->bus] = 1;
14704 }
14705 for(i = 1; i < 256; i++) {
14706 + int node;
14707 if (!busmap[i] || pci_find_bus(0, i))
14708 continue;
14709 - if (pci_scan_bus_with_sysdata(i))
14710 + node = get_mp_bus_to_node(i);
14711 + if (pci_scan_bus_on_node(i, &pci_root_ops, node))
14712 printk(KERN_INFO "PCI: Discovered primary peer "
14713 "bus %02x [IRQ]\n", i);
14714 }
14715 @@ -204,7 +206,7 @@ static int pirq_ali_get(struct pci_dev *
14716 {
14717 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
14718
14719 - WARN_ON_ONCE(pirq >= 16);
14720 + WARN_ON_ONCE(pirq > 16);
14721 return irqmap[read_config_nybble(router, 0x48, pirq-1)];
14722 }
14723
14724 @@ -213,7 +215,7 @@ static int pirq_ali_set(struct pci_dev *
14725 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
14726 unsigned int val = irqmap[irq];
14727
14728 - WARN_ON_ONCE(pirq >= 16);
14729 + WARN_ON_ONCE(pirq > 16);
14730 if (val) {
14731 write_config_nybble(router, 0x48, pirq-1, val);
14732 return 1;
14733 @@ -264,7 +266,7 @@ static int pirq_via586_get(struct pci_de
14734 {
14735 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
14736
14737 - WARN_ON_ONCE(pirq >= 5);
14738 + WARN_ON_ONCE(pirq > 5);
14739 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
14740 }
14741
14742 @@ -272,7 +274,7 @@ static int pirq_via586_set(struct pci_de
14743 {
14744 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
14745
14746 - WARN_ON_ONCE(pirq >= 5);
14747 + WARN_ON_ONCE(pirq > 5);
14748 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
14749 return 1;
14750 }
14751 @@ -286,7 +288,7 @@ static int pirq_ite_get(struct pci_dev *
14752 {
14753 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
14754
14755 - WARN_ON_ONCE(pirq >= 4);
14756 + WARN_ON_ONCE(pirq > 4);
14757 return read_config_nybble(router,0x43, pirqmap[pirq-1]);
14758 }
14759
14760 @@ -294,7 +296,7 @@ static int pirq_ite_set(struct pci_dev *
14761 {
14762 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
14763
14764 - WARN_ON_ONCE(pirq >= 4);
14765 + WARN_ON_ONCE(pirq > 4);
14766 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
14767 return 1;
14768 }
14769 @@ -623,6 +625,13 @@ static __init int via_router_probe(struc
14770 */
14771 device = PCI_DEVICE_ID_VIA_8235;
14772 break;
14773 + case PCI_DEVICE_ID_VIA_8237:
14774 + /**
14775 + * Asus a7v600 bios wrongly reports 8237
14776 + * as 586-compatible
14777 + */
14778 + device = PCI_DEVICE_ID_VIA_8237;
14779 + break;
14780 }
14781 }
14782
14783 --- sle11-2009-10-16.orig/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:33:40.000000000 +0100
14784 +++ sle11-2009-10-16/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:38:05.000000000 +0100
14785 @@ -164,7 +164,7 @@ static __init void relocate_vdso(Elf32_E
14786 Elf32_Shdr *shdr;
14787 int i;
14788
14789 - BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
14790 + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
14791 !elf_check_arch_ia32(ehdr) ||
14792 ehdr->e_type != ET_DYN);
14793
14794 @@ -233,8 +233,12 @@ void syscall32_cpu_init(void)
14795 BUG();
14796 #endif
14797
14798 - if (use_sysenter < 0)
14799 - use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
14800 + if (use_sysenter < 0) {
14801 + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
14802 + use_sysenter = 1;
14803 + if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
14804 + use_sysenter = 1;
14805 + }
14806 }
14807
14808 #define compat_uses_vma 1
14809 @@ -337,8 +341,6 @@ int __init sysenter_setup(void)
14810
14811 #ifdef CONFIG_X86_32
14812 gate_vma_init();
14813 -
14814 - printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
14815 #endif
14816
14817 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
14818 @@ -383,6 +385,9 @@ int arch_setup_additional_pages(struct l
14819 int ret = 0;
14820 bool compat;
14821
14822 + if (vdso_enabled == VDSO_DISABLED)
14823 + return 0;
14824 +
14825 down_write(&mm->mmap_sem);
14826
14827 /* Test compat mode once here, in case someone
14828 --- sle11-2009-10-16.orig/drivers/acpi/processor_core.c 2009-08-26 11:52:33.000000000 +0200
14829 +++ sle11-2009-10-16/drivers/acpi/processor_core.c 2009-08-26 11:54:44.000000000 +0200
14830 @@ -666,7 +666,7 @@ static int acpi_processor_get_info(struc
14831 * of /proc/cpuinfo
14832 */
14833 status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
14834 - if (ACPI_SUCCESS(status))
14835 + if (ACPI_SUCCESS(status) && pr->id != -1)
14836 arch_fix_phys_package_id(pr->id, object.integer.value);
14837
14838 return 0;
14839 --- sle11-2009-10-16.orig/drivers/firmware/iscsi_ibft.c 2009-10-28 14:55:03.000000000 +0100
14840 +++ sle11-2009-10-16/drivers/firmware/iscsi_ibft.c 2009-09-24 09:54:51.000000000 +0200
14841 @@ -943,7 +943,7 @@ static int __init ibft_init(void)
14842
14843 if (ibft_addr) {
14844 printk(KERN_INFO "iBFT detected at 0x%lx.\n",
14845 - virt_to_phys((void *)ibft_addr));
14846 + isa_virt_to_bus(ibft_addr));
14847
14848 rc = ibft_check_device();
14849 if (rc)
14850 --- sle11-2009-10-16.orig/drivers/firmware/iscsi_ibft_find.c 2009-10-28 14:55:03.000000000 +0100
14851 +++ sle11-2009-10-16/drivers/firmware/iscsi_ibft_find.c 2009-09-24 09:52:18.000000000 +0200
14852 @@ -65,10 +65,10 @@ void __init reserve_ibft_region(void)
14853 * so skip that area */
14854 if (pos == VGA_MEM)
14855 pos += VGA_SIZE;
14856 - virt = phys_to_virt(pos);
14857 + virt = isa_bus_to_virt(pos);
14858 if (memcmp(virt, IBFT_SIGN, IBFT_SIGN_LEN) == 0) {
14859 unsigned long *addr =
14860 - (unsigned long *)phys_to_virt(pos + 4);
14861 + (unsigned long *)isa_bus_to_virt(pos + 4);
14862 len = *addr;
14863 /* if the length of the table extends past 1M,
14864 * the table cannot be valid. */
14865 @@ -78,6 +78,8 @@ void __init reserve_ibft_region(void)
14866 }
14867 }
14868 }
14869 +#ifndef CONFIG_XEN
14870 if (ibft_addr)
14871 reserve_bootmem(pos, PAGE_ALIGN(len), BOOTMEM_DEFAULT);
14872 +#endif
14873 }
14874 --- sle11-2009-10-16.orig/drivers/input/xen-kbdfront.c 2009-10-28 14:55:03.000000000 +0100
14875 +++ sle11-2009-10-16/drivers/input/xen-kbdfront.c 2009-03-16 16:38:05.000000000 +0100
14876 @@ -325,7 +325,6 @@ static struct xenbus_device_id xenkbd_id
14877
14878 static struct xenbus_driver xenkbd = {
14879 .name = "vkbd",
14880 - .owner = THIS_MODULE,
14881 .ids = xenkbd_ids,
14882 .probe = xenkbd_probe,
14883 .remove = xenkbd_remove,
14884 --- sle11-2009-10-16.orig/drivers/oprofile/cpu_buffer.c 2009-03-12 16:15:32.000000000 +0100
14885 +++ sle11-2009-10-16/drivers/oprofile/cpu_buffer.c 2009-03-16 16:38:05.000000000 +0100
14886 @@ -341,7 +341,7 @@ void oprofile_add_mode(int cpu_mode)
14887
14888 int oprofile_add_domain_switch(int32_t domain_id)
14889 {
14890 - struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
14891 + struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
14892
14893 /* should have space for switching into and out of domain
14894 (2 slots each) plus one sample and one cpu mode switch */
14895 --- sle11-2009-10-16.orig/drivers/pci/msi-xen.c 2009-03-16 16:33:40.000000000 +0100
14896 +++ sle11-2009-10-16/drivers/pci/msi-xen.c 2009-03-16 16:38:05.000000000 +0100
14897 @@ -583,7 +583,7 @@ int pci_enable_msi(struct pci_dev* dev)
14898 EXPORT_SYMBOL(pci_enable_msi);
14899
14900 extern void pci_frontend_disable_msi(struct pci_dev* dev);
14901 -void pci_disable_msi(struct pci_dev* dev)
14902 +void pci_msi_shutdown(struct pci_dev* dev)
14903 {
14904 int pirq;
14905
14906 @@ -612,6 +612,10 @@ void pci_disable_msi(struct pci_dev* dev
14907 pci_intx_for_msi(dev, 1);
14908 dev->msi_enabled = 0;
14909 }
14910 +void pci_disable_msi(struct pci_dev* dev)
14911 +{
14912 + pci_msi_shutdown(dev);
14913 +}
14914 EXPORT_SYMBOL(pci_disable_msi);
14915
14916 /**
14917 @@ -714,7 +718,7 @@ int pci_enable_msix(struct pci_dev* dev,
14918 EXPORT_SYMBOL(pci_enable_msix);
14919
14920 extern void pci_frontend_disable_msix(struct pci_dev* dev);
14921 -void pci_disable_msix(struct pci_dev* dev)
14922 +void pci_msix_shutdown(struct pci_dev* dev)
14923 {
14924 if (!pci_msi_enable)
14925 return;
14926 @@ -751,6 +755,10 @@ void pci_disable_msix(struct pci_dev* de
14927 pci_intx_for_msi(dev, 1);
14928 dev->msix_enabled = 0;
14929 }
14930 +void pci_disable_msix(struct pci_dev* dev)
14931 +{
14932 + pci_msix_shutdown(dev);
14933 +}
14934 EXPORT_SYMBOL(pci_disable_msix);
14935
14936 /**
14937 --- sle11-2009-10-16.orig/drivers/video/Kconfig 2009-06-04 10:18:21.000000000 +0200
14938 +++ sle11-2009-10-16/drivers/video/Kconfig 2009-03-16 16:38:05.000000000 +0100
14939 @@ -2029,7 +2029,7 @@ config FB_VIRTUAL
14940
14941 config XEN_FBDEV_FRONTEND
14942 tristate "Xen virtual frame buffer support"
14943 - depends on FB && XEN
14944 + depends on FB && PARAVIRT_XEN
14945 select FB_SYS_FILLRECT
14946 select FB_SYS_COPYAREA
14947 select FB_SYS_IMAGEBLIT
14948 --- sle11-2009-10-16.orig/drivers/video/xen-fbfront.c 2009-10-28 14:55:03.000000000 +0100
14949 +++ sle11-2009-10-16/drivers/video/xen-fbfront.c 2009-03-16 16:38:05.000000000 +0100
14950 @@ -670,7 +670,6 @@ static struct xenbus_device_id xenfb_ids
14951
14952 static struct xenbus_driver xenfb = {
14953 .name = "vfb",
14954 - .owner = THIS_MODULE,
14955 .ids = xenfb_ids,
14956 .probe = xenfb_probe,
14957 .remove = xenfb_remove,
14958 --- sle11-2009-10-16.orig/drivers/xen/Kconfig 2009-03-04 11:28:34.000000000 +0100
14959 +++ sle11-2009-10-16/drivers/xen/Kconfig 2009-03-16 16:38:05.000000000 +0100
14960 @@ -2,8 +2,6 @@
14961 # This Kconfig describe xen options
14962 #
14963
14964 -mainmenu "Xen Configuration"
14965 -
14966 config XEN
14967 bool
14968
14969 --- sle11-2009-10-16.orig/drivers/xen/Makefile 2009-02-16 16:17:21.000000000 +0100
14970 +++ sle11-2009-10-16/drivers/xen/Makefile 2009-03-16 16:38:05.000000000 +0100
14971 @@ -1,5 +1,8 @@
14972 -obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o
14973 +obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
14974 +xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
14975 +xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
14976
14977 +xen-balloon-$(CONFIG_XEN) := balloon/
14978 obj-$(CONFIG_XEN) += core/
14979 obj-$(CONFIG_XEN) += console/
14980 obj-$(CONFIG_XEN) += evtchn/
14981 @@ -7,7 +10,8 @@ obj-y += xenbus/
14982 obj-$(CONFIG_XEN) += char/
14983
14984 obj-$(CONFIG_XEN) += util.o
14985 -obj-$(CONFIG_XEN_BALLOON) += balloon/
14986 +obj-$(CONFIG_XEN_XENCOMM) += $(xen-xencomm-y)
14987 +obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y)
14988 obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
14989 obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
14990 obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
14991 --- sle11-2009-10-16.orig/drivers/xen/blkfront/blkfront.c 2009-03-24 10:12:53.000000000 +0100
14992 +++ sle11-2009-10-16/drivers/xen/blkfront/blkfront.c 2009-05-19 10:38:53.000000000 +0200
14993 @@ -285,7 +285,11 @@ static void backend_changed(struct xenbu
14994 break;
14995
14996 case XenbusStateClosing:
14997 - bd = bdget(info->dev);
14998 + if (!info->gd) {
14999 + xenbus_frontend_closed(dev);
15000 + break;
15001 + }
15002 + bd = bdget_disk(info->gd, 0);
15003 if (bd == NULL)
15004 xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
15005
15006 --- sle11-2009-10-16.orig/drivers/xen/blkfront/block.h 2009-03-24 10:11:58.000000000 +0100
15007 +++ sle11-2009-10-16/drivers/xen/blkfront/block.h 2009-03-16 16:38:05.000000000 +0100
15008 @@ -96,7 +96,6 @@ struct blk_shadow {
15009 struct blkfront_info
15010 {
15011 struct xenbus_device *xbdev;
15012 - dev_t dev;
15013 struct gendisk *gd;
15014 int vdevice;
15015 blkif_vdev_t handle;
15016 --- sle11-2009-10-16.orig/drivers/xen/blkfront/vbd.c 2009-02-16 16:17:21.000000000 +0100
15017 +++ sle11-2009-10-16/drivers/xen/blkfront/vbd.c 2009-03-16 16:38:05.000000000 +0100
15018 @@ -246,17 +246,32 @@ xlvbd_init_blk_queue(struct gendisk *gd,
15019 return 0;
15020 }
15021
15022 -static int
15023 -xlvbd_alloc_gendisk(int major, int minor, blkif_sector_t capacity, int vdevice,
15024 - u16 vdisk_info, u16 sector_size,
15025 - struct blkfront_info *info)
15026 +int
15027 +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
15028 + u16 sector_size, struct blkfront_info *info)
15029 {
15030 + int major, minor;
15031 struct gendisk *gd;
15032 struct xlbd_major_info *mi;
15033 int nr_minors = 1;
15034 int err = -ENODEV;
15035 unsigned int offset;
15036
15037 + if ((vdevice>>EXT_SHIFT) > 1) {
15038 + /* this is above the extended range; something is wrong */
15039 + printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
15040 + return -ENODEV;
15041 + }
15042 +
15043 + if (!VDEV_IS_EXTENDED(vdevice)) {
15044 + major = BLKIF_MAJOR(vdevice);
15045 + minor = BLKIF_MINOR(vdevice);
15046 + }
15047 + else {
15048 + major = 202;
15049 + minor = BLKIF_MINOR_EXT(vdevice);
15050 + }
15051 +
15052 BUG_ON(info->gd != NULL);
15053 BUG_ON(info->mi != NULL);
15054 BUG_ON(info->rq != NULL);
15055 @@ -337,41 +352,6 @@ xlvbd_alloc_gendisk(int major, int minor
15056 return err;
15057 }
15058
15059 -int
15060 -xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
15061 - u16 sector_size, struct blkfront_info *info)
15062 -{
15063 - struct block_device *bd;
15064 - int err = 0;
15065 - int major, minor;
15066 -
15067 - if ((vdevice>>EXT_SHIFT) > 1) {
15068 - /* this is above the extended range; something is wrong */
15069 - printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
15070 - return -ENODEV;
15071 - }
15072 -
15073 - if (!VDEV_IS_EXTENDED(vdevice)) {
15074 - major = BLKIF_MAJOR(vdevice);
15075 - minor = BLKIF_MINOR(vdevice);
15076 - }
15077 - else {
15078 - major = 202;
15079 - minor = BLKIF_MINOR_EXT(vdevice);
15080 - }
15081 -
15082 - info->dev = MKDEV(major, minor);
15083 - bd = bdget(info->dev);
15084 - if (bd == NULL)
15085 - return -ENODEV;
15086 -
15087 - err = xlvbd_alloc_gendisk(major, minor, capacity, vdevice, vdisk_info,
15088 - sector_size, info);
15089 -
15090 - bdput(bd);
15091 - return err;
15092 -}
15093 -
15094 void
15095 xlvbd_del(struct blkfront_info *info)
15096 {
15097 --- sle11-2009-10-16.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:38:54.000000000 +0200
15098 +++ sle11-2009-10-16/drivers/xen/blktap/blktap.c 2009-04-20 11:40:14.000000000 +0200
15099 @@ -111,6 +111,7 @@ typedef struct tap_blkif {
15100 unsigned long mode; /*current switching mode */
15101 int minor; /*Minor number for tapdisk device */
15102 pid_t pid; /*tapdisk process id */
15103 + struct pid_namespace *pid_ns; /*... and its corresponding namespace */
15104 enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
15105 shutdown */
15106 unsigned long *idx_map; /*Record the user ring id to kern
15107 @@ -299,16 +300,14 @@ struct tap_vma_priv {
15108 struct page *map[];
15109 };
15110
15111 -static struct page *blktap_nopage(struct vm_area_struct *vma,
15112 - unsigned long address,
15113 - int *type)
15114 +static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
15115 {
15116 /*
15117 * if the page has not been mapped in by the driver then return
15118 - * NOPAGE_SIGBUS to the domain.
15119 + * VM_FAULT_SIGBUS to the domain.
15120 */
15121
15122 - return NOPAGE_SIGBUS;
15123 + return VM_FAULT_SIGBUS;
15124 }
15125
15126 static pte_t blktap_clear_pte(struct vm_area_struct *vma,
15127 @@ -404,7 +403,7 @@ static void blktap_vma_close(struct vm_a
15128 }
15129
15130 struct vm_operations_struct blktap_vm_ops = {
15131 - nopage: blktap_nopage,
15132 + fault: blktap_fault,
15133 zap_pte: blktap_clear_pte,
15134 close: blktap_vma_close,
15135 };
15136 @@ -498,9 +497,8 @@ found:
15137 tapfds[minor] = info;
15138
15139 if ((class = get_xen_class()) != NULL)
15140 - class_device_create(class, NULL,
15141 - MKDEV(blktap_major, minor), NULL,
15142 - "blktap%d", minor);
15143 + device_create(class, NULL, MKDEV(blktap_major, minor),
15144 + "blktap%d", minor);
15145 }
15146
15147 out:
15148 @@ -542,7 +540,7 @@ void signal_tapdisk(int idx)
15149 return;
15150
15151 if (info->pid > 0) {
15152 - ptask = find_task_by_pid(info->pid);
15153 + ptask = find_task_by_pid_ns(info->pid, info->pid_ns);
15154 if (ptask)
15155 info->status = CLEANSHUTDOWN;
15156 }
15157 @@ -770,8 +768,9 @@ static int blktap_ioctl(struct inode *in
15158 {
15159 if (info) {
15160 info->pid = (pid_t)arg;
15161 - DPRINTK("blktap: pid received %d\n",
15162 - info->pid);
15163 + info->pid_ns = current->nsproxy->pid_ns;
15164 + DPRINTK("blktap: pid received %p:%d\n",
15165 + info->pid_ns, info->pid);
15166 }
15167 return 0;
15168 }
15169 @@ -1684,9 +1683,7 @@ static int __init blkif_init(void)
15170 * We only create the device when a request of a new device is
15171 * made.
15172 */
15173 - class_device_create(class, NULL,
15174 - MKDEV(blktap_major, 0), NULL,
15175 - "blktap0");
15176 + device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
15177 } else {
15178 /* this is bad, but not fatal */
15179 WPRINTK("blktap: sysfs xen_class not created\n");
15180 --- sle11-2009-10-16.orig/drivers/xen/char/mem.c 2008-12-15 11:27:22.000000000 +0100
15181 +++ sle11-2009-10-16/drivers/xen/char/mem.c 2009-03-16 16:38:05.000000000 +0100
15182 @@ -33,6 +33,27 @@ static inline int uncached_access(struct
15183 return 0;
15184 }
15185
15186 +static inline int range_is_allowed(unsigned long pfn, unsigned long size)
15187 +{
15188 +#ifdef CONFIG_NONPROMISC_DEVMEM
15189 + u64 from = ((u64)pfn) << PAGE_SHIFT;
15190 + u64 to = from + size;
15191 + u64 cursor = from;
15192 +
15193 + while (cursor < to) {
15194 + if (!devmem_is_allowed(pfn)) {
15195 + printk(KERN_INFO
15196 + "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
15197 + current->comm, from, to);
15198 + return 0;
15199 + }
15200 + cursor += PAGE_SIZE;
15201 + pfn++;
15202 + }
15203 +#endif
15204 + return 1;
15205 +}
15206 +
15207 /*
15208 * This funcion reads the *physical* memory. The f_pos points directly to the
15209 * memory location.
15210 @@ -55,6 +76,9 @@ static ssize_t read_mem(struct file * fi
15211
15212 sz = min_t(unsigned long, sz, count);
15213
15214 + if (!range_is_allowed(p >> PAGE_SHIFT, count))
15215 + return -EPERM;
15216 +
15217 v = ioremap(p, sz);
15218 if (IS_ERR(v) || v == NULL) {
15219 /*
15220 @@ -103,6 +127,9 @@ static ssize_t write_mem(struct file * f
15221
15222 sz = min_t(unsigned long, sz, count);
15223
15224 + if (!range_is_allowed(p >> PAGE_SHIFT, sz))
15225 + return -EPERM;
15226 +
15227 v = ioremap(p, sz);
15228 if (v == NULL)
15229 break;
15230 @@ -131,6 +158,23 @@ static ssize_t write_mem(struct file * f
15231 }
15232
15233 #ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
15234 +static void mmap_mem_open(struct vm_area_struct *vma)
15235 +{
15236 + map_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
15237 + vma->vm_page_prot);
15238 +}
15239 +
15240 +static void mmap_mem_close(struct vm_area_struct *vma)
15241 +{
15242 + unmap_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
15243 + vma->vm_page_prot);
15244 +}
15245 +
15246 +static struct vm_operations_struct mmap_mem_ops = {
15247 + .open = mmap_mem_open,
15248 + .close = mmap_mem_close
15249 +};
15250 +
15251 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
15252 {
15253 size_t size = vma->vm_end - vma->vm_start;
15254 @@ -138,6 +182,15 @@ static int xen_mmap_mem(struct file * fi
15255 if (uncached_access(file))
15256 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
15257
15258 + if (!range_is_allowed(vma->vm_pgoff, size))
15259 + return -EPERM;
15260 +
15261 + if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
15262 + &vma->vm_page_prot))
15263 + return -EINVAL;
15264 +
15265 + vma->vm_ops = &mmap_mem_ops;
15266 +
15267 /* We want to return the real error code, not EAGAIN. */
15268 return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
15269 size, vma->vm_page_prot, DOMID_IO);
15270 --- sle11-2009-10-16.orig/drivers/xen/console/console.c 2008-12-15 11:26:44.000000000 +0100
15271 +++ sle11-2009-10-16/drivers/xen/console/console.c 2009-03-16 16:38:05.000000000 +0100
15272 @@ -552,16 +552,18 @@ static int xencons_write(
15273 return i;
15274 }
15275
15276 -static void xencons_put_char(struct tty_struct *tty, u_char ch)
15277 +static int xencons_put_char(struct tty_struct *tty, u_char ch)
15278 {
15279 unsigned long flags;
15280 + int ret;
15281
15282 if (DUMMY_TTY(tty))
15283 - return;
15284 + return 0;
15285
15286 spin_lock_irqsave(&xencons_lock, flags);
15287 - (void)__xencons_put_char(ch);
15288 + ret = __xencons_put_char(ch);
15289 spin_unlock_irqrestore(&xencons_lock, flags);
15290 + return ret;
15291 }
15292
15293 static void xencons_flush_chars(struct tty_struct *tty)
15294 @@ -583,7 +585,7 @@ static void xencons_wait_until_sent(stru
15295 if (DUMMY_TTY(tty))
15296 return;
15297
15298 - while (DRV(tty->driver)->chars_in_buffer(tty)) {
15299 + while (tty_chars_in_buffer(tty)) {
15300 set_current_state(TASK_INTERRUPTIBLE);
15301 schedule_timeout(1);
15302 if (signal_pending(current))
15303 @@ -632,8 +634,7 @@ static void xencons_close(struct tty_str
15304
15305 tty->closing = 1;
15306 tty_wait_until_sent(tty, 0);
15307 - if (DRV(tty->driver)->flush_buffer != NULL)
15308 - DRV(tty->driver)->flush_buffer(tty);
15309 + tty_driver_flush_buffer(tty);
15310 if (tty->ldisc.flush_buffer != NULL)
15311 tty->ldisc.flush_buffer(tty);
15312 tty->closing = 0;
15313 --- sle11-2009-10-16.orig/drivers/xen/core/machine_kexec.c 2009-02-17 11:46:41.000000000 +0100
15314 +++ sle11-2009-10-16/drivers/xen/core/machine_kexec.c 2009-03-16 16:38:05.000000000 +0100
15315 @@ -5,6 +5,7 @@
15316
15317 #include <linux/kexec.h>
15318 #include <xen/interface/kexec.h>
15319 +#include <linux/reboot.h>
15320 #include <linux/mm.h>
15321 #include <linux/bootmem.h>
15322
15323 @@ -90,6 +91,9 @@ void __init xen_machine_kexec_setup_reso
15324 xen_hypervisor_res.start = range.start;
15325 xen_hypervisor_res.end = range.start + range.size - 1;
15326 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
15327 +#ifdef CONFIG_X86_64
15328 + insert_resource(&iomem_resource, &xen_hypervisor_res);
15329 +#endif
15330
15331 /* fill in crashk_res if range is reserved by hypervisor */
15332
15333 @@ -102,6 +106,9 @@ void __init xen_machine_kexec_setup_reso
15334 if (range.size) {
15335 crashk_res.start = range.start;
15336 crashk_res.end = range.start + range.size - 1;
15337 +#ifdef CONFIG_X86_64
15338 + insert_resource(&iomem_resource, &crashk_res);
15339 +#endif
15340 }
15341
15342 /* get physical address of vmcoreinfo */
15343 @@ -153,11 +160,13 @@ void __init xen_machine_kexec_setup_reso
15344 return;
15345 }
15346
15347 +#ifndef CONFIG_X86_64
15348 void __init xen_machine_kexec_register_resources(struct resource *res)
15349 {
15350 request_resource(res, &xen_hypervisor_res);
15351 machine_kexec_register_resources(res);
15352 }
15353 +#endif
15354
15355 static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
15356 {
15357 @@ -228,6 +237,11 @@ void machine_shutdown(void)
15358 /* do nothing */
15359 }
15360
15361 +void machine_crash_shutdown(struct pt_regs *regs)
15362 +{
15363 + /* The kernel is broken so disable interrupts */
15364 + local_irq_disable();
15365 +}
15366
15367 /*
15368 * Local variables:
15369 --- sle11-2009-10-16.orig/drivers/xen/core/smpboot.c 2009-03-16 16:33:40.000000000 +0100
15370 +++ sle11-2009-10-16/drivers/xen/core/smpboot.c 2009-03-16 16:38:05.000000000 +0100
15371 @@ -53,17 +53,16 @@ static DEFINE_PER_CPU(int, callfunc_irq)
15372 static char resched_name[NR_CPUS][15];
15373 static char callfunc_name[NR_CPUS][15];
15374
15375 -u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
15376 +#ifdef CONFIG_X86_LOCAL_APIC
15377 +#define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
15378 +#else
15379 +#define set_cpu_to_apicid(cpu, apicid)
15380 +#endif
15381
15382 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
15383 DEFINE_PER_CPU(cpumask_t, cpu_core_map);
15384 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
15385
15386 -#if defined(__i386__)
15387 -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
15388 -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15389 -#endif
15390 -
15391 void __init prefill_possible_map(void)
15392 {
15393 int i, rc;
15394 @@ -154,7 +153,7 @@ static int __cpuinit xen_smp_intr_init(u
15395 }
15396
15397 #ifdef CONFIG_HOTPLUG_CPU
15398 -static void xen_smp_intr_exit(unsigned int cpu)
15399 +static void __cpuexit xen_smp_intr_exit(unsigned int cpu)
15400 {
15401 if (cpu != 0)
15402 local_teardown_timer(cpu);
15403 @@ -263,8 +262,7 @@ void __init smp_prepare_cpus(unsigned in
15404 boot_cpu_data.apicid = apicid;
15405 cpu_data(0) = boot_cpu_data;
15406
15407 - cpu_2_logical_apicid[0] = apicid;
15408 - per_cpu(x86_cpu_to_apicid, 0) = apicid;
15409 + set_cpu_to_apicid(0, apicid);
15410
15411 current_thread_info()->cpu = 0;
15412
15413 @@ -319,8 +317,7 @@ void __init smp_prepare_cpus(unsigned in
15414 cpu_data(cpu).cpu_index = cpu;
15415 cpu_data(cpu).apicid = apicid;
15416
15417 - cpu_2_logical_apicid[cpu] = apicid;
15418 - per_cpu(x86_cpu_to_apicid, cpu) = apicid;
15419 + set_cpu_to_apicid(cpu, apicid);
15420
15421 #ifdef __x86_64__
15422 cpu_pda(cpu)->pcurrent = idle;
15423 @@ -375,7 +372,7 @@ static int __init initialize_cpu_present
15424 }
15425 core_initcall(initialize_cpu_present_map);
15426
15427 -int __cpu_disable(void)
15428 +int __cpuexit __cpu_disable(void)
15429 {
15430 cpumask_t map = cpu_online_map;
15431 unsigned int cpu = smp_processor_id();
15432 @@ -392,7 +389,7 @@ int __cpu_disable(void)
15433 return 0;
15434 }
15435
15436 -void __cpu_die(unsigned int cpu)
15437 +void __cpuexit __cpu_die(unsigned int cpu)
15438 {
15439 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
15440 current->state = TASK_UNINTERRUPTIBLE;
15441 --- sle11-2009-10-16.orig/drivers/xen/core/xen_proc.c 2009-10-28 14:55:03.000000000 +0100
15442 +++ sle11-2009-10-16/drivers/xen/core/xen_proc.c 2009-03-16 16:38:05.000000000 +0100
15443 @@ -8,7 +8,7 @@ static struct proc_dir_entry *xen_base;
15444 struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
15445 {
15446 if ( xen_base == NULL )
15447 - if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
15448 + if ( (xen_base = proc_mkdir("xen", NULL)) == NULL )
15449 panic("Couldn't create /proc/xen");
15450 return create_proc_entry(name, mode, xen_base);
15451 }
15452 --- sle11-2009-10-16.orig/drivers/xen/fbfront/xenfb.c 2009-03-04 11:25:55.000000000 +0100
15453 +++ sle11-2009-10-16/drivers/xen/fbfront/xenfb.c 2009-03-16 16:38:05.000000000 +0100
15454 @@ -93,7 +93,7 @@ struct xenfb_info
15455 * only mappings. The former creates unfaulted pages. Preserves
15456 * invariant. The latter removes pages. Preserves invariant.
15457 *
15458 - * 3. Holding both locks: xenfb_vm_nopage(). Extends the dirty
15459 + * 3. Holding both locks: xenfb_vm_fault(). Extends the dirty
15460 * rectangle and updates mappings consistently. Preserves
15461 * invariant.
15462 *
15463 @@ -112,13 +112,13 @@ struct xenfb_info
15464 *
15465 * But FIXME: the invariant is too weak. It misses that the fault
15466 * record in mappings must be consistent with the mapping of pages in
15467 - * the associated address space! do_no_page() updates the PTE after
15468 - * xenfb_vm_nopage() returns, i.e. outside the critical region. This
15469 + * the associated address space! __do_fault() updates the PTE after
15470 + * xenfb_vm_fault() returns, i.e. outside the critical region. This
15471 * allows the following race:
15472 *
15473 * X writes to some address in the Xen frame buffer
15474 - * Fault - call do_no_page()
15475 - * call xenfb_vm_nopage()
15476 + * Fault - call __do_fault()
15477 + * call xenfb_vm_fault()
15478 * grab mm_lock
15479 * map->faults++;
15480 * release mm_lock
15481 @@ -387,18 +387,17 @@ static void xenfb_vm_close(struct vm_are
15482 mutex_unlock(&info->mm_lock);
15483 }
15484
15485 -static struct page *xenfb_vm_nopage(struct vm_area_struct *vma,
15486 - unsigned long vaddr, int *type)
15487 +static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
15488 {
15489 struct xenfb_mapping *map = vma->vm_private_data;
15490 struct xenfb_info *info = map->info;
15491 - int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT;
15492 + int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
15493 unsigned long flags;
15494 struct page *page;
15495 int y1, y2;
15496
15497 if (pgnr >= info->nr_pages)
15498 - return NOPAGE_SIGBUS;
15499 + return VM_FAULT_SIGBUS;
15500
15501 mutex_lock(&info->mm_lock);
15502 spin_lock_irqsave(&info->dirty_lock, flags);
15503 @@ -414,16 +413,15 @@ static struct page *xenfb_vm_nopage(stru
15504 spin_unlock_irqrestore(&info->dirty_lock, flags);
15505 mutex_unlock(&info->mm_lock);
15506
15507 - if (type)
15508 - *type = VM_FAULT_MINOR;
15509 + vmf->page = page;
15510
15511 - return page;
15512 + return VM_FAULT_MINOR;
15513 }
15514
15515 static struct vm_operations_struct xenfb_vm_ops = {
15516 .open = xenfb_vm_open,
15517 .close = xenfb_vm_close,
15518 - .nopage = xenfb_vm_nopage,
15519 + .fault = xenfb_vm_fault,
15520 };
15521
15522 static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
15523 --- sle11-2009-10-16.orig/drivers/xen/gntdev/gntdev.c 2009-03-16 16:33:40.000000000 +0100
15524 +++ sle11-2009-10-16/drivers/xen/gntdev/gntdev.c 2009-03-16 16:38:05.000000000 +0100
15525 @@ -392,7 +392,7 @@ nomem_out:
15526 static int __init gntdev_init(void)
15527 {
15528 struct class *class;
15529 - struct class_device *device;
15530 + struct device *device;
15531
15532 if (!is_running_on_xen()) {
15533 printk(KERN_ERR "You must be running Xen to use gntdev\n");
15534 @@ -417,8 +417,8 @@ static int __init gntdev_init(void)
15535 return 0;
15536 }
15537
15538 - device = class_device_create(class, NULL, MKDEV(gntdev_major, 0),
15539 - NULL, GNTDEV_NAME);
15540 + device = device_create(class, NULL, MKDEV(gntdev_major, 0),
15541 + GNTDEV_NAME);
15542 if (IS_ERR(device)) {
15543 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
15544 printk(KERN_ERR "gntdev created with major number = %d\n",
15545 @@ -435,7 +435,7 @@ static void __exit gntdev_exit(void)
15546 {
15547 struct class *class;
15548 if ((class = get_xen_class()) != NULL)
15549 - class_device_destroy(class, MKDEV(gntdev_major, 0));
15550 + device_destroy(class, MKDEV(gntdev_major, 0));
15551 unregister_chrdev(gntdev_major, GNTDEV_NAME);
15552 }
15553
15554 --- sle11-2009-10-16.orig/drivers/xen/netfront/netfront.c 2009-03-30 16:39:44.000000000 +0200
15555 +++ sle11-2009-10-16/drivers/xen/netfront/netfront.c 2009-03-30 16:40:17.000000000 +0200
15556 @@ -1464,8 +1464,7 @@ err:
15557 }
15558 }
15559
15560 - while ((skb = __skb_dequeue(&errq)))
15561 - kfree_skb(skb);
15562 + __skb_queue_purge(&errq);
15563
15564 while ((skb = __skb_dequeue(&rxq)) != NULL) {
15565 struct page *page = NETFRONT_SKB_CB(skb)->page;
15566 @@ -1630,8 +1629,7 @@ static void netif_release_rx_bufs_flip(s
15567 }
15568 }
15569
15570 - while ((skb = __skb_dequeue(&free_list)) != NULL)
15571 - dev_kfree_skb(skb);
15572 + __skb_queue_purge(&free_list);
15573
15574 spin_unlock_bh(&np->rx_lock);
15575 }
15576 --- sle11-2009-10-16.orig/drivers/xen/privcmd/privcmd.c 2009-03-04 11:28:34.000000000 +0100
15577 +++ sle11-2009-10-16/drivers/xen/privcmd/privcmd.c 2009-03-16 16:38:05.000000000 +0100
15578 @@ -261,15 +261,13 @@ static long privcmd_ioctl(struct file *f
15579 }
15580
15581 #ifndef HAVE_ARCH_PRIVCMD_MMAP
15582 -static struct page *privcmd_nopage(struct vm_area_struct *vma,
15583 - unsigned long address,
15584 - int *type)
15585 +static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
15586 {
15587 - return NOPAGE_SIGBUS;
15588 + return VM_FAULT_SIGBUS;
15589 }
15590
15591 static struct vm_operations_struct privcmd_vm_ops = {
15592 - .nopage = privcmd_nopage
15593 + .fault = privcmd_fault
15594 };
15595
15596 static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
15597 --- sle11-2009-10-16.orig/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:12:22.000000000 +0100
15598 +++ sle11-2009-10-16/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:13:17.000000000 +0100
15599 @@ -442,7 +442,7 @@ int xenbus_map_ring_valloc(struct xenbus
15600
15601 *vaddr = NULL;
15602
15603 - area = alloc_vm_area(PAGE_SIZE);
15604 + area = xen_alloc_vm_area(PAGE_SIZE);
15605 if (!area)
15606 return -ENOMEM;
15607
15608 @@ -452,7 +452,7 @@ int xenbus_map_ring_valloc(struct xenbus
15609 BUG();
15610
15611 if (op.status != GNTST_okay) {
15612 - free_vm_area(area);
15613 + xen_free_vm_area(area);
15614 xenbus_dev_fatal(dev, op.status,
15615 "mapping in shared page %d from domain %d",
15616 gnt_ref, dev->otherend_id);
15617 @@ -551,7 +551,7 @@ int xenbus_unmap_ring_vfree(struct xenbu
15618 BUG();
15619
15620 if (op.status == GNTST_okay)
15621 - free_vm_area(area);
15622 + xen_free_vm_area(area);
15623 else
15624 xenbus_dev_error(dev, op.status,
15625 "unmapping page at handle %d error %d",
15626 --- sle11-2009-10-16.orig/drivers/xen/xenbus/xenbus_probe.c 2009-02-16 16:18:36.000000000 +0100
15627 +++ sle11-2009-10-16/drivers/xen/xenbus/xenbus_probe.c 2009-03-16 16:38:05.000000000 +0100
15628 @@ -173,7 +173,7 @@ static int read_backend_details(struct x
15629 return read_otherend_details(xendev, "backend-id", "backend");
15630 }
15631
15632 -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) && (defined(CONFIG_XEN) || defined(MODULE))
15633 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
15634 static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env)
15635 {
15636 struct xenbus_device *xdev;
15637 @@ -185,8 +185,10 @@ static int xenbus_uevent_frontend(struct
15638 return -ENODEV;
15639
15640 /* stuff we want to pass to /sbin/hotplug */
15641 +#if defined(CONFIG_XEN) || defined(MODULE)
15642 add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype);
15643 add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename);
15644 +#endif
15645 add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype);
15646
15647 return 0;
15648 @@ -207,10 +209,8 @@ static struct xen_bus_type xenbus_fronte
15649 .probe = xenbus_dev_probe,
15650 .remove = xenbus_dev_remove,
15651 .shutdown = xenbus_dev_shutdown,
15652 -#if defined(CONFIG_XEN) || defined(MODULE)
15653 .uevent = xenbus_uevent_frontend,
15654 #endif
15655 -#endif
15656 },
15657 #if defined(CONFIG_XEN) || defined(MODULE)
15658 .dev = {
15659 @@ -519,6 +519,15 @@ static ssize_t xendev_show_devtype(struc
15660 }
15661 DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
15662
15663 +static ssize_t xendev_show_modalias(struct device *dev,
15664 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
15665 + struct device_attribute *attr,
15666 +#endif
15667 + char *buf)
15668 +{
15669 + return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
15670 +}
15671 +DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
15672
15673 int xenbus_probe_node(struct xen_bus_type *bus,
15674 const char *type,
15675 @@ -579,10 +588,16 @@ int xenbus_probe_node(struct xen_bus_typ
15676
15677 err = device_create_file(&xendev->dev, &dev_attr_devtype);
15678 if (err)
15679 - goto fail_remove_file;
15680 + goto fail_remove_nodename;
15681 +
15682 + err = device_create_file(&xendev->dev, &dev_attr_modalias);
15683 + if (err)
15684 + goto fail_remove_devtype;
15685
15686 return 0;
15687 -fail_remove_file:
15688 +fail_remove_devtype:
15689 + device_remove_file(&xendev->dev, &dev_attr_devtype);
15690 +fail_remove_nodename:
15691 device_remove_file(&xendev->dev, &dev_attr_nodename);
15692 fail_unregister:
15693 device_unregister(&xendev->dev);
15694 --- sle11-2009-10-16.orig/fs/aio.c 2009-03-24 10:11:37.000000000 +0100
15695 +++ sle11-2009-10-16/fs/aio.c 2009-03-24 10:13:25.000000000 +0100
15696 @@ -1271,6 +1271,7 @@ static void io_destroy(struct kioctx *io
15697 #ifdef CONFIG_EPOLL
15698 /* forget the poll file, but it's up to the user to close it */
15699 if (ioctx->file) {
15700 + fput(ioctx->file);
15701 ioctx->file->private_data = 0;
15702 ioctx->file = 0;
15703 }
15704 @@ -1295,6 +1296,7 @@ static int aio_queue_fd_close(struct ino
15705 spin_lock_irq(&ioctx->ctx_lock);
15706 ioctx->file = 0;
15707 spin_unlock_irq(&ioctx->ctx_lock);
15708 + fput(file);
15709 }
15710 return 0;
15711 }
15712 @@ -1330,16 +1332,17 @@ static const struct file_operations aioq
15713
15714 static int make_aio_fd(struct kioctx *ioctx)
15715 {
15716 - int error, fd;
15717 - struct inode *inode;
15718 + int fd;
15719 struct file *file;
15720
15721 - error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
15722 - &aioq_fops, ioctx);
15723 - if (error)
15724 - return error;
15725 + fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
15726 + if (fd < 0)
15727 + return fd;
15728
15729 /* associate the file with the IO context */
15730 + file = fget(fd);
15731 + if (!file)
15732 + return -EBADF;
15733 file->private_data = ioctx;
15734 ioctx->file = file;
15735 init_waitqueue_head(&ioctx->poll_wait);
15736 --- sle11-2009-10-16.orig/include/asm-x86/dma-mapping.h 2009-10-28 14:55:03.000000000 +0100
15737 +++ sle11-2009-10-16/include/asm-x86/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
15738 @@ -223,8 +223,13 @@ static inline dma_addr_t dma_map_page(st
15739 struct dma_mapping_ops *ops = get_dma_ops(dev);
15740
15741 BUG_ON(!valid_dma_direction(direction));
15742 +#ifndef CONFIG_XEN
15743 return ops->map_single(dev, page_to_phys(page) + offset,
15744 size, direction);
15745 +#else
15746 + return ops->map_single(dev, page_to_pseudophys(page) + offset,
15747 + size, direction);
15748 +#endif
15749 }
15750
15751 static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
15752 --- sle11-2009-10-16.orig/include/asm-x86/genapic_64.h 2009-10-28 14:55:03.000000000 +0100
15753 +++ sle11-2009-10-16/include/asm-x86/genapic_64.h 2009-03-16 16:38:05.000000000 +0100
15754 @@ -46,6 +46,7 @@ extern struct genapic apic_x2apic_phys;
15755 extern int acpi_madt_oem_check(char *, char *);
15756
15757 extern void apic_send_IPI_self(int vector);
15758 +#ifndef CONFIG_XEN
15759 enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
15760 extern enum uv_system_type get_uv_system_type(void);
15761 extern int is_uv_system(void);
15762 @@ -55,6 +56,10 @@ DECLARE_PER_CPU(int, x2apic_extra_bits);
15763 extern void uv_cpu_init(void);
15764 extern void uv_system_init(void);
15765 extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
15766 +#else
15767 +#define is_uv_system() 0
15768 +#define uv_cpu_init() ((void)0)
15769 +#endif
15770
15771 extern void setup_apic_routing(void);
15772
15773 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:33:40.000000000 +0100
15774 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:38:05.000000000 +0100
15775 @@ -64,8 +64,8 @@ static inline struct desc_struct *get_cp
15776 }
15777
15778 static inline void pack_gate(gate_desc *gate, unsigned char type,
15779 - unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
15780 -
15781 + unsigned long base, unsigned dpl, unsigned flags,
15782 + unsigned short seg)
15783 {
15784 gate->a = (seg << 16) | (base & 0xffff);
15785 gate->b = (base & 0xffff0000) |
15786 @@ -84,22 +84,23 @@ static inline int desc_empty(const void
15787 #define load_TR_desc() native_load_tr_desc()
15788 #define load_gdt(dtr) native_load_gdt(dtr)
15789 #define load_idt(dtr) native_load_idt(dtr)
15790 -#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
15791 -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
15792 +#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
15793 +#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
15794
15795 #define store_gdt(dtr) native_store_gdt(dtr)
15796 #define store_idt(dtr) native_store_idt(dtr)
15797 #define store_tr(tr) (tr = native_store_tr())
15798 -#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
15799 +#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
15800
15801 #define load_TLS(t, cpu) native_load_tls(t, cpu)
15802 #define set_ldt native_set_ldt
15803
15804 -#define write_ldt_entry(dt, entry, desc) \
15805 - native_write_ldt_entry(dt, entry, desc)
15806 -#define write_gdt_entry(dt, entry, desc, type) \
15807 - native_write_gdt_entry(dt, entry, desc, type)
15808 -#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
15809 +#define write_ldt_entry(dt, entry, desc) \
15810 + native_write_ldt_entry(dt, entry, desc)
15811 +#define write_gdt_entry(dt, entry, desc, type) \
15812 + native_write_gdt_entry(dt, entry, desc, type)
15813 +#define write_idt_entry(dt, entry, g) \
15814 + native_write_idt_entry(dt, entry, g)
15815
15816 static inline void native_write_idt_entry(gate_desc *idt, int entry,
15817 const gate_desc *gate)
15818 @@ -138,8 +139,8 @@ static inline void pack_descriptor(struc
15819 {
15820 desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
15821 desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
15822 - (limit & 0x000f0000) | ((type & 0xff) << 8) |
15823 - ((flags & 0xf) << 20);
15824 + (limit & 0x000f0000) | ((type & 0xff) << 8) |
15825 + ((flags & 0xf) << 20);
15826 desc->p = 1;
15827 }
15828
15829 @@ -160,7 +161,6 @@ static inline void set_tssldt_descriptor
15830 desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
15831 desc->base3 = PTR_HIGH(addr);
15832 #else
15833 -
15834 pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
15835 #endif
15836 }
15837 @@ -178,7 +178,8 @@ static inline void __set_tss_desc(unsign
15838 * last valid byte
15839 */
15840 set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
15841 - IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
15842 + IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
15843 + sizeof(unsigned long) - 1);
15844 write_gdt_entry(d, entry, &tss, DESC_TSS);
15845 }
15846
15847 @@ -187,16 +188,16 @@ static inline void __set_tss_desc(unsign
15848 static inline void native_set_ldt(const void *addr, unsigned int entries)
15849 {
15850 if (likely(entries == 0))
15851 - __asm__ __volatile__("lldt %w0"::"q" (0));
15852 + asm volatile("lldt %w0"::"q" (0));
15853 else {
15854 unsigned cpu = smp_processor_id();
15855 ldt_desc ldt;
15856
15857 - set_tssldt_descriptor(&ldt, (unsigned long)addr,
15858 - DESC_LDT, entries * sizeof(ldt) - 1);
15859 + set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
15860 + entries * LDT_ENTRY_SIZE - 1);
15861 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
15862 &ldt, DESC_LDT);
15863 - __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
15864 + asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
15865 }
15866 }
15867
15868 @@ -261,15 +262,15 @@ static inline void xen_load_tls(struct t
15869 }
15870 #endif
15871
15872 -#define _LDT_empty(info) (\
15873 - (info)->base_addr == 0 && \
15874 - (info)->limit == 0 && \
15875 - (info)->contents == 0 && \
15876 - (info)->read_exec_only == 1 && \
15877 - (info)->seg_32bit == 0 && \
15878 - (info)->limit_in_pages == 0 && \
15879 - (info)->seg_not_present == 1 && \
15880 - (info)->useable == 0)
15881 +#define _LDT_empty(info) \
15882 + ((info)->base_addr == 0 && \
15883 + (info)->limit == 0 && \
15884 + (info)->contents == 0 && \
15885 + (info)->read_exec_only == 1 && \
15886 + (info)->seg_32bit == 0 && \
15887 + (info)->limit_in_pages == 0 && \
15888 + (info)->seg_not_present == 1 && \
15889 + (info)->useable == 0)
15890
15891 #ifdef CONFIG_X86_64
15892 #define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
15893 @@ -309,7 +310,7 @@ static inline unsigned long get_desc_lim
15894
15895 #ifndef CONFIG_X86_NO_IDT
15896 static inline void _set_gate(int gate, unsigned type, void *addr,
15897 - unsigned dpl, unsigned ist, unsigned seg)
15898 + unsigned dpl, unsigned ist, unsigned seg)
15899 {
15900 gate_desc s;
15901 pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
15902 @@ -393,10 +394,10 @@ static inline void set_system_gate_ist(i
15903 * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
15904 */
15905 #define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
15906 - movb idx*8+4(gdt), lo_b; \
15907 - movb idx*8+7(gdt), hi_b; \
15908 - shll $16, base; \
15909 - movw idx*8+2(gdt), lo_w;
15910 + movb idx * 8 + 4(gdt), lo_b; \
15911 + movb idx * 8 + 7(gdt), hi_b; \
15912 + shll $16, base; \
15913 + movw idx * 8 + 2(gdt), lo_w;
15914
15915
15916 #endif /* __ASSEMBLY__ */
15917 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/dma-mapping.h 2009-02-16 16:18:36.000000000 +0100
15918 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
15919 @@ -1,5 +1,17 @@
15920 -#ifdef CONFIG_X86_32
15921 -# include "dma-mapping_32.h"
15922 -#else
15923 -# include "dma-mapping_64.h"
15924 -#endif
15925 +#ifndef _ASM_DMA_MAPPING_H_
15926 +
15927 +#include "../../dma-mapping.h"
15928 +
15929 +static inline int
15930 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
15931 +{
15932 + dma_addr_t mask = 0xffffffff;
15933 + /* If the device has a mask, use it, otherwise default to 32 bits */
15934 + if (hwdev && hwdev->dma_mask)
15935 + mask = *hwdev->dma_mask;
15936 + return (addr & ~mask) != 0;
15937 +}
15938 +
15939 +extern int range_straddles_page_boundary(paddr_t p, size_t size);
15940 +
15941 +#endif /* _ASM_DMA_MAPPING_H_ */
15942 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2009-03-16 16:33:40.000000000 +0100
15943 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
15944 @@ -1,141 +0,0 @@
15945 -#ifndef _ASM_I386_DMA_MAPPING_H
15946 -#define _ASM_I386_DMA_MAPPING_H
15947 -
15948 -/*
15949 - * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
15950 - * documentation.
15951 - */
15952 -
15953 -#include <linux/mm.h>
15954 -#include <linux/scatterlist.h>
15955 -#include <asm/cache.h>
15956 -#include <asm/io.h>
15957 -#include <asm/swiotlb.h>
15958 -
15959 -static inline int
15960 -address_needs_mapping(struct device *hwdev, dma_addr_t addr)
15961 -{
15962 - dma_addr_t mask = 0xffffffff;
15963 - /* If the device has a mask, use it, otherwise default to 32 bits */
15964 - if (hwdev && hwdev->dma_mask)
15965 - mask = *hwdev->dma_mask;
15966 - return (addr & ~mask) != 0;
15967 -}
15968 -
15969 -extern int range_straddles_page_boundary(paddr_t p, size_t size);
15970 -
15971 -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
15972 -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
15973 -
15974 -void *dma_alloc_coherent(struct device *dev, size_t size,
15975 - dma_addr_t *dma_handle, gfp_t flag);
15976 -
15977 -void dma_free_coherent(struct device *dev, size_t size,
15978 - void *vaddr, dma_addr_t dma_handle);
15979 -
15980 -extern dma_addr_t
15981 -dma_map_single(struct device *dev, void *ptr, size_t size,
15982 - enum dma_data_direction direction);
15983 -
15984 -extern void
15985 -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
15986 - enum dma_data_direction direction);
15987 -
15988 -extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
15989 - int nents, enum dma_data_direction direction);
15990 -extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
15991 - int nents, enum dma_data_direction direction);
15992 -
15993 -#ifdef CONFIG_HIGHMEM
15994 -extern dma_addr_t
15995 -dma_map_page(struct device *dev, struct page *page, unsigned long offset,
15996 - size_t size, enum dma_data_direction direction);
15997 -
15998 -extern void
15999 -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
16000 - enum dma_data_direction direction);
16001 -#else
16002 -#define dma_map_page(dev, page, offset, size, dir) \
16003 - dma_map_single(dev, page_address(page) + (offset), (size), (dir))
16004 -#define dma_unmap_page dma_unmap_single
16005 -#endif
16006 -
16007 -extern void
16008 -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
16009 - enum dma_data_direction direction);
16010 -
16011 -extern void
16012 -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
16013 - enum dma_data_direction direction);
16014 -
16015 -static inline void
16016 -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
16017 - unsigned long offset, size_t size,
16018 - enum dma_data_direction direction)
16019 -{
16020 - dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
16021 -}
16022 -
16023 -static inline void
16024 -dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
16025 - unsigned long offset, size_t size,
16026 - enum dma_data_direction direction)
16027 -{
16028 - dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
16029 -}
16030 -
16031 -extern void
16032 -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
16033 - enum dma_data_direction direction);
16034 -
16035 -extern void
16036 -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
16037 - enum dma_data_direction direction);
16038 -
16039 -extern int
16040 -dma_mapping_error(dma_addr_t dma_addr);
16041 -
16042 -extern int
16043 -dma_supported(struct device *dev, u64 mask);
16044 -
16045 -static inline int
16046 -dma_set_mask(struct device *dev, u64 mask)
16047 -{
16048 - if(!dev->dma_mask || !dma_supported(dev, mask))
16049 - return -EIO;
16050 -
16051 - *dev->dma_mask = mask;
16052 -
16053 - return 0;
16054 -}
16055 -
16056 -static inline int
16057 -dma_get_cache_alignment(void)
16058 -{
16059 - /* no easy way to get cache size on all x86, so return the
16060 - * maximum possible, to be safe */
16061 - return (1 << INTERNODE_CACHE_SHIFT);
16062 -}
16063 -
16064 -#define dma_is_consistent(d, h) (1)
16065 -
16066 -static inline void
16067 -dma_cache_sync(struct device *dev, void *vaddr, size_t size,
16068 - enum dma_data_direction direction)
16069 -{
16070 - flush_write_buffers();
16071 -}
16072 -
16073 -#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
16074 -extern int
16075 -dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
16076 - dma_addr_t device_addr, size_t size, int flags);
16077 -
16078 -extern void
16079 -dma_release_declared_memory(struct device *dev);
16080 -
16081 -extern void *
16082 -dma_mark_declared_memory_occupied(struct device *dev,
16083 - dma_addr_t device_addr, size_t size);
16084 -
16085 -#endif
16086 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/dma-mapping_64.h 2009-02-16 16:18:36.000000000 +0100
16087 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
16088 @@ -1,205 +0,0 @@
16089 -#ifndef _X8664_DMA_MAPPING_H
16090 -#define _X8664_DMA_MAPPING_H 1
16091 -
16092 -/*
16093 - * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
16094 - * documentation.
16095 - */
16096 -
16097 -#include <linux/scatterlist.h>
16098 -#include <asm/io.h>
16099 -
16100 -struct dma_mapping_ops {
16101 - int (*mapping_error)(dma_addr_t dma_addr);
16102 - void* (*alloc_coherent)(struct device *dev, size_t size,
16103 - dma_addr_t *dma_handle, gfp_t gfp);
16104 - void (*free_coherent)(struct device *dev, size_t size,
16105 - void *vaddr, dma_addr_t dma_handle);
16106 - dma_addr_t (*map_single)(struct device *hwdev, void *ptr,
16107 - size_t size, int direction);
16108 - /* like map_single, but doesn't check the device mask */
16109 - dma_addr_t (*map_simple)(struct device *hwdev, char *ptr,
16110 - size_t size, int direction);
16111 - void (*unmap_single)(struct device *dev, dma_addr_t addr,
16112 - size_t size, int direction);
16113 - void (*sync_single_for_cpu)(struct device *hwdev,
16114 - dma_addr_t dma_handle, size_t size,
16115 - int direction);
16116 - void (*sync_single_for_device)(struct device *hwdev,
16117 - dma_addr_t dma_handle, size_t size,
16118 - int direction);
16119 - void (*sync_single_range_for_cpu)(struct device *hwdev,
16120 - dma_addr_t dma_handle, unsigned long offset,
16121 - size_t size, int direction);
16122 - void (*sync_single_range_for_device)(struct device *hwdev,
16123 - dma_addr_t dma_handle, unsigned long offset,
16124 - size_t size, int direction);
16125 - void (*sync_sg_for_cpu)(struct device *hwdev,
16126 - struct scatterlist *sg, int nelems,
16127 - int direction);
16128 - void (*sync_sg_for_device)(struct device *hwdev,
16129 - struct scatterlist *sg, int nelems,
16130 - int direction);
16131 - int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
16132 - int nents, int direction);
16133 - void (*unmap_sg)(struct device *hwdev,
16134 - struct scatterlist *sg, int nents,
16135 - int direction);
16136 - int (*dma_supported)(struct device *hwdev, u64 mask);
16137 - int is_phys;
16138 -};
16139 -
16140 -extern dma_addr_t bad_dma_address;
16141 -extern const struct dma_mapping_ops* dma_ops;
16142 -extern int iommu_merge;
16143 -
16144 -#if 0
16145 -static inline int dma_mapping_error(dma_addr_t dma_addr)
16146 -{
16147 - if (dma_ops->mapping_error)
16148 - return dma_ops->mapping_error(dma_addr);
16149 -
16150 - return (dma_addr == bad_dma_address);
16151 -}
16152 -
16153 -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
16154 -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
16155 -
16156 -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
16157 -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
16158 -
16159 -extern void *dma_alloc_coherent(struct device *dev, size_t size,
16160 - dma_addr_t *dma_handle, gfp_t gfp);
16161 -extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
16162 - dma_addr_t dma_handle);
16163 -
16164 -static inline dma_addr_t
16165 -dma_map_single(struct device *hwdev, void *ptr, size_t size,
16166 - int direction)
16167 -{
16168 - BUG_ON(!valid_dma_direction(direction));
16169 - return dma_ops->map_single(hwdev, ptr, size, direction);
16170 -}
16171 -
16172 -static inline void
16173 -dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
16174 - int direction)
16175 -{
16176 - BUG_ON(!valid_dma_direction(direction));
16177 - dma_ops->unmap_single(dev, addr, size, direction);
16178 -}
16179 -
16180 -#define dma_map_page(dev,page,offset,size,dir) \
16181 - dma_map_single((dev), page_address(page)+(offset), (size), (dir))
16182 -
16183 -#define dma_unmap_page dma_unmap_single
16184 -
16185 -static inline void
16186 -dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
16187 - size_t size, int direction)
16188 -{
16189 - BUG_ON(!valid_dma_direction(direction));
16190 - if (dma_ops->sync_single_for_cpu)
16191 - dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
16192 - direction);
16193 - flush_write_buffers();
16194 -}
16195 -
16196 -static inline void
16197 -dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
16198 - size_t size, int direction)
16199 -{
16200 - BUG_ON(!valid_dma_direction(direction));
16201 - if (dma_ops->sync_single_for_device)
16202 - dma_ops->sync_single_for_device(hwdev, dma_handle, size,
16203 - direction);
16204 - flush_write_buffers();
16205 -}
16206 -
16207 -static inline void
16208 -dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
16209 - unsigned long offset, size_t size, int direction)
16210 -{
16211 - BUG_ON(!valid_dma_direction(direction));
16212 - if (dma_ops->sync_single_range_for_cpu) {
16213 - dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
16214 - }
16215 -
16216 - flush_write_buffers();
16217 -}
16218 -
16219 -static inline void
16220 -dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
16221 - unsigned long offset, size_t size, int direction)
16222 -{
16223 - BUG_ON(!valid_dma_direction(direction));
16224 - if (dma_ops->sync_single_range_for_device)
16225 - dma_ops->sync_single_range_for_device(hwdev, dma_handle,
16226 - offset, size, direction);
16227 -
16228 - flush_write_buffers();
16229 -}
16230 -
16231 -static inline void
16232 -dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
16233 - int nelems, int direction)
16234 -{
16235 - BUG_ON(!valid_dma_direction(direction));
16236 - if (dma_ops->sync_sg_for_cpu)
16237 - dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
16238 - flush_write_buffers();
16239 -}
16240 -
16241 -static inline void
16242 -dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
16243 - int nelems, int direction)
16244 -{
16245 - BUG_ON(!valid_dma_direction(direction));
16246 - if (dma_ops->sync_sg_for_device) {
16247 - dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
16248 - }
16249 -
16250 - flush_write_buffers();
16251 -}
16252 -
16253 -static inline int
16254 -dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
16255 -{
16256 - BUG_ON(!valid_dma_direction(direction));
16257 - return dma_ops->map_sg(hwdev, sg, nents, direction);
16258 -}
16259 -
16260 -static inline void
16261 -dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
16262 - int direction)
16263 -{
16264 - BUG_ON(!valid_dma_direction(direction));
16265 - dma_ops->unmap_sg(hwdev, sg, nents, direction);
16266 -}
16267 -
16268 -extern int dma_supported(struct device *hwdev, u64 mask);
16269 -
16270 -/* same for gart, swiotlb, and nommu */
16271 -static inline int dma_get_cache_alignment(void)
16272 -{
16273 - return boot_cpu_data.x86_clflush_size;
16274 -}
16275 -
16276 -#define dma_is_consistent(d, h) 1
16277 -
16278 -extern int dma_set_mask(struct device *dev, u64 mask);
16279 -
16280 -static inline void
16281 -dma_cache_sync(struct device *dev, void *vaddr, size_t size,
16282 - enum dma_data_direction dir)
16283 -{
16284 - flush_write_buffers();
16285 -}
16286 -
16287 -extern struct device fallback_dev;
16288 -extern int panic_on_overflow;
16289 -#endif
16290 -
16291 -#endif /* _X8664_DMA_MAPPING_H */
16292 -
16293 -#include "dma-mapping_32.h"
16294 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap.h 2009-02-16 16:18:36.000000000 +0100
16295 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap.h 2009-03-16 16:38:05.000000000 +0100
16296 @@ -1,5 +1,13 @@
16297 +#ifndef _ASM_FIXMAP_H
16298 +#define _ASM_FIXMAP_H
16299 +
16300 #ifdef CONFIG_X86_32
16301 # include "fixmap_32.h"
16302 #else
16303 # include "fixmap_64.h"
16304 #endif
16305 +
16306 +#define clear_fixmap(idx) \
16307 + __set_fixmap(idx, 0, __pgprot(0))
16308 +
16309 +#endif
16310 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:33:40.000000000 +0100
16311 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:38:05.000000000 +0100
16312 @@ -10,8 +10,8 @@
16313 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
16314 */
16315
16316 -#ifndef _ASM_FIXMAP_H
16317 -#define _ASM_FIXMAP_H
16318 +#ifndef _ASM_FIXMAP_32_H
16319 +#define _ASM_FIXMAP_32_H
16320
16321 /* used by vmalloc.c, vsyscall.lds.S.
16322 *
16323 @@ -102,8 +102,7 @@ enum fixed_addresses {
16324 */
16325 #define NR_FIX_BTMAPS 64
16326 #define FIX_BTMAPS_NESTING 4
16327 - FIX_BTMAP_END =
16328 - __end_of_permanent_fixed_addresses + 512 -
16329 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
16330 (__end_of_permanent_fixed_addresses & 511),
16331 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
16332 FIX_WP_TEST,
16333 @@ -114,19 +113,16 @@ enum fixed_addresses {
16334 };
16335
16336 extern void __set_fixmap(enum fixed_addresses idx,
16337 - maddr_t phys, pgprot_t flags);
16338 + maddr_t phys, pgprot_t flags);
16339 extern void reserve_top_address(unsigned long reserve);
16340
16341 -#define set_fixmap(idx, phys) \
16342 - __set_fixmap(idx, phys, PAGE_KERNEL)
16343 +#define set_fixmap(idx, phys) \
16344 + __set_fixmap(idx, phys, PAGE_KERNEL)
16345 /*
16346 * Some hardware wants to get fixmapped without caching.
16347 */
16348 -#define set_fixmap_nocache(idx, phys) \
16349 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16350 -
16351 -#define clear_fixmap(idx) \
16352 - __set_fixmap(idx, 0, __pgprot(0))
16353 +#define set_fixmap_nocache(idx, phys) \
16354 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16355
16356 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
16357
16358 @@ -159,7 +155,7 @@ static __always_inline unsigned long fix
16359 if (idx >= __end_of_fixed_addresses)
16360 __this_fixmap_does_not_exist();
16361
16362 - return __fix_to_virt(idx);
16363 + return __fix_to_virt(idx);
16364 }
16365
16366 static inline unsigned long virt_to_fix(const unsigned long vaddr)
16367 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:33:40.000000000 +0100
16368 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:38:05.000000000 +0100
16369 @@ -8,8 +8,8 @@
16370 * Copyright (C) 1998 Ingo Molnar
16371 */
16372
16373 -#ifndef _ASM_FIXMAP_H
16374 -#define _ASM_FIXMAP_H
16375 +#ifndef _ASM_FIXMAP_64_H
16376 +#define _ASM_FIXMAP_64_H
16377
16378 #include <linux/kernel.h>
16379 #include <asm/apicdef.h>
16380 @@ -35,7 +35,8 @@
16381
16382 enum fixed_addresses {
16383 VSYSCALL_LAST_PAGE,
16384 - VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
16385 + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
16386 + + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
16387 VSYSCALL_HPET,
16388 FIX_DBGP_BASE,
16389 FIX_EARLYCON_MEM_BASE,
16390 @@ -45,11 +46,12 @@ enum fixed_addresses {
16391 #endif
16392 #ifndef CONFIG_XEN
16393 FIX_IO_APIC_BASE_0,
16394 - FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
16395 + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
16396 #endif
16397 #ifdef CONFIG_EFI
16398 FIX_EFI_IO_MAP_LAST_PAGE,
16399 - FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
16400 + FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
16401 + + MAX_EFI_IO_PAGES - 1,
16402 #endif
16403 #ifdef CONFIG_ACPI
16404 FIX_ACPI_BEGIN,
16405 @@ -79,19 +81,16 @@ enum fixed_addresses {
16406 __end_of_fixed_addresses
16407 };
16408
16409 -extern void __set_fixmap (enum fixed_addresses idx,
16410 - unsigned long phys, pgprot_t flags);
16411 +extern void __set_fixmap(enum fixed_addresses idx,
16412 + unsigned long phys, pgprot_t flags);
16413
16414 -#define set_fixmap(idx, phys) \
16415 - __set_fixmap(idx, phys, PAGE_KERNEL)
16416 +#define set_fixmap(idx, phys) \
16417 + __set_fixmap(idx, phys, PAGE_KERNEL)
16418 /*
16419 * Some hardware wants to get fixmapped without caching.
16420 */
16421 -#define set_fixmap_nocache(idx, phys) \
16422 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16423 -
16424 -#define clear_fixmap(idx) \
16425 - __set_fixmap(idx, 0, __pgprot(0))
16426 +#define set_fixmap_nocache(idx, phys) \
16427 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16428
16429 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
16430 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
16431 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:33:40.000000000 +0100
16432 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:38:05.000000000 +0100
16433 @@ -8,7 +8,7 @@
16434 * Gerhard.Wichert@pdb.siemens.de
16435 *
16436 *
16437 - * Redesigned the x86 32-bit VM architecture to deal with
16438 + * Redesigned the x86 32-bit VM architecture to deal with
16439 * up to 16 Terabyte physical memory. With current x86 CPUs
16440 * we now support up to 64 Gigabytes physical RAM.
16441 *
16442 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/io.h 2009-02-16 16:18:36.000000000 +0100
16443 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/io.h 2009-03-16 16:38:05.000000000 +0100
16444 @@ -1,5 +1,22 @@
16445 +#ifndef _ASM_X86_IO_H
16446 +#define _ASM_X86_IO_H
16447 +
16448 +#define ARCH_HAS_IOREMAP_WC
16449 +
16450 #ifdef CONFIG_X86_32
16451 # include "io_32.h"
16452 #else
16453 # include "io_64.h"
16454 #endif
16455 +
16456 +extern void *xlate_dev_mem_ptr(unsigned long phys);
16457 +extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
16458 +
16459 +extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
16460 +extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
16461 +
16462 +extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
16463 + unsigned long prot_val);
16464 +extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
16465 +
16466 +#endif /* _ASM_X86_IO_H */
16467 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:33:40.000000000 +0100
16468 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:38:05.000000000 +0100
16469 @@ -137,11 +137,11 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT
16470 #endif /* __ASSEMBLY__ */
16471
16472 #ifndef __ASSEMBLY__
16473 -#define raw_local_save_flags(flags) \
16474 - do { (flags) = __raw_local_save_flags(); } while (0)
16475 +#define raw_local_save_flags(flags) \
16476 + do { (flags) = __raw_local_save_flags(); } while (0)
16477
16478 -#define raw_local_irq_save(flags) \
16479 - do { (flags) = __raw_local_irq_save(); } while (0)
16480 +#define raw_local_irq_save(flags) \
16481 + do { (flags) = __raw_local_irq_save(); } while (0)
16482
16483 static inline int raw_irqs_disabled_flags(unsigned long flags)
16484 {
16485 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:33:40.000000000 +0100
16486 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:38:05.000000000 +0100
16487 @@ -94,7 +94,7 @@ static inline void switch_mm(struct mm_s
16488 BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
16489
16490 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
16491 - /* We were in lazy tlb mode and leave_mm disabled
16492 + /* We were in lazy tlb mode and leave_mm disabled
16493 * tlb flush IPI delivery. We must reload %cr3.
16494 */
16495 load_cr3(next->pgd);
16496 @@ -107,10 +107,10 @@ static inline void switch_mm(struct mm_s
16497 #define deactivate_mm(tsk, mm) \
16498 asm("movl %0,%%gs": :"r" (0));
16499
16500 -#define activate_mm(prev, next) \
16501 - do { \
16502 - xen_activate_mm(prev, next); \
16503 - switch_mm((prev),(next),NULL); \
16504 - } while(0)
16505 +#define activate_mm(prev, next) \
16506 +do { \
16507 + xen_activate_mm(prev, next); \
16508 + switch_mm((prev), (next), NULL); \
16509 +} while (0)
16510
16511 #endif
16512 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:33:40.000000000 +0100
16513 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:38:05.000000000 +0100
16514 @@ -21,7 +21,7 @@ void destroy_context(struct mm_struct *m
16515 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
16516 {
16517 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
16518 - if (read_pda(mmu_state) == TLBSTATE_OK)
16519 + if (read_pda(mmu_state) == TLBSTATE_OK)
16520 write_pda(mmu_state, TLBSTATE_LAZY);
16521 #endif
16522 }
16523 @@ -62,7 +62,7 @@ extern void mm_pin(struct mm_struct *mm)
16524 extern void mm_unpin(struct mm_struct *mm);
16525 void mm_pin_all(void);
16526
16527 -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
16528 +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
16529 struct task_struct *tsk)
16530 {
16531 unsigned cpu = smp_processor_id();
16532 @@ -106,7 +106,7 @@ static inline void switch_mm(struct mm_s
16533 if (read_pda(active_mm) != next)
16534 BUG();
16535 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
16536 - /* We were in lazy tlb mode and leave_mm disabled
16537 + /* We were in lazy tlb mode and leave_mm disabled
16538 * tlb flush IPI delivery. We must reload CR3
16539 * to make sure to use no freed page tables.
16540 */
16541 @@ -118,10 +118,11 @@ static inline void switch_mm(struct mm_s
16542 #endif
16543 }
16544
16545 -#define deactivate_mm(tsk,mm) do { \
16546 - load_gs_index(0); \
16547 - asm volatile("movl %0,%%fs"::"r"(0)); \
16548 -} while(0)
16549 +#define deactivate_mm(tsk, mm) \
16550 +do { \
16551 + load_gs_index(0); \
16552 + asm volatile("movl %0,%%fs"::"r"(0)); \
16553 +} while (0)
16554
16555 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
16556 {
16557 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:33:40.000000000 +0100
16558 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:38:05.000000000 +0100
16559 @@ -20,8 +20,16 @@
16560 #define _PAGE_BIT_IO 9
16561 #define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
16562
16563 -#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
16564 -#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK)
16565 +#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
16566 +#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
16567 +
16568 +/* Cast PAGE_MASK to a signed type so that it is sign-extended if
16569 + virtual addresses are 32-bits but physical addresses are larger
16570 + (ie, 32-bit PAE). */
16571 +#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
16572 +
16573 +/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
16574 +#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
16575
16576 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
16577 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
16578 @@ -34,19 +42,14 @@
16579 /* to align the pointer to the (next) page boundary */
16580 #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
16581
16582 -#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
16583 -#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
16584 -
16585 #ifndef __ASSEMBLY__
16586 #include <linux/types.h>
16587 #endif
16588
16589 #ifdef CONFIG_X86_64
16590 #include <asm/page_64.h>
16591 -#define max_pfn_mapped end_pfn_map
16592 #else
16593 #include <asm/page_32.h>
16594 -#define max_pfn_mapped max_low_pfn
16595 #endif /* CONFIG_X86_64 */
16596
16597 #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
16598 @@ -59,6 +62,9 @@
16599 #ifndef __ASSEMBLY__
16600
16601 extern int page_is_ram(unsigned long pagenr);
16602 +extern int devmem_is_allowed(unsigned long pagenr);
16603 +
16604 +extern unsigned long max_pfn_mapped;
16605
16606 struct page;
16607
16608 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:33:40.000000000 +0100
16609 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:38:05.000000000 +0100
16610 @@ -5,7 +5,7 @@
16611
16612 #define THREAD_ORDER 1
16613 #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
16614 -#define CURRENT_MASK (~(THREAD_SIZE-1))
16615 +#define CURRENT_MASK (~(THREAD_SIZE - 1))
16616
16617 #define EXCEPTION_STACK_ORDER 0
16618 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
16619 @@ -53,10 +53,10 @@
16620 #define __VIRTUAL_MASK_SHIFT 48
16621
16622 /*
16623 - * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
16624 + * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
16625 * arch/x86/kernel/head_64.S), and it is mapped here:
16626 */
16627 -#define KERNEL_IMAGE_SIZE (128*1024*1024)
16628 +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
16629 #define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
16630
16631 #ifndef __ASSEMBLY__
16632 @@ -64,7 +64,6 @@ void clear_page(void *page);
16633 void copy_page(void *to, void *from);
16634
16635 extern unsigned long end_pfn;
16636 -extern unsigned long end_pfn_map;
16637
16638 static inline unsigned long __phys_addr(unsigned long x)
16639 {
16640 @@ -89,6 +88,9 @@ typedef union { pteval_t pte; unsigned i
16641
16642 #define vmemmap ((struct page *)VMEMMAP_START)
16643
16644 +extern unsigned long init_memory_mapping(unsigned long start,
16645 + unsigned long end);
16646 +
16647 #endif /* !__ASSEMBLY__ */
16648
16649 #ifdef CONFIG_FLATMEM
16650 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:33:40.000000000 +0100
16651 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:38:05.000000000 +0100
16652 @@ -8,14 +8,13 @@
16653 #include <asm/scatterlist.h>
16654 #include <asm/io.h>
16655
16656 -
16657 #ifdef __KERNEL__
16658
16659 struct pci_sysdata {
16660 int domain; /* PCI domain */
16661 int node; /* NUMA node */
16662 #ifdef CONFIG_X86_64
16663 - void* iommu; /* IOMMU private data */
16664 + void *iommu; /* IOMMU private data */
16665 #endif
16666 #ifdef CONFIG_XEN_PCIDEV_FRONTEND
16667 struct pcifront_device *pdev;
16668 @@ -23,6 +22,8 @@ struct pci_sysdata {
16669 };
16670
16671 /* scan a bus after allocating a pci_sysdata for it */
16672 +extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
16673 + int node);
16674 extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
16675
16676 static inline int pci_domain_nr(struct pci_bus *bus)
16677 @@ -36,6 +37,7 @@ static inline int pci_proc_domain(struct
16678 return pci_domain_nr(bus);
16679 }
16680
16681 +extern void pci_iommu_alloc(void);
16682
16683 /* Can be used to override the logic in pci_scan_bus for skipping
16684 already-configured bus numbers - to be used for buggy BIOSes
16685 @@ -57,7 +59,7 @@ extern unsigned long pci_mem_start;
16686 #define PCIBIOS_MIN_CARDBUS_IO 0x4000
16687
16688 void pcibios_config_init(void);
16689 -struct pci_bus * pcibios_scan_root(int bus);
16690 +struct pci_bus *pcibios_scan_root(int bus);
16691
16692 void pcibios_set_master(struct pci_dev *dev);
16693 void pcibios_penalize_isa_irq(int irq, int active);
16694 @@ -67,7 +69,8 @@ int pcibios_set_irq_routing(struct pci_d
16695
16696 #define HAVE_PCI_MMAP
16697 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
16698 - enum pci_mmap_state mmap_state, int write_combine);
16699 + enum pci_mmap_state mmap_state,
16700 + int write_combine);
16701
16702
16703 #ifdef CONFIG_PCI
16704 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2009-02-16 16:18:36.000000000 +0100
16705 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgalloc.h 2009-03-16 16:38:05.000000000 +0100
16706 @@ -1,5 +1,149 @@
16707 -#ifdef CONFIG_X86_32
16708 -# include "pgalloc_32.h"
16709 -#else
16710 -# include "pgalloc_64.h"
16711 +#ifndef _ASM_X86_PGALLOC_H
16712 +#define _ASM_X86_PGALLOC_H
16713 +
16714 +#include <linux/threads.h>
16715 +#include <linux/mm.h> /* for struct page */
16716 +#include <linux/pagemap.h>
16717 +
16718 +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
16719 +
16720 +static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
16721 +static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
16722 +static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
16723 + unsigned long start, unsigned long count) {}
16724 +static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
16725 +static inline void paravirt_release_pte(unsigned long pfn) {}
16726 +static inline void paravirt_release_pmd(unsigned long pfn) {}
16727 +static inline void paravirt_release_pud(unsigned long pfn) {}
16728 +
16729 +#ifdef CONFIG_X86_64
16730 +void early_make_page_readonly(void *va, unsigned int feature);
16731 +pmd_t *early_get_pmd(unsigned long va);
16732 +#define make_lowmem_page_readonly make_page_readonly
16733 +#define make_lowmem_page_writable make_page_writable
16734 #endif
16735 +
16736 +/*
16737 + * Allocate and free page tables.
16738 + */
16739 +extern pgd_t *pgd_alloc(struct mm_struct *);
16740 +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
16741 +
16742 +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
16743 +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
16744 +
16745 +/* Should really implement gc for free page table pages. This could be
16746 + done with a reference count in struct page. */
16747 +
16748 +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
16749 +{
16750 + BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
16751 + make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
16752 + free_page((unsigned long)pte);
16753 +}
16754 +
16755 +extern void __pte_free(pgtable_t);
16756 +static inline void pte_free(struct mm_struct *mm, struct page *pte)
16757 +{
16758 + __pte_free(pte);
16759 +}
16760 +
16761 +extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
16762 +
16763 +static inline void pmd_populate_kernel(struct mm_struct *mm,
16764 + pmd_t *pmd, pte_t *pte)
16765 +{
16766 + paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
16767 + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
16768 +}
16769 +
16770 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
16771 + struct page *pte)
16772 +{
16773 + unsigned long pfn = page_to_pfn(pte);
16774 +
16775 + paravirt_alloc_pte(mm, pfn);
16776 + if (PagePinned(virt_to_page(mm->pgd))) {
16777 + if (!PageHighMem(pte))
16778 + BUG_ON(HYPERVISOR_update_va_mapping(
16779 + (unsigned long)__va(pfn << PAGE_SHIFT),
16780 + pfn_pte(pfn, PAGE_KERNEL_RO), 0));
16781 +#ifndef CONFIG_X86_64
16782 + else if (!TestSetPagePinned(pte))
16783 + kmap_flush_unused();
16784 +#endif
16785 + set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
16786 + } else
16787 + *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
16788 +}
16789 +
16790 +#define pmd_pgtable(pmd) pmd_page(pmd)
16791 +
16792 +#if PAGETABLE_LEVELS > 2
16793 +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
16794 +extern void __pmd_free(pgtable_t);
16795 +
16796 +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
16797 +{
16798 + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
16799 + __pmd_free(virt_to_page(pmd));
16800 +}
16801 +
16802 +extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
16803 +
16804 +#ifdef CONFIG_X86_PAE
16805 +extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
16806 +#else /* !CONFIG_X86_PAE */
16807 +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
16808 +{
16809 + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
16810 + if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
16811 + BUG_ON(HYPERVISOR_update_va_mapping(
16812 + (unsigned long)pmd,
16813 + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
16814 + PAGE_KERNEL_RO), 0));
16815 + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
16816 + } else
16817 + *pud = __pud(_PAGE_TABLE | __pa(pmd));
16818 +}
16819 +#endif /* CONFIG_X86_PAE */
16820 +
16821 +#if PAGETABLE_LEVELS > 3
16822 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
16823 +
16824 +/*
16825 + * We need to use the batch mode here, but pgd_pupulate() won't be
16826 + * be called frequently.
16827 + */
16828 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
16829 +{
16830 + paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
16831 + if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
16832 + BUG_ON(HYPERVISOR_update_va_mapping(
16833 + (unsigned long)pud,
16834 + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
16835 + PAGE_KERNEL_RO), 0));
16836 + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
16837 + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
16838 + } else {
16839 + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
16840 + *__user_pgd(pgd) = *(pgd);
16841 + }
16842 +}
16843 +
16844 +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
16845 +{
16846 + return (pud_t *)pmd_alloc_one(mm, addr);
16847 +}
16848 +
16849 +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
16850 +{
16851 + BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
16852 + __pmd_free(virt_to_page(pud));
16853 +}
16854 +
16855 +extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
16856 +#endif /* PAGETABLE_LEVELS > 3 */
16857 +#endif /* PAGETABLE_LEVELS > 2 */
16858 +
16859 +#endif /* _ASM_X86_PGALLOC_H */
16860 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgalloc_32.h 2009-03-16 16:33:40.000000000 +0100
16861 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
16862 @@ -1,111 +0,0 @@
16863 -#ifndef _I386_PGALLOC_H
16864 -#define _I386_PGALLOC_H
16865 -
16866 -#include <linux/threads.h>
16867 -#include <linux/mm.h> /* for struct page */
16868 -#include <linux/pagemap.h>
16869 -#include <asm/tlb.h>
16870 -#include <asm-generic/tlb.h>
16871 -#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
16872 -
16873 -#define paravirt_alloc_pt(mm, pfn) do { } while (0)
16874 -#define paravirt_alloc_pd(mm, pfn) do { } while (0)
16875 -#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
16876 -#define paravirt_release_pt(pfn) do { } while (0)
16877 -#define paravirt_release_pd(pfn) do { } while (0)
16878 -
16879 -static inline void pmd_populate_kernel(struct mm_struct *mm,
16880 - pmd_t *pmd, pte_t *pte)
16881 -{
16882 - paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
16883 - set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
16884 -}
16885 -
16886 -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
16887 -{
16888 - unsigned long pfn = page_to_pfn(pte);
16889 -
16890 - paravirt_alloc_pt(mm, pfn);
16891 - if (PagePinned(virt_to_page(mm->pgd))) {
16892 - if (!PageHighMem(pte))
16893 - BUG_ON(HYPERVISOR_update_va_mapping(
16894 - (unsigned long)__va(pfn << PAGE_SHIFT),
16895 - pfn_pte(pfn, PAGE_KERNEL_RO), 0));
16896 - else if (!test_and_set_bit(PG_pinned, &pte->flags))
16897 - kmap_flush_unused();
16898 - set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
16899 - } else
16900 - *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
16901 -}
16902 -#define pmd_pgtable(pmd) pmd_page(pmd)
16903 -
16904 -/*
16905 - * Allocate and free page tables.
16906 - */
16907 -extern void pgd_test_and_unpin(pgd_t *);
16908 -extern pgd_t *pgd_alloc(struct mm_struct *);
16909 -extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
16910 -
16911 -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
16912 -extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
16913 -
16914 -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
16915 -{
16916 - make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
16917 - free_page((unsigned long)pte);
16918 -}
16919 -
16920 -extern void __pte_free(pgtable_t);
16921 -static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
16922 -{
16923 - __pte_free(pte);
16924 -}
16925 -
16926 -
16927 -extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
16928 -
16929 -#ifdef CONFIG_X86_PAE
16930 -/*
16931 - * In the PAE case we free the pmds as part of the pgd.
16932 - */
16933 -extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
16934 -
16935 -extern void __pmd_free(pgtable_t);
16936 -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
16937 -{
16938 - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
16939 - __pmd_free(virt_to_page(pmd));
16940 -}
16941 -
16942 -extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
16943 -
16944 -static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
16945 -{
16946 - struct page *page = virt_to_page(pmd);
16947 - unsigned long pfn = page_to_pfn(page);
16948 -
16949 - paravirt_alloc_pd(mm, pfn);
16950 -
16951 - /* Note: almost everything apart from _PAGE_PRESENT is
16952 - reserved at the pmd (PDPT) level. */
16953 - if (PagePinned(virt_to_page(mm->pgd))) {
16954 - BUG_ON(PageHighMem(page));
16955 - BUG_ON(HYPERVISOR_update_va_mapping(
16956 - (unsigned long)__va(pfn << PAGE_SHIFT),
16957 - pfn_pte(pfn, PAGE_KERNEL_RO), 0));
16958 - set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
16959 - } else
16960 - *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
16961 -
16962 - /*
16963 - * According to Intel App note "TLBs, Paging-Structure Caches,
16964 - * and Their Invalidation", April 2007, document 317080-001,
16965 - * section 8.1: in PAE mode we explicitly have to flush the
16966 - * TLB via cr3 if the top-level pgd is changed...
16967 - */
16968 - if (mm == current->active_mm)
16969 - xen_tlb_flush();
16970 -}
16971 -#endif /* CONFIG_X86_PAE */
16972 -
16973 -#endif /* _I386_PGALLOC_H */
16974 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgalloc_64.h 2009-03-16 16:33:40.000000000 +0100
16975 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
16976 @@ -1,179 +0,0 @@
16977 -#ifndef _X86_64_PGALLOC_H
16978 -#define _X86_64_PGALLOC_H
16979 -
16980 -#include <asm/pda.h>
16981 -#include <linux/threads.h>
16982 -#include <linux/mm.h>
16983 -#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
16984 -
16985 -pmd_t *early_get_pmd(unsigned long va);
16986 -void early_make_page_readonly(void *va, unsigned int feature);
16987 -
16988 -#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
16989 -
16990 -#define pmd_populate_kernel(mm, pmd, pte) \
16991 - set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
16992 -
16993 -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
16994 -{
16995 - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
16996 - BUG_ON(HYPERVISOR_update_va_mapping(
16997 - (unsigned long)pmd,
16998 - pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
16999 - PAGE_KERNEL_RO), 0));
17000 - set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
17001 - } else {
17002 - *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
17003 - }
17004 -}
17005 -
17006 -/*
17007 - * We need to use the batch mode here, but pgd_pupulate() won't be
17008 - * be called frequently.
17009 - */
17010 -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
17011 -{
17012 - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
17013 - BUG_ON(HYPERVISOR_update_va_mapping(
17014 - (unsigned long)pud,
17015 - pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
17016 - PAGE_KERNEL_RO), 0));
17017 - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
17018 - set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
17019 - } else {
17020 - *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
17021 - *(__user_pgd(pgd)) = *(pgd);
17022 - }
17023 -}
17024 -
17025 -#define pmd_pgtable(pmd) pmd_page(pmd)
17026 -
17027 -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
17028 -{
17029 - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
17030 - BUG_ON(HYPERVISOR_update_va_mapping(
17031 - (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
17032 - pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
17033 - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
17034 - } else {
17035 - *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
17036 - }
17037 -}
17038 -
17039 -extern void __pmd_free(pgtable_t);
17040 -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
17041 -{
17042 - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
17043 - __pmd_free(virt_to_page(pmd));
17044 -}
17045 -
17046 -extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
17047 -
17048 -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
17049 -{
17050 - return (pud_t *)pmd_alloc_one(mm, addr);
17051 -}
17052 -
17053 -static inline void pud_free(struct mm_struct *mm, pud_t *pud)
17054 -{
17055 - BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
17056 - __pmd_free(virt_to_page(pud));
17057 -}
17058 -
17059 -static inline void pgd_list_add(pgd_t *pgd)
17060 -{
17061 - struct page *page = virt_to_page(pgd);
17062 - unsigned long flags;
17063 -
17064 - spin_lock_irqsave(&pgd_lock, flags);
17065 - list_add(&page->lru, &pgd_list);
17066 - spin_unlock_irqrestore(&pgd_lock, flags);
17067 -}
17068 -
17069 -static inline void pgd_list_del(pgd_t *pgd)
17070 -{
17071 - struct page *page = virt_to_page(pgd);
17072 - unsigned long flags;
17073 -
17074 - spin_lock_irqsave(&pgd_lock, flags);
17075 - list_del(&page->lru);
17076 - spin_unlock_irqrestore(&pgd_lock, flags);
17077 -}
17078 -
17079 -extern void pgd_test_and_unpin(pgd_t *);
17080 -
17081 -static inline pgd_t *pgd_alloc(struct mm_struct *mm)
17082 -{
17083 - /*
17084 - * We allocate two contiguous pages for kernel and user.
17085 - */
17086 - unsigned boundary;
17087 - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
17088 - if (!pgd)
17089 - return NULL;
17090 - pgd_list_add(pgd);
17091 - pgd_test_and_unpin(pgd);
17092 - /*
17093 - * Copy kernel pointers in from init.
17094 - * Could keep a freelist or slab cache of those because the kernel
17095 - * part never changes.
17096 - */
17097 - boundary = pgd_index(__PAGE_OFFSET);
17098 - memset(pgd, 0, boundary * sizeof(pgd_t));
17099 - memcpy(pgd + boundary,
17100 - init_level4_pgt + boundary,
17101 - (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
17102 -
17103 - memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
17104 - /*
17105 - * Set level3_user_pgt for vsyscall area
17106 - */
17107 - __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
17108 - __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
17109 - return pgd;
17110 -}
17111 -
17112 -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
17113 -{
17114 - pgd_test_and_unpin(pgd);
17115 - pgd_list_del(pgd);
17116 - free_pages((unsigned long)pgd, 1);
17117 -}
17118 -
17119 -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
17120 -{
17121 - pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
17122 - if (pte)
17123 - make_page_readonly(pte, XENFEAT_writable_page_tables);
17124 -
17125 - return pte;
17126 -}
17127 -
17128 -extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
17129 -
17130 -/* Should really implement gc for free page table pages. This could be
17131 - done with a reference count in struct page. */
17132 -
17133 -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
17134 -{
17135 - BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
17136 - make_page_writable(pte, XENFEAT_writable_page_tables);
17137 - free_page((unsigned long)pte);
17138 -}
17139 -
17140 -extern void __pte_free(pgtable_t);
17141 -static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
17142 -{
17143 - __pte_free(pte);
17144 -}
17145 -
17146 -#define __pte_free_tlb(tlb,pte) \
17147 -do { \
17148 - pgtable_page_dtor((pte)); \
17149 - tlb_remove_page((tlb), (pte)); \
17150 -} while (0)
17151 -
17152 -#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
17153 -#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
17154 -
17155 -#endif /* _X86_64_PGALLOC_H */
17156 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:33:40.000000000 +0100
17157 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:38:05.000000000 +0100
17158 @@ -1,17 +1,15 @@
17159 #ifndef _ASM_X86_PGTABLE_H
17160 #define _ASM_X86_PGTABLE_H
17161
17162 -#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
17163 #define FIRST_USER_ADDRESS 0
17164
17165 -#define _PAGE_BIT_PRESENT 0
17166 -#define _PAGE_BIT_RW 1
17167 -#define _PAGE_BIT_USER 2
17168 -#define _PAGE_BIT_PWT 3
17169 -#define _PAGE_BIT_PCD 4
17170 -#define _PAGE_BIT_ACCESSED 5
17171 -#define _PAGE_BIT_DIRTY 6
17172 -#define _PAGE_BIT_FILE 6
17173 +#define _PAGE_BIT_PRESENT 0 /* is present */
17174 +#define _PAGE_BIT_RW 1 /* writeable */
17175 +#define _PAGE_BIT_USER 2 /* userspace addressable */
17176 +#define _PAGE_BIT_PWT 3 /* page write through */
17177 +#define _PAGE_BIT_PCD 4 /* page cache disabled */
17178 +#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
17179 +#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
17180 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
17181 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
17182 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
17183 @@ -22,6 +20,14 @@
17184 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
17185 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
17186
17187 +/* If _PAGE_BIT_PRESENT is clear, we use these: */
17188 +
17189 +/* set: nonlinear file mapping, saved PTE; unset:swap */
17190 +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
17191 +
17192 +/* if the user mapped it with PROT_NONE; pte_present gives true */
17193 +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
17194 +
17195 /*
17196 * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
17197 * sign-extended value on 32-bit with all 1's in the upper word,
17198 @@ -48,10 +54,8 @@
17199 #define _PAGE_NX 0
17200 #endif
17201
17202 -/* If _PAGE_PRESENT is clear, we use these: */
17203 -#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
17204 -#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
17205 - pte_present gives true */
17206 +#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
17207 +#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
17208
17209 #ifndef __ASSEMBLY__
17210 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
17211 @@ -61,20 +65,42 @@ extern unsigned int __kernel_page_user;
17212 #endif
17213 #endif
17214
17215 -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
17216 -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
17217 +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
17218 + _PAGE_ACCESSED | _PAGE_DIRTY)
17219 +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
17220 + _PAGE_DIRTY | __kernel_page_user)
17221 +
17222 +/* Set of bits not changed in pte_modify */
17223 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
17224 + _PAGE_ACCESSED | _PAGE_DIRTY)
17225
17226 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
17227 +/*
17228 + * PAT settings are part of the hypervisor interface, which sets the
17229 + * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
17230 + */
17231 +#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
17232 +#define _PAGE_CACHE_WB (0)
17233 +#define _PAGE_CACHE_WT (_PAGE_PWT)
17234 +#define _PAGE_CACHE_WC (_PAGE_PAT)
17235 +#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT)
17236 +#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
17237 +#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
17238
17239 #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
17240 -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
17241 +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
17242 + _PAGE_ACCESSED | _PAGE_NX)
17243
17244 -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
17245 -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
17246 -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
17247 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
17248 + _PAGE_USER | _PAGE_ACCESSED)
17249 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
17250 + _PAGE_ACCESSED | _PAGE_NX)
17251 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
17252 + _PAGE_ACCESSED)
17253 #define PAGE_COPY PAGE_COPY_NOEXEC
17254 -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
17255 -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
17256 +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
17257 + _PAGE_ACCESSED | _PAGE_NX)
17258 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
17259 + _PAGE_ACCESSED)
17260
17261 #ifdef CONFIG_X86_32
17262 #define _PAGE_KERNEL_EXEC \
17263 @@ -93,6 +119,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
17264 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
17265 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
17266 #define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
17267 +#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
17268 #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
17269 #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
17270 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
17271 @@ -109,6 +136,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
17272 #define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
17273 #define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
17274 #define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
17275 +#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
17276 #define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
17277 #define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
17278 #define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
17279 @@ -142,7 +170,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
17280 * ZERO_PAGE is a global shared page that is always zero: used
17281 * for zero-mapped memory areas etc..
17282 */
17283 -extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
17284 +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
17285 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
17286
17287 extern spinlock_t pgd_lock;
17288 @@ -152,30 +180,111 @@ extern struct list_head pgd_list;
17289 * The following only work if pte_present() is true.
17290 * Undefined behaviour if not..
17291 */
17292 -static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
17293 -static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
17294 -static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
17295 -static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
17296 -static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
17297 -static inline int pte_global(pte_t pte) { return 0; }
17298 -static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
17299 -
17300 -static inline int pmd_large(pmd_t pte) {
17301 - return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
17302 - (_PAGE_PSE|_PAGE_PRESENT);
17303 -}
17304 -
17305 -static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
17306 -static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
17307 -static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
17308 -static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
17309 -static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
17310 -static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
17311 -static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
17312 -static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
17313 -static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
17314 -static inline pte_t pte_mkglobal(pte_t pte) { return pte; }
17315 -static inline pte_t pte_clrglobal(pte_t pte) { return pte; }
17316 +static inline int pte_dirty(pte_t pte)
17317 +{
17318 + return __pte_val(pte) & _PAGE_DIRTY;
17319 +}
17320 +
17321 +static inline int pte_young(pte_t pte)
17322 +{
17323 + return __pte_val(pte) & _PAGE_ACCESSED;
17324 +}
17325 +
17326 +static inline int pte_write(pte_t pte)
17327 +{
17328 + return __pte_val(pte) & _PAGE_RW;
17329 +}
17330 +
17331 +static inline int pte_file(pte_t pte)
17332 +{
17333 + return __pte_val(pte) & _PAGE_FILE;
17334 +}
17335 +
17336 +static inline int pte_huge(pte_t pte)
17337 +{
17338 + return __pte_val(pte) & _PAGE_PSE;
17339 +}
17340 +
17341 +static inline int pte_global(pte_t pte)
17342 +{
17343 + return 0;
17344 +}
17345 +
17346 +static inline int pte_exec(pte_t pte)
17347 +{
17348 + return !(__pte_val(pte) & _PAGE_NX);
17349 +}
17350 +
17351 +static inline int pte_special(pte_t pte)
17352 +{
17353 + return 0;
17354 +}
17355 +
17356 +static inline int pmd_large(pmd_t pte)
17357 +{
17358 + return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
17359 + (_PAGE_PSE | _PAGE_PRESENT);
17360 +}
17361 +
17362 +static inline pte_t pte_mkclean(pte_t pte)
17363 +{
17364 + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
17365 +}
17366 +
17367 +static inline pte_t pte_mkold(pte_t pte)
17368 +{
17369 + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
17370 +}
17371 +
17372 +static inline pte_t pte_wrprotect(pte_t pte)
17373 +{
17374 + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
17375 +}
17376 +
17377 +static inline pte_t pte_mkexec(pte_t pte)
17378 +{
17379 + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
17380 +}
17381 +
17382 +static inline pte_t pte_mkdirty(pte_t pte)
17383 +{
17384 + return __pte_ma(__pte_val(pte) | _PAGE_DIRTY);
17385 +}
17386 +
17387 +static inline pte_t pte_mkyoung(pte_t pte)
17388 +{
17389 + return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED);
17390 +}
17391 +
17392 +static inline pte_t pte_mkwrite(pte_t pte)
17393 +{
17394 + return __pte_ma(__pte_val(pte) | _PAGE_RW);
17395 +}
17396 +
17397 +static inline pte_t pte_mkhuge(pte_t pte)
17398 +{
17399 + return __pte_ma(__pte_val(pte) | _PAGE_PSE);
17400 +}
17401 +
17402 +static inline pte_t pte_clrhuge(pte_t pte)
17403 +{
17404 + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
17405 +}
17406 +
17407 +static inline pte_t pte_mkglobal(pte_t pte)
17408 +{
17409 + return pte;
17410 +}
17411 +
17412 +static inline pte_t pte_clrglobal(pte_t pte)
17413 +{
17414 + return pte;
17415 +}
17416 +
17417 +static inline pte_t pte_mkspecial(pte_t pte)
17418 +{
17419 + return pte;
17420 +}
17421
17422 extern pteval_t __supported_pte_mask;
17423
17424 @@ -202,15 +311,33 @@ static inline pte_t pte_modify(pte_t pte
17425 pteval_t val = pte_val(pte);
17426
17427 val &= _PAGE_CHG_MASK;
17428 - val |= pgprot_val(newprot) & __supported_pte_mask;
17429 + val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
17430
17431 return __pte(val);
17432 }
17433
17434 -#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
17435 +/* mprotect needs to preserve PAT bits when updating vm_page_prot */
17436 +#define pgprot_modify pgprot_modify
17437 +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
17438 +{
17439 + pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
17440 + pgprotval_t addbits = pgprot_val(newprot);
17441 + return __pgprot(preservebits | addbits);
17442 +}
17443 +
17444 +#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
17445
17446 #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
17447
17448 +#ifndef __ASSEMBLY__
17449 +#define __HAVE_PHYS_MEM_ACCESS_PROT
17450 +struct file;
17451 +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
17452 + unsigned long size, pgprot_t vma_prot);
17453 +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
17454 + unsigned long size, pgprot_t *vma_prot);
17455 +#endif
17456 +
17457 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
17458 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
17459
17460 @@ -246,6 +373,9 @@ static inline pte_t pte_modify(pte_t pte
17461 # include "pgtable_64.h"
17462 #endif
17463
17464 +#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
17465 +#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
17466 +
17467 #ifndef __ASSEMBLY__
17468
17469 enum {
17470 @@ -312,46 +442,17 @@ static inline void xen_pte_clear(struct
17471 * bit at the same time.
17472 */
17473 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
17474 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
17475 -({ \
17476 - int __changed = !pte_same(*(ptep), entry); \
17477 - if (__changed && (dirty)) { \
17478 - if ( likely((vma)->vm_mm == current->mm) ) { \
17479 - BUG_ON(HYPERVISOR_update_va_mapping(address, \
17480 - entry, \
17481 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
17482 - UVMF_INVLPG|UVMF_MULTI)); \
17483 - } else { \
17484 - xen_l1_entry_update(ptep, entry); \
17485 - flush_tlb_page(vma, address); \
17486 - } \
17487 - } \
17488 - __changed; \
17489 -})
17490 +extern int ptep_set_access_flags(struct vm_area_struct *vma,
17491 + unsigned long address, pte_t *ptep,
17492 + pte_t entry, int dirty);
17493
17494 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
17495 -#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
17496 - int __ret = 0; \
17497 - if (pte_young(*(ptep))) \
17498 - __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
17499 - &(ptep)->pte); \
17500 - if (__ret) \
17501 - pte_update((vma)->vm_mm, addr, ptep); \
17502 - __ret; \
17503 -})
17504 +extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
17505 + unsigned long addr, pte_t *ptep);
17506
17507 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
17508 -#define ptep_clear_flush_young(vma, address, ptep) \
17509 -({ \
17510 - pte_t __pte = *(ptep); \
17511 - int __young = pte_young(__pte); \
17512 - __pte = pte_mkold(__pte); \
17513 - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
17514 - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
17515 - else if (__young) \
17516 - (ptep)->pte_low = __pte.pte_low; \
17517 - __young; \
17518 -})
17519 +extern int ptep_clear_flush_young(struct vm_area_struct *vma,
17520 + unsigned long address, pte_t *ptep);
17521
17522 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
17523 #define ptep_clear_flush(vma, addr, ptep) \
17524 @@ -370,7 +471,8 @@ static inline void xen_pte_clear(struct
17525 })
17526
17527 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
17528 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
17529 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
17530 + pte_t *ptep)
17531 {
17532 pte_t pte = *ptep;
17533 if (!pte_none(pte)
17534 @@ -398,13 +500,29 @@ static inline pte_t ptep_get_and_clear(s
17535 pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
17536
17537 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
17538 -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
17539 +static inline void ptep_set_wrprotect(struct mm_struct *mm,
17540 + unsigned long addr, pte_t *ptep)
17541 {
17542 pte_t pte = *ptep;
17543 if (pte_write(pte))
17544 set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
17545 }
17546
17547 +/*
17548 + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
17549 + *
17550 + * dst - pointer to pgd range anwhere on a pgd page
17551 + * src - ""
17552 + * count - the number of pgds to copy.
17553 + *
17554 + * dst and src can be on the same page, but the range must not overlap,
17555 + * and must not cross a page boundary.
17556 + */
17557 +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
17558 +{
17559 + memcpy(dst, src, count * sizeof(pgd_t));
17560 +}
17561 +
17562 #define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
17563 xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
17564
17565 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:33:40.000000000 +0100
17566 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:38:05.000000000 +0100
17567 @@ -8,25 +8,28 @@
17568 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
17569 */
17570
17571 -#define pte_ERROR(e) \
17572 - printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \
17573 - &(e), __pte_val(e), pte_pfn(e))
17574 -#define pmd_ERROR(e) \
17575 - printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
17576 - &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17577 -#define pgd_ERROR(e) \
17578 - printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
17579 - &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17580 -
17581 +#define pte_ERROR(e) \
17582 + printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \
17583 + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
17584 +#define pmd_ERROR(e) \
17585 + printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
17586 + __FILE__, __LINE__, &(e), __pmd_val(e), \
17587 + (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17588 +#define pgd_ERROR(e) \
17589 + printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
17590 + __FILE__, __LINE__, &(e), __pgd_val(e), \
17591 + (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17592
17593 static inline int pud_none(pud_t pud)
17594 {
17595 return __pud_val(pud) == 0;
17596 +
17597 }
17598 static inline int pud_bad(pud_t pud)
17599 {
17600 return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
17601 }
17602 +
17603 static inline int pud_present(pud_t pud)
17604 {
17605 return __pud_val(pud) & _PAGE_PRESENT;
17606 @@ -48,12 +51,14 @@ static inline void xen_set_pte(pte_t *pt
17607
17608 static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
17609 {
17610 - set_64bit((unsigned long long *)(ptep),__pte_val(pte));
17611 + set_64bit((unsigned long long *)(ptep), __pte_val(pte));
17612 }
17613 +
17614 static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
17615 {
17616 xen_l2_entry_update(pmdp, pmd);
17617 }
17618 +
17619 static inline void xen_set_pud(pud_t *pudp, pud_t pud)
17620 {
17621 xen_l3_entry_update(pudp, pud);
17622 @@ -92,20 +97,19 @@ static inline void pud_clear(pud_t *pudp
17623 * current pgd to avoid unnecessary TLB flushes.
17624 */
17625 pgd = read_cr3();
17626 - if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
17627 + if (__pa(pudp) >= pgd && __pa(pudp) <
17628 + (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
17629 xen_tlb_flush();
17630 }
17631
17632 -#define pud_page(pud) \
17633 -((struct page *) __va(pud_val(pud) & PAGE_MASK))
17634 +#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
17635
17636 -#define pud_page_vaddr(pud) \
17637 -((unsigned long) __va(pud_val(pud) & PAGE_MASK))
17638 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
17639
17640
17641 /* Find an entry in the second-level page table.. */
17642 -#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
17643 - pmd_index(address))
17644 +#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \
17645 + pmd_index(address))
17646
17647 #ifdef CONFIG_SMP
17648 static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
17649 @@ -150,7 +154,8 @@ static inline int pte_none(pte_t pte)
17650 * put the 32 bits of offset into the high part.
17651 */
17652 #define pte_to_pgoff(pte) ((pte).pte_high)
17653 -#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
17654 +#define pgoff_to_pte(off) \
17655 + ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
17656 #define PTE_FILE_MAX_BITS 32
17657
17658 /* Encode and de-code a swap entry */
17659 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:33:40.000000000 +0100
17660 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:38:05.000000000 +0100
17661 @@ -38,16 +38,13 @@ void paging_init(void);
17662 #ifdef CONFIG_X86_PAE
17663 # include <asm/pgtable-3level-defs.h>
17664 # define PMD_SIZE (1UL << PMD_SHIFT)
17665 -# define PMD_MASK (~(PMD_SIZE-1))
17666 +# define PMD_MASK (~(PMD_SIZE - 1))
17667 #else
17668 # include <asm/pgtable-2level-defs.h>
17669 #endif
17670
17671 #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
17672 -#define PGDIR_MASK (~(PGDIR_SIZE-1))
17673 -
17674 -#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
17675 -#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
17676 +#define PGDIR_MASK (~(PGDIR_SIZE - 1))
17677
17678 /* Just any arbitrary offset to the start of the vmalloc VM area: the
17679 * current 8MB value just means that there will be a 8MB "hole" after the
17680 @@ -56,21 +53,22 @@ void paging_init(void);
17681 * The vmalloc() routines leaves a hole of 4kB between each vmalloced
17682 * area for the same reason. ;)
17683 */
17684 -#define VMALLOC_OFFSET (8*1024*1024)
17685 -#define VMALLOC_START (((unsigned long) high_memory + \
17686 - 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
17687 +#define VMALLOC_OFFSET (8 * 1024 * 1024)
17688 +#define VMALLOC_START (((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) \
17689 + & ~(VMALLOC_OFFSET - 1))
17690 #ifdef CONFIG_X86_PAE
17691 #define LAST_PKMAP 512
17692 #else
17693 #define LAST_PKMAP 1024
17694 #endif
17695
17696 -#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
17697 +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
17698 + & PMD_MASK)
17699
17700 #ifdef CONFIG_HIGHMEM
17701 -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
17702 +# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
17703 #else
17704 -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
17705 +# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
17706 #endif
17707
17708 /*
17709 @@ -91,10 +89,10 @@ extern unsigned long pg0[];
17710 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
17711 can temporarily clear it. */
17712 #define pmd_present(x) (__pmd_val(x))
17713 -#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
17714 +#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
17715 #else
17716 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
17717 -#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
17718 +#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
17719 #endif
17720
17721
17722 @@ -107,32 +105,18 @@ extern unsigned long pg0[];
17723 #endif
17724
17725 /*
17726 - * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
17727 - *
17728 - * dst - pointer to pgd range anwhere on a pgd page
17729 - * src - ""
17730 - * count - the number of pgds to copy.
17731 - *
17732 - * dst and src can be on the same page, but the range must not overlap,
17733 - * and must not cross a page boundary.
17734 + * Macro to mark a page protection value as "uncacheable".
17735 + * On processors which do not support it, this is a no-op.
17736 */
17737 -static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
17738 -{
17739 - memcpy(dst, src, count * sizeof(pgd_t));
17740 -}
17741 -
17742 -/*
17743 - * Macro to mark a page protection value as "uncacheable". On processors which do not support
17744 - * it, this is a no-op.
17745 - */
17746 -#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
17747 - ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
17748 +#define pgprot_noncached(prot) \
17749 + ((boot_cpu_data.x86 > 3) \
17750 + ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
17751 + : (prot))
17752
17753 /*
17754 * Conversion functions: convert a page and protection to a page entry,
17755 * and a page entry and page directory to the page they refer to.
17756 */
17757 -
17758 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
17759
17760 /*
17761 @@ -141,20 +125,20 @@ static inline void clone_pgd_range(pgd_t
17762 * this macro returns the index of the entry in the pgd page which would
17763 * control the given virtual address
17764 */
17765 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
17766 -#define pgd_index_k(addr) pgd_index(addr)
17767 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
17768 +#define pgd_index_k(addr) pgd_index((addr))
17769
17770 /*
17771 * pgd_offset() returns a (pgd_t *)
17772 * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
17773 */
17774 -#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
17775 +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
17776
17777 /*
17778 * a shortcut which implies the use of the kernel's pgd, instead
17779 * of a process's
17780 */
17781 -#define pgd_offset_k(address) pgd_offset(&init_mm, address)
17782 +#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
17783
17784 static inline int pud_large(pud_t pud) { return 0; }
17785
17786 @@ -164,8 +148,8 @@ static inline int pud_large(pud_t pud) {
17787 * this macro returns the index of the entry in the pmd page which would
17788 * control the given virtual address
17789 */
17790 -#define pmd_index(address) \
17791 - (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
17792 +#define pmd_index(address) \
17793 + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
17794
17795 /*
17796 * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
17797 @@ -173,33 +157,36 @@ static inline int pud_large(pud_t pud) {
17798 * this macro returns the index of the entry in the pte page which would
17799 * control the given virtual address
17800 */
17801 -#define pte_index(address) \
17802 - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
17803 -#define pte_offset_kernel(dir, address) \
17804 - ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
17805 +#define pte_index(address) \
17806 + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
17807 +#define pte_offset_kernel(dir, address) \
17808 + ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
17809
17810 -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
17811 +#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
17812
17813 -#define pmd_page_vaddr(pmd) \
17814 - ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
17815 +#define pmd_page_vaddr(pmd) \
17816 + ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
17817
17818 #if defined(CONFIG_HIGHPTE)
17819 -#define pte_offset_map(dir, address) \
17820 - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
17821 -#define pte_offset_map_nested(dir, address) \
17822 - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
17823 -#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
17824 -#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
17825 -#else
17826 -#define pte_offset_map(dir, address) \
17827 - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
17828 -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
17829 +#define pte_offset_map(dir, address) \
17830 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
17831 + pte_index((address)))
17832 +#define pte_offset_map_nested(dir, address) \
17833 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
17834 + pte_index((address)))
17835 +#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
17836 +#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
17837 +#else
17838 +#define pte_offset_map(dir, address) \
17839 + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
17840 +#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
17841 #define pte_unmap(pte) do { } while (0)
17842 #define pte_unmap_nested(pte) do { } while (0)
17843 #endif
17844
17845 /* Clear a kernel PTE and flush it from the TLB */
17846 -#define kpte_clear_flush(ptep, vaddr) do { \
17847 +#define kpte_clear_flush(ptep, vaddr) \
17848 +do { \
17849 if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
17850 BUG(); \
17851 } while (0)
17852 @@ -208,7 +195,7 @@ static inline int pud_large(pud_t pud) {
17853 * The i386 doesn't have any external MMU info: the kernel page
17854 * tables contain all the necessary information.
17855 */
17856 -#define update_mmu_cache(vma,address,pte) do { } while (0)
17857 +#define update_mmu_cache(vma, address, pte) do { } while (0)
17858
17859 void make_lowmem_page_readonly(void *va, unsigned int feature);
17860 void make_lowmem_page_writable(void *va, unsigned int feature);
17861 @@ -225,7 +212,7 @@ void make_lowmem_page_writable(void *va,
17862 #define kern_addr_valid(kaddr) (0)
17863 #endif
17864
17865 -#define io_remap_pfn_range(vma,from,pfn,size,prot) \
17866 -direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
17867 +#define io_remap_pfn_range(vma, from, pfn, size, prot) \
17868 + direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO)
17869
17870 #endif /* _I386_PGTABLE_H */
17871 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:33:40.000000000 +0100
17872 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:38:05.000000000 +0100
17873 @@ -31,7 +31,7 @@ extern void paging_init(void);
17874
17875 #endif /* !__ASSEMBLY__ */
17876
17877 -#define SHARED_KERNEL_PMD 1
17878 +#define SHARED_KERNEL_PMD 0
17879
17880 /*
17881 * PGDIR_SHIFT determines what a top-level page table entry can map
17882 @@ -59,18 +59,20 @@ extern void paging_init(void);
17883
17884 #ifndef __ASSEMBLY__
17885
17886 -#define pte_ERROR(e) \
17887 - printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
17888 - &(e), __pte_val(e), pte_pfn(e))
17889 -#define pmd_ERROR(e) \
17890 - printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
17891 - &(e), __pmd_val(e), pmd_pfn(e))
17892 -#define pud_ERROR(e) \
17893 - printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
17894 - &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
17895 -#define pgd_ERROR(e) \
17896 - printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
17897 - &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
17898 +#define pte_ERROR(e) \
17899 + printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \
17900 + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
17901 +#define pmd_ERROR(e) \
17902 + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \
17903 + __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
17904 +#define pud_ERROR(e) \
17905 + printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", \
17906 + __FILE__, __LINE__, &(e), __pud_val(e), \
17907 + (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
17908 +#define pgd_ERROR(e) \
17909 + printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", \
17910 + __FILE__, __LINE__, &(e), __pgd_val(e), \
17911 + (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
17912
17913 #define pgd_none(x) (!__pgd_val(x))
17914 #define pud_none(x) (!__pud_val(x))
17915 @@ -125,7 +127,7 @@ static inline void xen_set_pgd(pgd_t *pg
17916 xen_l4_entry_update(pgdp, pgd);
17917 }
17918
17919 -static inline void xen_pgd_clear(pgd_t * pgd)
17920 +static inline void xen_pgd_clear(pgd_t *pgd)
17921 {
17922 xen_set_pgd(pgd, xen_make_pgd(0));
17923 xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
17924 @@ -135,43 +137,43 @@ static inline void xen_pgd_clear(pgd_t *
17925
17926 #endif /* !__ASSEMBLY__ */
17927
17928 -#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
17929 -#define PMD_MASK (~(PMD_SIZE-1))
17930 -#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT)
17931 -#define PUD_MASK (~(PUD_SIZE-1))
17932 -#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
17933 -#define PGDIR_MASK (~(PGDIR_SIZE-1))
17934 +#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
17935 +#define PMD_MASK (~(PMD_SIZE - 1))
17936 +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
17937 +#define PUD_MASK (~(PUD_SIZE - 1))
17938 +#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
17939 +#define PGDIR_MASK (~(PGDIR_SIZE - 1))
17940
17941
17942 -#define MAXMEM _AC(0x3fffffffffff, UL)
17943 +#define MAXMEM _AC(0x00003fffffffffff, UL)
17944 #define VMALLOC_START _AC(0xffffc20000000000, UL)
17945 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
17946 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
17947 -#define MODULES_VADDR _AC(0xffffffff88000000, UL)
17948 +#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
17949 #define MODULES_END _AC(0xfffffffffff00000, UL)
17950 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
17951
17952 #ifndef __ASSEMBLY__
17953
17954 -static inline unsigned long pgd_bad(pgd_t pgd)
17955 +static inline int pgd_bad(pgd_t pgd)
17956 {
17957 - return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
17958 + return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
17959 }
17960
17961 -static inline unsigned long pud_bad(pud_t pud)
17962 +static inline int pud_bad(pud_t pud)
17963 {
17964 - return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
17965 + return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
17966 }
17967
17968 -static inline unsigned long pmd_bad(pmd_t pmd)
17969 +static inline int pmd_bad(pmd_t pmd)
17970 {
17971 - return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
17972 + return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
17973 }
17974
17975 #define pte_none(x) (!(x).pte)
17976 #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
17977
17978 -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
17979 +#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
17980
17981 #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
17982 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
17983 @@ -181,13 +183,13 @@ static inline unsigned long pmd_bad(pmd_
17984 mfn_to_local_pfn(__pte_mfn(_pte)) : \
17985 __pte_mfn(_pte))
17986
17987 -#define pte_page(x) pfn_to_page(pte_pfn(x))
17988 +#define pte_page(x) pfn_to_page(pte_pfn((x)))
17989
17990 /*
17991 * Macro to mark a page protection value as "uncacheable".
17992 */
17993 -#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
17994 -
17995 +#define pgprot_noncached(prot) \
17996 + (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
17997
17998 /*
17999 * Conversion functions: convert a page and protection to a page entry,
18000 @@ -197,36 +199,39 @@ static inline unsigned long pmd_bad(pmd_
18001 /*
18002 * Level 4 access.
18003 */
18004 -#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
18005 -#define pgd_page(pgd) (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
18006 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
18007 -#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
18008 -#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
18009 +#define pgd_page_vaddr(pgd) \
18010 + ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
18011 +#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
18012 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
18013 +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
18014 +#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
18015 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
18016 static inline int pgd_large(pgd_t pgd) { return 0; }
18017 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
18018
18019 /* PUD - Level3 access */
18020 /* to find an entry in a page-table-directory. */
18021 -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
18022 -#define pud_page(pud) (pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
18023 -#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
18024 -#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
18025 +#define pud_page_vaddr(pud) \
18026 + ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
18027 +#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
18028 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
18029 +#define pud_offset(pgd, address) \
18030 + ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
18031 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
18032
18033 static inline int pud_large(pud_t pte)
18034 {
18035 - return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
18036 - (_PAGE_PSE|_PAGE_PRESENT);
18037 + return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
18038 + (_PAGE_PSE | _PAGE_PRESENT);
18039 }
18040
18041 /* PMD - Level 2 access */
18042 -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
18043 -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
18044 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
18045 +#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
18046
18047 -#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
18048 -#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
18049 - pmd_index(address))
18050 +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
18051 +#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
18052 + pmd_index(address))
18053 #define pmd_none(x) (!__pmd_val(x))
18054 #if CONFIG_XEN_COMPAT <= 0x030002
18055 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
18056 @@ -235,43 +240,56 @@ static inline int pud_large(pud_t pte)
18057 #else
18058 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
18059 #endif
18060 -#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
18061 -#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
18062 +#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
18063 +#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
18064
18065 #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
18066 -#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
18067 +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
18068 + _PAGE_FILE })
18069 #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
18070
18071 /* PTE - Level 1 access. */
18072
18073 /* page, protection -> pte */
18074 -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
18075 -
18076 -#define pte_index(address) \
18077 - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
18078 +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
18079 +
18080 +#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
18081 #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
18082 - pte_index(address))
18083 + pte_index((address)))
18084
18085 /* x86-64 always has all page tables mapped. */
18086 -#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
18087 -#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
18088 +#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
18089 +#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
18090 #define pte_unmap(pte) /* NOP */
18091 -#define pte_unmap_nested(pte) /* NOP */
18092 +#define pte_unmap_nested(pte) /* NOP */
18093 +
18094 +#define update_mmu_cache(vma, address, pte) do { } while (0)
18095
18096 -#define update_mmu_cache(vma,address,pte) do { } while (0)
18097 +#define direct_gbpages 0
18098
18099 /* Encode and de-code a swap entry */
18100 -#define __swp_type(x) (((x).val >> 1) & 0x3f)
18101 -#define __swp_offset(x) ((x).val >> 8)
18102 -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
18103 +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
18104 +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
18105 +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
18106 +#else
18107 +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
18108 +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
18109 +#endif
18110 +
18111 +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
18112 + & ((1U << SWP_TYPE_BITS) - 1))
18113 +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
18114 +#define __swp_entry(type, offset) ((swp_entry_t) { \
18115 + ((type) << (_PAGE_BIT_PRESENT + 1)) \
18116 + | ((offset) << SWP_OFFSET_SHIFT) })
18117 #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
18118 #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
18119
18120 -extern int kern_addr_valid(unsigned long addr);
18121 +extern int kern_addr_valid(unsigned long addr);
18122 extern void cleanup_highmap(void);
18123
18124 -#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
18125 - direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
18126 +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
18127 + direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
18128
18129 #define HAVE_ARCH_UNMAPPED_AREA
18130 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
18131 @@ -284,8 +302,10 @@ extern void cleanup_highmap(void);
18132
18133 /* fs/proc/kcore.c */
18134 #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
18135 -#define kc_offset_to_vaddr(o) \
18136 - (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
18137 +#define kc_offset_to_vaddr(o) \
18138 + (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
18139 + ? ((o) | ~__VIRTUAL_MASK) \
18140 + : (o))
18141
18142 #define __HAVE_ARCH_PTE_SAME
18143 #endif /* !__ASSEMBLY__ */
18144 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:33:40.000000000 +0100
18145 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:38:05.000000000 +0100
18146 @@ -3,10 +3,6 @@
18147
18148 #include <asm/processor-flags.h>
18149
18150 -/* migration helpers, for KVM - will be removed in 2.6.25: */
18151 -#include <asm/vm86.h>
18152 -#define Xgt_desc_struct desc_ptr
18153 -
18154 /* Forward declaration, a strange C thing */
18155 struct task_struct;
18156 struct mm_struct;
18157 @@ -24,6 +20,7 @@ struct mm_struct;
18158 #include <asm/msr.h>
18159 #include <asm/desc_defs.h>
18160 #include <asm/nops.h>
18161 +
18162 #include <linux/personality.h>
18163 #include <linux/cpumask.h>
18164 #include <linux/cache.h>
18165 @@ -38,16 +35,18 @@ struct mm_struct;
18166 static inline void *current_text_addr(void)
18167 {
18168 void *pc;
18169 - asm volatile("mov $1f,%0\n1:":"=r" (pc));
18170 +
18171 + asm volatile("mov $1f, %0; 1:":"=r" (pc));
18172 +
18173 return pc;
18174 }
18175
18176 #ifdef CONFIG_X86_VSMP
18177 -#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
18178 -#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
18179 +# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
18180 +# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
18181 #else
18182 -#define ARCH_MIN_TASKALIGN 16
18183 -#define ARCH_MIN_MMSTRUCT_ALIGN 0
18184 +# define ARCH_MIN_TASKALIGN 16
18185 +# define ARCH_MIN_MMSTRUCT_ALIGN 0
18186 #endif
18187
18188 /*
18189 @@ -57,68 +56,80 @@ static inline void *current_text_addr(vo
18190 */
18191
18192 struct cpuinfo_x86 {
18193 - __u8 x86; /* CPU family */
18194 - __u8 x86_vendor; /* CPU vendor */
18195 - __u8 x86_model;
18196 - __u8 x86_mask;
18197 + __u8 x86; /* CPU family */
18198 + __u8 x86_vendor; /* CPU vendor */
18199 + __u8 x86_model;
18200 + __u8 x86_mask;
18201 #ifdef CONFIG_X86_32
18202 - char wp_works_ok; /* It doesn't on 386's */
18203 - char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
18204 - char hard_math;
18205 - char rfu;
18206 - char fdiv_bug;
18207 - char f00f_bug;
18208 - char coma_bug;
18209 - char pad0;
18210 + char wp_works_ok; /* It doesn't on 386's */
18211 +
18212 + /* Problems on some 486Dx4's and old 386's: */
18213 + char hlt_works_ok;
18214 + char hard_math;
18215 + char rfu;
18216 + char fdiv_bug;
18217 + char f00f_bug;
18218 + char coma_bug;
18219 + char pad0;
18220 #else
18221 - /* number of 4K pages in DTLB/ITLB combined(in pages)*/
18222 - int x86_tlbsize;
18223 - __u8 x86_virt_bits, x86_phys_bits;
18224 - /* cpuid returned core id bits */
18225 - __u8 x86_coreid_bits;
18226 - /* Max extended CPUID function supported */
18227 - __u32 extended_cpuid_level;
18228 -#endif
18229 - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
18230 - __u32 x86_capability[NCAPINTS];
18231 - char x86_vendor_id[16];
18232 - char x86_model_id[64];
18233 - int x86_cache_size; /* in KB - valid for CPUS which support this
18234 - call */
18235 - int x86_cache_alignment; /* In bytes */
18236 - int x86_power;
18237 - unsigned long loops_per_jiffy;
18238 + /* Number of 4K pages in DTLB/ITLB combined(in pages): */
18239 + int x86_tlbsize;
18240 + __u8 x86_virt_bits;
18241 + __u8 x86_phys_bits;
18242 + /* CPUID returned core id bits: */
18243 + __u8 x86_coreid_bits;
18244 + /* Max extended CPUID function supported: */
18245 + __u32 extended_cpuid_level;
18246 +#endif
18247 + /* Maximum supported CPUID level, -1=no CPUID: */
18248 + int cpuid_level;
18249 + __u32 x86_capability[NCAPINTS];
18250 + char x86_vendor_id[16];
18251 + char x86_model_id[64];
18252 + /* in KB - valid for CPUS which support this call: */
18253 + int x86_cache_size;
18254 + int x86_cache_alignment; /* In bytes */
18255 + int x86_power;
18256 + unsigned long loops_per_jiffy;
18257 #ifdef CONFIG_SMP
18258 - cpumask_t llc_shared_map; /* cpus sharing the last level cache */
18259 + /* cpus sharing the last level cache: */
18260 + cpumask_t llc_shared_map;
18261 #endif
18262 - u16 x86_max_cores; /* cpuid returned max cores value */
18263 - u16 apicid;
18264 - u16 x86_clflush_size;
18265 + /* cpuid returned max cores value: */
18266 + u16 x86_max_cores;
18267 + u16 apicid;
18268 + u16 initial_apicid;
18269 + u16 x86_clflush_size;
18270 #ifdef CONFIG_SMP
18271 - u16 booted_cores; /* number of cores as seen by OS */
18272 - u16 phys_proc_id; /* Physical processor id. */
18273 - u16 cpu_core_id; /* Core id */
18274 - u16 cpu_index; /* index into per_cpu list */
18275 + /* number of cores as seen by the OS: */
18276 + u16 booted_cores;
18277 + /* Physical processor id: */
18278 + u16 phys_proc_id;
18279 + /* Core id: */
18280 + u16 cpu_core_id;
18281 + /* Index into per_cpu list: */
18282 + u16 cpu_index;
18283 #endif
18284 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
18285
18286 -#define X86_VENDOR_INTEL 0
18287 -#define X86_VENDOR_CYRIX 1
18288 -#define X86_VENDOR_AMD 2
18289 -#define X86_VENDOR_UMC 3
18290 -#define X86_VENDOR_NEXGEN 4
18291 -#define X86_VENDOR_CENTAUR 5
18292 -#define X86_VENDOR_TRANSMETA 7
18293 -#define X86_VENDOR_NSC 8
18294 -#define X86_VENDOR_NUM 9
18295 -#define X86_VENDOR_UNKNOWN 0xff
18296 +#define X86_VENDOR_INTEL 0
18297 +#define X86_VENDOR_CYRIX 1
18298 +#define X86_VENDOR_AMD 2
18299 +#define X86_VENDOR_UMC 3
18300 +#define X86_VENDOR_CENTAUR 5
18301 +#define X86_VENDOR_TRANSMETA 7
18302 +#define X86_VENDOR_NSC 8
18303 +#define X86_VENDOR_NUM 9
18304 +
18305 +#define X86_VENDOR_UNKNOWN 0xff
18306
18307 /*
18308 * capabilities of CPUs
18309 */
18310 -extern struct cpuinfo_x86 boot_cpu_data;
18311 -extern struct cpuinfo_x86 new_cpu_data;
18312 -extern __u32 cleared_cpu_caps[NCAPINTS];
18313 +extern struct cpuinfo_x86 boot_cpu_data;
18314 +extern struct cpuinfo_x86 new_cpu_data;
18315 +
18316 +extern __u32 cleared_cpu_caps[NCAPINTS];
18317
18318 #ifdef CONFIG_SMP
18319 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
18320 @@ -129,7 +140,18 @@ DECLARE_PER_CPU(struct cpuinfo_x86, cpu_
18321 #define current_cpu_data boot_cpu_data
18322 #endif
18323
18324 -void cpu_detect(struct cpuinfo_x86 *c);
18325 +static inline int hlt_works(int cpu)
18326 +{
18327 +#ifdef CONFIG_X86_32
18328 + return cpu_data(cpu).hlt_works_ok;
18329 +#else
18330 + return 1;
18331 +#endif
18332 +}
18333 +
18334 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
18335 +
18336 +extern void cpu_detect(struct cpuinfo_x86 *c);
18337
18338 extern void identify_cpu(struct cpuinfo_x86 *);
18339 extern void identify_boot_cpu(void);
18340 @@ -149,12 +171,12 @@ static inline void xen_cpuid(unsigned in
18341 unsigned int *ecx, unsigned int *edx)
18342 {
18343 /* ecx is often an input as well as an output. */
18344 - __asm__(XEN_CPUID
18345 - : "=a" (*eax),
18346 - "=b" (*ebx),
18347 - "=c" (*ecx),
18348 - "=d" (*edx)
18349 - : "0" (*eax), "2" (*ecx));
18350 + asm(XEN_CPUID
18351 + : "=a" (*eax),
18352 + "=b" (*ebx),
18353 + "=c" (*ecx),
18354 + "=d" (*edx)
18355 + : "0" (*eax), "2" (*ecx));
18356 }
18357
18358 static inline void load_cr3(pgd_t *pgdir)
18359 @@ -166,57 +188,70 @@ static inline void load_cr3(pgd_t *pgdir
18360 #ifdef CONFIG_X86_32
18361 /* This is the TSS defined by the hardware. */
18362 struct x86_hw_tss {
18363 - unsigned short back_link, __blh;
18364 - unsigned long sp0;
18365 - unsigned short ss0, __ss0h;
18366 - unsigned long sp1;
18367 - unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
18368 - unsigned long sp2;
18369 - unsigned short ss2, __ss2h;
18370 - unsigned long __cr3;
18371 - unsigned long ip;
18372 - unsigned long flags;
18373 - unsigned long ax, cx, dx, bx;
18374 - unsigned long sp, bp, si, di;
18375 - unsigned short es, __esh;
18376 - unsigned short cs, __csh;
18377 - unsigned short ss, __ssh;
18378 - unsigned short ds, __dsh;
18379 - unsigned short fs, __fsh;
18380 - unsigned short gs, __gsh;
18381 - unsigned short ldt, __ldth;
18382 - unsigned short trace, io_bitmap_base;
18383 + unsigned short back_link, __blh;
18384 + unsigned long sp0;
18385 + unsigned short ss0, __ss0h;
18386 + unsigned long sp1;
18387 + /* ss1 caches MSR_IA32_SYSENTER_CS: */
18388 + unsigned short ss1, __ss1h;
18389 + unsigned long sp2;
18390 + unsigned short ss2, __ss2h;
18391 + unsigned long __cr3;
18392 + unsigned long ip;
18393 + unsigned long flags;
18394 + unsigned long ax;
18395 + unsigned long cx;
18396 + unsigned long dx;
18397 + unsigned long bx;
18398 + unsigned long sp;
18399 + unsigned long bp;
18400 + unsigned long si;
18401 + unsigned long di;
18402 + unsigned short es, __esh;
18403 + unsigned short cs, __csh;
18404 + unsigned short ss, __ssh;
18405 + unsigned short ds, __dsh;
18406 + unsigned short fs, __fsh;
18407 + unsigned short gs, __gsh;
18408 + unsigned short ldt, __ldth;
18409 + unsigned short trace;
18410 + unsigned short io_bitmap_base;
18411 +
18412 } __attribute__((packed));
18413 extern struct tss_struct doublefault_tss;
18414 #else
18415 struct x86_hw_tss {
18416 - u32 reserved1;
18417 - u64 sp0;
18418 - u64 sp1;
18419 - u64 sp2;
18420 - u64 reserved2;
18421 - u64 ist[7];
18422 - u32 reserved3;
18423 - u32 reserved4;
18424 - u16 reserved5;
18425 - u16 io_bitmap_base;
18426 + u32 reserved1;
18427 + u64 sp0;
18428 + u64 sp1;
18429 + u64 sp2;
18430 + u64 reserved2;
18431 + u64 ist[7];
18432 + u32 reserved3;
18433 + u32 reserved4;
18434 + u16 reserved5;
18435 + u16 io_bitmap_base;
18436 +
18437 } __attribute__((packed)) ____cacheline_aligned;
18438 #endif
18439 #endif /* CONFIG_X86_NO_TSS */
18440
18441 /*
18442 - * Size of io_bitmap.
18443 + * IO-bitmap sizes:
18444 */
18445 -#define IO_BITMAP_BITS 65536
18446 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
18447 -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
18448 -#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
18449 -#define INVALID_IO_BITMAP_OFFSET 0x8000
18450 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
18451 +#define IO_BITMAP_BITS 65536
18452 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
18453 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
18454 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
18455 +#define INVALID_IO_BITMAP_OFFSET 0x8000
18456 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
18457
18458 #ifndef CONFIG_X86_NO_TSS
18459 struct tss_struct {
18460 - struct x86_hw_tss x86_tss;
18461 + /*
18462 + * The hardware state:
18463 + */
18464 + struct x86_hw_tss x86_tss;
18465
18466 /*
18467 * The extra 1 is there because the CPU will access an
18468 @@ -224,136 +259,162 @@ struct tss_struct {
18469 * bitmap. The extra byte must be all 1 bits, and must
18470 * be within the limit.
18471 */
18472 - unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
18473 + unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
18474 /*
18475 * Cache the current maximum and the last task that used the bitmap:
18476 */
18477 - unsigned long io_bitmap_max;
18478 - struct thread_struct *io_bitmap_owner;
18479 + unsigned long io_bitmap_max;
18480 + struct thread_struct *io_bitmap_owner;
18481 +
18482 /*
18483 - * pads the TSS to be cacheline-aligned (size is 0x100)
18484 + * Pad the TSS to be cacheline-aligned (size is 0x100):
18485 */
18486 - unsigned long __cacheline_filler[35];
18487 + unsigned long __cacheline_filler[35];
18488 /*
18489 - * .. and then another 0x100 bytes for emergency kernel stack
18490 + * .. and then another 0x100 bytes for the emergency kernel stack:
18491 */
18492 - unsigned long stack[64];
18493 + unsigned long stack[64];
18494 +
18495 } __attribute__((packed));
18496
18497 DECLARE_PER_CPU(struct tss_struct, init_tss);
18498
18499 -/* Save the original ist values for checking stack pointers during debugging */
18500 +/*
18501 + * Save the original ist values for checking stack pointers during debugging
18502 + */
18503 struct orig_ist {
18504 - unsigned long ist[7];
18505 + unsigned long ist[7];
18506 };
18507 #endif /* CONFIG_X86_NO_TSS */
18508
18509 #define MXCSR_DEFAULT 0x1f80
18510
18511 struct i387_fsave_struct {
18512 - u32 cwd;
18513 - u32 swd;
18514 - u32 twd;
18515 - u32 fip;
18516 - u32 fcs;
18517 - u32 foo;
18518 - u32 fos;
18519 - u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
18520 - u32 status; /* software status information */
18521 + u32 cwd; /* FPU Control Word */
18522 + u32 swd; /* FPU Status Word */
18523 + u32 twd; /* FPU Tag Word */
18524 + u32 fip; /* FPU IP Offset */
18525 + u32 fcs; /* FPU IP Selector */
18526 + u32 foo; /* FPU Operand Pointer Offset */
18527 + u32 fos; /* FPU Operand Pointer Selector */
18528 +
18529 + /* 8*10 bytes for each FP-reg = 80 bytes: */
18530 + u32 st_space[20];
18531 +
18532 + /* Software status information [not touched by FSAVE ]: */
18533 + u32 status;
18534 };
18535
18536 struct i387_fxsave_struct {
18537 - u16 cwd;
18538 - u16 swd;
18539 - u16 twd;
18540 - u16 fop;
18541 + u16 cwd; /* Control Word */
18542 + u16 swd; /* Status Word */
18543 + u16 twd; /* Tag Word */
18544 + u16 fop; /* Last Instruction Opcode */
18545 union {
18546 struct {
18547 - u64 rip;
18548 - u64 rdp;
18549 + u64 rip; /* Instruction Pointer */
18550 + u64 rdp; /* Data Pointer */
18551 };
18552 struct {
18553 - u32 fip;
18554 - u32 fcs;
18555 - u32 foo;
18556 - u32 fos;
18557 + u32 fip; /* FPU IP Offset */
18558 + u32 fcs; /* FPU IP Selector */
18559 + u32 foo; /* FPU Operand Offset */
18560 + u32 fos; /* FPU Operand Selector */
18561 };
18562 };
18563 - u32 mxcsr;
18564 - u32 mxcsr_mask;
18565 - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
18566 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
18567 - u32 padding[24];
18568 + u32 mxcsr; /* MXCSR Register State */
18569 + u32 mxcsr_mask; /* MXCSR Mask */
18570 +
18571 + /* 8*16 bytes for each FP-reg = 128 bytes: */
18572 + u32 st_space[32];
18573 +
18574 + /* 16*16 bytes for each XMM-reg = 256 bytes: */
18575 + u32 xmm_space[64];
18576 +
18577 + u32 padding[24];
18578 +
18579 } __attribute__((aligned(16)));
18580
18581 struct i387_soft_struct {
18582 - u32 cwd;
18583 - u32 swd;
18584 - u32 twd;
18585 - u32 fip;
18586 - u32 fcs;
18587 - u32 foo;
18588 - u32 fos;
18589 - u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
18590 - u8 ftop, changed, lookahead, no_update, rm, alimit;
18591 - struct info *info;
18592 - u32 entry_eip;
18593 + u32 cwd;
18594 + u32 swd;
18595 + u32 twd;
18596 + u32 fip;
18597 + u32 fcs;
18598 + u32 foo;
18599 + u32 fos;
18600 + /* 8*10 bytes for each FP-reg = 80 bytes: */
18601 + u32 st_space[20];
18602 + u8 ftop;
18603 + u8 changed;
18604 + u8 lookahead;
18605 + u8 no_update;
18606 + u8 rm;
18607 + u8 alimit;
18608 + struct info *info;
18609 + u32 entry_eip;
18610 };
18611
18612 -union i387_union {
18613 +union thread_xstate {
18614 struct i387_fsave_struct fsave;
18615 struct i387_fxsave_struct fxsave;
18616 - struct i387_soft_struct soft;
18617 + struct i387_soft_struct soft;
18618 };
18619
18620 -#ifdef CONFIG_X86_32
18621 -DECLARE_PER_CPU(u8, cpu_llc_id);
18622 -#elif !defined(CONFIG_X86_NO_TSS)
18623 +#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS)
18624 DECLARE_PER_CPU(struct orig_ist, orig_ist);
18625 #endif
18626
18627 extern void print_cpu_info(struct cpuinfo_x86 *);
18628 +extern unsigned int xstate_size;
18629 +extern void free_thread_xstate(struct task_struct *);
18630 +extern struct kmem_cache *task_xstate_cachep;
18631 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
18632 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
18633 extern unsigned short num_cache_leaves;
18634
18635 struct thread_struct {
18636 -/* cached TLS descriptors. */
18637 - struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
18638 - unsigned long sp0;
18639 - unsigned long sp;
18640 + /* Cached TLS descriptors: */
18641 + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
18642 + unsigned long sp0;
18643 + unsigned long sp;
18644 #ifdef CONFIG_X86_32
18645 - unsigned long sysenter_cs;
18646 + unsigned long sysenter_cs;
18647 #else
18648 - unsigned long usersp; /* Copy from PDA */
18649 - unsigned short es, ds, fsindex, gsindex;
18650 -#endif
18651 - unsigned long ip;
18652 - unsigned long fs;
18653 - unsigned long gs;
18654 -/* Hardware debugging registers */
18655 - unsigned long debugreg0;
18656 - unsigned long debugreg1;
18657 - unsigned long debugreg2;
18658 - unsigned long debugreg3;
18659 - unsigned long debugreg6;
18660 - unsigned long debugreg7;
18661 -/* fault info */
18662 - unsigned long cr2, trap_no, error_code;
18663 -/* floating point info */
18664 - union i387_union i387 __attribute__((aligned(16)));;
18665 + unsigned long usersp; /* Copy from PDA */
18666 + unsigned short es;
18667 + unsigned short ds;
18668 + unsigned short fsindex;
18669 + unsigned short gsindex;
18670 +#endif
18671 + unsigned long ip;
18672 + unsigned long fs;
18673 + unsigned long gs;
18674 + /* Hardware debugging registers: */
18675 + unsigned long debugreg0;
18676 + unsigned long debugreg1;
18677 + unsigned long debugreg2;
18678 + unsigned long debugreg3;
18679 + unsigned long debugreg6;
18680 + unsigned long debugreg7;
18681 + /* Fault info: */
18682 + unsigned long cr2;
18683 + unsigned long trap_no;
18684 + unsigned long error_code;
18685 + /* floating point and extended processor state */
18686 + union thread_xstate *xstate;
18687 #ifdef CONFIG_X86_32
18688 -/* virtual 86 mode info */
18689 + /* Virtual 86 mode info */
18690 struct vm86_struct __user *vm86_info;
18691 unsigned long screen_bitmap;
18692 unsigned long v86flags, v86mask, saved_sp0;
18693 unsigned int saved_fs, saved_gs;
18694 #endif
18695 -/* IO permissions */
18696 - unsigned long *io_bitmap_ptr;
18697 - unsigned long iopl;
18698 -/* max allowed port in the bitmap, in bytes: */
18699 - unsigned io_bitmap_max;
18700 + /* IO permissions: */
18701 + unsigned long *io_bitmap_ptr;
18702 + unsigned long iopl;
18703 + /* Max allowed port in the bitmap, in bytes: */
18704 + unsigned io_bitmap_max;
18705 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
18706 unsigned long debugctlmsr;
18707 /* Debug Store - if not 0 points to a DS Save Area configuration;
18708 @@ -384,12 +445,12 @@ static inline void xen_set_iopl_mask(uns
18709 }
18710
18711 #ifndef CONFIG_X86_NO_TSS
18712 -static inline void native_load_sp0(struct tss_struct *tss,
18713 - struct thread_struct *thread)
18714 +static inline void
18715 +native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
18716 {
18717 tss->x86_tss.sp0 = thread->sp0;
18718 #ifdef CONFIG_X86_32
18719 - /* Only happens when SEP is enabled, no need to test "SEP"arately */
18720 + /* Only happens when SEP is enabled, no need to test "SEP"arately: */
18721 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
18722 tss->x86_tss.ss1 = thread->sysenter_cs;
18723 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
18724 @@ -403,8 +464,8 @@ static inline void native_load_sp0(struc
18725 } while (0)
18726 #endif
18727
18728 -#define __cpuid xen_cpuid
18729 -#define paravirt_enabled() 0
18730 +#define __cpuid xen_cpuid
18731 +#define paravirt_enabled() 0
18732
18733 /*
18734 * These special macros can be used to get or set a debugging register
18735 @@ -424,11 +485,12 @@ static inline void native_load_sp0(struc
18736 * enable), so that any CPU's that boot up
18737 * after us can get the correct flags.
18738 */
18739 -extern unsigned long mmu_cr4_features;
18740 +extern unsigned long mmu_cr4_features;
18741
18742 static inline void set_in_cr4(unsigned long mask)
18743 {
18744 unsigned cr4;
18745 +
18746 mmu_cr4_features |= mask;
18747 cr4 = read_cr4();
18748 cr4 |= mask;
18749 @@ -438,6 +500,7 @@ static inline void set_in_cr4(unsigned l
18750 static inline void clear_in_cr4(unsigned long mask)
18751 {
18752 unsigned cr4;
18753 +
18754 mmu_cr4_features &= ~mask;
18755 cr4 = read_cr4();
18756 cr4 &= ~mask;
18757 @@ -445,42 +508,42 @@ static inline void clear_in_cr4(unsigned
18758 }
18759
18760 struct microcode_header {
18761 - unsigned int hdrver;
18762 - unsigned int rev;
18763 - unsigned int date;
18764 - unsigned int sig;
18765 - unsigned int cksum;
18766 - unsigned int ldrver;
18767 - unsigned int pf;
18768 - unsigned int datasize;
18769 - unsigned int totalsize;
18770 - unsigned int reserved[3];
18771 + unsigned int hdrver;
18772 + unsigned int rev;
18773 + unsigned int date;
18774 + unsigned int sig;
18775 + unsigned int cksum;
18776 + unsigned int ldrver;
18777 + unsigned int pf;
18778 + unsigned int datasize;
18779 + unsigned int totalsize;
18780 + unsigned int reserved[3];
18781 };
18782
18783 struct microcode {
18784 - struct microcode_header hdr;
18785 - unsigned int bits[0];
18786 + struct microcode_header hdr;
18787 + unsigned int bits[0];
18788 };
18789
18790 -typedef struct microcode microcode_t;
18791 -typedef struct microcode_header microcode_header_t;
18792 +typedef struct microcode microcode_t;
18793 +typedef struct microcode_header microcode_header_t;
18794
18795 /* microcode format is extended from prescott processors */
18796 struct extended_signature {
18797 - unsigned int sig;
18798 - unsigned int pf;
18799 - unsigned int cksum;
18800 + unsigned int sig;
18801 + unsigned int pf;
18802 + unsigned int cksum;
18803 };
18804
18805 struct extended_sigtable {
18806 - unsigned int count;
18807 - unsigned int cksum;
18808 - unsigned int reserved[3];
18809 + unsigned int count;
18810 + unsigned int cksum;
18811 + unsigned int reserved[3];
18812 struct extended_signature sigs[0];
18813 };
18814
18815 typedef struct {
18816 - unsigned long seg;
18817 + unsigned long seg;
18818 } mm_segment_t;
18819
18820
18821 @@ -492,7 +555,7 @@ extern int kernel_thread(int (*fn)(void
18822 /* Free all resources held by a thread. */
18823 extern void release_thread(struct task_struct *);
18824
18825 -/* Prepare to copy thread state - unlazy all lazy status */
18826 +/* Prepare to copy thread state - unlazy all lazy state */
18827 extern void prepare_to_copy(struct task_struct *tsk);
18828
18829 unsigned long get_wchan(struct task_struct *p);
18830 @@ -529,118 +592,138 @@ static inline unsigned int cpuid_eax(uns
18831 unsigned int eax, ebx, ecx, edx;
18832
18833 cpuid(op, &eax, &ebx, &ecx, &edx);
18834 +
18835 return eax;
18836 }
18837 +
18838 static inline unsigned int cpuid_ebx(unsigned int op)
18839 {
18840 unsigned int eax, ebx, ecx, edx;
18841
18842 cpuid(op, &eax, &ebx, &ecx, &edx);
18843 +
18844 return ebx;
18845 }
18846 +
18847 static inline unsigned int cpuid_ecx(unsigned int op)
18848 {
18849 unsigned int eax, ebx, ecx, edx;
18850
18851 cpuid(op, &eax, &ebx, &ecx, &edx);
18852 +
18853 return ecx;
18854 }
18855 +
18856 static inline unsigned int cpuid_edx(unsigned int op)
18857 {
18858 unsigned int eax, ebx, ecx, edx;
18859
18860 cpuid(op, &eax, &ebx, &ecx, &edx);
18861 +
18862 return edx;
18863 }
18864
18865 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
18866 static inline void rep_nop(void)
18867 {
18868 - __asm__ __volatile__("rep;nop": : :"memory");
18869 + asm volatile("rep; nop" ::: "memory");
18870 }
18871
18872 -/* Stop speculative execution */
18873 +static inline void cpu_relax(void)
18874 +{
18875 + rep_nop();
18876 +}
18877 +
18878 +/* Stop speculative execution: */
18879 static inline void sync_core(void)
18880 {
18881 int tmp;
18882 +
18883 asm volatile("cpuid" : "=a" (tmp) : "0" (1)
18884 - : "ebx", "ecx", "edx", "memory");
18885 + : "ebx", "ecx", "edx", "memory");
18886 }
18887
18888 -#define cpu_relax() rep_nop()
18889 -
18890 static inline void __monitor(const void *eax, unsigned long ecx,
18891 - unsigned long edx)
18892 + unsigned long edx)
18893 {
18894 - /* "monitor %eax,%ecx,%edx;" */
18895 - asm volatile(
18896 - ".byte 0x0f,0x01,0xc8;"
18897 - : :"a" (eax), "c" (ecx), "d"(edx));
18898 + /* "monitor %eax, %ecx, %edx;" */
18899 + asm volatile(".byte 0x0f, 0x01, 0xc8;"
18900 + :: "a" (eax), "c" (ecx), "d"(edx));
18901 }
18902
18903 static inline void __mwait(unsigned long eax, unsigned long ecx)
18904 {
18905 - /* "mwait %eax,%ecx;" */
18906 - asm volatile(
18907 - ".byte 0x0f,0x01,0xc9;"
18908 - : :"a" (eax), "c" (ecx));
18909 + /* "mwait %eax, %ecx;" */
18910 + asm volatile(".byte 0x0f, 0x01, 0xc9;"
18911 + :: "a" (eax), "c" (ecx));
18912 }
18913
18914 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
18915 {
18916 - /* "mwait %eax,%ecx;" */
18917 - asm volatile(
18918 - "sti; .byte 0x0f,0x01,0xc9;"
18919 - : :"a" (eax), "c" (ecx));
18920 + trace_hardirqs_on();
18921 + /* "mwait %eax, %ecx;" */
18922 + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
18923 + :: "a" (eax), "c" (ecx));
18924 }
18925
18926 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
18927
18928 -extern int force_mwait;
18929 +extern int force_mwait;
18930
18931 extern void select_idle_routine(const struct cpuinfo_x86 *c);
18932
18933 -extern unsigned long boot_option_idle_override;
18934 +extern unsigned long boot_option_idle_override;
18935
18936 extern void enable_sep_cpu(void);
18937 extern int sysenter_setup(void);
18938
18939 /* Defined in head.S */
18940 -extern struct desc_ptr early_gdt_descr;
18941 +extern struct desc_ptr early_gdt_descr;
18942
18943 extern void cpu_set_gdt(int);
18944 extern void switch_to_new_gdt(void);
18945 extern void cpu_init(void);
18946 extern void init_gdt(int cpu);
18947
18948 -/* from system description table in BIOS. Mostly for MCA use, but
18949 - * others may find it useful. */
18950 -extern unsigned int machine_id;
18951 -extern unsigned int machine_submodel_id;
18952 -extern unsigned int BIOS_revision;
18953 +static inline void update_debugctlmsr(unsigned long debugctlmsr)
18954 +{
18955 +#ifndef CONFIG_X86_DEBUGCTLMSR
18956 + if (boot_cpu_data.x86 < 6)
18957 + return;
18958 +#endif
18959 + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
18960 +}
18961
18962 -/* Boot loader type from the setup header */
18963 -extern int bootloader_type;
18964 +/*
18965 + * from system description table in BIOS. Mostly for MCA use, but
18966 + * others may find it useful:
18967 + */
18968 +extern unsigned int machine_id;
18969 +extern unsigned int machine_submodel_id;
18970 +extern unsigned int BIOS_revision;
18971 +
18972 +/* Boot loader type from the setup header: */
18973 +extern int bootloader_type;
18974
18975 -extern char ignore_fpu_irq;
18976 -#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
18977 +extern char ignore_fpu_irq;
18978
18979 #define HAVE_ARCH_PICK_MMAP_LAYOUT 1
18980 #define ARCH_HAS_PREFETCHW
18981 #define ARCH_HAS_SPINLOCK_PREFETCH
18982
18983 #ifdef CONFIG_X86_32
18984 -#define BASE_PREFETCH ASM_NOP4
18985 -#define ARCH_HAS_PREFETCH
18986 +# define BASE_PREFETCH ASM_NOP4
18987 +# define ARCH_HAS_PREFETCH
18988 #else
18989 -#define BASE_PREFETCH "prefetcht0 (%1)"
18990 +# define BASE_PREFETCH "prefetcht0 (%1)"
18991 #endif
18992
18993 -/* Prefetch instructions for Pentium III and AMD Athlon */
18994 -/* It's not worth to care about 3dnow! prefetches for the K6
18995 - because they are microcoded there and very slow.
18996 - However we don't do prefetches for pre XP Athlons currently
18997 - That should be fixed. */
18998 +/*
18999 + * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
19000 + *
19001 + * It's not worth to care about 3dnow prefetches for the K6
19002 + * because they are microcoded there and very slow.
19003 + */
19004 static inline void prefetch(const void *x)
19005 {
19006 alternative_input(BASE_PREFETCH,
19007 @@ -649,8 +732,11 @@ static inline void prefetch(const void *
19008 "r" (x));
19009 }
19010
19011 -/* 3dnow! prefetch to get an exclusive cache line. Useful for
19012 - spinlocks to avoid one state transition in the cache coherency protocol. */
19013 +/*
19014 + * 3dnow prefetch to get an exclusive cache line.
19015 + * Useful for spinlocks to avoid one state transition in the
19016 + * cache coherency protocol:
19017 + */
19018 static inline void prefetchw(const void *x)
19019 {
19020 alternative_input(BASE_PREFETCH,
19021 @@ -659,21 +745,25 @@ static inline void prefetchw(const void
19022 "r" (x));
19023 }
19024
19025 -#define spin_lock_prefetch(x) prefetchw(x)
19026 +static inline void spin_lock_prefetch(const void *x)
19027 +{
19028 + prefetchw(x);
19029 +}
19030 +
19031 #ifdef CONFIG_X86_32
19032 /*
19033 * User space process size: 3GB (default).
19034 */
19035 -#define TASK_SIZE (PAGE_OFFSET)
19036 -#define STACK_TOP TASK_SIZE
19037 -#define STACK_TOP_MAX STACK_TOP
19038 -
19039 -#define INIT_THREAD { \
19040 - .sp0 = sizeof(init_stack) + (long)&init_stack, \
19041 - .vm86_info = NULL, \
19042 - .sysenter_cs = __KERNEL_CS, \
19043 - .io_bitmap_ptr = NULL, \
19044 - .fs = __KERNEL_PERCPU, \
19045 +#define TASK_SIZE PAGE_OFFSET
19046 +#define STACK_TOP TASK_SIZE
19047 +#define STACK_TOP_MAX STACK_TOP
19048 +
19049 +#define INIT_THREAD { \
19050 + .sp0 = sizeof(init_stack) + (long)&init_stack, \
19051 + .vm86_info = NULL, \
19052 + .sysenter_cs = __KERNEL_CS, \
19053 + .io_bitmap_ptr = NULL, \
19054 + .fs = __KERNEL_PERCPU, \
19055 }
19056
19057 /*
19058 @@ -682,28 +772,15 @@ static inline void prefetchw(const void
19059 * permission bitmap. The extra byte must be all 1 bits, and must
19060 * be within the limit.
19061 */
19062 -#define INIT_TSS { \
19063 - .x86_tss = { \
19064 +#define INIT_TSS { \
19065 + .x86_tss = { \
19066 .sp0 = sizeof(init_stack) + (long)&init_stack, \
19067 - .ss0 = __KERNEL_DS, \
19068 - .ss1 = __KERNEL_CS, \
19069 - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
19070 - }, \
19071 - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
19072 -}
19073 -
19074 -#define start_thread(regs, new_eip, new_esp) do { \
19075 - __asm__("movl %0,%%gs": :"r" (0)); \
19076 - regs->fs = 0; \
19077 - set_fs(USER_DS); \
19078 - regs->ds = __USER_DS; \
19079 - regs->es = __USER_DS; \
19080 - regs->ss = __USER_DS; \
19081 - regs->cs = __USER_CS; \
19082 - regs->ip = new_eip; \
19083 - regs->sp = new_esp; \
19084 -} while (0)
19085 -
19086 + .ss0 = __KERNEL_DS, \
19087 + .ss1 = __KERNEL_CS, \
19088 + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
19089 + }, \
19090 + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
19091 +}
19092
19093 extern unsigned long thread_saved_pc(struct task_struct *tsk);
19094
19095 @@ -731,24 +808,24 @@ extern unsigned long thread_saved_pc(str
19096 __regs__ - 1; \
19097 })
19098
19099 -#define KSTK_ESP(task) (task_pt_regs(task)->sp)
19100 +#define KSTK_ESP(task) (task_pt_regs(task)->sp)
19101
19102 #else
19103 /*
19104 * User space process size. 47bits minus one guard page.
19105 */
19106 -#define TASK_SIZE64 (0x800000000000UL - 4096)
19107 +#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
19108
19109 /* This decides where the kernel will search for a free chunk of vm
19110 * space during mmap's.
19111 */
19112 -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
19113 - 0xc0000000 : 0xFFFFe000)
19114 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
19115 + 0xc0000000 : 0xFFFFe000)
19116
19117 -#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
19118 - IA32_PAGE_OFFSET : TASK_SIZE64)
19119 -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
19120 - IA32_PAGE_OFFSET : TASK_SIZE64)
19121 +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
19122 + IA32_PAGE_OFFSET : TASK_SIZE64)
19123 +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
19124 + IA32_PAGE_OFFSET : TASK_SIZE64)
19125
19126 #define STACK_TOP TASK_SIZE
19127 #define STACK_TOP_MAX TASK_SIZE64
19128 @@ -761,33 +838,32 @@ extern unsigned long thread_saved_pc(str
19129 .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
19130 }
19131
19132 -#define start_thread(regs, new_rip, new_rsp) do { \
19133 - asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
19134 - load_gs_index(0); \
19135 - (regs)->ip = (new_rip); \
19136 - (regs)->sp = (new_rsp); \
19137 - write_pda(oldrsp, (new_rsp)); \
19138 - (regs)->cs = __USER_CS; \
19139 - (regs)->ss = __USER_DS; \
19140 - (regs)->flags = 0x200; \
19141 - set_fs(USER_DS); \
19142 -} while (0)
19143 -
19144 /*
19145 * Return saved PC of a blocked thread.
19146 * What is this good for? it will be always the scheduler or ret_from_fork.
19147 */
19148 -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
19149 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
19150
19151 -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
19152 -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
19153 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
19154 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
19155 #endif /* CONFIG_X86_64 */
19156
19157 -/* This decides where the kernel will search for a free chunk of vm
19158 +extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
19159 + unsigned long new_sp);
19160 +
19161 +/*
19162 + * This decides where the kernel will search for a free chunk of vm
19163 * space during mmap's.
19164 */
19165 #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
19166
19167 -#define KSTK_EIP(task) (task_pt_regs(task)->ip)
19168 +#define KSTK_EIP(task) (task_pt_regs(task)->ip)
19169 +
19170 +/* Get/set a process' ability to use the timestamp counter instruction */
19171 +#define GET_TSC_CTL(adr) get_tsc_mode((adr))
19172 +#define SET_TSC_CTL(val) set_tsc_mode((val))
19173 +
19174 +extern int get_tsc_mode(unsigned long adr);
19175 +extern int set_tsc_mode(unsigned int val);
19176
19177 #endif
19178 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:33:40.000000000 +0100
19179 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:38:05.000000000 +0100
19180 @@ -191,13 +191,14 @@
19181 #define SEGMENT_TI_MASK 0x4
19182
19183 #define IDT_ENTRIES 256
19184 +#define NUM_EXCEPTION_VECTORS 32
19185 #define GDT_SIZE (GDT_ENTRIES * 8)
19186 #define GDT_ENTRY_TLS_ENTRIES 3
19187 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
19188
19189 #ifdef __KERNEL__
19190 #ifndef __ASSEMBLY__
19191 -extern const char early_idt_handlers[IDT_ENTRIES][10];
19192 +extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
19193 #endif
19194 #endif
19195
19196 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/smp.h 2009-02-16 16:18:36.000000000 +0100
19197 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/smp.h 2009-03-16 16:38:05.000000000 +0100
19198 @@ -1,5 +1,227 @@
19199 -#ifdef CONFIG_X86_32
19200 -# include "smp_32.h"
19201 +#ifndef _ASM_X86_SMP_H_
19202 +#define _ASM_X86_SMP_H_
19203 +#ifndef __ASSEMBLY__
19204 +#include <linux/cpumask.h>
19205 +#include <linux/init.h>
19206 +#include <asm/percpu.h>
19207 +
19208 +/*
19209 + * We need the APIC definitions automatically as part of 'smp.h'
19210 + */
19211 +#ifdef CONFIG_X86_LOCAL_APIC
19212 +# include <asm/mpspec.h>
19213 +# include <asm/apic.h>
19214 +# ifdef CONFIG_X86_IO_APIC
19215 +# include <asm/io_apic.h>
19216 +# endif
19217 +#endif
19218 +#include <asm/pda.h>
19219 +#include <asm/thread_info.h>
19220 +
19221 +#define cpu_callout_map cpu_possible_map
19222 +extern cpumask_t cpu_initialized;
19223 +#define cpu_callin_map cpu_possible_map
19224 +
19225 +extern void (*mtrr_hook)(void);
19226 +extern void zap_low_mappings(void);
19227 +
19228 +extern int smp_num_siblings;
19229 +extern unsigned int num_processors;
19230 +extern cpumask_t cpu_initialized;
19231 +
19232 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
19233 +extern u16 x86_cpu_to_apicid_init[];
19234 +extern u16 x86_bios_cpu_apicid_init[];
19235 +extern void *x86_cpu_to_apicid_early_ptr;
19236 +extern void *x86_bios_cpu_apicid_early_ptr;
19237 #else
19238 -# include "smp_64.h"
19239 +#define x86_cpu_to_apicid_early_ptr NULL
19240 +#define x86_bios_cpu_apicid_early_ptr NULL
19241 +#endif
19242 +
19243 +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
19244 +DECLARE_PER_CPU(cpumask_t, cpu_core_map);
19245 +DECLARE_PER_CPU(u16, cpu_llc_id);
19246 +DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
19247 +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
19248 +
19249 +#ifdef CONFIG_SMP
19250 +
19251 +#ifndef CONFIG_XEN
19252 +
19253 +/* Static state in head.S used to set up a CPU */
19254 +extern struct {
19255 + void *sp;
19256 + unsigned short ss;
19257 +} stack_start;
19258 +
19259 +struct smp_ops {
19260 + void (*smp_prepare_boot_cpu)(void);
19261 + void (*smp_prepare_cpus)(unsigned max_cpus);
19262 + int (*cpu_up)(unsigned cpu);
19263 + void (*smp_cpus_done)(unsigned max_cpus);
19264 +
19265 + void (*smp_send_stop)(void);
19266 + void (*smp_send_reschedule)(int cpu);
19267 + int (*smp_call_function_mask)(cpumask_t mask,
19268 + void (*func)(void *info), void *info,
19269 + int wait);
19270 +};
19271 +
19272 +/* Globals due to paravirt */
19273 +extern void set_cpu_sibling_map(int cpu);
19274 +
19275 +#ifndef CONFIG_PARAVIRT
19276 +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
19277 +#endif
19278 +extern struct smp_ops smp_ops;
19279 +
19280 +static inline void smp_send_stop(void)
19281 +{
19282 + smp_ops.smp_send_stop();
19283 +}
19284 +
19285 +static inline void smp_prepare_boot_cpu(void)
19286 +{
19287 + smp_ops.smp_prepare_boot_cpu();
19288 +}
19289 +
19290 +static inline void smp_prepare_cpus(unsigned int max_cpus)
19291 +{
19292 + smp_ops.smp_prepare_cpus(max_cpus);
19293 +}
19294 +
19295 +static inline void smp_cpus_done(unsigned int max_cpus)
19296 +{
19297 + smp_ops.smp_cpus_done(max_cpus);
19298 +}
19299 +
19300 +static inline int __cpu_up(unsigned int cpu)
19301 +{
19302 + return smp_ops.cpu_up(cpu);
19303 +}
19304 +
19305 +static inline void smp_send_reschedule(int cpu)
19306 +{
19307 + smp_ops.smp_send_reschedule(cpu);
19308 +}
19309 +
19310 +static inline int smp_call_function_mask(cpumask_t mask,
19311 + void (*func) (void *info), void *info,
19312 + int wait)
19313 +{
19314 + return smp_ops.smp_call_function_mask(mask, func, info, wait);
19315 +}
19316 +
19317 +void native_smp_prepare_boot_cpu(void);
19318 +void native_smp_prepare_cpus(unsigned int max_cpus);
19319 +void native_smp_cpus_done(unsigned int max_cpus);
19320 +int native_cpu_up(unsigned int cpunum);
19321 +
19322 +#else /* CONFIG_XEN */
19323 +
19324 +void xen_smp_send_stop(void);
19325 +void xen_smp_send_reschedule(int cpu);
19326 +int xen_smp_call_function_mask(cpumask_t mask,
19327 + void (*func) (void *info), void *info,
19328 + int wait);
19329 +
19330 +#define smp_send_stop xen_smp_send_stop
19331 +#define smp_send_reschedule xen_smp_send_reschedule
19332 +#define smp_call_function_mask xen_smp_call_function_mask
19333 +
19334 +extern void prefill_possible_map(void);
19335 +
19336 +#endif /* CONFIG_XEN */
19337 +
19338 +extern int __cpu_disable(void);
19339 +extern void __cpu_die(unsigned int cpu);
19340 +
19341 +extern void prefill_possible_map(void);
19342 +
19343 +void smp_store_cpu_info(int id);
19344 +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
19345 +
19346 +/* We don't mark CPUs online until __cpu_up(), so we need another measure */
19347 +static inline int num_booting_cpus(void)
19348 +{
19349 + return cpus_weight(cpu_callout_map);
19350 +}
19351 +#endif /* CONFIG_SMP */
19352 +
19353 +extern unsigned disabled_cpus __cpuinitdata;
19354 +
19355 +#ifdef CONFIG_X86_32_SMP
19356 +/*
19357 + * This function is needed by all SMP systems. It must _always_ be valid
19358 + * from the initial startup. We map APIC_BASE very early in page_setup(),
19359 + * so this is correct in the x86 case.
19360 + */
19361 +DECLARE_PER_CPU(int, cpu_number);
19362 +#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
19363 +#define safe_smp_processor_id() smp_processor_id()
19364 +
19365 +#elif defined(CONFIG_X86_64_SMP)
19366 +#define raw_smp_processor_id() read_pda(cpunumber)
19367 +
19368 +#define stack_smp_processor_id() \
19369 +({ \
19370 + struct thread_info *ti; \
19371 + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
19372 + ti->cpu; \
19373 +})
19374 +#define safe_smp_processor_id() smp_processor_id()
19375 +
19376 +#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
19377 +#define cpu_physical_id(cpu) boot_cpu_physical_apicid
19378 +#define safe_smp_processor_id() 0
19379 +#define stack_smp_processor_id() 0
19380 +#endif
19381 +
19382 +#ifdef CONFIG_X86_LOCAL_APIC
19383 +
19384 +static inline int logical_smp_processor_id(void)
19385 +{
19386 + /* we don't want to mark this access volatile - bad code generation */
19387 + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
19388 +}
19389 +
19390 +#ifndef CONFIG_X86_64
19391 +static inline unsigned int read_apic_id(void)
19392 +{
19393 + return *(u32 *)(APIC_BASE + APIC_ID);
19394 +}
19395 +#else
19396 +extern unsigned int read_apic_id(void);
19397 +#endif
19398 +
19399 +
19400 +# ifdef APIC_DEFINITION
19401 +extern int hard_smp_processor_id(void);
19402 +# else
19403 +# include <mach_apicdef.h>
19404 +static inline int hard_smp_processor_id(void)
19405 +{
19406 + /* we don't want to mark this access volatile - bad code generation */
19407 + return GET_APIC_ID(read_apic_id());
19408 +}
19409 +# endif /* APIC_DEFINITION */
19410 +
19411 +#else /* CONFIG_X86_LOCAL_APIC */
19412 +
19413 +# ifndef CONFIG_SMP
19414 +# define hard_smp_processor_id() 0
19415 +# endif
19416 +
19417 +#endif /* CONFIG_X86_LOCAL_APIC */
19418 +
19419 +#ifdef CONFIG_HOTPLUG_CPU
19420 +extern void cpu_exit_clear(void);
19421 +extern void cpu_uninit(void);
19422 +#endif
19423 +
19424 +extern void smp_alloc_memory(void);
19425 +extern void lock_ipi_call_lock(void);
19426 +extern void unlock_ipi_call_lock(void);
19427 +#endif /* __ASSEMBLY__ */
19428 #endif
19429 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/smp_32.h 2009-03-16 16:33:40.000000000 +0100
19430 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
19431 @@ -1,178 +0,0 @@
19432 -#ifndef __ASM_SMP_H
19433 -#define __ASM_SMP_H
19434 -
19435 -#ifndef __ASSEMBLY__
19436 -#include <linux/cpumask.h>
19437 -#include <linux/init.h>
19438 -
19439 -/*
19440 - * We need the APIC definitions automatically as part of 'smp.h'
19441 - */
19442 -#ifdef CONFIG_X86_LOCAL_APIC
19443 -# include <asm/mpspec.h>
19444 -# include <asm/apic.h>
19445 -# ifdef CONFIG_X86_IO_APIC
19446 -# include <asm/io_apic.h>
19447 -# endif
19448 -#endif
19449 -
19450 -#define cpu_callout_map cpu_possible_map
19451 -#define cpu_callin_map cpu_possible_map
19452 -
19453 -extern int smp_num_siblings;
19454 -extern unsigned int num_processors;
19455 -
19456 -extern void smp_alloc_memory(void);
19457 -extern void lock_ipi_call_lock(void);
19458 -extern void unlock_ipi_call_lock(void);
19459 -
19460 -extern void (*mtrr_hook) (void);
19461 -extern void zap_low_mappings (void);
19462 -
19463 -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
19464 -DECLARE_PER_CPU(cpumask_t, cpu_core_map);
19465 -DECLARE_PER_CPU(u8, cpu_llc_id);
19466 -DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
19467 -
19468 -#ifdef CONFIG_HOTPLUG_CPU
19469 -extern void cpu_exit_clear(void);
19470 -extern void cpu_uninit(void);
19471 -#endif
19472 -
19473 -#ifdef CONFIG_SMP
19474 -
19475 -#ifndef CONFIG_XEN
19476 -
19477 -/* Globals due to paravirt */
19478 -extern void set_cpu_sibling_map(int cpu);
19479 -
19480 -struct smp_ops
19481 -{
19482 - void (*smp_prepare_boot_cpu)(void);
19483 - void (*smp_prepare_cpus)(unsigned max_cpus);
19484 - int (*cpu_up)(unsigned cpu);
19485 - void (*smp_cpus_done)(unsigned max_cpus);
19486 -
19487 - void (*smp_send_stop)(void);
19488 - void (*smp_send_reschedule)(int cpu);
19489 - int (*smp_call_function_mask)(cpumask_t mask,
19490 - void (*func)(void *info), void *info,
19491 - int wait);
19492 -};
19493 -
19494 -extern struct smp_ops smp_ops;
19495 -
19496 -static inline void smp_prepare_boot_cpu(void)
19497 -{
19498 - smp_ops.smp_prepare_boot_cpu();
19499 -}
19500 -static inline void smp_prepare_cpus(unsigned int max_cpus)
19501 -{
19502 - smp_ops.smp_prepare_cpus(max_cpus);
19503 -}
19504 -static inline int __cpu_up(unsigned int cpu)
19505 -{
19506 - return smp_ops.cpu_up(cpu);
19507 -}
19508 -static inline void smp_cpus_done(unsigned int max_cpus)
19509 -{
19510 - smp_ops.smp_cpus_done(max_cpus);
19511 -}
19512 -
19513 -static inline void smp_send_stop(void)
19514 -{
19515 - smp_ops.smp_send_stop();
19516 -}
19517 -static inline void smp_send_reschedule(int cpu)
19518 -{
19519 - smp_ops.smp_send_reschedule(cpu);
19520 -}
19521 -static inline int smp_call_function_mask(cpumask_t mask,
19522 - void (*func) (void *info), void *info,
19523 - int wait)
19524 -{
19525 - return smp_ops.smp_call_function_mask(mask, func, info, wait);
19526 -}
19527 -
19528 -void native_smp_prepare_boot_cpu(void);
19529 -void native_smp_prepare_cpus(unsigned int max_cpus);
19530 -int native_cpu_up(unsigned int cpunum);
19531 -void native_smp_cpus_done(unsigned int max_cpus);
19532 -
19533 -#ifndef CONFIG_PARAVIRT
19534 -#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
19535 -#endif
19536 -
19537 -#else /* CONFIG_XEN */
19538 -
19539 -void xen_smp_send_stop(void);
19540 -void xen_smp_send_reschedule(int cpu);
19541 -int xen_smp_call_function_mask(cpumask_t mask,
19542 - void (*func) (void *info), void *info,
19543 - int wait);
19544 -
19545 -#define smp_send_stop xen_smp_send_stop
19546 -#define smp_send_reschedule xen_smp_send_reschedule
19547 -#define smp_call_function_mask xen_smp_call_function_mask
19548 -
19549 -extern void prefill_possible_map(void);
19550 -
19551 -#endif /* CONFIG_XEN */
19552 -
19553 -extern int __cpu_disable(void);
19554 -extern void __cpu_die(unsigned int cpu);
19555 -
19556 -/*
19557 - * This function is needed by all SMP systems. It must _always_ be valid
19558 - * from the initial startup. We map APIC_BASE very early in page_setup(),
19559 - * so this is correct in the x86 case.
19560 - */
19561 -DECLARE_PER_CPU(int, cpu_number);
19562 -#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
19563 -
19564 -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
19565 -
19566 -#define safe_smp_processor_id() smp_processor_id()
19567 -
19568 -/* We don't mark CPUs online until __cpu_up(), so we need another measure */
19569 -static inline int num_booting_cpus(void)
19570 -{
19571 - return cpus_weight(cpu_callout_map);
19572 -}
19573 -
19574 -#else /* CONFIG_SMP */
19575 -
19576 -#define safe_smp_processor_id() 0
19577 -#define cpu_physical_id(cpu) boot_cpu_physical_apicid
19578 -
19579 -#endif /* !CONFIG_SMP */
19580 -
19581 -#ifdef CONFIG_X86_LOCAL_APIC
19582 -
19583 -static __inline int logical_smp_processor_id(void)
19584 -{
19585 - /* we don't want to mark this access volatile - bad code generation */
19586 - return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
19587 -}
19588 -
19589 -# ifdef APIC_DEFINITION
19590 -extern int hard_smp_processor_id(void);
19591 -# else
19592 -# include <mach_apicdef.h>
19593 -static inline int hard_smp_processor_id(void)
19594 -{
19595 - /* we don't want to mark this access volatile - bad code generation */
19596 - return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
19597 -}
19598 -# endif /* APIC_DEFINITION */
19599 -
19600 -#else /* CONFIG_X86_LOCAL_APIC */
19601 -
19602 -# ifndef CONFIG_SMP
19603 -# define hard_smp_processor_id() 0
19604 -# endif
19605 -
19606 -#endif /* CONFIG_X86_LOCAL_APIC */
19607 -
19608 -#endif /* !ASSEMBLY */
19609 -#endif
19610 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/smp_64.h 2009-03-16 16:33:40.000000000 +0100
19611 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
19612 @@ -1,103 +0,0 @@
19613 -#ifndef __ASM_SMP_H
19614 -#define __ASM_SMP_H
19615 -
19616 -#include <linux/cpumask.h>
19617 -#include <linux/init.h>
19618 -
19619 -#ifdef CONFIG_X86_LOCAL_APIC
19620 -/*
19621 - * We need the APIC definitions automatically as part of 'smp.h'
19622 - */
19623 -#include <asm/apic.h>
19624 -#ifdef CONFIG_X86_IO_APIC
19625 -#include <asm/io_apic.h>
19626 -#endif
19627 -#include <asm/mpspec.h>
19628 -#endif
19629 -#include <asm/pda.h>
19630 -#include <asm/thread_info.h>
19631 -
19632 -extern cpumask_t cpu_initialized;
19633 -
19634 -extern int smp_num_siblings;
19635 -extern unsigned int num_processors;
19636 -
19637 -extern void smp_alloc_memory(void);
19638 -extern void lock_ipi_call_lock(void);
19639 -extern void unlock_ipi_call_lock(void);
19640 -
19641 -extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
19642 - void *info, int wait);
19643 -
19644 -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
19645 -DECLARE_PER_CPU(cpumask_t, cpu_core_map);
19646 -DECLARE_PER_CPU(u16, cpu_llc_id);
19647 -DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
19648 -DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
19649 -
19650 -#ifdef CONFIG_X86_LOCAL_APIC
19651 -static inline int cpu_present_to_apicid(int mps_cpu)
19652 -{
19653 - if (cpu_present(mps_cpu))
19654 - return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
19655 - else
19656 - return BAD_APICID;
19657 -}
19658 -#endif
19659 -
19660 -#ifdef CONFIG_SMP
19661 -
19662 -#define SMP_TRAMPOLINE_BASE 0x6000
19663 -
19664 -extern int __cpu_disable(void);
19665 -extern void __cpu_die(unsigned int cpu);
19666 -extern void prefill_possible_map(void);
19667 -extern unsigned __cpuinitdata disabled_cpus;
19668 -
19669 -#define raw_smp_processor_id() read_pda(cpunumber)
19670 -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
19671 -
19672 -#define stack_smp_processor_id() \
19673 - ({ \
19674 - struct thread_info *ti; \
19675 - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
19676 - ti->cpu; \
19677 -})
19678 -
19679 -/*
19680 - * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
19681 - * scheduling and IPI sending and compresses data structures.
19682 - */
19683 -static inline int num_booting_cpus(void)
19684 -{
19685 - return cpus_weight(cpu_possible_map);
19686 -}
19687 -
19688 -extern void smp_send_reschedule(int cpu);
19689 -
19690 -#else /* CONFIG_SMP */
19691 -
19692 -extern unsigned int boot_cpu_id;
19693 -#define cpu_physical_id(cpu) boot_cpu_id
19694 -#define stack_smp_processor_id() 0
19695 -
19696 -#endif /* !CONFIG_SMP */
19697 -
19698 -#define safe_smp_processor_id() smp_processor_id()
19699 -
19700 -#ifdef CONFIG_X86_LOCAL_APIC
19701 -static __inline int logical_smp_processor_id(void)
19702 -{
19703 - /* we don't want to mark this access volatile - bad code generation */
19704 - return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
19705 -}
19706 -
19707 -static inline int hard_smp_processor_id(void)
19708 -{
19709 - /* we don't want to mark this access volatile - bad code generation */
19710 - return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
19711 -}
19712 -#endif
19713 -
19714 -#endif
19715 -
19716 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:33:40.000000000 +0100
19717 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:38:05.000000000 +0100
19718 @@ -88,7 +88,7 @@ extern void xen_spin_kick(raw_spinlock_t
19719 : "memory", "cc")
19720
19721
19722 -static inline int __raw_spin_trylock(raw_spinlock_t *lock)
19723 +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
19724 {
19725 int tmp, new;
19726
19727 @@ -107,7 +107,7 @@ static inline int __raw_spin_trylock(raw
19728 return tmp;
19729 }
19730
19731 -static inline void __raw_spin_unlock(raw_spinlock_t *lock)
19732 +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
19733 {
19734 unsigned int token;
19735 unsigned char kick;
19736 @@ -155,7 +155,7 @@ static inline void __raw_spin_unlock(raw
19737 : "memory", "cc"); \
19738 } while (0)
19739
19740 -static inline int __raw_spin_trylock(raw_spinlock_t *lock)
19741 +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
19742 {
19743 int tmp;
19744 int new;
19745 @@ -177,7 +177,7 @@ static inline int __raw_spin_trylock(raw
19746 return tmp;
19747 }
19748
19749 -static inline void __raw_spin_unlock(raw_spinlock_t *lock)
19750 +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
19751 {
19752 unsigned int token, tmp;
19753 bool kick;
19754 @@ -197,19 +197,19 @@ static inline void __raw_spin_unlock(raw
19755
19756 static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
19757 {
19758 - int tmp = *(volatile signed int *)(&(lock)->slock);
19759 + int tmp = ACCESS_ONCE(lock->slock);
19760
19761 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
19762 }
19763
19764 static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
19765 {
19766 - int tmp = *(volatile signed int *)(&(lock)->slock);
19767 + int tmp = ACCESS_ONCE(lock->slock);
19768
19769 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
19770 }
19771
19772 -static inline void __raw_spin_lock(raw_spinlock_t *lock)
19773 +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
19774 {
19775 unsigned int token, count;
19776 bool free;
19777 @@ -223,8 +223,8 @@ static inline void __raw_spin_lock(raw_s
19778 } while (unlikely(!count) && !xen_spin_wait(lock, token));
19779 }
19780
19781 -static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
19782 - unsigned long flags)
19783 +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
19784 + unsigned long flags)
19785 {
19786 unsigned int token, count;
19787 bool free;
19788 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/swiotlb.h 2009-02-16 16:18:36.000000000 +0100
19789 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/swiotlb.h 2009-03-16 16:38:05.000000000 +0100
19790 @@ -1,5 +1,8 @@
19791 -#ifdef CONFIG_X86_32
19792 -# include "swiotlb_32.h"
19793 -#else
19794 -# include "../../swiotlb.h"
19795 -#endif
19796 +#ifndef _ASM_SWIOTLB_H
19797 +
19798 +#include "../../swiotlb.h"
19799 +
19800 +dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
19801 + int dir);
19802 +
19803 +#endif /* _ASM_SWIOTLB_H */
19804 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/swiotlb_32.h 2009-10-28 14:55:03.000000000 +0100
19805 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
19806 @@ -1,43 +0,0 @@
19807 -#ifndef _ASM_SWIOTLB_H
19808 -#define _ASM_SWIOTLB_H 1
19809 -
19810 -/* SWIOTLB interface */
19811 -
19812 -extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
19813 - int dir);
19814 -extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
19815 - size_t size, int dir);
19816 -extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
19817 - dma_addr_t dev_addr,
19818 - size_t size, int dir);
19819 -extern void swiotlb_sync_single_for_device(struct device *hwdev,
19820 - dma_addr_t dev_addr,
19821 - size_t size, int dir);
19822 -extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
19823 - struct scatterlist *sg, int nelems,
19824 - int dir);
19825 -extern void swiotlb_sync_sg_for_device(struct device *hwdev,
19826 - struct scatterlist *sg, int nelems,
19827 - int dir);
19828 -extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
19829 - int nents, int direction);
19830 -extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
19831 - int nents, int direction);
19832 -extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
19833 -#ifdef CONFIG_HIGHMEM
19834 -extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
19835 - unsigned long offset, size_t size,
19836 - enum dma_data_direction direction);
19837 -extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
19838 - size_t size, enum dma_data_direction direction);
19839 -#endif
19840 -extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
19841 -extern void swiotlb_init(void);
19842 -
19843 -#ifdef CONFIG_SWIOTLB
19844 -extern int swiotlb;
19845 -#else
19846 -#define swiotlb 0
19847 -#endif
19848 -
19849 -#endif
19850 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:33:40.000000000 +0100
19851 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:38:05.000000000 +0100
19852 @@ -28,22 +28,44 @@ struct task_struct *__switch_to(struct t
19853 * Saving eflags is important. It switches not only IOPL between tasks,
19854 * it also protects other tasks from NT leaking through sysenter etc.
19855 */
19856 -#define switch_to(prev, next, last) do { \
19857 - unsigned long esi, edi; \
19858 - asm volatile("pushfl\n\t" /* Save flags */ \
19859 - "pushl %%ebp\n\t" \
19860 - "movl %%esp,%0\n\t" /* save ESP */ \
19861 - "movl %5,%%esp\n\t" /* restore ESP */ \
19862 - "movl $1f,%1\n\t" /* save EIP */ \
19863 - "pushl %6\n\t" /* restore EIP */ \
19864 - "jmp __switch_to\n" \
19865 +#define switch_to(prev, next, last) \
19866 +do { \
19867 + /* \
19868 + * Context-switching clobbers all registers, so we clobber \
19869 + * them explicitly, via unused output variables. \
19870 + * (EAX and EBP is not listed because EBP is saved/restored \
19871 + * explicitly for wchan access and EAX is the return value of \
19872 + * __switch_to()) \
19873 + */ \
19874 + unsigned long ebx, ecx, edx, esi, edi; \
19875 + \
19876 + asm volatile("pushfl\n\t" /* save flags */ \
19877 + "pushl %%ebp\n\t" /* save EBP */ \
19878 + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
19879 + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
19880 + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
19881 + "pushl %[next_ip]\n\t" /* restore EIP */ \
19882 + "jmp __switch_to\n" /* regparm call */ \
19883 "1:\t" \
19884 - "popl %%ebp\n\t" \
19885 - "popfl" \
19886 - :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \
19887 - "=a" (last), "=S" (esi), "=D" (edi) \
19888 - :"m" (next->thread.sp), "m" (next->thread.ip), \
19889 - "2" (prev), "d" (next)); \
19890 + "popl %%ebp\n\t" /* restore EBP */ \
19891 + "popfl\n" /* restore flags */ \
19892 + \
19893 + /* output parameters */ \
19894 + : [prev_sp] "=m" (prev->thread.sp), \
19895 + [prev_ip] "=m" (prev->thread.ip), \
19896 + "=a" (last), \
19897 + \
19898 + /* clobbered output registers: */ \
19899 + "=b" (ebx), "=c" (ecx), "=d" (edx), \
19900 + "=S" (esi), "=D" (edi) \
19901 + \
19902 + /* input parameters: */ \
19903 + : [next_sp] "m" (next->thread.sp), \
19904 + [next_ip] "m" (next->thread.ip), \
19905 + \
19906 + /* regparm parameters for __switch_to(): */ \
19907 + [prev] "a" (prev), \
19908 + [next] "d" (next)); \
19909 } while (0)
19910
19911 /*
19912 @@ -123,30 +145,29 @@ extern void load_gs_index(unsigned);
19913 */
19914 #define loadsegment(seg, value) \
19915 asm volatile("\n" \
19916 - "1:\t" \
19917 - "movl %k0,%%" #seg "\n" \
19918 - "2:\n" \
19919 - ".section .fixup,\"ax\"\n" \
19920 - "3:\t" \
19921 - "movl %k1, %%" #seg "\n\t" \
19922 - "jmp 2b\n" \
19923 - ".previous\n" \
19924 - _ASM_EXTABLE(1b,3b) \
19925 - : :"r" (value), "r" (0))
19926 + "1:\t" \
19927 + "movl %k0,%%" #seg "\n" \
19928 + "2:\n" \
19929 + ".section .fixup,\"ax\"\n" \
19930 + "3:\t" \
19931 + "movl %k1, %%" #seg "\n\t" \
19932 + "jmp 2b\n" \
19933 + ".previous\n" \
19934 + _ASM_EXTABLE(1b,3b) \
19935 + : :"r" (value), "r" (0))
19936
19937
19938 /*
19939 * Save a segment register away
19940 */
19941 -#define savesegment(seg, value) \
19942 +#define savesegment(seg, value) \
19943 asm volatile("mov %%" #seg ",%0":"=rm" (value))
19944
19945 static inline unsigned long get_limit(unsigned long segment)
19946 {
19947 unsigned long __limit;
19948 - __asm__("lsll %1,%0"
19949 - :"=r" (__limit):"r" (segment));
19950 - return __limit+1;
19951 + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
19952 + return __limit + 1;
19953 }
19954
19955 static inline void xen_clts(void)
19956 @@ -171,13 +192,13 @@ static unsigned long __force_order;
19957 static inline unsigned long xen_read_cr0(void)
19958 {
19959 unsigned long val;
19960 - asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
19961 + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
19962 return val;
19963 }
19964
19965 static inline void xen_write_cr0(unsigned long val)
19966 {
19967 - asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
19968 + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
19969 }
19970
19971 #define xen_read_cr2() (current_vcpu_info()->arch.cr2)
19972 @@ -186,7 +207,7 @@ static inline void xen_write_cr0(unsigne
19973 static inline unsigned long xen_read_cr3(void)
19974 {
19975 unsigned long val;
19976 - asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
19977 + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
19978 #ifdef CONFIG_X86_32
19979 return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
19980 #else
19981 @@ -201,13 +222,13 @@ static inline void xen_write_cr3(unsigne
19982 #else
19983 val = phys_to_machine(val);
19984 #endif
19985 - asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
19986 + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
19987 }
19988
19989 static inline unsigned long xen_read_cr4(void)
19990 {
19991 unsigned long val;
19992 - asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
19993 + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
19994 return val;
19995 }
19996
19997 @@ -215,7 +236,7 @@ static inline unsigned long xen_read_cr4
19998
19999 static inline void xen_write_cr4(unsigned long val)
20000 {
20001 - asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
20002 + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
20003 }
20004
20005 #ifdef CONFIG_X86_64
20006 @@ -234,6 +255,7 @@ static inline void xen_wbinvd(void)
20007 {
20008 asm volatile("wbinvd": : :"memory");
20009 }
20010 +
20011 #define read_cr0() (xen_read_cr0())
20012 #define write_cr0(x) (xen_write_cr0(x))
20013 #define read_cr2() (xen_read_cr2())
20014 @@ -260,7 +282,7 @@ static inline void clflush(volatile void
20015 asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
20016 }
20017
20018 -#define nop() __asm__ __volatile__ ("nop")
20019 +#define nop() asm volatile ("nop")
20020
20021 void disable_hlt(void);
20022 void enable_hlt(void);
20023 @@ -280,16 +302,7 @@ void default_idle(void);
20024 */
20025 #ifdef CONFIG_X86_32
20026 /*
20027 - * For now, "wmb()" doesn't actually do anything, as all
20028 - * Intel CPU's follow what Intel calls a *Processor Order*,
20029 - * in which all writes are seen in the program order even
20030 - * outside the CPU.
20031 - *
20032 - * I expect future Intel CPU's to have a weaker ordering,
20033 - * but I'd also expect them to finally get their act together
20034 - * and add some real memory barriers if so.
20035 - *
20036 - * Some non intel clones support out of order store. wmb() ceases to be a
20037 + * Some non-Intel clones support out of order store. wmb() ceases to be a
20038 * nop for these.
20039 */
20040 #define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
20041 @@ -368,7 +381,7 @@ void default_idle(void);
20042 # define smp_wmb() barrier()
20043 #endif
20044 #define smp_read_barrier_depends() read_barrier_depends()
20045 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
20046 +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
20047 #else
20048 #define smp_mb() barrier()
20049 #define smp_rmb() barrier()
20050 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/tlbflush.h 2009-03-16 16:33:40.000000000 +0100
20051 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/tlbflush.h 2009-03-16 16:38:05.000000000 +0100
20052 @@ -86,8 +86,7 @@ static inline void flush_tlb_range(struc
20053 #define TLBSTATE_LAZY 2
20054
20055 #ifdef CONFIG_X86_32
20056 -struct tlb_state
20057 -{
20058 +struct tlb_state {
20059 struct mm_struct *active_mm;
20060 int state;
20061 char __cacheline_padding[L1_CACHE_BYTES-8];
20062 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/vga.h 2009-10-28 14:55:03.000000000 +0100
20063 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/vga.h 2009-03-16 16:38:05.000000000 +0100
20064 @@ -12,9 +12,9 @@
20065 * access the videoram directly without any black magic.
20066 */
20067
20068 -#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x)
20069 +#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x)
20070
20071 #define vga_readb(x) (*(x))
20072 -#define vga_writeb(x,y) (*(y) = (x))
20073 +#define vga_writeb(x, y) (*(y) = (x))
20074
20075 #endif
20076 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/xor_64.h 2009-10-28 14:55:03.000000000 +0100
20077 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/xor_64.h 2009-03-16 16:38:05.000000000 +0100
20078 @@ -1,20 +1,23 @@
20079 /*
20080 - * x86-64 changes / gcc fixes from Andi Kleen.
20081 + * x86-64 changes / gcc fixes from Andi Kleen.
20082 * Copyright 2002 Andi Kleen, SuSE Labs.
20083 *
20084 * This hasn't been optimized for the hammer yet, but there are likely
20085 * no advantages to be gotten from x86-64 here anyways.
20086 */
20087
20088 -typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
20089 +typedef struct {
20090 + unsigned long a, b;
20091 +} __attribute__((aligned(16))) xmm_store_t;
20092
20093 -/* Doesn't use gcc to save the XMM registers, because there is no easy way to
20094 +/* Doesn't use gcc to save the XMM registers, because there is no easy way to
20095 tell it to do a clts before the register saving. */
20096 -#define XMMS_SAVE do { \
20097 +#define XMMS_SAVE \
20098 +do { \
20099 preempt_disable(); \
20100 if (!(current_thread_info()->status & TS_USEDFPU)) \
20101 clts(); \
20102 - __asm__ __volatile__ ( \
20103 + asm volatile( \
20104 "movups %%xmm0,(%1) ;\n\t" \
20105 "movups %%xmm1,0x10(%1) ;\n\t" \
20106 "movups %%xmm2,0x20(%1) ;\n\t" \
20107 @@ -22,10 +25,11 @@ typedef struct { unsigned long a,b; } __
20108 : "=&r" (cr0) \
20109 : "r" (xmm_save) \
20110 : "memory"); \
20111 -} while(0)
20112 +} while (0)
20113
20114 -#define XMMS_RESTORE do { \
20115 - asm volatile ( \
20116 +#define XMMS_RESTORE \
20117 +do { \
20118 + asm volatile( \
20119 "sfence ;\n\t" \
20120 "movups (%1),%%xmm0 ;\n\t" \
20121 "movups 0x10(%1),%%xmm1 ;\n\t" \
20122 @@ -37,72 +41,72 @@ typedef struct { unsigned long a,b; } __
20123 if (!(current_thread_info()->status & TS_USEDFPU)) \
20124 stts(); \
20125 preempt_enable(); \
20126 -} while(0)
20127 +} while (0)
20128
20129 #define OFFS(x) "16*("#x")"
20130 #define PF_OFFS(x) "256+16*("#x")"
20131 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
20132 -#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
20133 -#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
20134 +#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
20135 +#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
20136 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
20137 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
20138 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
20139 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
20140 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
20141 -#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
20142 -#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
20143 -#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
20144 -#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
20145 -#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
20146 +#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
20147 +#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
20148 +#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
20149 +#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
20150 +#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
20151
20152
20153 static void
20154 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
20155 {
20156 - unsigned int lines = bytes >> 8;
20157 + unsigned int lines = bytes >> 8;
20158 unsigned long cr0;
20159 xmm_store_t xmm_save[4];
20160
20161 XMMS_SAVE;
20162
20163 - asm volatile (
20164 + asm volatile(
20165 #undef BLOCK
20166 #define BLOCK(i) \
20167 - LD(i,0) \
20168 - LD(i+1,1) \
20169 + LD(i, 0) \
20170 + LD(i + 1, 1) \
20171 PF1(i) \
20172 - PF1(i+2) \
20173 - LD(i+2,2) \
20174 - LD(i+3,3) \
20175 - PF0(i+4) \
20176 - PF0(i+6) \
20177 - XO1(i,0) \
20178 - XO1(i+1,1) \
20179 - XO1(i+2,2) \
20180 - XO1(i+3,3) \
20181 - ST(i,0) \
20182 - ST(i+1,1) \
20183 - ST(i+2,2) \
20184 - ST(i+3,3) \
20185 + PF1(i + 2) \
20186 + LD(i + 2, 2) \
20187 + LD(i + 3, 3) \
20188 + PF0(i + 4) \
20189 + PF0(i + 6) \
20190 + XO1(i, 0) \
20191 + XO1(i + 1, 1) \
20192 + XO1(i + 2, 2) \
20193 + XO1(i + 3, 3) \
20194 + ST(i, 0) \
20195 + ST(i + 1, 1) \
20196 + ST(i + 2, 2) \
20197 + ST(i + 3, 3) \
20198
20199
20200 PF0(0)
20201 PF0(2)
20202
20203 " .align 32 ;\n"
20204 - " 1: ;\n"
20205 + " 1: ;\n"
20206
20207 BLOCK(0)
20208 BLOCK(4)
20209 BLOCK(8)
20210 BLOCK(12)
20211
20212 - " addq %[inc], %[p1] ;\n"
20213 - " addq %[inc], %[p2] ;\n"
20214 + " addq %[inc], %[p1] ;\n"
20215 + " addq %[inc], %[p2] ;\n"
20216 " decl %[cnt] ; jnz 1b"
20217 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
20218 - : [inc] "r" (256UL)
20219 - : "memory");
20220 + : [inc] "r" (256UL)
20221 + : "memory");
20222
20223 XMMS_RESTORE;
20224 }
20225 @@ -117,52 +121,52 @@ xor_sse_3(unsigned long bytes, unsigned
20226
20227 XMMS_SAVE;
20228
20229 - __asm__ __volatile__ (
20230 + asm volatile(
20231 #undef BLOCK
20232 #define BLOCK(i) \
20233 PF1(i) \
20234 - PF1(i+2) \
20235 - LD(i,0) \
20236 - LD(i+1,1) \
20237 - LD(i+2,2) \
20238 - LD(i+3,3) \
20239 + PF1(i + 2) \
20240 + LD(i, 0) \
20241 + LD(i + 1, 1) \
20242 + LD(i + 2, 2) \
20243 + LD(i + 3, 3) \
20244 PF2(i) \
20245 - PF2(i+2) \
20246 - PF0(i+4) \
20247 - PF0(i+6) \
20248 - XO1(i,0) \
20249 - XO1(i+1,1) \
20250 - XO1(i+2,2) \
20251 - XO1(i+3,3) \
20252 - XO2(i,0) \
20253 - XO2(i+1,1) \
20254 - XO2(i+2,2) \
20255 - XO2(i+3,3) \
20256 - ST(i,0) \
20257 - ST(i+1,1) \
20258 - ST(i+2,2) \
20259 - ST(i+3,3) \
20260 + PF2(i + 2) \
20261 + PF0(i + 4) \
20262 + PF0(i + 6) \
20263 + XO1(i, 0) \
20264 + XO1(i + 1, 1) \
20265 + XO1(i + 2, 2) \
20266 + XO1(i + 3, 3) \
20267 + XO2(i, 0) \
20268 + XO2(i + 1, 1) \
20269 + XO2(i + 2, 2) \
20270 + XO2(i + 3, 3) \
20271 + ST(i, 0) \
20272 + ST(i + 1, 1) \
20273 + ST(i + 2, 2) \
20274 + ST(i + 3, 3) \
20275
20276
20277 PF0(0)
20278 PF0(2)
20279
20280 " .align 32 ;\n"
20281 - " 1: ;\n"
20282 + " 1: ;\n"
20283
20284 BLOCK(0)
20285 BLOCK(4)
20286 BLOCK(8)
20287 BLOCK(12)
20288
20289 - " addq %[inc], %[p1] ;\n"
20290 - " addq %[inc], %[p2] ;\n"
20291 - " addq %[inc], %[p3] ;\n"
20292 + " addq %[inc], %[p1] ;\n"
20293 + " addq %[inc], %[p2] ;\n"
20294 + " addq %[inc], %[p3] ;\n"
20295 " decl %[cnt] ; jnz 1b"
20296 : [cnt] "+r" (lines),
20297 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
20298 : [inc] "r" (256UL)
20299 - : "memory");
20300 + : "memory");
20301 XMMS_RESTORE;
20302 }
20303
20304 @@ -171,64 +175,64 @@ xor_sse_4(unsigned long bytes, unsigned
20305 unsigned long *p3, unsigned long *p4)
20306 {
20307 unsigned int lines = bytes >> 8;
20308 - xmm_store_t xmm_save[4];
20309 + xmm_store_t xmm_save[4];
20310 unsigned long cr0;
20311
20312 XMMS_SAVE;
20313
20314 - __asm__ __volatile__ (
20315 + asm volatile(
20316 #undef BLOCK
20317 #define BLOCK(i) \
20318 PF1(i) \
20319 - PF1(i+2) \
20320 - LD(i,0) \
20321 - LD(i+1,1) \
20322 - LD(i+2,2) \
20323 - LD(i+3,3) \
20324 + PF1(i + 2) \
20325 + LD(i, 0) \
20326 + LD(i + 1, 1) \
20327 + LD(i + 2, 2) \
20328 + LD(i + 3, 3) \
20329 PF2(i) \
20330 - PF2(i+2) \
20331 - XO1(i,0) \
20332 - XO1(i+1,1) \
20333 - XO1(i+2,2) \
20334 - XO1(i+3,3) \
20335 + PF2(i + 2) \
20336 + XO1(i, 0) \
20337 + XO1(i + 1, 1) \
20338 + XO1(i + 2, 2) \
20339 + XO1(i + 3, 3) \
20340 PF3(i) \
20341 - PF3(i+2) \
20342 - PF0(i+4) \
20343 - PF0(i+6) \
20344 - XO2(i,0) \
20345 - XO2(i+1,1) \
20346 - XO2(i+2,2) \
20347 - XO2(i+3,3) \
20348 - XO3(i,0) \
20349 - XO3(i+1,1) \
20350 - XO3(i+2,2) \
20351 - XO3(i+3,3) \
20352 - ST(i,0) \
20353 - ST(i+1,1) \
20354 - ST(i+2,2) \
20355 - ST(i+3,3) \
20356 + PF3(i + 2) \
20357 + PF0(i + 4) \
20358 + PF0(i + 6) \
20359 + XO2(i, 0) \
20360 + XO2(i + 1, 1) \
20361 + XO2(i + 2, 2) \
20362 + XO2(i + 3, 3) \
20363 + XO3(i, 0) \
20364 + XO3(i + 1, 1) \
20365 + XO3(i + 2, 2) \
20366 + XO3(i + 3, 3) \
20367 + ST(i, 0) \
20368 + ST(i + 1, 1) \
20369 + ST(i + 2, 2) \
20370 + ST(i + 3, 3) \
20371
20372
20373 PF0(0)
20374 PF0(2)
20375
20376 " .align 32 ;\n"
20377 - " 1: ;\n"
20378 + " 1: ;\n"
20379
20380 BLOCK(0)
20381 BLOCK(4)
20382 BLOCK(8)
20383 BLOCK(12)
20384
20385 - " addq %[inc], %[p1] ;\n"
20386 - " addq %[inc], %[p2] ;\n"
20387 - " addq %[inc], %[p3] ;\n"
20388 - " addq %[inc], %[p4] ;\n"
20389 + " addq %[inc], %[p1] ;\n"
20390 + " addq %[inc], %[p2] ;\n"
20391 + " addq %[inc], %[p3] ;\n"
20392 + " addq %[inc], %[p4] ;\n"
20393 " decl %[cnt] ; jnz 1b"
20394 : [cnt] "+c" (lines),
20395 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
20396 : [inc] "r" (256UL)
20397 - : "memory" );
20398 + : "memory" );
20399
20400 XMMS_RESTORE;
20401 }
20402 @@ -237,70 +241,70 @@ static void
20403 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
20404 unsigned long *p3, unsigned long *p4, unsigned long *p5)
20405 {
20406 - unsigned int lines = bytes >> 8;
20407 + unsigned int lines = bytes >> 8;
20408 xmm_store_t xmm_save[4];
20409 unsigned long cr0;
20410
20411 XMMS_SAVE;
20412
20413 - __asm__ __volatile__ (
20414 + asm volatile(
20415 #undef BLOCK
20416 #define BLOCK(i) \
20417 PF1(i) \
20418 - PF1(i+2) \
20419 - LD(i,0) \
20420 - LD(i+1,1) \
20421 - LD(i+2,2) \
20422 - LD(i+3,3) \
20423 + PF1(i + 2) \
20424 + LD(i, 0) \
20425 + LD(i + 1, 1) \
20426 + LD(i + 2, 2) \
20427 + LD(i + 3, 3) \
20428 PF2(i) \
20429 - PF2(i+2) \
20430 - XO1(i,0) \
20431 - XO1(i+1,1) \
20432 - XO1(i+2,2) \
20433 - XO1(i+3,3) \
20434 + PF2(i + 2) \
20435 + XO1(i, 0) \
20436 + XO1(i + 1, 1) \
20437 + XO1(i + 2, 2) \
20438 + XO1(i + 3, 3) \
20439 PF3(i) \
20440 - PF3(i+2) \
20441 - XO2(i,0) \
20442 - XO2(i+1,1) \
20443 - XO2(i+2,2) \
20444 - XO2(i+3,3) \
20445 + PF3(i + 2) \
20446 + XO2(i, 0) \
20447 + XO2(i + 1, 1) \
20448 + XO2(i + 2, 2) \
20449 + XO2(i + 3, 3) \
20450 PF4(i) \
20451 - PF4(i+2) \
20452 - PF0(i+4) \
20453 - PF0(i+6) \
20454 - XO3(i,0) \
20455 - XO3(i+1,1) \
20456 - XO3(i+2,2) \
20457 - XO3(i+3,3) \
20458 - XO4(i,0) \
20459 - XO4(i+1,1) \
20460 - XO4(i+2,2) \
20461 - XO4(i+3,3) \
20462 - ST(i,0) \
20463 - ST(i+1,1) \
20464 - ST(i+2,2) \
20465 - ST(i+3,3) \
20466 + PF4(i + 2) \
20467 + PF0(i + 4) \
20468 + PF0(i + 6) \
20469 + XO3(i, 0) \
20470 + XO3(i + 1, 1) \
20471 + XO3(i + 2, 2) \
20472 + XO3(i + 3, 3) \
20473 + XO4(i, 0) \
20474 + XO4(i + 1, 1) \
20475 + XO4(i + 2, 2) \
20476 + XO4(i + 3, 3) \
20477 + ST(i, 0) \
20478 + ST(i + 1, 1) \
20479 + ST(i + 2, 2) \
20480 + ST(i + 3, 3) \
20481
20482
20483 PF0(0)
20484 PF0(2)
20485
20486 " .align 32 ;\n"
20487 - " 1: ;\n"
20488 + " 1: ;\n"
20489
20490 BLOCK(0)
20491 BLOCK(4)
20492 BLOCK(8)
20493 BLOCK(12)
20494
20495 - " addq %[inc], %[p1] ;\n"
20496 - " addq %[inc], %[p2] ;\n"
20497 - " addq %[inc], %[p3] ;\n"
20498 - " addq %[inc], %[p4] ;\n"
20499 - " addq %[inc], %[p5] ;\n"
20500 + " addq %[inc], %[p1] ;\n"
20501 + " addq %[inc], %[p2] ;\n"
20502 + " addq %[inc], %[p3] ;\n"
20503 + " addq %[inc], %[p4] ;\n"
20504 + " addq %[inc], %[p5] ;\n"
20505 " decl %[cnt] ; jnz 1b"
20506 : [cnt] "+c" (lines),
20507 - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
20508 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
20509 [p5] "+r" (p5)
20510 : [inc] "r" (256UL)
20511 : "memory");
20512 @@ -309,18 +313,18 @@ xor_sse_5(unsigned long bytes, unsigned
20513 }
20514
20515 static struct xor_block_template xor_block_sse = {
20516 - .name = "generic_sse",
20517 - .do_2 = xor_sse_2,
20518 - .do_3 = xor_sse_3,
20519 - .do_4 = xor_sse_4,
20520 - .do_5 = xor_sse_5,
20521 + .name = "generic_sse",
20522 + .do_2 = xor_sse_2,
20523 + .do_3 = xor_sse_3,
20524 + .do_4 = xor_sse_4,
20525 + .do_5 = xor_sse_5,
20526 };
20527
20528 #undef XOR_TRY_TEMPLATES
20529 -#define XOR_TRY_TEMPLATES \
20530 - do { \
20531 - xor_speed(&xor_block_sse); \
20532 - } while (0)
20533 +#define XOR_TRY_TEMPLATES \
20534 +do { \
20535 + xor_speed(&xor_block_sse); \
20536 +} while (0)
20537
20538 /* We force the use of the SSE xor block because it can write around L2.
20539 We may also be able to load into the L1 only depending on how the cpu
20540 --- sle11-2009-10-16.orig/include/asm-x86/scatterlist.h 2009-10-28 14:55:03.000000000 +0100
20541 +++ sle11-2009-10-16/include/asm-x86/scatterlist.h 2009-03-16 16:38:05.000000000 +0100
20542 @@ -24,7 +24,7 @@ struct scatterlist {
20543 * returns.
20544 */
20545 #define sg_dma_address(sg) ((sg)->dma_address)
20546 -#ifdef CONFIG_X86_32
20547 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
20548 # define sg_dma_len(sg) ((sg)->length)
20549 #else
20550 # define sg_dma_len(sg) ((sg)->dma_length)
20551 --- sle11-2009-10-16.orig/include/linux/page-flags.h 2009-03-16 16:33:40.000000000 +0100
20552 +++ sle11-2009-10-16/include/linux/page-flags.h 2009-03-16 16:38:05.000000000 +0100
20553 @@ -278,18 +278,25 @@ static inline void SetPageUptodate(struc
20554
20555 CLEARPAGEFLAG(Uptodate, uptodate)
20556
20557 -#define PageForeign(page) test_bit(PG_foreign, &(page)->flags)
20558 -#define SetPageForeign(_page, dtor) do { \
20559 - set_bit(PG_foreign, &(_page)->flags); \
20560 - BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0); \
20561 - (_page)->index = (long)(dtor); \
20562 -} while (0)
20563 -#define ClearPageForeign(page) do { \
20564 - clear_bit(PG_foreign, &(page)->flags); \
20565 - (page)->index = 0; \
20566 -} while (0)
20567 -#define PageForeignDestructor(_page, order) \
20568 - ((void (*)(struct page *, unsigned int))(_page)->index)(_page, order)
20569 +#ifdef CONFIG_XEN
20570 +TESTPAGEFLAG(Foreign, foreign)
20571 +static inline void SetPageForeign(struct page *page,
20572 + void (*dtor)(struct page *, unsigned int))
20573 +{
20574 + BUG_ON(!dtor);
20575 + set_bit(PG_foreign, &page->flags);
20576 + page->index = (long)dtor;
20577 +}
20578 +static inline void ClearPageForeign(struct page *page)
20579 +{
20580 + clear_bit(PG_foreign, &page->flags);
20581 + page->index = 0;
20582 +}
20583 +static inline void PageForeignDestructor(struct page *page, unsigned int order)
20584 +{
20585 + ((void (*)(struct page *, unsigned int))page->index)(page, order);
20586 +}
20587 +#endif
20588
20589 extern void cancel_dirty_page(struct page *page, unsigned int account_size);
20590
20591 --- sle11-2009-10-16.orig/include/xen/balloon.h 2008-11-25 12:35:56.000000000 +0100
20592 +++ sle11-2009-10-16/include/xen/balloon.h 2009-03-16 16:38:05.000000000 +0100
20593 @@ -31,9 +31,12 @@
20594 * IN THE SOFTWARE.
20595 */
20596
20597 -#ifndef __ASM_BALLOON_H__
20598 -#define __ASM_BALLOON_H__
20599 +#ifndef __XEN_BALLOON_H__
20600 +#define __XEN_BALLOON_H__
20601
20602 +#include <linux/spinlock.h>
20603 +
20604 +#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
20605 /*
20606 * Inform the balloon driver that it should allow some slop for device-driver
20607 * memory activities.
20608 @@ -53,5 +56,6 @@ void balloon_release_driver_page(struct
20609 extern spinlock_t balloon_lock;
20610 #define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags)
20611 #define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
20612 +#endif
20613
20614 -#endif /* __ASM_BALLOON_H__ */
20615 +#endif /* __XEN_BALLOON_H__ */
20616 --- sle11-2009-10-16.orig/include/xen/interface/grant_table.h 2008-11-25 12:22:34.000000000 +0100
20617 +++ sle11-2009-10-16/include/xen/interface/grant_table.h 2009-03-16 16:38:05.000000000 +0100
20618 @@ -193,6 +193,7 @@ struct gnttab_map_grant_ref {
20619 grant_handle_t handle;
20620 uint64_t dev_bus_addr;
20621 };
20622 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
20623 typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
20624 DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
20625
20626 @@ -216,6 +217,7 @@ struct gnttab_unmap_grant_ref {
20627 /* OUT parameters. */
20628 int16_t status; /* GNTST_* */
20629 };
20630 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
20631 typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
20632 DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
20633
20634 @@ -237,6 +239,7 @@ struct gnttab_setup_table {
20635 int16_t status; /* GNTST_* */
20636 XEN_GUEST_HANDLE(ulong) frame_list;
20637 };
20638 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_setup_table);
20639 typedef struct gnttab_setup_table gnttab_setup_table_t;
20640 DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
20641
20642 @@ -251,6 +254,7 @@ struct gnttab_dump_table {
20643 /* OUT parameters. */
20644 int16_t status; /* GNTST_* */
20645 };
20646 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_dump_table);
20647 typedef struct gnttab_dump_table gnttab_dump_table_t;
20648 DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
20649
20650 @@ -271,6 +275,7 @@ struct gnttab_transfer {
20651 /* OUT parameters. */
20652 int16_t status;
20653 };
20654 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_transfer);
20655 typedef struct gnttab_transfer gnttab_transfer_t;
20656 DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
20657
20658 @@ -314,6 +319,7 @@ typedef struct gnttab_copy {
20659 /* OUT parameters. */
20660 int16_t status;
20661 } gnttab_copy_t;
20662 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_copy);
20663 DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);
20664
20665 /*
20666 @@ -332,6 +338,7 @@ struct gnttab_query_size {
20667 uint32_t max_nr_frames;
20668 int16_t status; /* GNTST_* */
20669 };
20670 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_query_size);
20671 typedef struct gnttab_query_size gnttab_query_size_t;
20672 DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t);
20673
20674 --- sle11-2009-10-16.orig/include/xen/interface/io/fbif.h 2008-11-25 12:35:56.000000000 +0100
20675 +++ sle11-2009-10-16/include/xen/interface/io/fbif.h 2009-03-16 16:38:05.000000000 +0100
20676 @@ -150,7 +150,12 @@ struct xenfb_page
20677 * framebuffer with a max resolution of 12,800x10,240. Should
20678 * be enough for a while with room leftover for expansion.
20679 */
20680 +#ifndef CONFIG_PARAVIRT_XEN
20681 unsigned long pd[256];
20682 +#else
20683 + /* Two directory pages should be enough for a while. */
20684 + unsigned long pd[2];
20685 +#endif
20686 };
20687
20688 /*
20689 --- sle11-2009-10-16.orig/include/xen/interface/memory.h 2009-02-16 16:17:21.000000000 +0100
20690 +++ sle11-2009-10-16/include/xen/interface/memory.h 2009-03-16 16:38:05.000000000 +0100
20691 @@ -62,7 +62,7 @@ struct xen_memory_reservation {
20692 * OUT: GMFN bases of extents that were allocated
20693 * (NB. This command also updates the mach_to_phys translation table)
20694 */
20695 - XEN_GUEST_HANDLE(ulong) extent_start;
20696 + XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
20697
20698 /* Number of extents, and size/alignment of each (2^extent_order pages). */
20699 xen_ulong_t nr_extents;
20700 @@ -82,7 +82,6 @@ struct xen_memory_reservation {
20701 domid_t domid;
20702
20703 };
20704 -DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
20705 typedef struct xen_memory_reservation xen_memory_reservation_t;
20706 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
20707
20708 @@ -168,7 +167,11 @@ struct xen_machphys_mfn_list {
20709 * any large discontiguities in the machine address space, 2MB gaps in
20710 * the machphys table will be represented by an MFN base of zero.
20711 */
20712 +#ifndef CONFIG_PARAVIRT_XEN
20713 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
20714 +#else
20715 + ulong extent_start;
20716 +#endif
20717
20718 /*
20719 * Number of extents written to the above array. This will be smaller
20720 @@ -176,7 +179,6 @@ struct xen_machphys_mfn_list {
20721 */
20722 unsigned int nr_extents;
20723 };
20724 -DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
20725 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
20726 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
20727
20728 @@ -216,7 +218,6 @@ struct xen_add_to_physmap {
20729 /* GPFN where the source mapping page should appear. */
20730 xen_pfn_t gpfn;
20731 };
20732 -DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
20733 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
20734 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
20735
20736 @@ -249,13 +250,21 @@ struct xen_translate_gpfn_list {
20737 xen_ulong_t nr_gpfns;
20738
20739 /* List of GPFNs to translate. */
20740 +#ifndef CONFIG_PARAVIRT_XEN
20741 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
20742 +#else
20743 + ulong gpfn_list;
20744 +#endif
20745
20746 /*
20747 * Output list to contain MFN translations. May be the same as the input
20748 * list (in which case each input GPFN is overwritten with the output MFN).
20749 */
20750 +#ifndef CONFIG_PARAVIRT_XEN
20751 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
20752 +#else
20753 + ulong mfn_list;
20754 +#endif
20755 };
20756 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
20757 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
20758 --- sle11-2009-10-16.orig/include/xen/interface/vcpu.h 2008-11-25 12:35:56.000000000 +0100
20759 +++ sle11-2009-10-16/include/xen/interface/vcpu.h 2009-03-16 16:38:05.000000000 +0100
20760 @@ -85,6 +85,7 @@ struct vcpu_runstate_info {
20761 */
20762 uint64_t time[4];
20763 };
20764 +DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
20765 typedef struct vcpu_runstate_info vcpu_runstate_info_t;
20766 DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);
20767
20768 @@ -140,6 +141,7 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_register_ru
20769 struct vcpu_set_periodic_timer {
20770 uint64_t period_ns;
20771 };
20772 +DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer);
20773 typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t;
20774 DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t);
20775
20776 @@ -153,6 +155,7 @@ struct vcpu_set_singleshot_timer {
20777 uint64_t timeout_abs_ns; /* Absolute system time value in nanoseconds. */
20778 uint32_t flags; /* VCPU_SSHOTTMR_??? */
20779 };
20780 +DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
20781 typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t;
20782 DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t);
20783
20784 @@ -176,6 +179,7 @@ struct vcpu_register_vcpu_info {
20785 uint32_t offset; /* offset within page */
20786 uint32_t rsvd; /* unused */
20787 };
20788 +DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
20789 typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t;
20790 DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t);
20791
20792 --- sle11-2009-10-16.orig/lib/swiotlb-xen.c 2009-03-16 16:33:40.000000000 +0100
20793 +++ sle11-2009-10-16/lib/swiotlb-xen.c 2009-03-16 16:38:05.000000000 +0100
20794 @@ -20,6 +20,7 @@
20795 #include <linux/ctype.h>
20796 #include <linux/init.h>
20797 #include <linux/bootmem.h>
20798 +#include <linux/iommu-helper.h>
20799 #include <linux/highmem.h>
20800 #include <asm/io.h>
20801 #include <asm/pci.h>
20802 @@ -288,15 +289,6 @@ __sync_single(struct phys_addr buffer, c
20803 }
20804 }
20805
20806 -static inline unsigned int is_span_boundary(unsigned int index,
20807 - unsigned int nslots,
20808 - unsigned long offset_slots,
20809 - unsigned long max_slots)
20810 -{
20811 - unsigned long offset = (offset_slots + index) & (max_slots - 1);
20812 - return offset + nslots > max_slots;
20813 -}
20814 -
20815 /*
20816 * Allocates bounce buffer and returns its kernel virtual address.
20817 */
20818 @@ -335,61 +327,53 @@ map_single(struct device *hwdev, struct
20819 * request and allocate a buffer from that IO TLB pool.
20820 */
20821 spin_lock_irqsave(&io_tlb_lock, flags);
20822 - {
20823 - index = ALIGN(io_tlb_index, stride);
20824 - if (index >= iotlb_nslabs)
20825 - index = 0;
20826 - wrap = index;
20827 + index = ALIGN(io_tlb_index, stride);
20828 + if (index >= iotlb_nslabs)
20829 + index = 0;
20830 + wrap = index;
20831
20832 - do {
20833 - while (is_span_boundary(index, nslots, offset_slots,
20834 - max_slots)) {
20835 - index += stride;
20836 - if (index >= iotlb_nslabs)
20837 - index = 0;
20838 - if (index == wrap)
20839 - goto not_found;
20840 - }
20841 + do {
20842 + while (iommu_is_span_boundary(index, nslots, offset_slots,
20843 + max_slots)) {
20844 + index += stride;
20845 + if (index >= iotlb_nslabs)
20846 + index = 0;
20847 + if (index == wrap)
20848 + goto not_found;
20849 + }
20850 +
20851 + /*
20852 + * If we find a slot that indicates we have 'nslots' number of
20853 + * contiguous buffers, we allocate the buffers from that slot
20854 + * and mark the entries as '0' indicating unavailable.
20855 + */
20856 + if (io_tlb_list[index] >= nslots) {
20857 + int count = 0;
20858 +
20859 + for (i = index; i < (int) (index + nslots); i++)
20860 + io_tlb_list[i] = 0;
20861 + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
20862 + io_tlb_list[i] = ++count;
20863 + dma_addr = iotlb_virt_start + (index << IO_TLB_SHIFT);
20864
20865 /*
20866 - * If we find a slot that indicates we have 'nslots'
20867 - * number of contiguous buffers, we allocate the
20868 - * buffers from that slot and mark the entries as '0'
20869 - * indicating unavailable.
20870 + * Update the indices to avoid searching in the next
20871 + * round.
20872 */
20873 - if (io_tlb_list[index] >= nslots) {
20874 - int count = 0;
20875 -
20876 - for (i = index; i < (int)(index + nslots); i++)
20877 - io_tlb_list[i] = 0;
20878 - for (i = index - 1;
20879 - (OFFSET(i, IO_TLB_SEGSIZE) !=
20880 - IO_TLB_SEGSIZE -1) && io_tlb_list[i];
20881 - i--)
20882 - io_tlb_list[i] = ++count;
20883 - dma_addr = iotlb_virt_start +
20884 - (index << IO_TLB_SHIFT);
20885 -
20886 - /*
20887 - * Update the indices to avoid searching in
20888 - * the next round.
20889 - */
20890 - io_tlb_index =
20891 - ((index + nslots) < iotlb_nslabs
20892 - ? (index + nslots) : 0);
20893 + io_tlb_index = ((index + nslots) < iotlb_nslabs
20894 + ? (index + nslots) : 0);
20895
20896 - goto found;
20897 - }
20898 - index += stride;
20899 - if (index >= iotlb_nslabs)
20900 - index = 0;
20901 - } while (index != wrap);
20902 + goto found;
20903 + }
20904 + index += stride;
20905 + if (index >= iotlb_nslabs)
20906 + index = 0;
20907 + } while (index != wrap);
20908
20909 - not_found:
20910 - spin_unlock_irqrestore(&io_tlb_lock, flags);
20911 - return NULL;
20912 - }
20913 - found:
20914 +not_found:
20915 + spin_unlock_irqrestore(&io_tlb_lock, flags);
20916 + return NULL;
20917 +found:
20918 spin_unlock_irqrestore(&io_tlb_lock, flags);
20919
20920 /*
20921 @@ -502,11 +486,13 @@ swiotlb_full(struct device *dev, size_t
20922 * Once the device is given the dma address, the device owns this memory until
20923 * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
20924 */
20925 -dma_addr_t
20926 -swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
20927 -{
20928 - dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) +
20929 - offset_in_page(ptr);
20930 +static dma_addr_t
20931 +_swiotlb_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
20932 + int dir, struct dma_attrs *attrs)
20933 +{
20934 + struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
20935 + dma_addr_t dev_addr = gnttab_dma_map_page(page) +
20936 + offset_in_page(paddr);
20937 void *map;
20938 struct phys_addr buffer;
20939
20940 @@ -517,7 +503,7 @@ swiotlb_map_single(struct device *hwdev,
20941 * we can safely return the device addr and not worry about bounce
20942 * buffering it.
20943 */
20944 - if (!range_straddles_page_boundary(__pa(ptr), size) &&
20945 + if (!range_straddles_page_boundary(paddr, size) &&
20946 !address_needs_mapping(hwdev, dev_addr))
20947 return dev_addr;
20948
20949 @@ -525,8 +511,8 @@ swiotlb_map_single(struct device *hwdev,
20950 * Oh well, have to allocate and map a bounce buffer.
20951 */
20952 gnttab_dma_unmap_page(dev_addr);
20953 - buffer.page = virt_to_page(ptr);
20954 - buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
20955 + buffer.page = page;
20956 + buffer.offset = offset_in_page(paddr);
20957 map = map_single(hwdev, buffer, size, dir);
20958 if (!map) {
20959 swiotlb_full(hwdev, size, dir, 1);
20960 @@ -537,6 +523,26 @@ swiotlb_map_single(struct device *hwdev,
20961 return dev_addr;
20962 }
20963
20964 +dma_addr_t
20965 +swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
20966 + int dir, struct dma_attrs *attrs)
20967 +{
20968 + return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, attrs);
20969 +}
20970 +EXPORT_SYMBOL(swiotlb_map_single_attrs);
20971 +
20972 +dma_addr_t
20973 +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
20974 +{
20975 + return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL);
20976 +}
20977 +
20978 +dma_addr_t
20979 +swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
20980 +{
20981 + return _swiotlb_map_single(hwdev, paddr, size, dir, NULL);
20982 +}
20983 +
20984 /*
20985 * Unmap a single streaming mode DMA translation. The dma_addr and size must
20986 * match what was provided for in a previous swiotlb_map_single call. All
20987 @@ -546,8 +552,8 @@ swiotlb_map_single(struct device *hwdev,
20988 * whatever the device wrote there.
20989 */
20990 void
20991 -swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
20992 - int dir)
20993 +swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
20994 + size_t size, int dir, struct dma_attrs *attrs)
20995 {
20996 BUG_ON(dir == DMA_NONE);
20997 if (in_swiotlb_aperture(dev_addr))
20998 @@ -555,7 +561,14 @@ swiotlb_unmap_single(struct device *hwde
20999 else
21000 gnttab_dma_unmap_page(dev_addr);
21001 }
21002 +EXPORT_SYMBOL(swiotlb_unmap_single_attrs);
21003
21004 +void
21005 +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
21006 + int dir)
21007 +{
21008 + return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
21009 +}
21010 /*
21011 * Make physical memory consistent for a single streaming mode DMA translation
21012 * after a transfer.
21013 @@ -584,6 +597,26 @@ swiotlb_sync_single_for_device(struct de
21014 sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
21015 }
21016
21017 +void
21018 +swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
21019 + unsigned long offset, size_t size, int dir)
21020 +{
21021 + BUG_ON(dir == DMA_NONE);
21022 + if (in_swiotlb_aperture(dev_addr))
21023 + sync_single(hwdev, bus_to_virt(dev_addr + offset), size, dir);
21024 +}
21025 +
21026 +void
21027 +swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
21028 + unsigned long offset, size_t size, int dir)
21029 +{
21030 + BUG_ON(dir == DMA_NONE);
21031 + if (in_swiotlb_aperture(dev_addr))
21032 + sync_single(hwdev, bus_to_virt(dev_addr + offset), size, dir);
21033 +}
21034 +
21035 +void swiotlb_unmap_sg_attrs(struct device *, struct scatterlist *, int, int,
21036 + struct dma_attrs *);
21037 /*
21038 * Map a set of buffers described by scatterlist in streaming mode for DMA.
21039 * This is the scatter-gather version of the above swiotlb_map_single
21040 @@ -601,8 +634,8 @@ swiotlb_sync_single_for_device(struct de
21041 * same here.
21042 */
21043 int
21044 -swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21045 - int dir)
21046 +swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
21047 + int dir, struct dma_attrs *attrs)
21048 {
21049 struct scatterlist *sg;
21050 struct phys_addr buffer;
21051 @@ -626,7 +659,8 @@ swiotlb_map_sg(struct device *hwdev, str
21052 /* Don't panic here, we expect map_sg users
21053 to do proper error handling. */
21054 swiotlb_full(hwdev, sg->length, dir, 0);
21055 - swiotlb_unmap_sg(hwdev, sgl, i, dir);
21056 + swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
21057 + attrs);
21058 sgl[0].dma_length = 0;
21059 return 0;
21060 }
21061 @@ -637,14 +671,22 @@ swiotlb_map_sg(struct device *hwdev, str
21062 }
21063 return nelems;
21064 }
21065 +EXPORT_SYMBOL(swiotlb_map_sg_attrs);
21066 +
21067 +int
21068 +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21069 + int dir)
21070 +{
21071 + return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
21072 +}
21073
21074 /*
21075 * Unmap a set of streaming mode DMA translations. Again, cpu read rules
21076 * concerning calls here are the same as for swiotlb_unmap_single() above.
21077 */
21078 void
21079 -swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21080 - int dir)
21081 +swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
21082 + int nelems, int dir, struct dma_attrs *attrs)
21083 {
21084 struct scatterlist *sg;
21085 int i;
21086 @@ -659,6 +701,14 @@ swiotlb_unmap_sg(struct device *hwdev, s
21087 gnttab_dma_unmap_page(sg->dma_address);
21088 }
21089 }
21090 +EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
21091 +
21092 +void
21093 +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21094 + int dir)
21095 +{
21096 + return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
21097 +}
21098
21099 /*
21100 * Make physical memory consistent for a set of streaming mode DMA translations
21101 @@ -699,46 +749,6 @@ swiotlb_sync_sg_for_device(struct device
21102 }
21103 }
21104
21105 -#ifdef CONFIG_HIGHMEM
21106 -
21107 -dma_addr_t
21108 -swiotlb_map_page(struct device *hwdev, struct page *page,
21109 - unsigned long offset, size_t size,
21110 - enum dma_data_direction direction)
21111 -{
21112 - struct phys_addr buffer;
21113 - dma_addr_t dev_addr;
21114 - char *map;
21115 -
21116 - dev_addr = gnttab_dma_map_page(page) + offset;
21117 - if (address_needs_mapping(hwdev, dev_addr)) {
21118 - gnttab_dma_unmap_page(dev_addr);
21119 - buffer.page = page;
21120 - buffer.offset = offset;
21121 - map = map_single(hwdev, buffer, size, direction);
21122 - if (!map) {
21123 - swiotlb_full(hwdev, size, direction, 1);
21124 - map = io_tlb_overflow_buffer;
21125 - }
21126 - dev_addr = (dma_addr_t)virt_to_bus(map);
21127 - }
21128 -
21129 - return dev_addr;
21130 -}
21131 -
21132 -void
21133 -swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
21134 - size_t size, enum dma_data_direction direction)
21135 -{
21136 - BUG_ON(direction == DMA_NONE);
21137 - if (in_swiotlb_aperture(dma_address))
21138 - unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
21139 - else
21140 - gnttab_dma_unmap_page(dma_address);
21141 -}
21142 -
21143 -#endif
21144 -
21145 int
21146 swiotlb_dma_mapping_error(dma_addr_t dma_addr)
21147 {