]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/60035_xen3-patch-2.6.26.patch1
Imported xen patches.
[people/pmueller/ipfire-2.x.git] / src / patches / 60035_xen3-patch-2.6.26.patch1
1 From: kernel.org
2 Subject: 2.6.26
3 Patch-mainline: 2.6.26
4
5 Acked-by: Jeff Mahoney <jeffm@suse.com>
6 Automatically created from "patches.kernel.org/patch-2.6.26" by xen-port-patches.py
7
8 ---
9 arch/x86/Kconfig | 10
10 arch/x86/ia32/ia32entry-xen.S | 14
11 arch/x86/kernel/Makefile | 5
12 arch/x86/kernel/acpi/Makefile | 2
13 arch/x86/kernel/acpi/boot.c | 8
14 arch/x86/kernel/acpi/sleep-xen.c | 87 +
15 arch/x86/kernel/cpu/common-xen.c | 158 +--
16 arch/x86/kernel/cpu/mtrr/main-xen.c | 138 +++
17 arch/x86/kernel/e820_32-xen.c | 32
18 arch/x86/kernel/e820_64-xen.c | 197 +++-
19 arch/x86/kernel/early_printk-xen.c | 24
20 arch/x86/kernel/entry_32-xen.S | 44
21 arch/x86/kernel/entry_64-xen.S | 8
22 arch/x86/kernel/genapic_64-xen.c | 55 +
23 arch/x86/kernel/genapic_xen_64.c | 4
24 arch/x86/kernel/head64-xen.c | 101 +-
25 arch/x86/kernel/head_32-xen.S | 2
26 arch/x86/kernel/init_task-xen.c | 1
27 arch/x86/kernel/io_apic_32-xen.c | 155 +--
28 arch/x86/kernel/io_apic_64-xen.c | 67 -
29 arch/x86/kernel/ipi-xen.c | 232 +++++
30 arch/x86/kernel/irq_32-xen.c | 6
31 arch/x86/kernel/machine_kexec_64.c | 2
32 arch/x86/kernel/microcode-xen.c | 2
33 arch/x86/kernel/mmconf-fam10h_64.c | 10
34 arch/x86/kernel/mpparse-xen.c | 1104 ++++++++++++++++++++++++
35 arch/x86/kernel/mpparse_32-xen.c | 1161 --------------------------
36 arch/x86/kernel/mpparse_64-xen.c | 879 -------------------
37 arch/x86/kernel/pci-dma-xen.c | 735 +++++++++-------
38 arch/x86/kernel/pci-nommu-xen.c | 103 ++
39 arch/x86/kernel/process-xen.c | 188 ++++
40 arch/x86/kernel/process_32-xen.c | 146 +--
41 arch/x86/kernel/process_64-xen.c | 165 ++-
42 arch/x86/kernel/setup-xen.c | 141 +++
43 arch/x86/kernel/setup64-xen.c | 103 --
44 arch/x86/kernel/setup_32-xen.c | 127 ++
45 arch/x86/kernel/setup_64-xen.c | 303 +++---
46 arch/x86/kernel/smp-xen.c | 329 +++++++
47 arch/x86/kernel/smp_32-xen.c | 647 --------------
48 arch/x86/kernel/smp_64-xen.c | 554 ------------
49 arch/x86/kernel/time_32-xen.c | 2
50 arch/x86/kernel/traps_32-xen.c | 592 +++++++------
51 arch/x86/kernel/traps_64-xen.c | 46 -
52 arch/x86/kernel/vsyscall_64-xen.c | 2
53 arch/x86/mm/fault-xen.c | 11
54 arch/x86/mm/highmem_32-xen.c | 1
55 arch/x86/mm/init_32-xen.c | 122 +-
56 arch/x86/mm/init_64-xen.c | 292 +++++-
57 arch/x86/mm/ioremap-xen.c | 269 ++++--
58 arch/x86/mm/pageattr-xen.c | 481 ++--------
59 arch/x86/mm/pat-xen.c | 602 +++++++++++++
60 arch/x86/mm/pgtable-xen.c | 709 +++++++++++++++
61 arch/x86/mm/pgtable_32-xen.c | 242 -----
62 arch/x86/pci/i386.c | 4
63 arch/x86/pci/irq-xen.c | 23
64 arch/x86/vdso/vdso32-setup-xen.c | 15
65 drivers/acpi/processor_core.c | 2
66 drivers/input/xen-kbdfront.c | 1
67 drivers/oprofile/cpu_buffer.c | 2
68 drivers/pci/msi-xen.c | 12
69 drivers/video/Kconfig | 2
70 drivers/video/xen-fbfront.c | 1
71 drivers/xen/Kconfig | 2
72 drivers/xen/Makefile | 8
73 drivers/xen/blkfront/blkfront.c | 4
74 drivers/xen/blkfront/block.h | 1
75 drivers/xen/blkfront/vbd.c | 58 -
76 drivers/xen/blktap/blktap.c | 27
77 drivers/xen/char/mem.c | 53 +
78 drivers/xen/console/console.c | 13
79 drivers/xen/core/machine_kexec.c | 8
80 drivers/xen/core/machine_reboot.c | 8
81 drivers/xen/core/smpboot.c | 23
82 drivers/xen/core/xen_proc.c | 2
83 drivers/xen/fbfront/xenfb.c | 24
84 drivers/xen/gntdev/gntdev.c | 8
85 drivers/xen/netfront/netfront.c | 6
86 drivers/xen/privcmd/privcmd.c | 8
87 drivers/xen/xenbus/xenbus_client.c | 6
88 drivers/xen/xenbus/xenbus_probe.c | 25
89 fs/aio.c | 15
90 include/asm-x86/dma-mapping.h | 5
91 include/asm-x86/genapic_64.h | 5
92 include/asm-x86/mach-xen/asm/desc.h | 65 -
93 include/asm-x86/mach-xen/asm/dma-mapping.h | 22
94 include/asm-x86/mach-xen/asm/dma-mapping_32.h | 141 ---
95 include/asm-x86/mach-xen/asm/dma-mapping_64.h | 205 ----
96 include/asm-x86/mach-xen/asm/fixmap.h | 8
97 include/asm-x86/mach-xen/asm/fixmap_32.h | 22
98 include/asm-x86/mach-xen/asm/fixmap_64.h | 27
99 include/asm-x86/mach-xen/asm/highmem.h | 2
100 include/asm-x86/mach-xen/asm/io.h | 17
101 include/asm-x86/mach-xen/asm/io_32.h | 156 +--
102 include/asm-x86/mach-xen/asm/io_64.h | 124 +-
103 include/asm-x86/mach-xen/asm/irqflags.h | 8
104 include/asm-x86/mach-xen/asm/mmu_context_32.h | 12
105 include/asm-x86/mach-xen/asm/mmu_context_64.h | 15
106 include/asm-x86/mach-xen/asm/page.h | 20
107 include/asm-x86/mach-xen/asm/page_64.h | 10
108 include/asm-x86/mach-xen/asm/pci.h | 11
109 include/asm-x86/mach-xen/asm/pci_64.h | 16
110 include/asm-x86/mach-xen/asm/pgalloc.h | 152 +++
111 include/asm-x86/mach-xen/asm/pgalloc_32.h | 111 --
112 include/asm-x86/mach-xen/asm/pgalloc_64.h | 179 ----
113 include/asm-x86/mach-xen/asm/pgtable-3level.h | 43
114 include/asm-x86/mach-xen/asm/pgtable.h | 292 ++++--
115 include/asm-x86/mach-xen/asm/pgtable_32.h | 107 +-
116 include/asm-x86/mach-xen/asm/pgtable_64.h | 156 +--
117 include/asm-x86/mach-xen/asm/processor.h | 688 ++++++++-------
118 include/asm-x86/mach-xen/asm/segment.h | 3
119 include/asm-x86/mach-xen/asm/smp.h | 228 +++++
120 include/asm-x86/mach-xen/asm/smp_32.h | 178 ---
121 include/asm-x86/mach-xen/asm/smp_64.h | 103 --
122 include/asm-x86/mach-xen/asm/spinlock.h | 18
123 include/asm-x86/mach-xen/asm/swiotlb.h | 13
124 include/asm-x86/mach-xen/asm/swiotlb_32.h | 43
125 include/asm-x86/mach-xen/asm/system.h | 107 +-
126 include/asm-x86/mach-xen/asm/tlbflush.h | 3
127 include/asm-x86/mach-xen/asm/vga.h | 4
128 include/asm-x86/mach-xen/asm/xor_64.h | 294 +++---
129 include/asm-x86/scatterlist.h | 2
130 include/linux/page-flags.h | 31
131 include/xen/balloon.h | 10
132 include/xen/interface/grant_table.h | 7
133 include/xen/interface/io/fbif.h | 5
134 include/xen/interface/memory.h | 17
135 include/xen/interface/vcpu.h | 4
136 lib/swiotlb-xen.c | 236 ++---
137 128 files changed, 8046 insertions(+), 7660 deletions(-)
138
139 --- a/arch/x86/ia32/ia32entry-xen.S
140 +++ b/arch/x86/ia32/ia32entry-xen.S
141 @@ -129,12 +129,14 @@ sysenter_tracesys:
142 SAVE_REST
143 CLEAR_RREGS
144 movq %r9,R9(%rsp)
145 - movq $-ENOSYS,RAX(%rsp) /* really needed? */
146 + movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
147 movq %rsp,%rdi /* &pt_regs -> arg1 */
148 call syscall_trace_enter
149 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
150 RESTORE_REST
151 xchgl %ebp,%r9d
152 + cmpl $(IA32_NR_syscalls-1),%eax
153 + ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
154 jmp sysenter_do_call
155 CFI_ENDPROC
156 ENDPROC(ia32_sysenter_target)
157 @@ -200,13 +202,15 @@ cstar_tracesys:
158 SAVE_REST
159 CLEAR_RREGS
160 movq %r9,R9(%rsp)
161 - movq $-ENOSYS,RAX(%rsp) /* really needed? */
162 + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
163 movq %rsp,%rdi /* &pt_regs -> arg1 */
164 call syscall_trace_enter
165 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
166 RESTORE_REST
167 xchgl %ebp,%r9d
168 movl RSP-ARGOFFSET(%rsp), %r8d
169 + cmpl $(IA32_NR_syscalls-1),%eax
170 + ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
171 jmp cstar_do_call
172 END(ia32_cstar_target)
173
174 @@ -264,7 +268,7 @@ ENTRY(ia32_syscall)
175 jnz ia32_tracesys
176 ia32_do_syscall:
177 cmpl $(IA32_NR_syscalls-1),%eax
178 - ja ia32_badsys
179 + ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
180 IA32_ARG_FIXUP
181 call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
182 ia32_sysret:
183 @@ -274,7 +278,7 @@ ia32_sysret:
184 ia32_tracesys:
185 SAVE_REST
186 CLEAR_RREGS
187 - movq $-ENOSYS,RAX(%rsp) /* really needed? */
188 + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
189 movq %rsp,%rdi /* &pt_regs -> arg1 */
190 call syscall_trace_enter
191 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
192 @@ -365,7 +369,7 @@ ia32_sys_call_table:
193 .quad sys_setuid16
194 .quad sys_getuid16
195 .quad compat_sys_stime /* stime */ /* 25 */
196 - .quad sys32_ptrace /* ptrace */
197 + .quad compat_sys_ptrace /* ptrace */
198 .quad sys_alarm
199 .quad sys_fstat /* (old)fstat */
200 .quad sys_pause
201 --- a/arch/x86/Kconfig
202 +++ b/arch/x86/Kconfig
203 @@ -28,6 +28,6 @@ config X86
204 select HAVE_DYNAMIC_FTRACE
205 select HAVE_FTRACE
206 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
207 - select HAVE_ARCH_KGDB if !X86_VOYAGER
208 + select HAVE_ARCH_KGDB if !X86_VOYAGER && !XEN
209 select HAVE_GENERIC_DMA_COHERENT if X86_32
210 select HAVE_EFFICIENT_UNALIGNED_ACCESS
211 @@ -482,6 +482,7 @@ config PARAVIRT_DEBUG
212
213 config MEMTEST
214 bool "Memtest"
215 + depends on !XEN
216 help
217 This option adds a kernel parameter 'memtest', which allows memtest
218 to be set.
219 @@ -1345,8 +1346,7 @@ source kernel/Kconfig.hz
220
221 config KEXEC
222 bool "kexec system call"
223 - depends on X86_BIOS_REBOOT
224 - depends on !XEN_UNPRIVILEGED_GUEST
225 + depends on X86_BIOS_REBOOT || (XEN && !XEN_UNPRIVILEGED_GUEST)
226 help
227 kexec is a system call that implements the ability to shutdown your
228 current kernel, and to start another kernel. It is like a reboot
229 @@ -1944,6 +1944,4 @@ source "crypto/Kconfig"
230
231 source "arch/x86/kvm/Kconfig"
232
233 -source "drivers/xen/Kconfig"
234 -
235 source "lib/Kconfig"
236 --- a/arch/x86/kernel/acpi/boot.c
237 +++ b/arch/x86/kernel/acpi/boot.c
238 @@ -251,19 +251,23 @@ static int __init acpi_parse_madt(struct
239
240 static void __cpuinit acpi_register_lapic(int id, u8 enabled)
241 {
242 +#ifndef CONFIG_XEN
243 unsigned int ver = 0;
244 +#endif
245
246 if (!enabled) {
247 ++disabled_cpus;
248 return;
249 }
250
251 +#ifndef CONFIG_XEN
252 #ifdef CONFIG_X86_32
253 if (boot_cpu_physical_apicid != -1U)
254 ver = apic_version[boot_cpu_physical_apicid];
255 #endif
256
257 generic_processor_info(id, ver);
258 +#endif
259 }
260
261 static int __init
262 @@ -774,6 +778,7 @@ static int __init acpi_parse_fadt(struct
263 * returns 0 on success, < 0 on error
264 */
265
266 +#ifndef CONFIG_XEN
267 static void __init acpi_register_lapic_address(unsigned long address)
268 {
269 mp_lapic_addr = address;
270 @@ -787,6 +792,9 @@ static void __init acpi_register_lapic_a
271 #endif
272 }
273 }
274 +#else
275 +#define acpi_register_lapic_address(address)
276 +#endif
277
278 static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
279 {
280 --- a/arch/x86/kernel/acpi/Makefile
281 +++ b/arch/x86/kernel/acpi/Makefile
282 @@ -15,4 +15,4 @@ $(obj)/wakeup_rm.o: $(obj)/realmode/w
283 $(obj)/realmode/wakeup.bin: FORCE
284 $(Q)$(MAKE) $(build)=$(obj)/realmode
285
286 -disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_$(BITS).o
287 +disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_%.o
288 --- a/arch/x86/kernel/acpi/sleep-xen.c
289 +++ b/arch/x86/kernel/acpi/sleep-xen.c
290 @@ -10,15 +10,19 @@
291 #include <linux/dmi.h>
292 #include <linux/cpumask.h>
293
294 -#include <asm/smp.h>
295 +#include "realmode/wakeup.h"
296 +#include "sleep.h"
297
298 #ifndef CONFIG_ACPI_PV_SLEEP
299 -/* address in low memory of the wakeup routine. */
300 -unsigned long acpi_wakeup_address = 0;
301 +unsigned long acpi_wakeup_address;
302 unsigned long acpi_realmode_flags;
303 -extern char wakeup_start, wakeup_end;
304
305 -extern unsigned long acpi_copy_wakeup_routine(unsigned long);
306 +/* address in low memory of the wakeup routine. */
307 +static unsigned long acpi_realmode;
308 +
309 +#ifdef CONFIG_64BIT
310 +static char temp_stack[10240];
311 +#endif
312 #endif
313
314 /**
315 @@ -26,17 +30,69 @@ extern unsigned long acpi_copy_wakeup_ro
316 *
317 * Create an identity mapped page table and copy the wakeup routine to
318 * low memory.
319 + *
320 + * Note that this is too late to change acpi_wakeup_address.
321 */
322 int acpi_save_state_mem(void)
323 {
324 #ifndef CONFIG_ACPI_PV_SLEEP
325 - if (!acpi_wakeup_address) {
326 - printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
327 + struct wakeup_header *header;
328 +
329 + if (!acpi_realmode) {
330 + printk(KERN_ERR "Could not allocate memory during boot, "
331 + "S3 disabled\n");
332 return -ENOMEM;
333 }
334 - memcpy((void *)acpi_wakeup_address, &wakeup_start,
335 - &wakeup_end - &wakeup_start);
336 - acpi_copy_wakeup_routine(acpi_wakeup_address);
337 + memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
338 +
339 + header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET);
340 + if (header->signature != 0x51ee1111) {
341 + printk(KERN_ERR "wakeup header does not match\n");
342 + return -EINVAL;
343 + }
344 +
345 + header->video_mode = saved_video_mode;
346 +
347 + header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
348 + /* GDT[0]: GDT self-pointer */
349 + header->wakeup_gdt[0] =
350 + (u64)(sizeof(header->wakeup_gdt) - 1) +
351 + ((u64)(acpi_wakeup_address +
352 + ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
353 + << 16);
354 + /* GDT[1]: real-mode-like code segment */
355 + header->wakeup_gdt[1] = (0x009bULL << 40) +
356 + ((u64)acpi_wakeup_address << 16) + 0xffff;
357 + /* GDT[2]: real-mode-like data segment */
358 + header->wakeup_gdt[2] = (0x0093ULL << 40) +
359 + ((u64)acpi_wakeup_address << 16) + 0xffff;
360 +
361 +#ifndef CONFIG_64BIT
362 + store_gdt((struct desc_ptr *)&header->pmode_gdt);
363 +
364 + header->pmode_efer_low = nx_enabled;
365 + if (header->pmode_efer_low & 1) {
366 + /* This is strange, why not save efer, always? */
367 + rdmsr(MSR_EFER, header->pmode_efer_low,
368 + header->pmode_efer_high);
369 + }
370 +#endif /* !CONFIG_64BIT */
371 +
372 + header->pmode_cr0 = read_cr0();
373 + header->pmode_cr4 = read_cr4();
374 + header->realmode_flags = acpi_realmode_flags;
375 + header->real_magic = 0x12345678;
376 +
377 +#ifndef CONFIG_64BIT
378 + header->pmode_entry = (u32)&wakeup_pmode_return;
379 + header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
380 + saved_magic = 0x12345678;
381 +#else /* CONFIG_64BIT */
382 + header->trampoline_segment = setup_trampoline() >> 4;
383 + init_rsp = (unsigned long)temp_stack + 4096;
384 + initial_code = (unsigned long)wakeup_long64;
385 + saved_magic = 0x123456789abcdef0;
386 +#endif /* CONFIG_64BIT */
387 #endif
388
389 return 0;
390 @@ -61,15 +117,20 @@ void acpi_restore_state_mem(void)
391 void __init acpi_reserve_bootmem(void)
392 {
393 #ifndef CONFIG_ACPI_PV_SLEEP
394 - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
395 + if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
396 printk(KERN_ERR
397 "ACPI: Wakeup code way too big, S3 disabled.\n");
398 return;
399 }
400
401 - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
402 - if (!acpi_wakeup_address)
403 + acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE);
404 +
405 + if (!acpi_realmode) {
406 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
407 + return;
408 + }
409 +
410 + acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
411 #endif
412 }
413
414 --- a/arch/x86/kernel/cpu/common-xen.c
415 +++ b/arch/x86/kernel/cpu/common-xen.c
416 @@ -5,7 +5,6 @@
417 #include <linux/module.h>
418 #include <linux/percpu.h>
419 #include <linux/bootmem.h>
420 -#include <asm/semaphore.h>
421 #include <asm/processor.h>
422 #include <asm/i387.h>
423 #include <asm/msr.h>
424 @@ -13,6 +12,7 @@
425 #include <asm/mmu_context.h>
426 #include <asm/mtrr.h>
427 #include <asm/mce.h>
428 +#include <asm/pat.h>
429 #ifdef CONFIG_X86_LOCAL_APIC
430 #include <asm/mpspec.h>
431 #include <asm/apic.h>
432 @@ -69,9 +69,9 @@ __u32 cleared_cpu_caps[NCAPINTS] __cpuin
433 static int cachesize_override __cpuinitdata = -1;
434 static int disable_x86_serial_nr __cpuinitdata = 1;
435
436 -struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
437 +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
438
439 -static void __cpuinit default_init(struct cpuinfo_x86 * c)
440 +static void __cpuinit default_init(struct cpuinfo_x86 *c)
441 {
442 /* Not much we can do here... */
443 /* Check if at least it has cpuid */
444 @@ -88,11 +88,11 @@ static struct cpu_dev __cpuinitdata defa
445 .c_init = default_init,
446 .c_vendor = "Unknown",
447 };
448 -static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
449 +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
450
451 static int __init cachesize_setup(char *str)
452 {
453 - get_option (&str, &cachesize_override);
454 + get_option(&str, &cachesize_override);
455 return 1;
456 }
457 __setup("cachesize=", cachesize_setup);
458 @@ -114,12 +114,12 @@ int __cpuinit get_model_name(struct cpui
459 /* Intel chips right-justify this string for some dumb reason;
460 undo that brain damage */
461 p = q = &c->x86_model_id[0];
462 - while ( *p == ' ' )
463 + while (*p == ' ')
464 p++;
465 - if ( p != q ) {
466 - while ( *p )
467 + if (p != q) {
468 + while (*p)
469 *q++ = *p++;
470 - while ( q <= &c->x86_model_id[48] )
471 + while (q <= &c->x86_model_id[48])
472 *q++ = '\0'; /* Zero-pad the rest */
473 }
474
475 @@ -137,7 +137,7 @@ void __cpuinit display_cacheinfo(struct
476 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
477 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
478 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
479 - c->x86_cache_size=(ecx>>24)+(edx>>24);
480 + c->x86_cache_size = (ecx>>24)+(edx>>24);
481 }
482
483 if (n < 0x80000006) /* Some chips just has a large L1. */
484 @@ -145,16 +145,16 @@ void __cpuinit display_cacheinfo(struct
485
486 ecx = cpuid_ecx(0x80000006);
487 l2size = ecx >> 16;
488 -
489 +
490 /* do processor-specific cache resizing */
491 if (this_cpu->c_size_cache)
492 - l2size = this_cpu->c_size_cache(c,l2size);
493 + l2size = this_cpu->c_size_cache(c, l2size);
494
495 /* Allow user to override all this if necessary. */
496 if (cachesize_override != -1)
497 l2size = cachesize_override;
498
499 - if ( l2size == 0 )
500 + if (l2size == 0)
501 return; /* Again, no L2 cache is possible */
502
503 c->x86_cache_size = l2size;
504 @@ -163,16 +163,19 @@ void __cpuinit display_cacheinfo(struct
505 l2size, ecx & 0xFF);
506 }
507
508 -/* Naming convention should be: <Name> [(<Codename>)] */
509 -/* This table only is used unless init_<vendor>() below doesn't set it; */
510 -/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
511 +/*
512 + * Naming convention should be: <Name> [(<Codename>)]
513 + * This table only is used unless init_<vendor>() below doesn't set it;
514 + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
515 + *
516 + */
517
518 /* Look up CPU names by table lookup. */
519 static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
520 {
521 struct cpu_model_info *info;
522
523 - if ( c->x86_model >= 16 )
524 + if (c->x86_model >= 16)
525 return NULL; /* Range check */
526
527 if (!this_cpu)
528 @@ -197,9 +200,9 @@ static void __cpuinit get_cpu_vendor(str
529
530 for (i = 0; i < X86_VENDOR_NUM; i++) {
531 if (cpu_devs[i]) {
532 - if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
533 - (cpu_devs[i]->c_ident[1] &&
534 - !strcmp(v,cpu_devs[i]->c_ident[1]))) {
535 + if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
536 + (cpu_devs[i]->c_ident[1] &&
537 + !strcmp(v, cpu_devs[i]->c_ident[1]))) {
538 c->x86_vendor = i;
539 if (!early)
540 this_cpu = cpu_devs[i];
541 @@ -217,7 +220,7 @@ static void __cpuinit get_cpu_vendor(str
542 }
543
544
545 -static int __init x86_fxsr_setup(char * s)
546 +static int __init x86_fxsr_setup(char *s)
547 {
548 setup_clear_cpu_cap(X86_FEATURE_FXSR);
549 setup_clear_cpu_cap(X86_FEATURE_XMM);
550 @@ -226,7 +229,7 @@ static int __init x86_fxsr_setup(char *
551 __setup("nofxsr", x86_fxsr_setup);
552
553
554 -static int __init x86_sep_setup(char * s)
555 +static int __init x86_sep_setup(char *s)
556 {
557 setup_clear_cpu_cap(X86_FEATURE_SEP);
558 return 1;
559 @@ -315,12 +318,15 @@ static void __cpuinit early_get_cap(stru
560
561 }
562
563 -/* Do minimum CPU detection early.
564 - Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
565 - The others are not touched to avoid unwanted side effects.
566 -
567 - WARNING: this function is only called on the BP. Don't add code here
568 - that is supposed to run on all CPUs. */
569 +/*
570 + * Do minimum CPU detection early.
571 + * Fields really needed: vendor, cpuid_level, family, model, mask,
572 + * cache alignment.
573 + * The others are not touched to avoid unwanted side effects.
574 + *
575 + * WARNING: this function is only called on the BP. Don't add code here
576 + * that is supposed to run on all CPUs.
577 + */
578 static void __init early_cpu_detect(void)
579 {
580 struct cpuinfo_x86 *c = &boot_cpu_data;
581 @@ -335,19 +341,14 @@ static void __init early_cpu_detect(void
582
583 get_cpu_vendor(c, 1);
584
585 - switch (c->x86_vendor) {
586 - case X86_VENDOR_AMD:
587 - early_init_amd(c);
588 - break;
589 - case X86_VENDOR_INTEL:
590 - early_init_intel(c);
591 - break;
592 - }
593 + if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
594 + cpu_devs[c->x86_vendor]->c_early_init)
595 + cpu_devs[c->x86_vendor]->c_early_init(c);
596
597 early_get_cap(c);
598 }
599
600 -static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
601 +static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
602 {
603 u32 tfms, xlvl;
604 unsigned int ebx;
605 @@ -358,13 +359,12 @@ static void __cpuinit generic_identify(s
606 (unsigned int *)&c->x86_vendor_id[0],
607 (unsigned int *)&c->x86_vendor_id[8],
608 (unsigned int *)&c->x86_vendor_id[4]);
609 -
610 +
611 get_cpu_vendor(c, 0);
612 /* Initialize the standard set of capabilities */
613 /* Note that the vendor-specific code below might override */
614 -
615 /* Intel-defined flags: level 0x00000001 */
616 - if ( c->cpuid_level >= 0x00000001 ) {
617 + if (c->cpuid_level >= 0x00000001) {
618 u32 capability, excap;
619 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
620 c->x86_capability[0] = capability;
621 @@ -376,12 +376,14 @@ static void __cpuinit generic_identify(s
622 if (c->x86 >= 0x6)
623 c->x86_model += ((tfms >> 16) & 0xF) << 4;
624 c->x86_mask = tfms & 15;
625 + c->initial_apicid = (ebx >> 24) & 0xFF;
626 #ifdef CONFIG_X86_HT
627 - c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
628 + c->apicid = phys_pkg_id(c->initial_apicid, 0);
629 + c->phys_proc_id = c->initial_apicid;
630 #else
631 - c->apicid = (ebx >> 24) & 0xFF;
632 + c->apicid = c->initial_apicid;
633 #endif
634 - if (c->x86_capability[0] & (1<<19))
635 + if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
636 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
637 } else {
638 /* Have CPUID level 0 only - unheard of */
639 @@ -390,33 +392,30 @@ static void __cpuinit generic_identify(s
640
641 /* AMD-defined flags: level 0x80000001 */
642 xlvl = cpuid_eax(0x80000000);
643 - if ( (xlvl & 0xffff0000) == 0x80000000 ) {
644 - if ( xlvl >= 0x80000001 ) {
645 + if ((xlvl & 0xffff0000) == 0x80000000) {
646 + if (xlvl >= 0x80000001) {
647 c->x86_capability[1] = cpuid_edx(0x80000001);
648 c->x86_capability[6] = cpuid_ecx(0x80000001);
649 }
650 - if ( xlvl >= 0x80000004 )
651 + if (xlvl >= 0x80000004)
652 get_model_name(c); /* Default name */
653 }
654
655 init_scattered_cpuid_features(c);
656 }
657
658 -#ifdef CONFIG_X86_HT
659 - c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
660 -#endif
661 }
662
663 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
664 {
665 - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
666 + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
667 /* Disable processor serial number */
668 - unsigned long lo,hi;
669 - rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
670 + unsigned long lo, hi;
671 + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
672 lo |= 0x200000;
673 - wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
674 + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
675 printk(KERN_NOTICE "CPU serial number disabled.\n");
676 - clear_bit(X86_FEATURE_PN, c->x86_capability);
677 + clear_cpu_cap(c, X86_FEATURE_PN);
678
679 /* Disabling the serial number may affect the cpuid level */
680 c->cpuid_level = cpuid_eax(0);
681 @@ -451,9 +450,11 @@ void __cpuinit identify_cpu(struct cpuin
682 memset(&c->x86_capability, 0, sizeof c->x86_capability);
683
684 if (!have_cpuid_p()) {
685 - /* First of all, decide if this is a 486 or higher */
686 - /* It's a 486 if we can modify the AC flag */
687 - if ( flag_is_changeable_p(X86_EFLAGS_AC) )
688 + /*
689 + * First of all, decide if this is a 486 or higher
690 + * It's a 486 if we can modify the AC flag
691 + */
692 + if (flag_is_changeable_p(X86_EFLAGS_AC))
693 c->x86 = 4;
694 else
695 c->x86 = 3;
696 @@ -486,10 +487,10 @@ void __cpuinit identify_cpu(struct cpuin
697 */
698
699 /* If the model name is still unset, do table lookup. */
700 - if ( !c->x86_model_id[0] ) {
701 + if (!c->x86_model_id[0]) {
702 char *p;
703 p = table_lookup_model(c);
704 - if ( p )
705 + if (p)
706 strcpy(c->x86_model_id, p);
707 else
708 /* Last resort... */
709 @@ -503,9 +504,9 @@ void __cpuinit identify_cpu(struct cpuin
710 * common between the CPUs. The first time this routine gets
711 * executed, c == &boot_cpu_data.
712 */
713 - if ( c != &boot_cpu_data ) {
714 + if (c != &boot_cpu_data) {
715 /* AND the already accumulated flags with these */
716 - for ( i = 0 ; i < NCAPINTS ; i++ )
717 + for (i = 0 ; i < NCAPINTS ; i++)
718 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
719 }
720
721 @@ -549,7 +550,7 @@ void __cpuinit detect_ht(struct cpuinfo_
722
723 if (smp_num_siblings == 1) {
724 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
725 - } else if (smp_num_siblings > 1 ) {
726 + } else if (smp_num_siblings > 1) {
727
728 if (smp_num_siblings > NR_CPUS) {
729 printk(KERN_WARNING "CPU: Unsupported number of the "
730 @@ -559,7 +560,7 @@ void __cpuinit detect_ht(struct cpuinfo_
731 }
732
733 index_msb = get_count_order(smp_num_siblings);
734 - c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
735 + c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
736
737 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
738 c->phys_proc_id);
739 @@ -570,7 +571,7 @@ void __cpuinit detect_ht(struct cpuinfo_
740
741 core_bits = get_count_order(c->x86_max_cores);
742
743 - c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
744 + c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
745 ((1 << core_bits) - 1);
746
747 if (c->x86_max_cores > 1)
748 @@ -604,7 +605,7 @@ void __cpuinit print_cpu_info(struct cpu
749 else
750 printk("%s", c->x86_model_id);
751
752 - if (c->x86_mask || c->cpuid_level >= 0)
753 + if (c->x86_mask || c->cpuid_level >= 0)
754 printk(" stepping %02x\n", c->x86_mask);
755 else
756 printk("\n");
757 @@ -623,24 +624,17 @@ __setup("clearcpuid=", setup_disablecpui
758
759 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
760
761 -/* This is hacky. :)
762 - * We're emulating future behavior.
763 - * In the future, the cpu-specific init functions will be called implicitly
764 - * via the magic of initcalls.
765 - * They will insert themselves into the cpu_devs structure.
766 - * Then, when cpu_init() is called, we can just iterate over that array.
767 - */
768 void __init early_cpu_init(void)
769 {
770 - intel_cpu_init();
771 - cyrix_init_cpu();
772 - nsc_init_cpu();
773 - amd_init_cpu();
774 - centaur_init_cpu();
775 - transmeta_init_cpu();
776 - nexgen_init_cpu();
777 - umc_init_cpu();
778 + struct cpu_vendor_dev *cvdev;
779 +
780 + for (cvdev = __x86cpuvendor_start ;
781 + cvdev < __x86cpuvendor_end ;
782 + cvdev++)
783 + cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
784 +
785 early_cpu_detect();
786 + validate_pat_support(&boot_cpu_data);
787 }
788
789 /* Make sure %fs is initialized properly in idle threads */
790 @@ -685,7 +679,7 @@ void __cpuinit cpu_init(void)
791 int cpu = smp_processor_id();
792 struct task_struct *curr = current;
793 #ifndef CONFIG_X86_NO_TSS
794 - struct tss_struct * t = &per_cpu(init_tss, cpu);
795 + struct tss_struct *t = &per_cpu(init_tss, cpu);
796 #endif
797 struct thread_struct *thread = &curr->thread;
798
799 @@ -738,7 +732,7 @@ void __cpuinit cpu_init(void)
800 mxcsr_feature_mask_init();
801 }
802
803 -#ifdef CONFIG_HOTPLUG_CPU
804 +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
805 void __cpuinit cpu_uninit(void)
806 {
807 int cpu = raw_smp_processor_id();
808 --- a/arch/x86/kernel/cpu/mtrr/main-xen.c
809 +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
810 @@ -35,6 +35,8 @@ struct mtrr_ops *mtrr_if = &generic_mtrr
811 unsigned int num_var_ranges;
812 unsigned int mtrr_usage_table[MAX_VAR_RANGES];
813
814 +static u64 tom2;
815 +
816 static void __init set_num_var_ranges(void)
817 {
818 struct xen_platform_op op;
819 @@ -162,8 +164,144 @@ mtrr_del(int reg, unsigned long base, un
820 EXPORT_SYMBOL(mtrr_add);
821 EXPORT_SYMBOL(mtrr_del);
822
823 +/*
824 + * Returns the effective MTRR type for the region
825 + * Error returns:
826 + * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
827 + * - 0xFF - when MTRR is not enabled
828 + */
829 +u8 mtrr_type_lookup(u64 start, u64 end)
830 +{
831 + int i, error;
832 + u64 start_mfn, end_mfn, base_mfn, top_mfn;
833 + u8 prev_match, curr_match;
834 + struct xen_platform_op op;
835 +
836 + if (!is_initial_xendomain())
837 + return MTRR_TYPE_WRBACK;
838 +
839 + if (!num_var_ranges)
840 + return 0xFF;
841 +
842 + start_mfn = start >> PAGE_SHIFT;
843 + /* Make end inclusive end, instead of exclusive */
844 + end_mfn = --end >> PAGE_SHIFT;
845 +
846 + /* Look in fixed ranges. Just return the type as per start */
847 + if (start_mfn < 0x100) {
848 +#if 0//todo
849 + op.cmd = XENPF_read_memtype;
850 + op.u.read_memtype.reg = ???;
851 + error = HYPERVISOR_platform_op(&op);
852 + if (!error)
853 + return op.u.read_memtype.type;
854 +#endif
855 + return MTRR_TYPE_UNCACHABLE;
856 + }
857 +
858 + /*
859 + * Look in variable ranges
860 + * Look of multiple ranges matching this address and pick type
861 + * as per MTRR precedence
862 + */
863 + prev_match = 0xFF;
864 + for (i = 0; i < num_var_ranges; ++i) {
865 + op.cmd = XENPF_read_memtype;
866 + op.u.read_memtype.reg = i;
867 + error = HYPERVISOR_platform_op(&op);
868 +
869 + if (error || !op.u.read_memtype.nr_mfns)
870 + continue;
871 +
872 + base_mfn = op.u.read_memtype.mfn;
873 + top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1;
874 +
875 + if (base_mfn > end_mfn || start_mfn > top_mfn) {
876 + continue;
877 + }
878 +
879 + if (base_mfn > start_mfn || end_mfn > top_mfn) {
880 + return 0xFE;
881 + }
882 +
883 + curr_match = op.u.read_memtype.type;
884 + if (prev_match == 0xFF) {
885 + prev_match = curr_match;
886 + continue;
887 + }
888 +
889 + if (prev_match == MTRR_TYPE_UNCACHABLE ||
890 + curr_match == MTRR_TYPE_UNCACHABLE) {
891 + return MTRR_TYPE_UNCACHABLE;
892 + }
893 +
894 + if ((prev_match == MTRR_TYPE_WRBACK &&
895 + curr_match == MTRR_TYPE_WRTHROUGH) ||
896 + (prev_match == MTRR_TYPE_WRTHROUGH &&
897 + curr_match == MTRR_TYPE_WRBACK)) {
898 + prev_match = MTRR_TYPE_WRTHROUGH;
899 + curr_match = MTRR_TYPE_WRTHROUGH;
900 + }
901 +
902 + if (prev_match != curr_match) {
903 + return MTRR_TYPE_UNCACHABLE;
904 + }
905 + }
906 +
907 + if (tom2) {
908 + if (start >= (1ULL<<32) && (end < tom2))
909 + return MTRR_TYPE_WRBACK;
910 + }
911 +
912 + if (prev_match != 0xFF)
913 + return prev_match;
914 +
915 +#if 0//todo
916 + op.cmd = XENPF_read_def_memtype;
917 + error = HYPERVISOR_platform_op(&op);
918 + if (!error)
919 + return op.u.read_def_memtype.type;
920 +#endif
921 + return MTRR_TYPE_UNCACHABLE;
922 +}
923 +
924 +/*
925 + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
926 + * for memory >4GB. Check for that here.
927 + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
928 + * apply to are wrong, but so far we don't know of any such case in the wild.
929 + */
930 +#define Tom2Enabled (1U << 21)
931 +#define Tom2ForceMemTypeWB (1U << 22)
932 +
933 +int __init amd_special_default_mtrr(void)
934 +{
935 + u32 l, h;
936 +
937 + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
938 + return 0;
939 + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
940 + return 0;
941 + /* In case some hypervisor doesn't pass SYSCFG through */
942 + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
943 + return 0;
944 + /*
945 + * Memory between 4GB and top of mem is forced WB by this magic bit.
946 + * Reserved before K8RevF, but should be zero there.
947 + */
948 + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
949 + (Tom2Enabled | Tom2ForceMemTypeWB))
950 + return 1;
951 + return 0;
952 +}
953 +
954 void __init mtrr_bp_init(void)
955 {
956 + if (amd_special_default_mtrr()) {
957 + /* TOP_MEM2 */
958 + rdmsrl(MSR_K8_TOP_MEM2, tom2);
959 + tom2 &= 0xffffff8000000ULL;
960 + }
961 }
962
963 void mtrr_ap_init(void)
964 --- a/arch/x86/kernel/e820_32-xen.c
965 +++ b/arch/x86/kernel/e820_32-xen.c
966 @@ -469,7 +469,7 @@ int __init sanitize_e820_map(struct e820
967 * thinkpad 560x, for example, does not cooperate with the memory
968 * detection code.)
969 */
970 -int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
971 +int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
972 {
973 #ifndef CONFIG_XEN
974 /* Only one memory region (or negative)? Ignore it */
975 @@ -480,33 +480,17 @@ int __init copy_e820_map(struct e820entr
976 #endif
977
978 do {
979 - unsigned long long start = biosmap->addr;
980 - unsigned long long size = biosmap->size;
981 - unsigned long long end = start + size;
982 - unsigned long type = biosmap->type;
983 + u64 start = biosmap->addr;
984 + u64 size = biosmap->size;
985 + u64 end = start + size;
986 + u32 type = biosmap->type;
987
988 /* Overflow in 64 bits? Ignore the memory map. */
989 if (start > end)
990 return -1;
991
992 -#ifndef CONFIG_XEN
993 - /*
994 - * Some BIOSes claim RAM in the 640k - 1M region.
995 - * Not right. Fix it up.
996 - */
997 - if (type == E820_RAM) {
998 - if (start < 0x100000ULL && end > 0xA0000ULL) {
999 - if (start < 0xA0000ULL)
1000 - add_memory_region(start, 0xA0000ULL-start, type);
1001 - if (end <= 0x100000ULL)
1002 - continue;
1003 - start = 0x100000ULL;
1004 - size = end - start;
1005 - }
1006 - }
1007 -#endif
1008 add_memory_region(start, size, type);
1009 - } while (biosmap++,--nr_map);
1010 + } while (biosmap++, --nr_map);
1011
1012 #ifdef CONFIG_XEN
1013 if (is_initial_xendomain()) {
1014 @@ -528,7 +512,7 @@ int __init copy_e820_map(struct e820entr
1015 /*
1016 * Find the highest page frame number we have available
1017 */
1018 -void __init find_max_pfn(void)
1019 +void __init propagate_e820_map(void)
1020 {
1021 int i;
1022
1023 @@ -814,7 +798,7 @@ static int __init parse_memmap(char *arg
1024 * size before original memory map is
1025 * reset.
1026 */
1027 - find_max_pfn();
1028 + propagate_e820_map();
1029 saved_max_pfn = max_pfn;
1030 #endif
1031 e820.nr_map = 0;
1032 --- a/arch/x86/kernel/e820_64-xen.c
1033 +++ b/arch/x86/kernel/e820_64-xen.c
1034 @@ -40,11 +40,11 @@ struct e820map machine_e820;
1035 unsigned long end_pfn;
1036
1037 /*
1038 - * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
1039 - * The direct mapping extends to end_pfn_map, so that we can directly access
1040 + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
1041 + * The direct mapping extends to max_pfn_mapped, so that we can directly access
1042 * apertures, ACPI and other tables without having to play with fixmaps.
1043 */
1044 -unsigned long end_pfn_map;
1045 +unsigned long max_pfn_mapped;
1046
1047 /*
1048 * Last pfn which the user wants to use.
1049 @@ -63,8 +63,8 @@ struct early_res {
1050 static struct early_res early_res[MAX_EARLY_RES] __initdata = {
1051 #ifndef CONFIG_XEN
1052 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
1053 -#ifdef CONFIG_SMP
1054 - { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
1055 +#ifdef CONFIG_X86_TRAMPOLINE
1056 + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
1057 #endif
1058 #endif
1059 {}
1060 @@ -89,19 +89,47 @@ void __init reserve_early(unsigned long
1061 strncpy(r->name, name, sizeof(r->name) - 1);
1062 }
1063
1064 -void __init early_res_to_bootmem(void)
1065 +void __init free_early(unsigned long start, unsigned long end)
1066 +{
1067 + struct early_res *r;
1068 + int i, j;
1069 +
1070 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1071 + r = &early_res[i];
1072 + if (start == r->start && end == r->end)
1073 + break;
1074 + }
1075 + if (i >= MAX_EARLY_RES || !early_res[i].end)
1076 + panic("free_early on not reserved area: %lx-%lx!", start, end);
1077 +
1078 + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
1079 + ;
1080 +
1081 + memmove(&early_res[i], &early_res[i + 1],
1082 + (j - 1 - i) * sizeof(struct early_res));
1083 +
1084 + early_res[j - 1].end = 0;
1085 +}
1086 +
1087 +void __init early_res_to_bootmem(unsigned long start, unsigned long end)
1088 {
1089 int i;
1090 + unsigned long final_start, final_end;
1091 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1092 struct early_res *r = &early_res[i];
1093 - printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
1094 - r->start, r->end - 1, r->name);
1095 - reserve_bootmem_generic(r->start, r->end - r->start);
1096 + final_start = max(start, r->start);
1097 + final_end = min(end, r->end);
1098 + if (final_start >= final_end)
1099 + continue;
1100 + printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
1101 + final_start, final_end - 1, r->name);
1102 + reserve_bootmem_generic(final_start, final_end - final_start);
1103 }
1104 }
1105
1106 /* Check for already reserved areas */
1107 -static inline int bad_addr(unsigned long *addrp, unsigned long size)
1108 +static inline int __init
1109 +bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
1110 {
1111 int i;
1112 unsigned long addr = *addrp, last;
1113 @@ -111,7 +139,7 @@ again:
1114 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1115 struct early_res *r = &early_res[i];
1116 if (last >= r->start && addr < r->end) {
1117 - *addrp = addr = r->end;
1118 + *addrp = addr = round_up(r->end, align);
1119 changed = 1;
1120 goto again;
1121 }
1122 @@ -119,6 +147,40 @@ again:
1123 return changed;
1124 }
1125
1126 +/* Check for already reserved areas */
1127 +static inline int __init
1128 +bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
1129 +{
1130 + int i;
1131 + unsigned long addr = *addrp, last;
1132 + unsigned long size = *sizep;
1133 + int changed = 0;
1134 +again:
1135 + last = addr + size;
1136 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1137 + struct early_res *r = &early_res[i];
1138 + if (last > r->start && addr < r->start) {
1139 + size = r->start - addr;
1140 + changed = 1;
1141 + goto again;
1142 + }
1143 + if (last > r->end && addr < r->end) {
1144 + addr = round_up(r->end, align);
1145 + size = last - addr;
1146 + changed = 1;
1147 + goto again;
1148 + }
1149 + if (last <= r->end && addr >= r->start) {
1150 + (*sizep)++;
1151 + return 0;
1152 + }
1153 + }
1154 + if (changed) {
1155 + *addrp = addr;
1156 + *sizep = size;
1157 + }
1158 + return changed;
1159 +}
1160 /*
1161 * This function checks if any part of the range <start,end> is mapped
1162 * with type.
1163 @@ -194,26 +256,27 @@ int __init e820_all_mapped(unsigned long
1164 * Find a free area with specified alignment in a specific range.
1165 */
1166 unsigned long __init find_e820_area(unsigned long start, unsigned long end,
1167 - unsigned size, unsigned long align)
1168 + unsigned long size, unsigned long align)
1169 {
1170 int i;
1171 - unsigned long mask = ~(align - 1);
1172
1173 for (i = 0; i < e820.nr_map; i++) {
1174 struct e820entry *ei = &e820.map[i];
1175 - unsigned long addr = ei->addr, last;
1176 + unsigned long addr, last;
1177 + unsigned long ei_last;
1178
1179 if (ei->type != E820_RAM)
1180 continue;
1181 + addr = round_up(ei->addr, align);
1182 + ei_last = ei->addr + ei->size;
1183 if (addr < start)
1184 - addr = start;
1185 - if (addr > ei->addr + ei->size)
1186 + addr = round_up(start, align);
1187 + if (addr >= ei_last)
1188 continue;
1189 - while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
1190 + while (bad_addr(&addr, size, align) && addr+size <= ei_last)
1191 ;
1192 - addr = (addr + align - 1) & mask;
1193 last = addr + size;
1194 - if (last > ei->addr + ei->size)
1195 + if (last > ei_last)
1196 continue;
1197 if (last > end)
1198 continue;
1199 @@ -223,6 +286,40 @@ unsigned long __init find_e820_area(unsi
1200 }
1201
1202 /*
1203 + * Find next free range after *start
1204 + */
1205 +unsigned long __init find_e820_area_size(unsigned long start,
1206 + unsigned long *sizep,
1207 + unsigned long align)
1208 +{
1209 + int i;
1210 +
1211 + for (i = 0; i < e820.nr_map; i++) {
1212 + struct e820entry *ei = &e820.map[i];
1213 + unsigned long addr, last;
1214 + unsigned long ei_last;
1215 +
1216 + if (ei->type != E820_RAM)
1217 + continue;
1218 + addr = round_up(ei->addr, align);
1219 + ei_last = ei->addr + ei->size;
1220 + if (addr < start)
1221 + addr = round_up(start, align);
1222 + if (addr >= ei_last)
1223 + continue;
1224 + *sizep = ei_last - addr;
1225 + while (bad_addr_size(&addr, sizep, align) &&
1226 + addr + *sizep <= ei_last)
1227 + ;
1228 + last = addr + *sizep;
1229 + if (last > ei_last)
1230 + continue;
1231 + return addr;
1232 + }
1233 + return -1UL;
1234 +
1235 +}
1236 +/*
1237 * Find the highest page frame number we have available
1238 */
1239 unsigned long __init e820_end_of_ram(void)
1240 @@ -231,31 +328,29 @@ unsigned long __init e820_end_of_ram(voi
1241
1242 end_pfn = find_max_pfn_with_active_regions();
1243
1244 - if (end_pfn > end_pfn_map)
1245 - end_pfn_map = end_pfn;
1246 - if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
1247 - end_pfn_map = MAXMEM>>PAGE_SHIFT;
1248 + if (end_pfn > max_pfn_mapped)
1249 + max_pfn_mapped = end_pfn;
1250 + if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
1251 + max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
1252 if (end_pfn > end_user_pfn)
1253 end_pfn = end_user_pfn;
1254 - if (end_pfn > end_pfn_map)
1255 - end_pfn = end_pfn_map;
1256 + if (end_pfn > max_pfn_mapped)
1257 + end_pfn = max_pfn_mapped;
1258
1259 - printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
1260 + printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
1261 return end_pfn;
1262 }
1263
1264 /*
1265 * Mark e820 reserved areas as busy for the resource manager.
1266 */
1267 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
1268 - struct resource *code_resource,
1269 - struct resource *data_resource,
1270 - struct resource *bss_resource)
1271 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
1272 {
1273 int i;
1274 + struct resource *res;
1275 +
1276 + res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
1277 for (i = 0; i < nr_map; i++) {
1278 - struct resource *res;
1279 - res = alloc_bootmem_low(sizeof(struct resource));
1280 switch (e820[i].type) {
1281 case E820_RAM: res->name = "System RAM"; break;
1282 case E820_ACPI: res->name = "ACPI Tables"; break;
1283 @@ -265,26 +360,8 @@ void __init e820_reserve_resources(struc
1284 res->start = e820[i].addr;
1285 res->end = res->start + e820[i].size - 1;
1286 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1287 - request_resource(&iomem_resource, res);
1288 - if (e820[i].type == E820_RAM) {
1289 - /*
1290 - * We don't know which RAM region contains kernel data,
1291 - * so we try it repeatedly and let the resource manager
1292 - * test it.
1293 - */
1294 -#ifndef CONFIG_XEN
1295 - request_resource(res, code_resource);
1296 - request_resource(res, data_resource);
1297 - request_resource(res, bss_resource);
1298 -#endif
1299 -#ifdef CONFIG_KEXEC
1300 - if (crashk_res.start != crashk_res.end)
1301 - request_resource(res, &crashk_res);
1302 -#ifdef CONFIG_XEN
1303 - xen_machine_kexec_register_resources(res);
1304 -#endif
1305 -#endif
1306 - }
1307 + insert_resource(&iomem_resource, res);
1308 + res++;
1309 }
1310 }
1311
1312 @@ -338,9 +415,9 @@ static int __init e820_find_active_regio
1313 if (*ei_startpfn >= *ei_endpfn)
1314 return 0;
1315
1316 - /* Check if end_pfn_map should be updated */
1317 - if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
1318 - end_pfn_map = *ei_endpfn;
1319 + /* Check if max_pfn_mapped should be updated */
1320 + if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
1321 + max_pfn_mapped = *ei_endpfn;
1322
1323 /* Skip if map is outside the node */
1324 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
1325 @@ -667,10 +744,10 @@ static int __init copy_e820_map(struct e
1326 #endif
1327
1328 do {
1329 - unsigned long start = biosmap->addr;
1330 - unsigned long size = biosmap->size;
1331 - unsigned long end = start + size;
1332 - unsigned long type = biosmap->type;
1333 + u64 start = biosmap->addr;
1334 + u64 size = biosmap->size;
1335 + u64 end = start + size;
1336 + u32 type = biosmap->type;
1337
1338 /* Overflow in 64 bits? Ignore the memory map. */
1339 if (start > end)
1340 @@ -801,7 +878,7 @@ static int __init parse_memmap_opt(char
1341 saved_max_pfn = e820_end_of_ram();
1342 remove_all_active_ranges();
1343 #endif
1344 - end_pfn_map = 0;
1345 + max_pfn_mapped = 0;
1346 e820.nr_map = 0;
1347 userdef = 1;
1348 return 0;
1349 --- a/arch/x86/kernel/early_printk-xen.c
1350 +++ b/arch/x86/kernel/early_printk-xen.c
1351 @@ -13,7 +13,7 @@
1352
1353 #ifndef CONFIG_XEN
1354 static int max_ypos = 25, max_xpos = 80;
1355 -static int current_ypos = 25, current_xpos = 0;
1356 +static int current_ypos = 25, current_xpos;
1357
1358 static void early_vga_write(struct console *con, const char *str, unsigned n)
1359 {
1360 @@ -108,12 +108,12 @@ static __init void early_serial_init(cha
1361
1362 if (*s) {
1363 unsigned port;
1364 - if (!strncmp(s,"0x",2)) {
1365 + if (!strncmp(s, "0x", 2)) {
1366 early_serial_base = simple_strtoul(s, &e, 16);
1367 } else {
1368 static int bases[] = { 0x3f8, 0x2f8 };
1369
1370 - if (!strncmp(s,"ttyS",4))
1371 + if (!strncmp(s, "ttyS", 4))
1372 s += 4;
1373 port = simple_strtoul(s, &e, 10);
1374 if (port > 1 || s == e)
1375 @@ -223,7 +223,7 @@ static struct console simnow_console = {
1376
1377 /* Direct interface for emergencies */
1378 static struct console *early_console = &early_vga_console;
1379 -static int early_console_initialized = 0;
1380 +static int early_console_initialized;
1381
1382 void early_printk(const char *fmt, ...)
1383 {
1384 @@ -231,9 +231,9 @@ void early_printk(const char *fmt, ...)
1385 int n;
1386 va_list ap;
1387
1388 - va_start(ap,fmt);
1389 - n = vscnprintf(buf,512,fmt,ap);
1390 - early_console->write(early_console,buf,n);
1391 + va_start(ap, fmt);
1392 + n = vscnprintf(buf, 512, fmt, ap);
1393 + early_console->write(early_console, buf, n);
1394 va_end(ap);
1395 }
1396
1397 @@ -259,16 +259,16 @@ static int __init setup_early_printk(cha
1398 early_console = &early_serial_console;
1399 } else if (!strncmp(buf, "vga", 3)) {
1400 #ifndef CONFIG_XEN
1401 - && boot_params.screen_info.orig_video_isVGA == 1) {
1402 + && boot_params.screen_info.orig_video_isVGA == 1) {
1403 max_xpos = boot_params.screen_info.orig_video_cols;
1404 max_ypos = boot_params.screen_info.orig_video_lines;
1405 current_ypos = boot_params.screen_info.orig_y;
1406 #endif
1407 early_console = &early_vga_console;
1408 - } else if (!strncmp(buf, "simnow", 6)) {
1409 - simnow_init(buf + 6);
1410 - early_console = &simnow_console;
1411 - keep_early = 1;
1412 + } else if (!strncmp(buf, "simnow", 6)) {
1413 + simnow_init(buf + 6);
1414 + early_console = &simnow_console;
1415 + keep_early = 1;
1416 #ifdef CONFIG_XEN
1417 } else if (!strncmp(buf, "xen", 3)) {
1418 early_console = &xenboot_console;
1419 --- a/arch/x86/kernel/entry_32-xen.S
1420 +++ b/arch/x86/kernel/entry_32-xen.S
1421 @@ -1,5 +1,4 @@
1422 /*
1423 - * linux/arch/i386/entry.S
1424 *
1425 * Copyright (C) 1991, 1992 Linus Torvalds
1426 */
1427 @@ -51,6 +50,7 @@
1428 #include <asm/desc.h>
1429 #include <asm/percpu.h>
1430 #include <asm/dwarf2.h>
1431 +#include <asm/processor-flags.h>
1432 #include "irq_vectors.h"
1433 #include <xen/interface/xen.h>
1434
1435 @@ -69,12 +69,6 @@
1436
1437 #define nr_syscalls ((syscall_table_size)/4)
1438
1439 -CF_MASK = 0x00000001
1440 -TF_MASK = 0x00000100
1441 -IF_MASK = 0x00000200
1442 -DF_MASK = 0x00000400
1443 -NT_MASK = 0x00004000
1444 -VM_MASK = 0x00020000
1445 /* Pseudo-eflags. */
1446 NMI_MASK = 0x80000000
1447
1448 @@ -87,7 +81,7 @@ NMI_MASK = 0x80000000
1449
1450 .macro TRACE_IRQS_IRET
1451 #ifdef CONFIG_TRACE_IRQFLAGS
1452 - testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
1453 + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
1454 jz 1f
1455 TRACE_IRQS_ON
1456 1:
1457 @@ -249,7 +243,7 @@ ret_from_intr:
1458 check_userspace:
1459 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
1460 movb PT_CS(%esp), %al
1461 - andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
1462 + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
1463 cmpl $USER_RPL, %eax
1464 jb resume_kernel # not returning to v8086 or userspace
1465
1466 @@ -258,6 +252,7 @@ ENTRY(resume_userspace)
1467 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1468 # setting need_resched or sigpending
1469 # between sampling and the iret
1470 + TRACE_IRQS_OFF
1471 movl TI_flags(%ebp), %ecx
1472 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
1473 # int/exception return?
1474 @@ -274,7 +269,7 @@ need_resched:
1475 movl TI_flags(%ebp), %ecx # need_resched set ?
1476 testb $_TIF_NEED_RESCHED, %cl
1477 jz restore_all
1478 - testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
1479 + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
1480 jz restore_all
1481 call preempt_schedule_irq
1482 jmp need_resched
1483 @@ -299,10 +294,10 @@ ENTRY(ia32_sysenter_target)
1484 movl SYSENTER_stack_sp0(%esp),%esp
1485 sysenter_past_esp:
1486 /*
1487 - * No need to follow this irqs on/off section: the syscall
1488 - * disabled irqs and here we enable it straight after entry:
1489 + * Interrupts are disabled here, but we can't trace it until
1490 + * enough kernel state to call TRACE_IRQS_OFF can be called - but
1491 + * we immediately enable interrupts at that point anyway.
1492 */
1493 - ENABLE_INTERRUPTS(CLBR_NONE)
1494 pushl $(__USER_DS)
1495 CFI_ADJUST_CFA_OFFSET 4
1496 /*CFI_REL_OFFSET ss, 0*/
1497 @@ -310,6 +305,7 @@ sysenter_past_esp:
1498 CFI_ADJUST_CFA_OFFSET 4
1499 CFI_REL_OFFSET esp, 0
1500 pushfl
1501 + orl $X86_EFLAGS_IF, (%esp)
1502 CFI_ADJUST_CFA_OFFSET 4
1503 pushl $(__USER_CS)
1504 CFI_ADJUST_CFA_OFFSET 4
1505 @@ -323,6 +319,11 @@ sysenter_past_esp:
1506 CFI_ADJUST_CFA_OFFSET 4
1507 CFI_REL_OFFSET eip, 0
1508
1509 + pushl %eax
1510 + CFI_ADJUST_CFA_OFFSET 4
1511 + SAVE_ALL
1512 + ENABLE_INTERRUPTS(CLBR_NONE)
1513 +
1514 /*
1515 * Load the potential sixth argument from user stack.
1516 * Careful about security.
1517 @@ -330,14 +331,12 @@ sysenter_past_esp:
1518 cmpl $__PAGE_OFFSET-3,%ebp
1519 jae syscall_fault
1520 1: movl (%ebp),%ebp
1521 + movl %ebp,PT_EBP(%esp)
1522 .section __ex_table,"a"
1523 .align 4
1524 .long 1b,syscall_fault
1525 .previous
1526
1527 - pushl %eax
1528 - CFI_ADJUST_CFA_OFFSET 4
1529 - SAVE_ALL
1530 GET_THREAD_INFO(%ebp)
1531 test_tif %ebp
1532 jnz syscall_trace_entry
1533 @@ -414,7 +413,7 @@ syscall_exit:
1534 # setting need_resched or sigpending
1535 # between sampling and the iret
1536 TRACE_IRQS_OFF
1537 - testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
1538 + testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
1539 jz no_singlestep
1540 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1541 no_singlestep:
1542 @@ -430,7 +429,7 @@ restore_all:
1543 # See comments in process.c:copy_thread() for details.
1544 movb PT_OLDSS(%esp), %ah
1545 movb PT_CS(%esp), %al
1546 - andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1547 + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1548 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
1549 CFI_REMEMBER_STATE
1550 je ldt_ss # returning to user-space with LDT SS
1551 @@ -438,7 +437,7 @@ restore_nocheck:
1552 #else
1553 restore_nocheck:
1554 movl PT_EFLAGS(%esp), %eax
1555 - testl $(VM_MASK|NMI_MASK), %eax
1556 + testl $(X86_EFLAGS_VM|NMI_MASK), %eax
1557 CFI_REMEMBER_STATE
1558 jnz hypervisor_iret
1559 shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
1560 @@ -456,7 +455,7 @@ restore_nocheck_notrace:
1561 irq_return:
1562 INTERRUPT_RETURN
1563 .section .fixup,"ax"
1564 -iret_exc:
1565 +ENTRY(iret_exc)
1566 pushl $0 # no error code
1567 pushl $do_iret_error
1568 jmp error_code
1569 @@ -560,7 +559,7 @@ work_resched:
1570 work_notifysig: # deal with pending signals and
1571 # notify-resume requests
1572 #ifdef CONFIG_VM86
1573 - testl $VM_MASK, PT_EFLAGS(%esp)
1574 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
1575 movl %esp, %eax
1576 jne work_notifysig_v86 # returning to kernel-space or
1577 # vm86-space
1578 @@ -617,9 +616,6 @@ END(syscall_exit_work)
1579
1580 RING0_INT_FRAME # can't unwind into user space anyway
1581 syscall_fault:
1582 - pushl %eax # save orig_eax
1583 - CFI_ADJUST_CFA_OFFSET 4
1584 - SAVE_ALL
1585 GET_THREAD_INFO(%ebp)
1586 movl $-EFAULT,PT_EAX(%esp)
1587 jmp resume_userspace
1588 --- a/arch/x86/kernel/entry_64-xen.S
1589 +++ b/arch/x86/kernel/entry_64-xen.S
1590 @@ -338,19 +338,17 @@ badsys:
1591 /* Do syscall tracing */
1592 tracesys:
1593 SAVE_REST
1594 - movq $-ENOSYS,RAX(%rsp)
1595 + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
1596 FIXUP_TOP_OF_STACK %rdi
1597 movq %rsp,%rdi
1598 call syscall_trace_enter
1599 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
1600 RESTORE_REST
1601 cmpq $__NR_syscall_max,%rax
1602 - movq $-ENOSYS,%rcx
1603 - cmova %rcx,%rax
1604 - ja 1f
1605 + ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
1606 movq %r10,%rcx /* fixup for C */
1607 call *sys_call_table(,%rax,8)
1608 -1: movq %rax,RAX-ARGOFFSET(%rsp)
1609 + movq %rax,RAX-ARGOFFSET(%rsp)
1610 /* Use IRET because user could have changed frame */
1611
1612 /*
1613 --- a/arch/x86/kernel/genapic_64-xen.c
1614 +++ b/arch/x86/kernel/genapic_64-xen.c
1615 @@ -15,6 +15,7 @@
1616 #include <linux/kernel.h>
1617 #include <linux/ctype.h>
1618 #include <linux/init.h>
1619 +#include <linux/hardirq.h>
1620
1621 #include <asm/smp.h>
1622 #include <asm/ipi.h>
1623 @@ -24,17 +25,12 @@
1624 #include <acpi/acpi_bus.h>
1625 #endif
1626
1627 -/* which logical CPU number maps to which CPU (physical APIC ID) */
1628 #ifndef CONFIG_XEN
1629 -u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
1630 - = { [0 ... NR_CPUS-1] = BAD_APICID };
1631 -void *x86_cpu_to_apicid_early_ptr;
1632 -#endif
1633 -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
1634 -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
1635 +DEFINE_PER_CPU(int, x2apic_extra_bits);
1636
1637 -#ifndef CONFIG_XEN
1638 struct genapic __read_mostly *genapic = &apic_flat;
1639 +
1640 +static enum uv_system_type uv_system_type;
1641 #else
1642 extern struct genapic apic_xen;
1643 struct genapic __read_mostly *genapic = &apic_xen;
1644 @@ -47,6 +43,9 @@ struct genapic __read_mostly *genapic =
1645 void __init setup_apic_routing(void)
1646 {
1647 #ifndef CONFIG_XEN
1648 + if (uv_system_type == UV_NON_UNIQUE_APIC)
1649 + genapic = &apic_x2apic_uv_x;
1650 + else
1651 #ifdef CONFIG_ACPI
1652 /*
1653 * Quirk: some x86_64 machines can only use physical APIC mode
1654 @@ -59,7 +58,7 @@ void __init setup_apic_routing(void)
1655 else
1656 #endif
1657
1658 - if (cpus_weight(cpu_possible_map) <= 8)
1659 + if (num_possible_cpus() <= 8)
1660 genapic = &apic_flat;
1661 else
1662 genapic = &apic_physflat;
1663 @@ -85,3 +84,41 @@ void send_IPI_self(int vector)
1664 xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
1665 #endif
1666 }
1667 +
1668 +int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
1669 +{
1670 +#ifndef CONFIG_XEN
1671 + if (!strcmp(oem_id, "SGI")) {
1672 + if (!strcmp(oem_table_id, "UVL"))
1673 + uv_system_type = UV_LEGACY_APIC;
1674 + else if (!strcmp(oem_table_id, "UVX"))
1675 + uv_system_type = UV_X2APIC;
1676 + else if (!strcmp(oem_table_id, "UVH"))
1677 + uv_system_type = UV_NON_UNIQUE_APIC;
1678 + }
1679 +#endif
1680 + return 0;
1681 +}
1682 +
1683 +#ifndef CONFIG_XEN
1684 +unsigned int read_apic_id(void)
1685 +{
1686 + unsigned int id;
1687 +
1688 + WARN_ON(preemptible() && num_online_cpus() > 1);
1689 + id = apic_read(APIC_ID);
1690 + if (uv_system_type >= UV_X2APIC)
1691 + id |= __get_cpu_var(x2apic_extra_bits);
1692 + return id;
1693 +}
1694 +
1695 +enum uv_system_type get_uv_system_type(void)
1696 +{
1697 + return uv_system_type;
1698 +}
1699 +
1700 +int is_uv_system(void)
1701 +{
1702 + return uv_system_type != UV_NONE;
1703 +}
1704 +#endif
1705 --- a/arch/x86/kernel/genapic_xen_64.c
1706 +++ b/arch/x86/kernel/genapic_xen_64.c
1707 @@ -72,9 +72,7 @@ static cpumask_t xen_target_cpus(void)
1708
1709 static cpumask_t xen_vector_allocation_domain(int cpu)
1710 {
1711 - cpumask_t domain = CPU_MASK_NONE;
1712 - cpu_set(cpu, domain);
1713 - return domain;
1714 + return cpumask_of_cpu(cpu);
1715 }
1716
1717 /*
1718 --- a/arch/x86/kernel/head_32-xen.S
1719 +++ b/arch/x86/kernel/head_32-xen.S
1720 @@ -69,7 +69,7 @@ ENTRY(startup_32)
1721 cld # gcc2 wants the direction flag cleared at all times
1722
1723 pushl $0 # fake return address for unwinder
1724 - jmp start_kernel
1725 + jmp i386_start_kernel
1726
1727 #define HYPERCALL_PAGE_OFFSET 0x1000
1728 .org HYPERCALL_PAGE_OFFSET
1729 --- a/arch/x86/kernel/head64-xen.c
1730 +++ b/arch/x86/kernel/head64-xen.c
1731 @@ -17,6 +17,7 @@
1732 #include <linux/string.h>
1733 #include <linux/percpu.h>
1734 #include <linux/start_kernel.h>
1735 +#include <linux/io.h>
1736 #include <linux/module.h>
1737
1738 #include <asm/processor.h>
1739 @@ -29,6 +30,7 @@
1740 #include <asm/sections.h>
1741 #include <asm/kdebug.h>
1742 #include <asm/e820.h>
1743 +#include <asm/bios_ebda.h>
1744
1745 unsigned long start_pfn;
1746
1747 @@ -75,34 +77,75 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
1748 unsigned int machine_to_phys_order;
1749 EXPORT_SYMBOL(machine_to_phys_order);
1750
1751 -#define EBDA_ADDR_POINTER 0x40E
1752 +#define BIOS_LOWMEM_KILOBYTES 0x413
1753
1754 -static __init void reserve_ebda(void)
1755 +/*
1756 + * The BIOS places the EBDA/XBDA at the top of conventional
1757 + * memory, and usually decreases the reported amount of
1758 + * conventional memory (int 0x12) too. This also contains a
1759 + * workaround for Dell systems that neglect to reserve EBDA.
1760 + * The same workaround also avoids a problem with the AMD768MPX
1761 + * chipset: reserve a page before VGA to prevent PCI prefetch
1762 + * into it (errata #56). Usually the page is reserved anyways,
1763 + * unless you have no PS/2 mouse plugged in.
1764 + */
1765 +static void __init reserve_ebda_region(void)
1766 {
1767 #ifndef CONFIG_XEN
1768 - unsigned ebda_addr, ebda_size;
1769 + unsigned int lowmem, ebda_addr;
1770
1771 - /*
1772 - * there is a real-mode segmented pointer pointing to the
1773 - * 4K EBDA area at 0x40E
1774 - */
1775 - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
1776 - ebda_addr <<= 4;
1777 -
1778 - if (!ebda_addr)
1779 + /* To determine the position of the EBDA and the */
1780 + /* end of conventional memory, we need to look at */
1781 + /* the BIOS data area. In a paravirtual environment */
1782 + /* that area is absent. We'll just have to assume */
1783 + /* that the paravirt case can handle memory setup */
1784 + /* correctly, without our help. */
1785 + if (paravirt_enabled())
1786 return;
1787
1788 - ebda_size = *(unsigned short *)__va(ebda_addr);
1789 + /* end of low (conventional) memory */
1790 + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
1791 + lowmem <<= 10;
1792 +
1793 + /* start of EBDA area */
1794 + ebda_addr = get_bios_ebda();
1795 +
1796 + /* Fixup: bios puts an EBDA in the top 64K segment */
1797 + /* of conventional memory, but does not adjust lowmem. */
1798 + if ((lowmem - ebda_addr) <= 0x10000)
1799 + lowmem = ebda_addr;
1800 +
1801 + /* Fixup: bios does not report an EBDA at all. */
1802 + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
1803 + if ((ebda_addr == 0) && (lowmem >= 0x9f000))
1804 + lowmem = 0x9f000;
1805 +
1806 + /* Paranoia: should never happen, but... */
1807 + if ((lowmem == 0) || (lowmem >= 0x100000))
1808 + lowmem = 0x9f000;
1809
1810 - /* Round EBDA up to pages */
1811 - if (ebda_size == 0)
1812 - ebda_size = 1;
1813 - ebda_size <<= 10;
1814 - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
1815 - if (ebda_size > 64*1024)
1816 - ebda_size = 64*1024;
1817 + /* reserve all memory between lowmem and the 1MB mark */
1818 + reserve_early(lowmem, 0x100000, "BIOS reserved");
1819 +#endif
1820 +}
1821
1822 - reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
1823 +static void __init reserve_setup_data(void)
1824 +{
1825 +#ifndef CONFIG_XEN
1826 + struct setup_data *data;
1827 + unsigned long pa_data;
1828 + char buf[32];
1829 +
1830 + if (boot_params.hdr.version < 0x0209)
1831 + return;
1832 + pa_data = boot_params.hdr.setup_data;
1833 + while (pa_data) {
1834 + data = early_ioremap(pa_data, sizeof(*data));
1835 + sprintf(buf, "setup data %x", data->type);
1836 + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
1837 + pa_data = data->next;
1838 + early_iounmap(data, sizeof(*data));
1839 + }
1840 #endif
1841 }
1842
1843 @@ -112,6 +155,19 @@ void __init x86_64_start_kernel(char * r
1844 unsigned long machine_to_phys_nr_ents;
1845 int i;
1846
1847 + /*
1848 + * Build-time sanity checks on the kernel image and module
1849 + * area mappings. (these are purely build-time and produce no code)
1850 + */
1851 + BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
1852 + BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
1853 + BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
1854 + BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
1855 + BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
1856 + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
1857 + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
1858 + (__START_KERNEL & PGDIR_MASK)));
1859 +
1860 xen_setup_features();
1861
1862 xen_start_info = (struct start_info *)real_mode_data;
1863 @@ -140,7 +196,7 @@ void __init x86_64_start_kernel(char * r
1864 /* Cleanup the over mapped high alias */
1865 cleanup_highmap();
1866
1867 - for (i = 0; i < IDT_ENTRIES; i++) {
1868 + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
1869 #ifdef CONFIG_EARLY_PRINTK
1870 set_intr_gate(i, &early_idt_handlers[i]);
1871 #else
1872 @@ -163,7 +219,8 @@ void __init x86_64_start_kernel(char * r
1873 reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
1874 start_pfn << PAGE_SHIFT, "Xen provided");
1875
1876 - reserve_ebda();
1877 + reserve_ebda_region();
1878 + reserve_setup_data();
1879
1880 /*
1881 * At this point everything still needed from the boot loader
1882 --- a/arch/x86/kernel/init_task-xen.c
1883 +++ b/arch/x86/kernel/init_task-xen.c
1884 @@ -11,7 +11,6 @@
1885 #include <asm/desc.h>
1886
1887 static struct fs_struct init_fs = INIT_FS;
1888 -static struct files_struct init_files = INIT_FILES;
1889 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
1890 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
1891 #ifdef CONFIG_X86_XEN
1892 --- a/arch/x86/kernel/io_apic_32-xen.c
1893 +++ b/arch/x86/kernel/io_apic_32-xen.c
1894 @@ -88,6 +88,16 @@ int sis_apic_bug = -1;
1895 */
1896 int nr_ioapic_registers[MAX_IO_APICS];
1897
1898 +/* I/O APIC entries */
1899 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
1900 +int nr_ioapics;
1901 +
1902 +/* MP IRQ source entries */
1903 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
1904 +
1905 +/* # of MP IRQ source entries */
1906 +int mp_irq_entries;
1907 +
1908 static int disable_timer_pin_1 __initdata;
1909
1910 /*
1911 @@ -863,10 +873,7 @@ static int __init find_isa_irq_pin(int i
1912 for (i = 0; i < mp_irq_entries; i++) {
1913 int lbus = mp_irqs[i].mpc_srcbus;
1914
1915 - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
1916 - mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
1917 - mp_bus_id_to_type[lbus] == MP_BUS_MCA
1918 - ) &&
1919 + if (test_bit(lbus, mp_bus_not_pci) &&
1920 (mp_irqs[i].mpc_irqtype == type) &&
1921 (mp_irqs[i].mpc_srcbusirq == irq))
1922
1923 @@ -882,10 +889,7 @@ static int __init find_isa_irq_apic(int
1924 for (i = 0; i < mp_irq_entries; i++) {
1925 int lbus = mp_irqs[i].mpc_srcbus;
1926
1927 - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
1928 - mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
1929 - mp_bus_id_to_type[lbus] == MP_BUS_MCA
1930 - ) &&
1931 + if (test_bit(lbus, mp_bus_not_pci) &&
1932 (mp_irqs[i].mpc_irqtype == type) &&
1933 (mp_irqs[i].mpc_srcbusirq == irq))
1934 break;
1935 @@ -926,7 +930,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
1936 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
1937 break;
1938
1939 - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
1940 + if (!test_bit(lbus, mp_bus_not_pci) &&
1941 !mp_irqs[i].mpc_irqtype &&
1942 (bus == lbus) &&
1943 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
1944 @@ -977,6 +981,7 @@ void __init setup_ioapic_dest(void)
1945 #endif /* !CONFIG_XEN */
1946 #endif
1947
1948 +#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1949 /*
1950 * EISA Edge/Level control register, ELCR
1951 */
1952 @@ -990,6 +995,13 @@ static int EISA_ELCR(unsigned int irq)
1953 "Broken MPtable reports ISA irq %d\n", irq);
1954 return 0;
1955 }
1956 +#endif
1957 +
1958 +/* ISA interrupts are always polarity zero edge triggered,
1959 + * when listed as conforming in the MP table. */
1960 +
1961 +#define default_ISA_trigger(idx) (0)
1962 +#define default_ISA_polarity(idx) (0)
1963
1964 /* EISA interrupts are always polarity zero and can be edge or level
1965 * trigger depending on the ELCR value. If an interrupt is listed as
1966 @@ -997,13 +1009,7 @@ static int EISA_ELCR(unsigned int irq)
1967 * be read in from the ELCR */
1968
1969 #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
1970 -#define default_EISA_polarity(idx) (0)
1971 -
1972 -/* ISA interrupts are always polarity zero edge triggered,
1973 - * when listed as conforming in the MP table. */
1974 -
1975 -#define default_ISA_trigger(idx) (0)
1976 -#define default_ISA_polarity(idx) (0)
1977 +#define default_EISA_polarity(idx) default_ISA_polarity(idx)
1978
1979 /* PCI interrupts are always polarity one level triggered,
1980 * when listed as conforming in the MP table. */
1981 @@ -1015,7 +1021,7 @@ static int EISA_ELCR(unsigned int irq)
1982 * when listed as conforming in the MP table. */
1983
1984 #define default_MCA_trigger(idx) (1)
1985 -#define default_MCA_polarity(idx) (0)
1986 +#define default_MCA_polarity(idx) default_ISA_polarity(idx)
1987
1988 static int MPBIOS_polarity(int idx)
1989 {
1990 @@ -1029,35 +1035,9 @@ static int MPBIOS_polarity(int idx)
1991 {
1992 case 0: /* conforms, ie. bus-type dependent polarity */
1993 {
1994 - switch (mp_bus_id_to_type[bus])
1995 - {
1996 - case MP_BUS_ISA: /* ISA pin */
1997 - {
1998 - polarity = default_ISA_polarity(idx);
1999 - break;
2000 - }
2001 - case MP_BUS_EISA: /* EISA pin */
2002 - {
2003 - polarity = default_EISA_polarity(idx);
2004 - break;
2005 - }
2006 - case MP_BUS_PCI: /* PCI pin */
2007 - {
2008 - polarity = default_PCI_polarity(idx);
2009 - break;
2010 - }
2011 - case MP_BUS_MCA: /* MCA pin */
2012 - {
2013 - polarity = default_MCA_polarity(idx);
2014 - break;
2015 - }
2016 - default:
2017 - {
2018 - printk(KERN_WARNING "broken BIOS!!\n");
2019 - polarity = 1;
2020 - break;
2021 - }
2022 - }
2023 + polarity = test_bit(bus, mp_bus_not_pci)?
2024 + default_ISA_polarity(idx):
2025 + default_PCI_polarity(idx);
2026 break;
2027 }
2028 case 1: /* high active */
2029 @@ -1098,11 +1078,15 @@ static int MPBIOS_trigger(int idx)
2030 {
2031 case 0: /* conforms, ie. bus-type dependent */
2032 {
2033 + trigger = test_bit(bus, mp_bus_not_pci)?
2034 + default_ISA_trigger(idx):
2035 + default_PCI_trigger(idx);
2036 +#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
2037 switch (mp_bus_id_to_type[bus])
2038 {
2039 case MP_BUS_ISA: /* ISA pin */
2040 {
2041 - trigger = default_ISA_trigger(idx);
2042 + /* set before the switch */
2043 break;
2044 }
2045 case MP_BUS_EISA: /* EISA pin */
2046 @@ -1112,7 +1096,7 @@ static int MPBIOS_trigger(int idx)
2047 }
2048 case MP_BUS_PCI: /* PCI pin */
2049 {
2050 - trigger = default_PCI_trigger(idx);
2051 + /* set before the switch */
2052 break;
2053 }
2054 case MP_BUS_MCA: /* MCA pin */
2055 @@ -1127,6 +1111,7 @@ static int MPBIOS_trigger(int idx)
2056 break;
2057 }
2058 }
2059 +#endif
2060 break;
2061 }
2062 case 1: /* edge */
2063 @@ -1176,39 +1161,22 @@ static int pin_2_irq(int idx, int apic,
2064 if (mp_irqs[idx].mpc_dstirq != pin)
2065 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
2066
2067 - switch (mp_bus_id_to_type[bus])
2068 - {
2069 - case MP_BUS_ISA: /* ISA pin */
2070 - case MP_BUS_EISA:
2071 - case MP_BUS_MCA:
2072 - {
2073 - irq = mp_irqs[idx].mpc_srcbusirq;
2074 - break;
2075 - }
2076 - case MP_BUS_PCI: /* PCI pin */
2077 - {
2078 - /*
2079 - * PCI IRQs are mapped in order
2080 - */
2081 - i = irq = 0;
2082 - while (i < apic)
2083 - irq += nr_ioapic_registers[i++];
2084 - irq += pin;
2085 -
2086 - /*
2087 - * For MPS mode, so far only needed by ES7000 platform
2088 - */
2089 - if (ioapic_renumber_irq)
2090 - irq = ioapic_renumber_irq(apic, irq);
2091 + if (test_bit(bus, mp_bus_not_pci))
2092 + irq = mp_irqs[idx].mpc_srcbusirq;
2093 + else {
2094 + /*
2095 + * PCI IRQs are mapped in order
2096 + */
2097 + i = irq = 0;
2098 + while (i < apic)
2099 + irq += nr_ioapic_registers[i++];
2100 + irq += pin;
2101
2102 - break;
2103 - }
2104 - default:
2105 - {
2106 - printk(KERN_ERR "unknown bus type %d.\n",bus);
2107 - irq = 0;
2108 - break;
2109 - }
2110 + /*
2111 + * For MPS mode, so far only needed by ES7000 platform
2112 + */
2113 + if (ioapic_renumber_irq)
2114 + irq = ioapic_renumber_irq(apic, irq);
2115 }
2116
2117 /*
2118 @@ -1314,7 +1282,6 @@ static void __init setup_IO_APIC_irqs(vo
2119 {
2120 struct IO_APIC_route_entry entry;
2121 int apic, pin, idx, irq, first_notcon = 1, vector;
2122 - unsigned long flags;
2123
2124 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
2125
2126 @@ -1380,9 +1347,7 @@ static void __init setup_IO_APIC_irqs(vo
2127 if (!apic && (irq < 16))
2128 disable_8259A_irq(irq);
2129 }
2130 - spin_lock_irqsave(&ioapic_lock, flags);
2131 - __ioapic_write_entry(apic, pin, entry);
2132 - spin_unlock_irqrestore(&ioapic_lock, flags);
2133 + ioapic_write_entry(apic, pin, entry);
2134 }
2135 }
2136
2137 @@ -1577,8 +1542,8 @@ void /*__init*/ print_local_APIC(void *
2138
2139 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
2140 smp_processor_id(), hard_smp_processor_id());
2141 - v = apic_read(APIC_ID);
2142 - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
2143 + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
2144 + GET_APIC_ID(read_apic_id()));
2145 v = apic_read(APIC_LVR);
2146 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
2147 ver = GET_APIC_VERSION(v);
2148 @@ -1791,7 +1756,7 @@ void disable_IO_APIC(void)
2149 entry.delivery_mode = dest_ExtINT; /* ExtInt */
2150 entry.vector = 0;
2151 entry.dest.physical.physical_dest =
2152 - GET_APIC_ID(apic_read(APIC_ID));
2153 + GET_APIC_ID(read_apic_id());
2154
2155 /*
2156 * Add it to the IO-APIC irq-routing table:
2157 @@ -2090,8 +2055,7 @@ static inline void init_IO_APIC_traps(vo
2158 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2159 */
2160 for (irq = 0; irq < NR_IRQS ; irq++) {
2161 - int tmp = irq;
2162 - if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
2163 + if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
2164 /*
2165 * Hmm.. We don't have an entry for this,
2166 * so default to an old-fashioned 8259
2167 @@ -2166,7 +2130,7 @@ static void __init setup_nmi(void)
2168 * cycles as some i82489DX-based boards have glue logic that keeps the
2169 * 8259A interrupt line asserted until INTA. --macro
2170 */
2171 -static inline void unlock_ExtINT_logic(void)
2172 +static inline void __init unlock_ExtINT_logic(void)
2173 {
2174 int apic, pin, i;
2175 struct IO_APIC_route_entry entry0, entry1;
2176 @@ -2218,8 +2182,6 @@ static inline void unlock_ExtINT_logic(v
2177 ioapic_write_entry(apic, pin, entry0);
2178 }
2179
2180 -int timer_uses_ioapic_pin_0;
2181 -
2182 /*
2183 * This code may look a bit paranoid, but it's supposed to cooperate with
2184 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
2185 @@ -2259,9 +2221,6 @@ static inline void __init check_timer(vo
2186 pin2 = ioapic_i8259.pin;
2187 apic2 = ioapic_i8259.apic;
2188
2189 - if (pin1 == 0)
2190 - timer_uses_ioapic_pin_0 = 1;
2191 -
2192 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
2193 vector, apic1, pin1, apic2, pin2);
2194
2195 @@ -2555,6 +2514,7 @@ void destroy_irq(unsigned int irq)
2196 dynamic_irq_cleanup(irq);
2197
2198 spin_lock_irqsave(&vector_lock, flags);
2199 + clear_bit(irq_vector[irq], used_vectors);
2200 irq_vector[irq] = 0;
2201 spin_unlock_irqrestore(&vector_lock, flags);
2202 }
2203 @@ -2871,7 +2831,6 @@ int __init io_apic_get_redir_entries (in
2204 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
2205 {
2206 struct IO_APIC_route_entry entry;
2207 - unsigned long flags;
2208
2209 if (!IO_APIC_IRQ(irq)) {
2210 printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2211 @@ -2912,9 +2871,7 @@ int io_apic_set_pci_routing (int ioapic,
2212 if (!ioapic && (irq < 16))
2213 disable_8259A_irq(irq);
2214
2215 - spin_lock_irqsave(&ioapic_lock, flags);
2216 - __ioapic_write_entry(ioapic, pin, entry);
2217 - spin_unlock_irqrestore(&ioapic_lock, flags);
2218 + ioapic_write_entry(ioapic, pin, entry);
2219
2220 return 0;
2221 }
2222 --- a/arch/x86/kernel/io_apic_64-xen.c
2223 +++ b/arch/x86/kernel/io_apic_64-xen.c
2224 @@ -43,13 +43,15 @@
2225 #include <asm/smp.h>
2226 #include <asm/desc.h>
2227 #include <asm/proto.h>
2228 -#include <asm/mach_apic.h>
2229 #include <asm/acpi.h>
2230 #include <asm/dma.h>
2231 #include <asm/nmi.h>
2232 #include <asm/msidef.h>
2233 #include <asm/hypertransport.h>
2234
2235 +#include <mach_ipi.h>
2236 +#include <mach_apic.h>
2237 +
2238 struct irq_cfg {
2239 #ifndef CONFIG_XEN
2240 cpumask_t domain;
2241 @@ -101,6 +103,16 @@ DEFINE_SPINLOCK(vector_lock);
2242 */
2243 int nr_ioapic_registers[MAX_IO_APICS];
2244
2245 +/* I/O APIC entries */
2246 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
2247 +int nr_ioapics;
2248 +
2249 +/* MP IRQ source entries */
2250 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
2251 +
2252 +/* # of MP IRQ source entries */
2253 +int mp_irq_entries;
2254 +
2255 /*
2256 * Rough estimation of how many shared IRQs there are, can
2257 * be changed anytime.
2258 @@ -181,11 +193,10 @@ static inline void io_apic_modify(unsign
2259 writel(value, &io_apic->data);
2260 }
2261
2262 -static int io_apic_level_ack_pending(unsigned int irq)
2263 +static bool io_apic_level_ack_pending(unsigned int irq)
2264 {
2265 struct irq_pin_list *entry;
2266 unsigned long flags;
2267 - int pending = 0;
2268
2269 spin_lock_irqsave(&ioapic_lock, flags);
2270 entry = irq_2_pin + irq;
2271 @@ -198,13 +209,17 @@ static int io_apic_level_ack_pending(uns
2272 break;
2273 reg = io_apic_read(entry->apic, 0x10 + pin*2);
2274 /* Is the remote IRR bit set? */
2275 - pending |= (reg >> 14) & 1;
2276 + if ((reg >> 14) & 1) {
2277 + spin_unlock_irqrestore(&ioapic_lock, flags);
2278 + return true;
2279 + }
2280 if (!entry->next)
2281 break;
2282 entry = irq_2_pin + entry->next;
2283 }
2284 spin_unlock_irqrestore(&ioapic_lock, flags);
2285 - return pending;
2286 +
2287 + return false;
2288 }
2289 #endif
2290
2291 @@ -762,7 +777,7 @@ static void __clear_irq_vector(int irq)
2292 per_cpu(vector_irq, cpu)[vector] = -1;
2293
2294 cfg->vector = 0;
2295 - cfg->domain = CPU_MASK_NONE;
2296 + cpus_clear(cfg->domain);
2297 }
2298
2299 void __setup_vector_irq(int cpu)
2300 @@ -899,9 +914,8 @@ static void __init setup_IO_APIC_irqs(vo
2301 static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
2302 {
2303 struct IO_APIC_route_entry entry;
2304 - unsigned long flags;
2305
2306 - memset(&entry,0,sizeof(entry));
2307 + memset(&entry, 0, sizeof(entry));
2308
2309 disable_8259A_irq(0);
2310
2311 @@ -929,10 +943,7 @@ static void __init setup_ExtINT_IRQ0_pin
2312 /*
2313 * Add it to the IO-APIC irq-routing table:
2314 */
2315 - spin_lock_irqsave(&ioapic_lock, flags);
2316 - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
2317 - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
2318 - spin_unlock_irqrestore(&ioapic_lock, flags);
2319 + ioapic_write_entry(apic, pin, entry);
2320
2321 enable_8259A_irq(0);
2322 }
2323 @@ -1061,8 +1072,7 @@ void __apicdebuginit print_local_APIC(vo
2324
2325 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
2326 smp_processor_id(), hard_smp_processor_id());
2327 - v = apic_read(APIC_ID);
2328 - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
2329 + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
2330 v = apic_read(APIC_LVR);
2331 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
2332 ver = GET_APIC_VERSION(v);
2333 @@ -1260,7 +1270,7 @@ void disable_IO_APIC(void)
2334 entry.dest_mode = 0; /* Physical */
2335 entry.delivery_mode = dest_ExtINT; /* ExtInt */
2336 entry.vector = 0;
2337 - entry.dest = GET_APIC_ID(apic_read(APIC_ID));
2338 + entry.dest = GET_APIC_ID(read_apic_id());
2339
2340 /*
2341 * Add it to the IO-APIC irq-routing table:
2342 @@ -1353,9 +1363,7 @@ static int ioapic_retrigger_irq(unsigned
2343 unsigned long flags;
2344
2345 spin_lock_irqsave(&vector_lock, flags);
2346 - cpus_clear(mask);
2347 - cpu_set(first_cpu(cfg->domain), mask);
2348 -
2349 + mask = cpumask_of_cpu(first_cpu(cfg->domain));
2350 send_IPI_mask(mask, cfg->vector);
2351 spin_unlock_irqrestore(&vector_lock, flags);
2352
2353 @@ -1519,8 +1527,7 @@ static inline void init_IO_APIC_traps(vo
2354 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2355 */
2356 for (irq = 0; irq < NR_IRQS ; irq++) {
2357 - int tmp = irq;
2358 - if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
2359 + if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) {
2360 /*
2361 * Hmm.. We don't have an entry for this,
2362 * so default to an old-fashioned 8259
2363 @@ -1597,22 +1604,19 @@ static void __init setup_nmi(void)
2364 * cycles as some i82489DX-based boards have glue logic that keeps the
2365 * 8259A interrupt line asserted until INTA. --macro
2366 */
2367 -static inline void unlock_ExtINT_logic(void)
2368 +static inline void __init unlock_ExtINT_logic(void)
2369 {
2370 int apic, pin, i;
2371 struct IO_APIC_route_entry entry0, entry1;
2372 unsigned char save_control, save_freq_select;
2373 - unsigned long flags;
2374
2375 pin = find_isa_irq_pin(8, mp_INT);
2376 apic = find_isa_irq_apic(8, mp_INT);
2377 if (pin == -1)
2378 return;
2379
2380 - spin_lock_irqsave(&ioapic_lock, flags);
2381 - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
2382 - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
2383 - spin_unlock_irqrestore(&ioapic_lock, flags);
2384 + entry0 = ioapic_read_entry(apic, pin);
2385 +
2386 clear_IO_APIC_pin(apic, pin);
2387
2388 memset(&entry1, 0, sizeof(entry1));
2389 @@ -1625,10 +1629,7 @@ static inline void unlock_ExtINT_logic(v
2390 entry1.trigger = 0;
2391 entry1.vector = 0;
2392
2393 - spin_lock_irqsave(&ioapic_lock, flags);
2394 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
2395 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
2396 - spin_unlock_irqrestore(&ioapic_lock, flags);
2397 + ioapic_write_entry(apic, pin, entry1);
2398
2399 save_control = CMOS_READ(RTC_CONTROL);
2400 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2401 @@ -1647,10 +1648,7 @@ static inline void unlock_ExtINT_logic(v
2402 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2403 clear_IO_APIC_pin(apic, pin);
2404
2405 - spin_lock_irqsave(&ioapic_lock, flags);
2406 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
2407 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
2408 - spin_unlock_irqrestore(&ioapic_lock, flags);
2409 + ioapic_write_entry(apic, pin, entry0);
2410 }
2411
2412 /*
2413 @@ -2327,7 +2325,6 @@ static struct resource * __init ioapic_s
2414 res = (void *)mem;
2415
2416 if (mem != NULL) {
2417 - memset(mem, 0, n);
2418 mem += sizeof(struct resource) * nr_ioapics;
2419
2420 for (i = 0; i < nr_ioapics; i++) {
2421 --- /dev/null
2422 +++ b/arch/x86/kernel/ipi-xen.c
2423 @@ -0,0 +1,232 @@
2424 +#include <linux/cpumask.h>
2425 +#include <linux/interrupt.h>
2426 +#include <linux/init.h>
2427 +
2428 +#include <linux/mm.h>
2429 +#include <linux/delay.h>
2430 +#include <linux/spinlock.h>
2431 +#include <linux/kernel_stat.h>
2432 +#include <linux/mc146818rtc.h>
2433 +#include <linux/cache.h>
2434 +#include <linux/interrupt.h>
2435 +#include <linux/cpu.h>
2436 +#include <linux/module.h>
2437 +
2438 +#include <asm/smp.h>
2439 +#include <asm/mtrr.h>
2440 +#include <asm/tlbflush.h>
2441 +#include <asm/mmu_context.h>
2442 +#include <asm/apic.h>
2443 +#include <asm/proto.h>
2444 +
2445 +#ifdef CONFIG_X86_32
2446 +#ifndef CONFIG_XEN
2447 +#include <mach_apic.h>
2448 +/*
2449 + * the following functions deal with sending IPIs between CPUs.
2450 + *
2451 + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
2452 + */
2453 +
2454 +static inline int __prepare_ICR(unsigned int shortcut, int vector)
2455 +{
2456 + unsigned int icr = shortcut | APIC_DEST_LOGICAL;
2457 +
2458 + switch (vector) {
2459 + default:
2460 + icr |= APIC_DM_FIXED | vector;
2461 + break;
2462 + case NMI_VECTOR:
2463 + icr |= APIC_DM_NMI;
2464 + break;
2465 + }
2466 + return icr;
2467 +}
2468 +
2469 +static inline int __prepare_ICR2(unsigned int mask)
2470 +{
2471 + return SET_APIC_DEST_FIELD(mask);
2472 +}
2473 +#else
2474 +#include <xen/evtchn.h>
2475 +
2476 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
2477 +
2478 +static inline void __send_IPI_one(unsigned int cpu, int vector)
2479 +{
2480 + int irq = per_cpu(ipi_to_irq, cpu)[vector];
2481 + BUG_ON(irq < 0);
2482 + notify_remote_via_irq(irq);
2483 +}
2484 +#endif
2485 +
2486 +void __send_IPI_shortcut(unsigned int shortcut, int vector)
2487 +{
2488 +#ifndef CONFIG_XEN
2489 + /*
2490 + * Subtle. In the case of the 'never do double writes' workaround
2491 + * we have to lock out interrupts to be safe. As we don't care
2492 + * of the value read we use an atomic rmw access to avoid costly
2493 + * cli/sti. Otherwise we use an even cheaper single atomic write
2494 + * to the APIC.
2495 + */
2496 + unsigned int cfg;
2497 +
2498 + /*
2499 + * Wait for idle.
2500 + */
2501 + apic_wait_icr_idle();
2502 +
2503 + /*
2504 + * No need to touch the target chip field
2505 + */
2506 + cfg = __prepare_ICR(shortcut, vector);
2507 +
2508 + /*
2509 + * Send the IPI. The write to APIC_ICR fires this off.
2510 + */
2511 + apic_write_around(APIC_ICR, cfg);
2512 +#else
2513 + int cpu;
2514 +
2515 + switch (shortcut) {
2516 + case APIC_DEST_SELF:
2517 + __send_IPI_one(smp_processor_id(), vector);
2518 + break;
2519 + case APIC_DEST_ALLBUT:
2520 + for_each_online_cpu(cpu)
2521 + if (cpu != smp_processor_id())
2522 + __send_IPI_one(cpu, vector);
2523 + break;
2524 + default:
2525 + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
2526 + vector);
2527 + break;
2528 + }
2529 +#endif
2530 +}
2531 +
2532 +void send_IPI_self(int vector)
2533 +{
2534 + __send_IPI_shortcut(APIC_DEST_SELF, vector);
2535 +}
2536 +
2537 +#ifndef CONFIG_XEN
2538 +/*
2539 + * This is used to send an IPI with no shorthand notation (the destination is
2540 + * specified in bits 56 to 63 of the ICR).
2541 + */
2542 +static inline void __send_IPI_dest_field(unsigned long mask, int vector)
2543 +{
2544 + unsigned long cfg;
2545 +
2546 + /*
2547 + * Wait for idle.
2548 + */
2549 + if (unlikely(vector == NMI_VECTOR))
2550 + safe_apic_wait_icr_idle();
2551 + else
2552 + apic_wait_icr_idle();
2553 +
2554 + /*
2555 + * prepare target chip field
2556 + */
2557 + cfg = __prepare_ICR2(mask);
2558 + apic_write_around(APIC_ICR2, cfg);
2559 +
2560 + /*
2561 + * program the ICR
2562 + */
2563 + cfg = __prepare_ICR(0, vector);
2564 +
2565 + /*
2566 + * Send the IPI. The write to APIC_ICR fires this off.
2567 + */
2568 + apic_write_around(APIC_ICR, cfg);
2569 +}
2570 +#endif
2571 +
2572 +/*
2573 + * This is only used on smaller machines.
2574 + */
2575 +void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
2576 +{
2577 +#ifndef CONFIG_XEN
2578 + unsigned long mask = cpus_addr(cpumask)[0];
2579 +#else
2580 + cpumask_t mask;
2581 + unsigned int cpu;
2582 +#endif
2583 + unsigned long flags;
2584 +
2585 + local_irq_save(flags);
2586 +#ifndef CONFIG_XEN
2587 + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
2588 + __send_IPI_dest_field(mask, vector);
2589 +#else
2590 + cpus_andnot(mask, cpumask, cpu_online_map);
2591 + WARN_ON(!cpus_empty(mask));
2592 + for_each_online_cpu(cpu)
2593 + if (cpu_isset(cpu, cpumask))
2594 + __send_IPI_one(cpu, vector);
2595 +#endif
2596 + local_irq_restore(flags);
2597 +}
2598 +
2599 +void send_IPI_mask_sequence(cpumask_t mask, int vector)
2600 +{
2601 +#ifndef CONFIG_XEN
2602 + unsigned long flags;
2603 + unsigned int query_cpu;
2604 +
2605 + /*
2606 + * Hack. The clustered APIC addressing mode doesn't allow us to send
2607 + * to an arbitrary mask, so I do a unicasts to each CPU instead. This
2608 + * should be modified to do 1 message per cluster ID - mbligh
2609 + */
2610 +
2611 + local_irq_save(flags);
2612 + for_each_possible_cpu(query_cpu) {
2613 + if (cpu_isset(query_cpu, mask)) {
2614 + __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
2615 + vector);
2616 + }
2617 + }
2618 + local_irq_restore(flags);
2619 +#else
2620 + send_IPI_mask_bitmask(mask, vector);
2621 +#endif
2622 +}
2623 +
2624 +/* must come after the send_IPI functions above for inlining */
2625 +#include <mach_ipi.h>
2626 +
2627 +#ifndef CONFIG_XEN
2628 +static int convert_apicid_to_cpu(int apic_id)
2629 +{
2630 + int i;
2631 +
2632 + for_each_possible_cpu(i) {
2633 + if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
2634 + return i;
2635 + }
2636 + return -1;
2637 +}
2638 +
2639 +int safe_smp_processor_id(void)
2640 +{
2641 + int apicid, cpuid;
2642 +
2643 + if (!boot_cpu_has(X86_FEATURE_APIC))
2644 + return 0;
2645 +
2646 + apicid = hard_smp_processor_id();
2647 + if (apicid == BAD_APICID)
2648 + return 0;
2649 +
2650 + cpuid = convert_apicid_to_cpu(apicid);
2651 +
2652 + return cpuid >= 0 ? cpuid : 0;
2653 +}
2654 +#endif
2655 +#endif
2656 --- a/arch/x86/kernel/irq_32-xen.c
2657 +++ b/arch/x86/kernel/irq_32-xen.c
2658 @@ -79,7 +79,7 @@ unsigned int do_IRQ(struct pt_regs *regs
2659
2660 if (unlikely((unsigned)irq >= NR_IRQS)) {
2661 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
2662 - __FUNCTION__, irq);
2663 + __func__, irq);
2664 BUG();
2665 }
2666
2667 @@ -134,7 +134,7 @@ unsigned int do_IRQ(struct pt_regs *regs
2668 : "=a" (arg1), "=d" (arg2), "=b" (bx)
2669 : "0" (irq), "1" (desc), "2" (isp),
2670 "D" (desc->handle_irq)
2671 - : "memory", "cc"
2672 + : "memory", "cc", "ecx"
2673 );
2674 } else
2675 #endif
2676 @@ -190,8 +190,6 @@ void irq_ctx_exit(int cpu)
2677 hardirq_ctx[cpu] = NULL;
2678 }
2679
2680 -extern asmlinkage void __do_softirq(void);
2681 -
2682 asmlinkage void do_softirq(void)
2683 {
2684 unsigned long flags;
2685 --- a/arch/x86/kernel/machine_kexec_64.c
2686 +++ b/arch/x86/kernel/machine_kexec_64.c
2687 @@ -120,8 +120,6 @@ int __init machine_kexec_setup_resources
2688 return 0;
2689 }
2690
2691 -void machine_kexec_register_resources(struct resource *res) { ; }
2692 -
2693 #else /* CONFIG_XEN */
2694
2695 #define x__pmd(x) __pmd(x)
2696 --- a/arch/x86/kernel/Makefile
2697 +++ b/arch/x86/kernel/Makefile
2698 @@ -122,8 +122,7 @@ ifeq ($(CONFIG_X86_64),y)
2699
2700 obj-$(CONFIG_XEN) += nmi_64.o
2701 time_64-$(CONFIG_XEN) += time_32.o
2702 - pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
2703 endif
2704
2705 -disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
2706 - smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
2707 +disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
2708 + pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
2709 --- a/arch/x86/kernel/microcode-xen.c
2710 +++ b/arch/x86/kernel/microcode-xen.c
2711 @@ -162,7 +162,7 @@ static int request_microcode(void)
2712 c->x86, c->x86_model, c->x86_mask);
2713 error = request_firmware(&firmware, name, &microcode_pdev->dev);
2714 if (error) {
2715 - pr_debug("ucode data file %s load failed\n", name);
2716 + pr_debug("microcode: ucode data file %s load failed\n", name);
2717 return error;
2718 }
2719
2720 --- a/arch/x86/kernel/mmconf-fam10h_64.c
2721 +++ b/arch/x86/kernel/mmconf-fam10h_64.c
2722 @@ -219,6 +219,16 @@ void __cpuinit fam10h_check_enable_mmcfg
2723 val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
2724 FAM10H_MMIO_CONF_ENABLE;
2725 wrmsrl(address, val);
2726 +
2727 +#ifdef CONFIG_XEN
2728 + {
2729 + u64 val2;
2730 +
2731 + rdmsrl(address, val2);
2732 + if (val2 != val)
2733 + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
2734 + }
2735 +#endif
2736 }
2737
2738 static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
2739 --- a/arch/x86/kernel/mpparse_32-xen.c
2740 +++ /dev/null
2741 @@ -1,1161 +0,0 @@
2742 -/*
2743 - * Intel Multiprocessor Specification 1.1 and 1.4
2744 - * compliant MP-table parsing routines.
2745 - *
2746 - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
2747 - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
2748 - *
2749 - * Fixes
2750 - * Erich Boleyn : MP v1.4 and additional changes.
2751 - * Alan Cox : Added EBDA scanning
2752 - * Ingo Molnar : various cleanups and rewrites
2753 - * Maciej W. Rozycki: Bits for default MP configurations
2754 - * Paul Diefenbaugh: Added full ACPI support
2755 - */
2756 -
2757 -#include <linux/mm.h>
2758 -#include <linux/init.h>
2759 -#include <linux/acpi.h>
2760 -#include <linux/delay.h>
2761 -#include <linux/bootmem.h>
2762 -#include <linux/kernel_stat.h>
2763 -#include <linux/mc146818rtc.h>
2764 -#include <linux/bitops.h>
2765 -
2766 -#include <asm/smp.h>
2767 -#include <asm/acpi.h>
2768 -#include <asm/mtrr.h>
2769 -#include <asm/mpspec.h>
2770 -#include <asm/io_apic.h>
2771 -
2772 -#include <mach_apic.h>
2773 -#include <mach_apicdef.h>
2774 -#include <mach_mpparse.h>
2775 -#include <bios_ebda.h>
2776 -
2777 -/* Have we found an MP table */
2778 -int smp_found_config;
2779 -unsigned int __cpuinitdata maxcpus = NR_CPUS;
2780 -
2781 -/*
2782 - * Various Linux-internal data structures created from the
2783 - * MP-table.
2784 - */
2785 -int apic_version [MAX_APICS];
2786 -int mp_bus_id_to_type [MAX_MP_BUSSES];
2787 -int mp_bus_id_to_node [MAX_MP_BUSSES];
2788 -int mp_bus_id_to_local [MAX_MP_BUSSES];
2789 -int quad_local_to_mp_bus_id [NR_CPUS/4][4];
2790 -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
2791 -static int mp_current_pci_id;
2792 -
2793 -/* I/O APIC entries */
2794 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
2795 -
2796 -/* # of MP IRQ source entries */
2797 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
2798 -
2799 -/* MP IRQ source entries */
2800 -int mp_irq_entries;
2801 -
2802 -int nr_ioapics;
2803 -
2804 -int pic_mode;
2805 -unsigned long mp_lapic_addr;
2806 -
2807 -unsigned int def_to_bigsmp = 0;
2808 -
2809 -/* Processor that is doing the boot up */
2810 -unsigned int boot_cpu_physical_apicid = -1U;
2811 -/* Internal processor count */
2812 -unsigned int num_processors;
2813 -
2814 -/* Bitmask of physically existing CPUs */
2815 -physid_mask_t phys_cpu_present_map;
2816 -
2817 -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
2818 -
2819 -/*
2820 - * Intel MP BIOS table parsing routines:
2821 - */
2822 -
2823 -
2824 -/*
2825 - * Checksum an MP configuration block.
2826 - */
2827 -
2828 -static int __init mpf_checksum(unsigned char *mp, int len)
2829 -{
2830 - int sum = 0;
2831 -
2832 - while (len--)
2833 - sum += *mp++;
2834 -
2835 - return sum & 0xFF;
2836 -}
2837 -
2838 -/*
2839 - * Have to match translation table entries to main table entries by counter
2840 - * hence the mpc_record variable .... can't see a less disgusting way of
2841 - * doing this ....
2842 - */
2843 -
2844 -static int mpc_record;
2845 -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
2846 -
2847 -#ifndef CONFIG_XEN
2848 -static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
2849 -{
2850 - int ver, apicid;
2851 - physid_mask_t phys_cpu;
2852 -
2853 - if (!(m->mpc_cpuflag & CPU_ENABLED))
2854 - return;
2855 -
2856 - apicid = mpc_apic_id(m, translation_table[mpc_record]);
2857 -
2858 - if (m->mpc_featureflag&(1<<0))
2859 - Dprintk(" Floating point unit present.\n");
2860 - if (m->mpc_featureflag&(1<<7))
2861 - Dprintk(" Machine Exception supported.\n");
2862 - if (m->mpc_featureflag&(1<<8))
2863 - Dprintk(" 64 bit compare & exchange supported.\n");
2864 - if (m->mpc_featureflag&(1<<9))
2865 - Dprintk(" Internal APIC present.\n");
2866 - if (m->mpc_featureflag&(1<<11))
2867 - Dprintk(" SEP present.\n");
2868 - if (m->mpc_featureflag&(1<<12))
2869 - Dprintk(" MTRR present.\n");
2870 - if (m->mpc_featureflag&(1<<13))
2871 - Dprintk(" PGE present.\n");
2872 - if (m->mpc_featureflag&(1<<14))
2873 - Dprintk(" MCA present.\n");
2874 - if (m->mpc_featureflag&(1<<15))
2875 - Dprintk(" CMOV present.\n");
2876 - if (m->mpc_featureflag&(1<<16))
2877 - Dprintk(" PAT present.\n");
2878 - if (m->mpc_featureflag&(1<<17))
2879 - Dprintk(" PSE present.\n");
2880 - if (m->mpc_featureflag&(1<<18))
2881 - Dprintk(" PSN present.\n");
2882 - if (m->mpc_featureflag&(1<<19))
2883 - Dprintk(" Cache Line Flush Instruction present.\n");
2884 - /* 20 Reserved */
2885 - if (m->mpc_featureflag&(1<<21))
2886 - Dprintk(" Debug Trace and EMON Store present.\n");
2887 - if (m->mpc_featureflag&(1<<22))
2888 - Dprintk(" ACPI Thermal Throttle Registers present.\n");
2889 - if (m->mpc_featureflag&(1<<23))
2890 - Dprintk(" MMX present.\n");
2891 - if (m->mpc_featureflag&(1<<24))
2892 - Dprintk(" FXSR present.\n");
2893 - if (m->mpc_featureflag&(1<<25))
2894 - Dprintk(" XMM present.\n");
2895 - if (m->mpc_featureflag&(1<<26))
2896 - Dprintk(" Willamette New Instructions present.\n");
2897 - if (m->mpc_featureflag&(1<<27))
2898 - Dprintk(" Self Snoop present.\n");
2899 - if (m->mpc_featureflag&(1<<28))
2900 - Dprintk(" HT present.\n");
2901 - if (m->mpc_featureflag&(1<<29))
2902 - Dprintk(" Thermal Monitor present.\n");
2903 - /* 30, 31 Reserved */
2904 -
2905 -
2906 - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
2907 - Dprintk(" Bootup CPU\n");
2908 - boot_cpu_physical_apicid = m->mpc_apicid;
2909 - }
2910 -
2911 - ver = m->mpc_apicver;
2912 -
2913 - /*
2914 - * Validate version
2915 - */
2916 - if (ver == 0x0) {
2917 - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
2918 - "fixing up to 0x10. (tell your hw vendor)\n",
2919 - m->mpc_apicid);
2920 - ver = 0x10;
2921 - }
2922 - apic_version[m->mpc_apicid] = ver;
2923 -
2924 - phys_cpu = apicid_to_cpu_present(apicid);
2925 - physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
2926 -
2927 - if (num_processors >= NR_CPUS) {
2928 - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
2929 - " Processor ignored.\n", NR_CPUS);
2930 - return;
2931 - }
2932 -
2933 - if (num_processors >= maxcpus) {
2934 - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
2935 - " Processor ignored.\n", maxcpus);
2936 - return;
2937 - }
2938 -
2939 - cpu_set(num_processors, cpu_possible_map);
2940 - num_processors++;
2941 -
2942 - /*
2943 - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
2944 - * but we need to work other dependencies like SMP_SUSPEND etc
2945 - * before this can be done without some confusion.
2946 - * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
2947 - * - Ashok Raj <ashok.raj@intel.com>
2948 - */
2949 - if (num_processors > 8) {
2950 - switch (boot_cpu_data.x86_vendor) {
2951 - case X86_VENDOR_INTEL:
2952 - if (!APIC_XAPIC(ver)) {
2953 - def_to_bigsmp = 0;
2954 - break;
2955 - }
2956 - /* If P4 and above fall through */
2957 - case X86_VENDOR_AMD:
2958 - def_to_bigsmp = 1;
2959 - }
2960 - }
2961 - bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
2962 -}
2963 -#else
2964 -static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
2965 -{
2966 - num_processors++;
2967 -}
2968 -#endif /* CONFIG_XEN */
2969 -
2970 -static void __init MP_bus_info (struct mpc_config_bus *m)
2971 -{
2972 - char str[7];
2973 -
2974 - memcpy(str, m->mpc_bustype, 6);
2975 - str[6] = 0;
2976 -
2977 - mpc_oem_bus_info(m, str, translation_table[mpc_record]);
2978 -
2979 -#if MAX_MP_BUSSES < 256
2980 - if (m->mpc_busid >= MAX_MP_BUSSES) {
2981 - printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
2982 - " is too large, max. supported is %d\n",
2983 - m->mpc_busid, str, MAX_MP_BUSSES - 1);
2984 - return;
2985 - }
2986 -#endif
2987 -
2988 - if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
2989 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
2990 - } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
2991 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
2992 - } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
2993 - mpc_oem_pci_bus(m, translation_table[mpc_record]);
2994 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
2995 - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
2996 - mp_current_pci_id++;
2997 - } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
2998 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
2999 - } else {
3000 - printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
3001 - }
3002 -}
3003 -
3004 -static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
3005 -{
3006 - if (!(m->mpc_flags & MPC_APIC_USABLE))
3007 - return;
3008 -
3009 - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
3010 - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
3011 - if (nr_ioapics >= MAX_IO_APICS) {
3012 - printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
3013 - MAX_IO_APICS, nr_ioapics);
3014 - panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
3015 - }
3016 - if (!m->mpc_apicaddr) {
3017 - printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
3018 - " found in MP table, skipping!\n");
3019 - return;
3020 - }
3021 - mp_ioapics[nr_ioapics] = *m;
3022 - nr_ioapics++;
3023 -}
3024 -
3025 -static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
3026 -{
3027 - mp_irqs [mp_irq_entries] = *m;
3028 - Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
3029 - " IRQ %02x, APIC ID %x, APIC INT %02x\n",
3030 - m->mpc_irqtype, m->mpc_irqflag & 3,
3031 - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
3032 - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
3033 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
3034 - panic("Max # of irq sources exceeded!!\n");
3035 -}
3036 -
3037 -static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
3038 -{
3039 - Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
3040 - " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
3041 - m->mpc_irqtype, m->mpc_irqflag & 3,
3042 - (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
3043 - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
3044 -}
3045 -
3046 -#ifdef CONFIG_X86_NUMAQ
3047 -static void __init MP_translation_info (struct mpc_config_translation *m)
3048 -{
3049 - printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
3050 -
3051 - if (mpc_record >= MAX_MPC_ENTRY)
3052 - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
3053 - else
3054 - translation_table[mpc_record] = m; /* stash this for later */
3055 - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
3056 - node_set_online(m->trans_quad);
3057 -}
3058 -
3059 -/*
3060 - * Read/parse the MPC oem tables
3061 - */
3062 -
3063 -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
3064 - unsigned short oemsize)
3065 -{
3066 - int count = sizeof (*oemtable); /* the header size */
3067 - unsigned char *oemptr = ((unsigned char *)oemtable)+count;
3068 -
3069 - mpc_record = 0;
3070 - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
3071 - if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
3072 - {
3073 - printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
3074 - oemtable->oem_signature[0],
3075 - oemtable->oem_signature[1],
3076 - oemtable->oem_signature[2],
3077 - oemtable->oem_signature[3]);
3078 - return;
3079 - }
3080 - if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
3081 - {
3082 - printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
3083 - return;
3084 - }
3085 - while (count < oemtable->oem_length) {
3086 - switch (*oemptr) {
3087 - case MP_TRANSLATION:
3088 - {
3089 - struct mpc_config_translation *m=
3090 - (struct mpc_config_translation *)oemptr;
3091 - MP_translation_info(m);
3092 - oemptr += sizeof(*m);
3093 - count += sizeof(*m);
3094 - ++mpc_record;
3095 - break;
3096 - }
3097 - default:
3098 - {
3099 - printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
3100 - return;
3101 - }
3102 - }
3103 - }
3104 -}
3105 -
3106 -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
3107 - char *productid)
3108 -{
3109 - if (strncmp(oem, "IBM NUMA", 8))
3110 - printk("Warning! May not be a NUMA-Q system!\n");
3111 - if (mpc->mpc_oemptr)
3112 - smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
3113 - mpc->mpc_oemsize);
3114 -}
3115 -#endif /* CONFIG_X86_NUMAQ */
3116 -
3117 -/*
3118 - * Read/parse the MPC
3119 - */
3120 -
3121 -static int __init smp_read_mpc(struct mp_config_table *mpc)
3122 -{
3123 - char str[16];
3124 - char oem[10];
3125 - int count=sizeof(*mpc);
3126 - unsigned char *mpt=((unsigned char *)mpc)+count;
3127 -
3128 - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
3129 - printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
3130 - *(u32 *)mpc->mpc_signature);
3131 - return 0;
3132 - }
3133 - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
3134 - printk(KERN_ERR "SMP mptable: checksum error!\n");
3135 - return 0;
3136 - }
3137 - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
3138 - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
3139 - mpc->mpc_spec);
3140 - return 0;
3141 - }
3142 - if (!mpc->mpc_lapic) {
3143 - printk(KERN_ERR "SMP mptable: null local APIC address!\n");
3144 - return 0;
3145 - }
3146 - memcpy(oem,mpc->mpc_oem,8);
3147 - oem[8]=0;
3148 - printk(KERN_INFO "OEM ID: %s ",oem);
3149 -
3150 - memcpy(str,mpc->mpc_productid,12);
3151 - str[12]=0;
3152 - printk("Product ID: %s ",str);
3153 -
3154 - mps_oem_check(mpc, oem, str);
3155 -
3156 - printk("APIC at: 0x%X\n", mpc->mpc_lapic);
3157 -
3158 - /*
3159 - * Save the local APIC address (it might be non-default) -- but only
3160 - * if we're not using ACPI.
3161 - */
3162 - if (!acpi_lapic)
3163 - mp_lapic_addr = mpc->mpc_lapic;
3164 -
3165 - /*
3166 - * Now process the configuration blocks.
3167 - */
3168 - mpc_record = 0;
3169 - while (count < mpc->mpc_length) {
3170 - switch(*mpt) {
3171 - case MP_PROCESSOR:
3172 - {
3173 - struct mpc_config_processor *m=
3174 - (struct mpc_config_processor *)mpt;
3175 - /* ACPI may have already provided this data */
3176 - if (!acpi_lapic)
3177 - MP_processor_info(m);
3178 - mpt += sizeof(*m);
3179 - count += sizeof(*m);
3180 - break;
3181 - }
3182 - case MP_BUS:
3183 - {
3184 - struct mpc_config_bus *m=
3185 - (struct mpc_config_bus *)mpt;
3186 - MP_bus_info(m);
3187 - mpt += sizeof(*m);
3188 - count += sizeof(*m);
3189 - break;
3190 - }
3191 - case MP_IOAPIC:
3192 - {
3193 - struct mpc_config_ioapic *m=
3194 - (struct mpc_config_ioapic *)mpt;
3195 - MP_ioapic_info(m);
3196 - mpt+=sizeof(*m);
3197 - count+=sizeof(*m);
3198 - break;
3199 - }
3200 - case MP_INTSRC:
3201 - {
3202 - struct mpc_config_intsrc *m=
3203 - (struct mpc_config_intsrc *)mpt;
3204 -
3205 - MP_intsrc_info(m);
3206 - mpt+=sizeof(*m);
3207 - count+=sizeof(*m);
3208 - break;
3209 - }
3210 - case MP_LINTSRC:
3211 - {
3212 - struct mpc_config_lintsrc *m=
3213 - (struct mpc_config_lintsrc *)mpt;
3214 - MP_lintsrc_info(m);
3215 - mpt+=sizeof(*m);
3216 - count+=sizeof(*m);
3217 - break;
3218 - }
3219 - default:
3220 - {
3221 - count = mpc->mpc_length;
3222 - break;
3223 - }
3224 - }
3225 - ++mpc_record;
3226 - }
3227 - setup_apic_routing();
3228 - if (!num_processors)
3229 - printk(KERN_ERR "SMP mptable: no processors registered!\n");
3230 - return num_processors;
3231 -}
3232 -
3233 -static int __init ELCR_trigger(unsigned int irq)
3234 -{
3235 - unsigned int port;
3236 -
3237 - port = 0x4d0 + (irq >> 3);
3238 - return (inb(port) >> (irq & 7)) & 1;
3239 -}
3240 -
3241 -static void __init construct_default_ioirq_mptable(int mpc_default_type)
3242 -{
3243 - struct mpc_config_intsrc intsrc;
3244 - int i;
3245 - int ELCR_fallback = 0;
3246 -
3247 - intsrc.mpc_type = MP_INTSRC;
3248 - intsrc.mpc_irqflag = 0; /* conforming */
3249 - intsrc.mpc_srcbus = 0;
3250 - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
3251 -
3252 - intsrc.mpc_irqtype = mp_INT;
3253 -
3254 - /*
3255 - * If true, we have an ISA/PCI system with no IRQ entries
3256 - * in the MP table. To prevent the PCI interrupts from being set up
3257 - * incorrectly, we try to use the ELCR. The sanity check to see if
3258 - * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
3259 - * never be level sensitive, so we simply see if the ELCR agrees.
3260 - * If it does, we assume it's valid.
3261 - */
3262 - if (mpc_default_type == 5) {
3263 - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
3264 -
3265 - if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
3266 - printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
3267 - else {
3268 - printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
3269 - ELCR_fallback = 1;
3270 - }
3271 - }
3272 -
3273 - for (i = 0; i < 16; i++) {
3274 - switch (mpc_default_type) {
3275 - case 2:
3276 - if (i == 0 || i == 13)
3277 - continue; /* IRQ0 & IRQ13 not connected */
3278 - /* fall through */
3279 - default:
3280 - if (i == 2)
3281 - continue; /* IRQ2 is never connected */
3282 - }
3283 -
3284 - if (ELCR_fallback) {
3285 - /*
3286 - * If the ELCR indicates a level-sensitive interrupt, we
3287 - * copy that information over to the MP table in the
3288 - * irqflag field (level sensitive, active high polarity).
3289 - */
3290 - if (ELCR_trigger(i))
3291 - intsrc.mpc_irqflag = 13;
3292 - else
3293 - intsrc.mpc_irqflag = 0;
3294 - }
3295 -
3296 - intsrc.mpc_srcbusirq = i;
3297 - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
3298 - MP_intsrc_info(&intsrc);
3299 - }
3300 -
3301 - intsrc.mpc_irqtype = mp_ExtINT;
3302 - intsrc.mpc_srcbusirq = 0;
3303 - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
3304 - MP_intsrc_info(&intsrc);
3305 -}
3306 -
3307 -static inline void __init construct_default_ISA_mptable(int mpc_default_type)
3308 -{
3309 - struct mpc_config_processor processor;
3310 - struct mpc_config_bus bus;
3311 - struct mpc_config_ioapic ioapic;
3312 - struct mpc_config_lintsrc lintsrc;
3313 - int linttypes[2] = { mp_ExtINT, mp_NMI };
3314 - int i;
3315 -
3316 - /*
3317 - * local APIC has default address
3318 - */
3319 - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
3320 -
3321 - /*
3322 - * 2 CPUs, numbered 0 & 1.
3323 - */
3324 - processor.mpc_type = MP_PROCESSOR;
3325 - /* Either an integrated APIC or a discrete 82489DX. */
3326 - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
3327 - processor.mpc_cpuflag = CPU_ENABLED;
3328 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
3329 - (boot_cpu_data.x86_model << 4) |
3330 - boot_cpu_data.x86_mask;
3331 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
3332 - processor.mpc_reserved[0] = 0;
3333 - processor.mpc_reserved[1] = 0;
3334 - for (i = 0; i < 2; i++) {
3335 - processor.mpc_apicid = i;
3336 - MP_processor_info(&processor);
3337 - }
3338 -
3339 - bus.mpc_type = MP_BUS;
3340 - bus.mpc_busid = 0;
3341 - switch (mpc_default_type) {
3342 - default:
3343 - printk("???\n");
3344 - printk(KERN_ERR "Unknown standard configuration %d\n",
3345 - mpc_default_type);
3346 - /* fall through */
3347 - case 1:
3348 - case 5:
3349 - memcpy(bus.mpc_bustype, "ISA ", 6);
3350 - break;
3351 - case 2:
3352 - case 6:
3353 - case 3:
3354 - memcpy(bus.mpc_bustype, "EISA ", 6);
3355 - break;
3356 - case 4:
3357 - case 7:
3358 - memcpy(bus.mpc_bustype, "MCA ", 6);
3359 - }
3360 - MP_bus_info(&bus);
3361 - if (mpc_default_type > 4) {
3362 - bus.mpc_busid = 1;
3363 - memcpy(bus.mpc_bustype, "PCI ", 6);
3364 - MP_bus_info(&bus);
3365 - }
3366 -
3367 - ioapic.mpc_type = MP_IOAPIC;
3368 - ioapic.mpc_apicid = 2;
3369 - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
3370 - ioapic.mpc_flags = MPC_APIC_USABLE;
3371 - ioapic.mpc_apicaddr = 0xFEC00000;
3372 - MP_ioapic_info(&ioapic);
3373 -
3374 - /*
3375 - * We set up most of the low 16 IO-APIC pins according to MPS rules.
3376 - */
3377 - construct_default_ioirq_mptable(mpc_default_type);
3378 -
3379 - lintsrc.mpc_type = MP_LINTSRC;
3380 - lintsrc.mpc_irqflag = 0; /* conforming */
3381 - lintsrc.mpc_srcbusid = 0;
3382 - lintsrc.mpc_srcbusirq = 0;
3383 - lintsrc.mpc_destapic = MP_APIC_ALL;
3384 - for (i = 0; i < 2; i++) {
3385 - lintsrc.mpc_irqtype = linttypes[i];
3386 - lintsrc.mpc_destapiclint = i;
3387 - MP_lintsrc_info(&lintsrc);
3388 - }
3389 -}
3390 -
3391 -static struct intel_mp_floating *mpf_found;
3392 -
3393 -/*
3394 - * Scan the memory blocks for an SMP configuration block.
3395 - */
3396 -void __init get_smp_config (void)
3397 -{
3398 - struct intel_mp_floating *mpf = mpf_found;
3399 -
3400 - /*
3401 - * ACPI supports both logical (e.g. Hyper-Threading) and physical
3402 - * processors, where MPS only supports physical.
3403 - */
3404 - if (acpi_lapic && acpi_ioapic) {
3405 - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
3406 - return;
3407 - }
3408 - else if (acpi_lapic)
3409 - printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
3410 -
3411 - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
3412 - if (mpf->mpf_feature2 & (1<<7)) {
3413 - printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
3414 - pic_mode = 1;
3415 - } else {
3416 - printk(KERN_INFO " Virtual Wire compatibility mode.\n");
3417 - pic_mode = 0;
3418 - }
3419 -
3420 - /*
3421 - * Now see if we need to read further.
3422 - */
3423 - if (mpf->mpf_feature1 != 0) {
3424 -
3425 - printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
3426 - construct_default_ISA_mptable(mpf->mpf_feature1);
3427 -
3428 - } else if (mpf->mpf_physptr) {
3429 -
3430 - /*
3431 - * Read the physical hardware table. Anything here will
3432 - * override the defaults.
3433 - */
3434 - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
3435 - smp_found_config = 0;
3436 - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
3437 - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
3438 - return;
3439 - }
3440 - /*
3441 - * If there are no explicit MP IRQ entries, then we are
3442 - * broken. We set up most of the low 16 IO-APIC pins to
3443 - * ISA defaults and hope it will work.
3444 - */
3445 - if (!mp_irq_entries) {
3446 - struct mpc_config_bus bus;
3447 -
3448 - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
3449 -
3450 - bus.mpc_type = MP_BUS;
3451 - bus.mpc_busid = 0;
3452 - memcpy(bus.mpc_bustype, "ISA ", 6);
3453 - MP_bus_info(&bus);
3454 -
3455 - construct_default_ioirq_mptable(0);
3456 - }
3457 -
3458 - } else
3459 - BUG();
3460 -
3461 - printk(KERN_INFO "Processors: %d\n", num_processors);
3462 - /*
3463 - * Only use the first configuration found.
3464 - */
3465 -}
3466 -
3467 -static int __init smp_scan_config (unsigned long base, unsigned long length)
3468 -{
3469 - unsigned long *bp = isa_bus_to_virt(base);
3470 - struct intel_mp_floating *mpf;
3471 -
3472 - printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
3473 - if (sizeof(*mpf) != 16)
3474 - printk("Error: MPF size\n");
3475 -
3476 - while (length > 0) {
3477 - mpf = (struct intel_mp_floating *)bp;
3478 - if ((*bp == SMP_MAGIC_IDENT) &&
3479 - (mpf->mpf_length == 1) &&
3480 - !mpf_checksum((unsigned char *)bp, 16) &&
3481 - ((mpf->mpf_specification == 1)
3482 - || (mpf->mpf_specification == 4)) ) {
3483 -
3484 - smp_found_config = 1;
3485 -#ifndef CONFIG_XEN
3486 - printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
3487 - mpf, virt_to_phys(mpf));
3488 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
3489 - BOOTMEM_DEFAULT);
3490 - if (mpf->mpf_physptr) {
3491 - /*
3492 - * We cannot access to MPC table to compute
3493 - * table size yet, as only few megabytes from
3494 - * the bottom is mapped now.
3495 - * PC-9800's MPC table places on the very last
3496 - * of physical memory; so that simply reserving
3497 - * PAGE_SIZE from mpg->mpf_physptr yields BUG()
3498 - * in reserve_bootmem.
3499 - */
3500 - unsigned long size = PAGE_SIZE;
3501 - unsigned long end = max_low_pfn * PAGE_SIZE;
3502 - if (mpf->mpf_physptr + size > end)
3503 - size = end - mpf->mpf_physptr;
3504 - reserve_bootmem(mpf->mpf_physptr, size,
3505 - BOOTMEM_DEFAULT);
3506 - }
3507 -#else
3508 - printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
3509 - mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
3510 -#endif
3511 -
3512 - mpf_found = mpf;
3513 - return 1;
3514 - }
3515 - bp += 4;
3516 - length -= 16;
3517 - }
3518 - return 0;
3519 -}
3520 -
3521 -void __init find_smp_config (void)
3522 -{
3523 -#ifndef CONFIG_XEN
3524 - unsigned int address;
3525 -#endif
3526 -
3527 - /*
3528 - * FIXME: Linux assumes you have 640K of base ram..
3529 - * this continues the error...
3530 - *
3531 - * 1) Scan the bottom 1K for a signature
3532 - * 2) Scan the top 1K of base RAM
3533 - * 3) Scan the 64K of bios
3534 - */
3535 - if (smp_scan_config(0x0,0x400) ||
3536 - smp_scan_config(639*0x400,0x400) ||
3537 - smp_scan_config(0xF0000,0x10000))
3538 - return;
3539 - /*
3540 - * If it is an SMP machine we should know now, unless the
3541 - * configuration is in an EISA/MCA bus machine with an
3542 - * extended bios data area.
3543 - *
3544 - * there is a real-mode segmented pointer pointing to the
3545 - * 4K EBDA area at 0x40E, calculate and scan it here.
3546 - *
3547 - * NOTE! There are Linux loaders that will corrupt the EBDA
3548 - * area, and as such this kind of SMP config may be less
3549 - * trustworthy, simply because the SMP table may have been
3550 - * stomped on during early boot. These loaders are buggy and
3551 - * should be fixed.
3552 - *
3553 - * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
3554 - */
3555 -
3556 -#ifndef CONFIG_XEN
3557 - address = get_bios_ebda();
3558 - if (address)
3559 - smp_scan_config(address, 0x400);
3560 -#endif
3561 -}
3562 -
3563 -int es7000_plat;
3564 -
3565 -/* --------------------------------------------------------------------------
3566 - ACPI-based MP Configuration
3567 - -------------------------------------------------------------------------- */
3568 -
3569 -#ifdef CONFIG_ACPI
3570 -
3571 -void __init mp_register_lapic_address(u64 address)
3572 -{
3573 -#ifndef CONFIG_XEN
3574 - mp_lapic_addr = (unsigned long) address;
3575 -
3576 - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
3577 -
3578 - if (boot_cpu_physical_apicid == -1U)
3579 - boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
3580 -
3581 - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
3582 -#endif
3583 -}
3584 -
3585 -void __cpuinit mp_register_lapic (u8 id, u8 enabled)
3586 -{
3587 - struct mpc_config_processor processor;
3588 - int boot_cpu = 0;
3589 -
3590 - if (MAX_APICS - id <= 0) {
3591 - printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
3592 - id, MAX_APICS);
3593 - return;
3594 - }
3595 -
3596 - if (id == boot_cpu_physical_apicid)
3597 - boot_cpu = 1;
3598 -
3599 -#ifndef CONFIG_XEN
3600 - processor.mpc_type = MP_PROCESSOR;
3601 - processor.mpc_apicid = id;
3602 - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
3603 - processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
3604 - processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
3605 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
3606 - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
3607 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
3608 - processor.mpc_reserved[0] = 0;
3609 - processor.mpc_reserved[1] = 0;
3610 -#endif
3611 -
3612 - MP_processor_info(&processor);
3613 -}
3614 -
3615 -#ifdef CONFIG_X86_IO_APIC
3616 -
3617 -#define MP_ISA_BUS 0
3618 -#define MP_MAX_IOAPIC_PIN 127
3619 -
3620 -static struct mp_ioapic_routing {
3621 - int apic_id;
3622 - int gsi_base;
3623 - int gsi_end;
3624 - u32 pin_programmed[4];
3625 -} mp_ioapic_routing[MAX_IO_APICS];
3626 -
3627 -static int mp_find_ioapic (int gsi)
3628 -{
3629 - int i = 0;
3630 -
3631 - /* Find the IOAPIC that manages this GSI. */
3632 - for (i = 0; i < nr_ioapics; i++) {
3633 - if ((gsi >= mp_ioapic_routing[i].gsi_base)
3634 - && (gsi <= mp_ioapic_routing[i].gsi_end))
3635 - return i;
3636 - }
3637 -
3638 - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
3639 -
3640 - return -1;
3641 -}
3642 -
3643 -void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
3644 -{
3645 - int idx = 0;
3646 - int tmpid;
3647 -
3648 - if (nr_ioapics >= MAX_IO_APICS) {
3649 - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
3650 - "(found %d)\n", MAX_IO_APICS, nr_ioapics);
3651 - panic("Recompile kernel with bigger MAX_IO_APICS!\n");
3652 - }
3653 - if (!address) {
3654 - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
3655 - " found in MADT table, skipping!\n");
3656 - return;
3657 - }
3658 -
3659 - idx = nr_ioapics++;
3660 -
3661 - mp_ioapics[idx].mpc_type = MP_IOAPIC;
3662 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
3663 - mp_ioapics[idx].mpc_apicaddr = address;
3664 -
3665 -#ifndef CONFIG_XEN
3666 - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
3667 -#endif
3668 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
3669 - && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3670 - tmpid = io_apic_get_unique_id(idx, id);
3671 - else
3672 - tmpid = id;
3673 - if (tmpid == -1) {
3674 - nr_ioapics--;
3675 - return;
3676 - }
3677 - mp_ioapics[idx].mpc_apicid = tmpid;
3678 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
3679 -
3680 - /*
3681 - * Build basic GSI lookup table to facilitate gsi->io_apic lookups
3682 - * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
3683 - */
3684 - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
3685 - mp_ioapic_routing[idx].gsi_base = gsi_base;
3686 - mp_ioapic_routing[idx].gsi_end = gsi_base +
3687 - io_apic_get_redir_entries(idx);
3688 -
3689 - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
3690 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
3691 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
3692 - mp_ioapic_routing[idx].gsi_base,
3693 - mp_ioapic_routing[idx].gsi_end);
3694 -}
3695 -
3696 -void __init
3697 -mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
3698 -{
3699 - struct mpc_config_intsrc intsrc;
3700 - int ioapic = -1;
3701 - int pin = -1;
3702 -
3703 - /*
3704 - * Convert 'gsi' to 'ioapic.pin'.
3705 - */
3706 - ioapic = mp_find_ioapic(gsi);
3707 - if (ioapic < 0)
3708 - return;
3709 - pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
3710 -
3711 - /*
3712 - * TBD: This check is for faulty timer entries, where the override
3713 - * erroneously sets the trigger to level, resulting in a HUGE
3714 - * increase of timer interrupts!
3715 - */
3716 - if ((bus_irq == 0) && (trigger == 3))
3717 - trigger = 1;
3718 -
3719 - intsrc.mpc_type = MP_INTSRC;
3720 - intsrc.mpc_irqtype = mp_INT;
3721 - intsrc.mpc_irqflag = (trigger << 2) | polarity;
3722 - intsrc.mpc_srcbus = MP_ISA_BUS;
3723 - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
3724 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
3725 - intsrc.mpc_dstirq = pin; /* INTIN# */
3726 -
3727 - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
3728 - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
3729 - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
3730 - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
3731 -
3732 - mp_irqs[mp_irq_entries] = intsrc;
3733 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
3734 - panic("Max # of irq sources exceeded!\n");
3735 -}
3736 -
3737 -void __init mp_config_acpi_legacy_irqs (void)
3738 -{
3739 - struct mpc_config_intsrc intsrc;
3740 - int i = 0;
3741 - int ioapic = -1;
3742 -
3743 - /*
3744 - * Fabricate the legacy ISA bus (bus #31).
3745 - */
3746 - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
3747 - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
3748 -
3749 - /*
3750 - * Older generations of ES7000 have no legacy identity mappings
3751 - */
3752 - if (es7000_plat == 1)
3753 - return;
3754 -
3755 - /*
3756 - * Locate the IOAPIC that manages the ISA IRQs (0-15).
3757 - */
3758 - ioapic = mp_find_ioapic(0);
3759 - if (ioapic < 0)
3760 - return;
3761 -
3762 - intsrc.mpc_type = MP_INTSRC;
3763 - intsrc.mpc_irqflag = 0; /* Conforming */
3764 - intsrc.mpc_srcbus = MP_ISA_BUS;
3765 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
3766 -
3767 - /*
3768 - * Use the default configuration for the IRQs 0-15. Unless
3769 - * overridden by (MADT) interrupt source override entries.
3770 - */
3771 - for (i = 0; i < 16; i++) {
3772 - int idx;
3773 -
3774 - for (idx = 0; idx < mp_irq_entries; idx++) {
3775 - struct mpc_config_intsrc *irq = mp_irqs + idx;
3776 -
3777 - /* Do we already have a mapping for this ISA IRQ? */
3778 - if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
3779 - break;
3780 -
3781 - /* Do we already have a mapping for this IOAPIC pin */
3782 - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
3783 - (irq->mpc_dstirq == i))
3784 - break;
3785 - }
3786 -
3787 - if (idx != mp_irq_entries) {
3788 - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
3789 - continue; /* IRQ already used */
3790 - }
3791 -
3792 - intsrc.mpc_irqtype = mp_INT;
3793 - intsrc.mpc_srcbusirq = i; /* Identity mapped */
3794 - intsrc.mpc_dstirq = i;
3795 -
3796 - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
3797 - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
3798 - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
3799 - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
3800 - intsrc.mpc_dstirq);
3801 -
3802 - mp_irqs[mp_irq_entries] = intsrc;
3803 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
3804 - panic("Max # of irq sources exceeded!\n");
3805 - }
3806 -}
3807 -
3808 -#define MAX_GSI_NUM 4096
3809 -#define IRQ_COMPRESSION_START 64
3810 -
3811 -int mp_register_gsi(u32 gsi, int triggering, int polarity)
3812 -{
3813 - int ioapic = -1;
3814 - int ioapic_pin = 0;
3815 - int idx, bit = 0;
3816 - static int pci_irq = IRQ_COMPRESSION_START;
3817 - /*
3818 - * Mapping between Global System Interrupts, which
3819 - * represent all possible interrupts, and IRQs
3820 - * assigned to actual devices.
3821 - */
3822 - static int gsi_to_irq[MAX_GSI_NUM];
3823 -
3824 - /* Don't set up the ACPI SCI because it's already set up */
3825 - if (acpi_gbl_FADT.sci_interrupt == gsi)
3826 - return gsi;
3827 -
3828 - ioapic = mp_find_ioapic(gsi);
3829 - if (ioapic < 0) {
3830 - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
3831 - return gsi;
3832 - }
3833 -
3834 - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
3835 -
3836 - if (ioapic_renumber_irq)
3837 - gsi = ioapic_renumber_irq(ioapic, gsi);
3838 -
3839 - /*
3840 - * Avoid pin reprogramming. PRTs typically include entries
3841 - * with redundant pin->gsi mappings (but unique PCI devices);
3842 - * we only program the IOAPIC on the first.
3843 - */
3844 - bit = ioapic_pin % 32;
3845 - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
3846 - if (idx > 3) {
3847 - printk(KERN_ERR "Invalid reference to IOAPIC pin "
3848 - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
3849 - ioapic_pin);
3850 - return gsi;
3851 - }
3852 - if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
3853 - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
3854 - mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
3855 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
3856 - }
3857 -
3858 - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
3859 -
3860 - /*
3861 - * For GSI >= 64, use IRQ compression
3862 - */
3863 - if ((gsi >= IRQ_COMPRESSION_START)
3864 - && (triggering == ACPI_LEVEL_SENSITIVE)) {
3865 - /*
3866 - * For PCI devices assign IRQs in order, avoiding gaps
3867 - * due to unused I/O APIC pins.
3868 - */
3869 - int irq = gsi;
3870 - if (gsi < MAX_GSI_NUM) {
3871 - /*
3872 - * Retain the VIA chipset work-around (gsi > 15), but
3873 - * avoid a problem where the 8254 timer (IRQ0) is setup
3874 - * via an override (so it's not on pin 0 of the ioapic),
3875 - * and at the same time, the pin 0 interrupt is a PCI
3876 - * type. The gsi > 15 test could cause these two pins
3877 - * to be shared as IRQ0, and they are not shareable.
3878 - * So test for this condition, and if necessary, avoid
3879 - * the pin collision.
3880 - */
3881 - if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
3882 - gsi = pci_irq++;
3883 - /*
3884 - * Don't assign IRQ used by ACPI SCI
3885 - */
3886 - if (gsi == acpi_gbl_FADT.sci_interrupt)
3887 - gsi = pci_irq++;
3888 - gsi_to_irq[irq] = gsi;
3889 - } else {
3890 - printk(KERN_ERR "GSI %u is too high\n", gsi);
3891 - return gsi;
3892 - }
3893 - }
3894 -
3895 - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
3896 - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
3897 - polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
3898 - return gsi;
3899 -}
3900 -
3901 -#endif /* CONFIG_X86_IO_APIC */
3902 -#endif /* CONFIG_ACPI */
3903 --- a/arch/x86/kernel/mpparse_64-xen.c
3904 +++ /dev/null
3905 @@ -1,879 +0,0 @@
3906 -/*
3907 - * Intel Multiprocessor Specification 1.1 and 1.4
3908 - * compliant MP-table parsing routines.
3909 - *
3910 - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
3911 - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
3912 - *
3913 - * Fixes
3914 - * Erich Boleyn : MP v1.4 and additional changes.
3915 - * Alan Cox : Added EBDA scanning
3916 - * Ingo Molnar : various cleanups and rewrites
3917 - * Maciej W. Rozycki: Bits for default MP configurations
3918 - * Paul Diefenbaugh: Added full ACPI support
3919 - */
3920 -
3921 -#include <linux/mm.h>
3922 -#include <linux/init.h>
3923 -#include <linux/delay.h>
3924 -#include <linux/bootmem.h>
3925 -#include <linux/kernel_stat.h>
3926 -#include <linux/mc146818rtc.h>
3927 -#include <linux/acpi.h>
3928 -#include <linux/module.h>
3929 -
3930 -#include <asm/smp.h>
3931 -#include <asm/mtrr.h>
3932 -#include <asm/mpspec.h>
3933 -#include <asm/pgalloc.h>
3934 -#include <asm/io_apic.h>
3935 -#include <asm/proto.h>
3936 -#include <asm/acpi.h>
3937 -
3938 -/* Have we found an MP table */
3939 -int smp_found_config;
3940 -
3941 -/*
3942 - * Various Linux-internal data structures created from the
3943 - * MP-table.
3944 - */
3945 -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
3946 -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
3947 -
3948 -static int mp_current_pci_id = 0;
3949 -/* I/O APIC entries */
3950 -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
3951 -
3952 -/* # of MP IRQ source entries */
3953 -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
3954 -
3955 -/* MP IRQ source entries */
3956 -int mp_irq_entries;
3957 -
3958 -int nr_ioapics;
3959 -unsigned long mp_lapic_addr = 0;
3960 -
3961 -
3962 -
3963 -/* Processor that is doing the boot up */
3964 -unsigned int boot_cpu_id = -1U;
3965 -EXPORT_SYMBOL(boot_cpu_id);
3966 -
3967 -/* Internal processor count */
3968 -unsigned int num_processors;
3969 -
3970 -unsigned disabled_cpus __cpuinitdata;
3971 -
3972 -/* Bitmask of physically existing CPUs */
3973 -physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
3974 -
3975 -#ifndef CONFIG_XEN
3976 -u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
3977 - = { [0 ... NR_CPUS-1] = BAD_APICID };
3978 -void *x86_bios_cpu_apicid_early_ptr;
3979 -#endif
3980 -DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
3981 -EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
3982 -
3983 -
3984 -/*
3985 - * Intel MP BIOS table parsing routines:
3986 - */
3987 -
3988 -/*
3989 - * Checksum an MP configuration block.
3990 - */
3991 -
3992 -static int __init mpf_checksum(unsigned char *mp, int len)
3993 -{
3994 - int sum = 0;
3995 -
3996 - while (len--)
3997 - sum += *mp++;
3998 -
3999 - return sum & 0xFF;
4000 -}
4001 -
4002 -#ifndef CONFIG_XEN
4003 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
4004 -{
4005 - int cpu;
4006 - cpumask_t tmp_map;
4007 - char *bootup_cpu = "";
4008 -
4009 - if (!(m->mpc_cpuflag & CPU_ENABLED)) {
4010 - disabled_cpus++;
4011 - return;
4012 - }
4013 - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4014 - bootup_cpu = " (Bootup-CPU)";
4015 - boot_cpu_id = m->mpc_apicid;
4016 - }
4017 -
4018 - printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
4019 -
4020 - if (num_processors >= NR_CPUS) {
4021 - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
4022 - " Processor ignored.\n", NR_CPUS);
4023 - return;
4024 - }
4025 -
4026 - num_processors++;
4027 - cpus_complement(tmp_map, cpu_present_map);
4028 - cpu = first_cpu(tmp_map);
4029 -
4030 - physid_set(m->mpc_apicid, phys_cpu_present_map);
4031 - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4032 - /*
4033 - * x86_bios_cpu_apicid is required to have processors listed
4034 - * in same order as logical cpu numbers. Hence the first
4035 - * entry is BSP, and so on.
4036 - */
4037 - cpu = 0;
4038 - }
4039 - /* are we being called early in kernel startup? */
4040 - if (x86_cpu_to_apicid_early_ptr) {
4041 - u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
4042 - u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
4043 -
4044 - cpu_to_apicid[cpu] = m->mpc_apicid;
4045 - bios_cpu_apicid[cpu] = m->mpc_apicid;
4046 - } else {
4047 - per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
4048 - per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
4049 - }
4050 -
4051 - cpu_set(cpu, cpu_possible_map);
4052 - cpu_set(cpu, cpu_present_map);
4053 -}
4054 -#else
4055 -static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
4056 -{
4057 - num_processors++;
4058 -}
4059 -#endif /* CONFIG_XEN */
4060 -
4061 -static void __init MP_bus_info (struct mpc_config_bus *m)
4062 -{
4063 - char str[7];
4064 -
4065 - memcpy(str, m->mpc_bustype, 6);
4066 - str[6] = 0;
4067 - Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
4068 -
4069 - if (strncmp(str, "ISA", 3) == 0) {
4070 - set_bit(m->mpc_busid, mp_bus_not_pci);
4071 - } else if (strncmp(str, "PCI", 3) == 0) {
4072 - clear_bit(m->mpc_busid, mp_bus_not_pci);
4073 - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
4074 - mp_current_pci_id++;
4075 - } else {
4076 - printk(KERN_ERR "Unknown bustype %s\n", str);
4077 - }
4078 -}
4079 -
4080 -static int bad_ioapic(unsigned long address)
4081 -{
4082 - if (nr_ioapics >= MAX_IO_APICS) {
4083 - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
4084 - "(found %d)\n", MAX_IO_APICS, nr_ioapics);
4085 - panic("Recompile kernel with bigger MAX_IO_APICS!\n");
4086 - }
4087 - if (!address) {
4088 - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
4089 - " found in table, skipping!\n");
4090 - return 1;
4091 - }
4092 - return 0;
4093 -}
4094 -
4095 -static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
4096 -{
4097 - if (!(m->mpc_flags & MPC_APIC_USABLE))
4098 - return;
4099 -
4100 - printk("I/O APIC #%d at 0x%X.\n",
4101 - m->mpc_apicid, m->mpc_apicaddr);
4102 -
4103 - if (bad_ioapic(m->mpc_apicaddr))
4104 - return;
4105 -
4106 - mp_ioapics[nr_ioapics] = *m;
4107 - nr_ioapics++;
4108 -}
4109 -
4110 -static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
4111 -{
4112 - mp_irqs [mp_irq_entries] = *m;
4113 - Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
4114 - " IRQ %02x, APIC ID %x, APIC INT %02x\n",
4115 - m->mpc_irqtype, m->mpc_irqflag & 3,
4116 - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
4117 - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
4118 - if (++mp_irq_entries >= MAX_IRQ_SOURCES)
4119 - panic("Max # of irq sources exceeded!!\n");
4120 -}
4121 -
4122 -static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
4123 -{
4124 - Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
4125 - " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
4126 - m->mpc_irqtype, m->mpc_irqflag & 3,
4127 - (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
4128 - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
4129 -}
4130 -
4131 -/*
4132 - * Read/parse the MPC
4133 - */
4134 -
4135 -static int __init smp_read_mpc(struct mp_config_table *mpc)
4136 -{
4137 - char str[16];
4138 - int count=sizeof(*mpc);
4139 - unsigned char *mpt=((unsigned char *)mpc)+count;
4140 -
4141 - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
4142 - printk("MPTABLE: bad signature [%c%c%c%c]!\n",
4143 - mpc->mpc_signature[0],
4144 - mpc->mpc_signature[1],
4145 - mpc->mpc_signature[2],
4146 - mpc->mpc_signature[3]);
4147 - return 0;
4148 - }
4149 - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
4150 - printk("MPTABLE: checksum error!\n");
4151 - return 0;
4152 - }
4153 - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
4154 - printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
4155 - mpc->mpc_spec);
4156 - return 0;
4157 - }
4158 - if (!mpc->mpc_lapic) {
4159 - printk(KERN_ERR "MPTABLE: null local APIC address!\n");
4160 - return 0;
4161 - }
4162 - memcpy(str,mpc->mpc_oem,8);
4163 - str[8] = 0;
4164 - printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
4165 -
4166 - memcpy(str,mpc->mpc_productid,12);
4167 - str[12] = 0;
4168 - printk("MPTABLE: Product ID: %s ",str);
4169 -
4170 - printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
4171 -
4172 - /* save the local APIC address, it might be non-default */
4173 - if (!acpi_lapic)
4174 - mp_lapic_addr = mpc->mpc_lapic;
4175 -
4176 - /*
4177 - * Now process the configuration blocks.
4178 - */
4179 - while (count < mpc->mpc_length) {
4180 - switch(*mpt) {
4181 - case MP_PROCESSOR:
4182 - {
4183 - struct mpc_config_processor *m=
4184 - (struct mpc_config_processor *)mpt;
4185 - if (!acpi_lapic)
4186 - MP_processor_info(m);
4187 - mpt += sizeof(*m);
4188 - count += sizeof(*m);
4189 - break;
4190 - }
4191 - case MP_BUS:
4192 - {
4193 - struct mpc_config_bus *m=
4194 - (struct mpc_config_bus *)mpt;
4195 - MP_bus_info(m);
4196 - mpt += sizeof(*m);
4197 - count += sizeof(*m);
4198 - break;
4199 - }
4200 - case MP_IOAPIC:
4201 - {
4202 - struct mpc_config_ioapic *m=
4203 - (struct mpc_config_ioapic *)mpt;
4204 - MP_ioapic_info(m);
4205 - mpt += sizeof(*m);
4206 - count += sizeof(*m);
4207 - break;
4208 - }
4209 - case MP_INTSRC:
4210 - {
4211 - struct mpc_config_intsrc *m=
4212 - (struct mpc_config_intsrc *)mpt;
4213 -
4214 - MP_intsrc_info(m);
4215 - mpt += sizeof(*m);
4216 - count += sizeof(*m);
4217 - break;
4218 - }
4219 - case MP_LINTSRC:
4220 - {
4221 - struct mpc_config_lintsrc *m=
4222 - (struct mpc_config_lintsrc *)mpt;
4223 - MP_lintsrc_info(m);
4224 - mpt += sizeof(*m);
4225 - count += sizeof(*m);
4226 - break;
4227 - }
4228 - }
4229 - }
4230 - setup_apic_routing();
4231 - if (!num_processors)
4232 - printk(KERN_ERR "MPTABLE: no processors registered!\n");
4233 - return num_processors;
4234 -}
4235 -
4236 -static int __init ELCR_trigger(unsigned int irq)
4237 -{
4238 - unsigned int port;
4239 -
4240 - port = 0x4d0 + (irq >> 3);
4241 - return (inb(port) >> (irq & 7)) & 1;
4242 -}
4243 -
4244 -static void __init construct_default_ioirq_mptable(int mpc_default_type)
4245 -{
4246 - struct mpc_config_intsrc intsrc;
4247 - int i;
4248 - int ELCR_fallback = 0;
4249 -
4250 - intsrc.mpc_type = MP_INTSRC;
4251 - intsrc.mpc_irqflag = 0; /* conforming */
4252 - intsrc.mpc_srcbus = 0;
4253 - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
4254 -
4255 - intsrc.mpc_irqtype = mp_INT;
4256 -
4257 - /*
4258 - * If true, we have an ISA/PCI system with no IRQ entries
4259 - * in the MP table. To prevent the PCI interrupts from being set up
4260 - * incorrectly, we try to use the ELCR. The sanity check to see if
4261 - * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
4262 - * never be level sensitive, so we simply see if the ELCR agrees.
4263 - * If it does, we assume it's valid.
4264 - */
4265 - if (mpc_default_type == 5) {
4266 - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
4267 -
4268 - if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
4269 - printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
4270 - else {
4271 - printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
4272 - ELCR_fallback = 1;
4273 - }
4274 - }
4275 -
4276 - for (i = 0; i < 16; i++) {
4277 - switch (mpc_default_type) {
4278 - case 2:
4279 - if (i == 0 || i == 13)
4280 - continue; /* IRQ0 & IRQ13 not connected */
4281 - /* fall through */
4282 - default:
4283 - if (i == 2)
4284 - continue; /* IRQ2 is never connected */
4285 - }
4286 -
4287 - if (ELCR_fallback) {
4288 - /*
4289 - * If the ELCR indicates a level-sensitive interrupt, we
4290 - * copy that information over to the MP table in the
4291 - * irqflag field (level sensitive, active high polarity).
4292 - */
4293 - if (ELCR_trigger(i))
4294 - intsrc.mpc_irqflag = 13;
4295 - else
4296 - intsrc.mpc_irqflag = 0;
4297 - }
4298 -
4299 - intsrc.mpc_srcbusirq = i;
4300 - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
4301 - MP_intsrc_info(&intsrc);
4302 - }
4303 -
4304 - intsrc.mpc_irqtype = mp_ExtINT;
4305 - intsrc.mpc_srcbusirq = 0;
4306 - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
4307 - MP_intsrc_info(&intsrc);
4308 -}
4309 -
4310 -static inline void __init construct_default_ISA_mptable(int mpc_default_type)
4311 -{
4312 - struct mpc_config_processor processor;
4313 - struct mpc_config_bus bus;
4314 - struct mpc_config_ioapic ioapic;
4315 - struct mpc_config_lintsrc lintsrc;
4316 - int linttypes[2] = { mp_ExtINT, mp_NMI };
4317 - int i;
4318 -
4319 - /*
4320 - * local APIC has default address
4321 - */
4322 - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
4323 -
4324 - /*
4325 - * 2 CPUs, numbered 0 & 1.
4326 - */
4327 - processor.mpc_type = MP_PROCESSOR;
4328 - processor.mpc_apicver = 0;
4329 - processor.mpc_cpuflag = CPU_ENABLED;
4330 - processor.mpc_cpufeature = 0;
4331 - processor.mpc_featureflag = 0;
4332 - processor.mpc_reserved[0] = 0;
4333 - processor.mpc_reserved[1] = 0;
4334 - for (i = 0; i < 2; i++) {
4335 - processor.mpc_apicid = i;
4336 - MP_processor_info(&processor);
4337 - }
4338 -
4339 - bus.mpc_type = MP_BUS;
4340 - bus.mpc_busid = 0;
4341 - switch (mpc_default_type) {
4342 - default:
4343 - printk(KERN_ERR "???\nUnknown standard configuration %d\n",
4344 - mpc_default_type);
4345 - /* fall through */
4346 - case 1:
4347 - case 5:
4348 - memcpy(bus.mpc_bustype, "ISA ", 6);
4349 - break;
4350 - }
4351 - MP_bus_info(&bus);
4352 - if (mpc_default_type > 4) {
4353 - bus.mpc_busid = 1;
4354 - memcpy(bus.mpc_bustype, "PCI ", 6);
4355 - MP_bus_info(&bus);
4356 - }
4357 -
4358 - ioapic.mpc_type = MP_IOAPIC;
4359 - ioapic.mpc_apicid = 2;
4360 - ioapic.mpc_apicver = 0;
4361 - ioapic.mpc_flags = MPC_APIC_USABLE;
4362 - ioapic.mpc_apicaddr = 0xFEC00000;
4363 - MP_ioapic_info(&ioapic);
4364 -
4365 - /*
4366 - * We set up most of the low 16 IO-APIC pins according to MPS rules.
4367 - */
4368 - construct_default_ioirq_mptable(mpc_default_type);
4369 -
4370 - lintsrc.mpc_type = MP_LINTSRC;
4371 - lintsrc.mpc_irqflag = 0; /* conforming */
4372 - lintsrc.mpc_srcbusid = 0;
4373 - lintsrc.mpc_srcbusirq = 0;
4374 - lintsrc.mpc_destapic = MP_APIC_ALL;
4375 - for (i = 0; i < 2; i++) {
4376 - lintsrc.mpc_irqtype = linttypes[i];
4377 - lintsrc.mpc_destapiclint = i;
4378 - MP_lintsrc_info(&lintsrc);
4379 - }
4380 -}
4381 -
4382 -static struct intel_mp_floating *mpf_found;
4383 -
4384 -/*
4385 - * Scan the memory blocks for an SMP configuration block.
4386 - */
4387 -void __init get_smp_config (void)
4388 -{
4389 - struct intel_mp_floating *mpf = mpf_found;
4390 -
4391 - /*
4392 - * ACPI supports both logical (e.g. Hyper-Threading) and physical
4393 - * processors, where MPS only supports physical.
4394 - */
4395 - if (acpi_lapic && acpi_ioapic) {
4396 - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
4397 - return;
4398 - }
4399 - else if (acpi_lapic)
4400 - printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
4401 -
4402 - printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
4403 -
4404 - /*
4405 - * Now see if we need to read further.
4406 - */
4407 - if (mpf->mpf_feature1 != 0) {
4408 -
4409 - printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
4410 - construct_default_ISA_mptable(mpf->mpf_feature1);
4411 -
4412 - } else if (mpf->mpf_physptr) {
4413 -
4414 - /*
4415 - * Read the physical hardware table. Anything here will
4416 - * override the defaults.
4417 - */
4418 - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
4419 - smp_found_config = 0;
4420 - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
4421 - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
4422 - return;
4423 - }
4424 - /*
4425 - * If there are no explicit MP IRQ entries, then we are
4426 - * broken. We set up most of the low 16 IO-APIC pins to
4427 - * ISA defaults and hope it will work.
4428 - */
4429 - if (!mp_irq_entries) {
4430 - struct mpc_config_bus bus;
4431 -
4432 - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
4433 -
4434 - bus.mpc_type = MP_BUS;
4435 - bus.mpc_busid = 0;
4436 - memcpy(bus.mpc_bustype, "ISA ", 6);
4437 - MP_bus_info(&bus);
4438 -
4439 - construct_default_ioirq_mptable(0);
4440 - }
4441 -
4442 - } else
4443 - BUG();
4444 -
4445 - printk(KERN_INFO "Processors: %d\n", num_processors);
4446 - /*
4447 - * Only use the first configuration found.
4448 - */
4449 -}
4450 -
4451 -static int __init smp_scan_config (unsigned long base, unsigned long length)
4452 -{
4453 - extern void __bad_mpf_size(void);
4454 - unsigned int *bp = isa_bus_to_virt(base);
4455 - struct intel_mp_floating *mpf;
4456 -
4457 - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
4458 - if (sizeof(*mpf) != 16)
4459 - __bad_mpf_size();
4460 -
4461 - while (length > 0) {
4462 - mpf = (struct intel_mp_floating *)bp;
4463 - if ((*bp == SMP_MAGIC_IDENT) &&
4464 - (mpf->mpf_length == 1) &&
4465 - !mpf_checksum((unsigned char *)bp, 16) &&
4466 - ((mpf->mpf_specification == 1)
4467 - || (mpf->mpf_specification == 4)) ) {
4468 -
4469 - smp_found_config = 1;
4470 - mpf_found = mpf;
4471 - return 1;
4472 - }
4473 - bp += 4;
4474 - length -= 16;
4475 - }
4476 - return 0;
4477 -}
4478 -
4479 -void __init find_smp_config(void)
4480 -{
4481 - unsigned int address;
4482 -
4483 - /*
4484 - * FIXME: Linux assumes you have 640K of base ram..
4485 - * this continues the error...
4486 - *
4487 - * 1) Scan the bottom 1K for a signature
4488 - * 2) Scan the top 1K of base RAM
4489 - * 3) Scan the 64K of bios
4490 - */
4491 - if (smp_scan_config(0x0,0x400) ||
4492 - smp_scan_config(639*0x400,0x400) ||
4493 - smp_scan_config(0xF0000,0x10000))
4494 - return;
4495 - /*
4496 - * If it is an SMP machine we should know now.
4497 - *
4498 - * there is a real-mode segmented pointer pointing to the
4499 - * 4K EBDA area at 0x40E, calculate and scan it here.
4500 - *
4501 - * NOTE! There are Linux loaders that will corrupt the EBDA
4502 - * area, and as such this kind of SMP config may be less
4503 - * trustworthy, simply because the SMP table may have been
4504 - * stomped on during early boot. These loaders are buggy and
4505 - * should be fixed.
4506 - */
4507 -
4508 - address = *(unsigned short *)phys_to_virt(0x40E);
4509 - address <<= 4;
4510 - if (smp_scan_config(address, 0x1000))
4511 - return;
4512 -
4513 - /* If we have come this far, we did not find an MP table */
4514 - printk(KERN_INFO "No mptable found.\n");
4515 -}
4516 -
4517 -/* --------------------------------------------------------------------------
4518 - ACPI-based MP Configuration
4519 - -------------------------------------------------------------------------- */
4520 -
4521 -#ifdef CONFIG_ACPI
4522 -
4523 -void __init mp_register_lapic_address(u64 address)
4524 -{
4525 -#ifndef CONFIG_XEN
4526 - mp_lapic_addr = (unsigned long) address;
4527 - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
4528 - if (boot_cpu_id == -1U)
4529 - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
4530 -#endif
4531 -}
4532 -
4533 -void __cpuinit mp_register_lapic (u8 id, u8 enabled)
4534 -{
4535 - struct mpc_config_processor processor;
4536 - int boot_cpu = 0;
4537 -
4538 - if (id == boot_cpu_id)
4539 - boot_cpu = 1;
4540 -
4541 -#ifndef CONFIG_XEN
4542 - processor.mpc_type = MP_PROCESSOR;
4543 - processor.mpc_apicid = id;
4544 - processor.mpc_apicver = 0;
4545 - processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
4546 - processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
4547 - processor.mpc_cpufeature = 0;
4548 - processor.mpc_featureflag = 0;
4549 - processor.mpc_reserved[0] = 0;
4550 - processor.mpc_reserved[1] = 0;
4551 -#endif
4552 -
4553 - MP_processor_info(&processor);
4554 -}
4555 -
4556 -#define MP_ISA_BUS 0
4557 -#define MP_MAX_IOAPIC_PIN 127
4558 -
4559 -static struct mp_ioapic_routing {
4560 - int apic_id;
4561 - int gsi_start;
4562 - int gsi_end;
4563 - u32 pin_programmed[4];
4564 -} mp_ioapic_routing[MAX_IO_APICS];
4565 -
4566 -static int mp_find_ioapic(int gsi)
4567 -{
4568 - int i = 0;
4569 -
4570 - /* Find the IOAPIC that manages this GSI. */
4571 - for (i = 0; i < nr_ioapics; i++) {
4572 - if ((gsi >= mp_ioapic_routing[i].gsi_start)
4573 - && (gsi <= mp_ioapic_routing[i].gsi_end))
4574 - return i;
4575 - }
4576 -
4577 - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
4578 - return -1;
4579 -}
4580 -
4581 -static u8 uniq_ioapic_id(u8 id)
4582 -{
4583 - int i;
4584 - DECLARE_BITMAP(used, 256);
4585 - bitmap_zero(used, 256);
4586 - for (i = 0; i < nr_ioapics; i++) {
4587 - struct mpc_config_ioapic *ia = &mp_ioapics[i];
4588 - __set_bit(ia->mpc_apicid, used);
4589 - }
4590 - if (!test_bit(id, used))
4591 - return id;
4592 - return find_first_zero_bit(used, 256);
4593 -}
4594 -
4595 -void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
4596 -{
4597 - int idx = 0;
4598 -
4599 - if (bad_ioapic(address))
4600 - return;
4601 -
4602 - idx = nr_ioapics;
4603 -
4604 - mp_ioapics[idx].mpc_type = MP_IOAPIC;
4605 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
4606 - mp_ioapics[idx].mpc_apicaddr = address;
4607 -
4608 -#ifndef CONFIG_XEN
4609 - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4610 -#endif
4611 - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
4612 - mp_ioapics[idx].mpc_apicver = 0;
4613 -
4614 - /*
4615 - * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
4616 - * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
4617 - */
4618 - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
4619 - mp_ioapic_routing[idx].gsi_start = gsi_base;
4620 - mp_ioapic_routing[idx].gsi_end = gsi_base +
4621 - io_apic_get_redir_entries(idx);
4622 -
4623 - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
4624 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4625 - mp_ioapics[idx].mpc_apicaddr,
4626 - mp_ioapic_routing[idx].gsi_start,
4627 - mp_ioapic_routing[idx].gsi_end);
4628 -
4629 - nr_ioapics++;
4630 -}
4631 -
4632 -void __init
4633 -mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
4634 -{
4635 - struct mpc_config_intsrc intsrc;
4636 - int ioapic = -1;
4637 - int pin = -1;
4638 -
4639 - /*
4640 - * Convert 'gsi' to 'ioapic.pin'.
4641 - */
4642 - ioapic = mp_find_ioapic(gsi);
4643 - if (ioapic < 0)
4644 - return;
4645 - pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
4646 -
4647 - /*
4648 - * TBD: This check is for faulty timer entries, where the override
4649 - * erroneously sets the trigger to level, resulting in a HUGE
4650 - * increase of timer interrupts!
4651 - */
4652 - if ((bus_irq == 0) && (trigger == 3))
4653 - trigger = 1;
4654 -
4655 - intsrc.mpc_type = MP_INTSRC;
4656 - intsrc.mpc_irqtype = mp_INT;
4657 - intsrc.mpc_irqflag = (trigger << 2) | polarity;
4658 - intsrc.mpc_srcbus = MP_ISA_BUS;
4659 - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
4660 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
4661 - intsrc.mpc_dstirq = pin; /* INTIN# */
4662 -
4663 - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
4664 - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
4665 - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
4666 - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
4667 -
4668 - mp_irqs[mp_irq_entries] = intsrc;
4669 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
4670 - panic("Max # of irq sources exceeded!\n");
4671 -}
4672 -
4673 -void __init mp_config_acpi_legacy_irqs(void)
4674 -{
4675 - struct mpc_config_intsrc intsrc;
4676 - int i = 0;
4677 - int ioapic = -1;
4678 -
4679 - /*
4680 - * Fabricate the legacy ISA bus (bus #31).
4681 - */
4682 - set_bit(MP_ISA_BUS, mp_bus_not_pci);
4683 -
4684 - /*
4685 - * Locate the IOAPIC that manages the ISA IRQs (0-15).
4686 - */
4687 - ioapic = mp_find_ioapic(0);
4688 - if (ioapic < 0)
4689 - return;
4690 -
4691 - intsrc.mpc_type = MP_INTSRC;
4692 - intsrc.mpc_irqflag = 0; /* Conforming */
4693 - intsrc.mpc_srcbus = MP_ISA_BUS;
4694 - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
4695 -
4696 - /*
4697 - * Use the default configuration for the IRQs 0-15. Unless
4698 - * overridden by (MADT) interrupt source override entries.
4699 - */
4700 - for (i = 0; i < 16; i++) {
4701 - int idx;
4702 -
4703 - for (idx = 0; idx < mp_irq_entries; idx++) {
4704 - struct mpc_config_intsrc *irq = mp_irqs + idx;
4705 -
4706 - /* Do we already have a mapping for this ISA IRQ? */
4707 - if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
4708 - break;
4709 -
4710 - /* Do we already have a mapping for this IOAPIC pin */
4711 - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
4712 - (irq->mpc_dstirq == i))
4713 - break;
4714 - }
4715 -
4716 - if (idx != mp_irq_entries) {
4717 - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
4718 - continue; /* IRQ already used */
4719 - }
4720 -
4721 - intsrc.mpc_irqtype = mp_INT;
4722 - intsrc.mpc_srcbusirq = i; /* Identity mapped */
4723 - intsrc.mpc_dstirq = i;
4724 -
4725 - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
4726 - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
4727 - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
4728 - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
4729 - intsrc.mpc_dstirq);
4730 -
4731 - mp_irqs[mp_irq_entries] = intsrc;
4732 - if (++mp_irq_entries == MAX_IRQ_SOURCES)
4733 - panic("Max # of irq sources exceeded!\n");
4734 - }
4735 -}
4736 -
4737 -int mp_register_gsi(u32 gsi, int triggering, int polarity)
4738 -{
4739 - int ioapic = -1;
4740 - int ioapic_pin = 0;
4741 - int idx, bit = 0;
4742 -
4743 - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
4744 - return gsi;
4745 -
4746 - /* Don't set up the ACPI SCI because it's already set up */
4747 - if (acpi_gbl_FADT.sci_interrupt == gsi)
4748 - return gsi;
4749 -
4750 - ioapic = mp_find_ioapic(gsi);
4751 - if (ioapic < 0) {
4752 - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
4753 - return gsi;
4754 - }
4755 -
4756 - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
4757 -
4758 - /*
4759 - * Avoid pin reprogramming. PRTs typically include entries
4760 - * with redundant pin->gsi mappings (but unique PCI devices);
4761 - * we only program the IOAPIC on the first.
4762 - */
4763 - bit = ioapic_pin % 32;
4764 - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
4765 - if (idx > 3) {
4766 - printk(KERN_ERR "Invalid reference to IOAPIC pin "
4767 - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
4768 - ioapic_pin);
4769 - return gsi;
4770 - }
4771 - if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
4772 - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
4773 - mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
4774 - return gsi;
4775 - }
4776 -
4777 - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
4778 -
4779 - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
4780 - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
4781 - polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
4782 - return gsi;
4783 -}
4784 -#endif /*CONFIG_ACPI*/
4785 --- /dev/null
4786 +++ b/arch/x86/kernel/mpparse-xen.c
4787 @@ -0,0 +1,1104 @@
4788 +/*
4789 + * Intel Multiprocessor Specification 1.1 and 1.4
4790 + * compliant MP-table parsing routines.
4791 + *
4792 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
4793 + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
4794 + * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
4795 + */
4796 +
4797 +#include <linux/mm.h>
4798 +#include <linux/init.h>
4799 +#include <linux/delay.h>
4800 +#include <linux/bootmem.h>
4801 +#include <linux/kernel_stat.h>
4802 +#include <linux/mc146818rtc.h>
4803 +#include <linux/bitops.h>
4804 +#include <linux/acpi.h>
4805 +#include <linux/module.h>
4806 +
4807 +#include <asm/smp.h>
4808 +#include <asm/mtrr.h>
4809 +#include <asm/mpspec.h>
4810 +#include <asm/pgalloc.h>
4811 +#include <asm/io_apic.h>
4812 +#include <asm/proto.h>
4813 +#include <asm/acpi.h>
4814 +#include <asm/bios_ebda.h>
4815 +
4816 +#include <mach_apic.h>
4817 +#ifdef CONFIG_X86_32
4818 +#include <mach_apicdef.h>
4819 +#include <mach_mpparse.h>
4820 +#endif
4821 +
4822 +/* Have we found an MP table */
4823 +int smp_found_config;
4824 +
4825 +/*
4826 + * Various Linux-internal data structures created from the
4827 + * MP-table.
4828 + */
4829 +#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
4830 +int mp_bus_id_to_type[MAX_MP_BUSSES];
4831 +#endif
4832 +
4833 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
4834 +int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
4835 +
4836 +static int mp_current_pci_id;
4837 +
4838 +int pic_mode;
4839 +
4840 +/*
4841 + * Intel MP BIOS table parsing routines:
4842 + */
4843 +
4844 +/*
4845 + * Checksum an MP configuration block.
4846 + */
4847 +
4848 +static int __init mpf_checksum(unsigned char *mp, int len)
4849 +{
4850 + int sum = 0;
4851 +
4852 + while (len--)
4853 + sum += *mp++;
4854 +
4855 + return sum & 0xFF;
4856 +}
4857 +
4858 +#ifdef CONFIG_X86_NUMAQ
4859 +/*
4860 + * Have to match translation table entries to main table entries by counter
4861 + * hence the mpc_record variable .... can't see a less disgusting way of
4862 + * doing this ....
4863 + */
4864 +
4865 +static int mpc_record;
4866 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
4867 + __cpuinitdata;
4868 +#endif
4869 +
4870 +#ifndef CONFIG_XEN
4871 +static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
4872 +{
4873 + int apicid;
4874 + char *bootup_cpu = "";
4875 +
4876 + if (!(m->mpc_cpuflag & CPU_ENABLED)) {
4877 + disabled_cpus++;
4878 + return;
4879 + }
4880 +#ifdef CONFIG_X86_NUMAQ
4881 + apicid = mpc_apic_id(m, translation_table[mpc_record]);
4882 +#else
4883 + apicid = m->mpc_apicid;
4884 +#endif
4885 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4886 + bootup_cpu = " (Bootup-CPU)";
4887 + boot_cpu_physical_apicid = m->mpc_apicid;
4888 + }
4889 +
4890 + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
4891 + generic_processor_info(apicid, m->mpc_apicver);
4892 +}
4893 +#else
4894 +static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
4895 +{
4896 + num_processors++;
4897 +}
4898 +#endif /* CONFIG_XEN */
4899 +
4900 +static void __init MP_bus_info(struct mpc_config_bus *m)
4901 +{
4902 + char str[7];
4903 +
4904 + memcpy(str, m->mpc_bustype, 6);
4905 + str[6] = 0;
4906 +
4907 +#ifdef CONFIG_X86_NUMAQ
4908 + mpc_oem_bus_info(m, str, translation_table[mpc_record]);
4909 +#else
4910 + Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
4911 +#endif
4912 +
4913 +#if MAX_MP_BUSSES < 256
4914 + if (m->mpc_busid >= MAX_MP_BUSSES) {
4915 + printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
4916 + " is too large, max. supported is %d\n",
4917 + m->mpc_busid, str, MAX_MP_BUSSES - 1);
4918 + return;
4919 + }
4920 +#endif
4921 +
4922 + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
4923 + set_bit(m->mpc_busid, mp_bus_not_pci);
4924 +#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
4925 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
4926 +#endif
4927 + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
4928 +#ifdef CONFIG_X86_NUMAQ
4929 + mpc_oem_pci_bus(m, translation_table[mpc_record]);
4930 +#endif
4931 + clear_bit(m->mpc_busid, mp_bus_not_pci);
4932 + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
4933 + mp_current_pci_id++;
4934 +#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
4935 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
4936 + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
4937 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
4938 + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
4939 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
4940 +#endif
4941 + } else
4942 + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
4943 +}
4944 +
4945 +#ifdef CONFIG_X86_IO_APIC
4946 +
4947 +static int bad_ioapic(unsigned long address)
4948 +{
4949 + if (nr_ioapics >= MAX_IO_APICS) {
4950 + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
4951 + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
4952 + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
4953 + }
4954 + if (!address) {
4955 + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
4956 + " found in table, skipping!\n");
4957 + return 1;
4958 + }
4959 + return 0;
4960 +}
4961 +
4962 +static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
4963 +{
4964 + if (!(m->mpc_flags & MPC_APIC_USABLE))
4965 + return;
4966 +
4967 + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
4968 + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
4969 +
4970 + if (bad_ioapic(m->mpc_apicaddr))
4971 + return;
4972 +
4973 + mp_ioapics[nr_ioapics] = *m;
4974 + nr_ioapics++;
4975 +}
4976 +
4977 +static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
4978 +{
4979 + mp_irqs[mp_irq_entries] = *m;
4980 + Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
4981 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
4982 + m->mpc_irqtype, m->mpc_irqflag & 3,
4983 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
4984 + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
4985 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
4986 + panic("Max # of irq sources exceeded!!\n");
4987 +}
4988 +
4989 +#endif
4990 +
4991 +static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
4992 +{
4993 + Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
4994 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
4995 + m->mpc_irqtype, m->mpc_irqflag & 3,
4996 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
4997 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
4998 +}
4999 +
5000 +#ifdef CONFIG_X86_NUMAQ
5001 +static void __init MP_translation_info(struct mpc_config_translation *m)
5002 +{
5003 + printk(KERN_INFO
5004 + "Translation: record %d, type %d, quad %d, global %d, local %d\n",
5005 + mpc_record, m->trans_type, m->trans_quad, m->trans_global,
5006 + m->trans_local);
5007 +
5008 + if (mpc_record >= MAX_MPC_ENTRY)
5009 + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
5010 + else
5011 + translation_table[mpc_record] = m; /* stash this for later */
5012 + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
5013 + node_set_online(m->trans_quad);
5014 +}
5015 +
5016 +/*
5017 + * Read/parse the MPC oem tables
5018 + */
5019 +
5020 +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
5021 + unsigned short oemsize)
5022 +{
5023 + int count = sizeof(*oemtable); /* the header size */
5024 + unsigned char *oemptr = ((unsigned char *)oemtable) + count;
5025 +
5026 + mpc_record = 0;
5027 + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
5028 + oemtable);
5029 + if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
5030 + printk(KERN_WARNING
5031 + "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
5032 + oemtable->oem_signature[0], oemtable->oem_signature[1],
5033 + oemtable->oem_signature[2], oemtable->oem_signature[3]);
5034 + return;
5035 + }
5036 + if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
5037 + printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
5038 + return;
5039 + }
5040 + while (count < oemtable->oem_length) {
5041 + switch (*oemptr) {
5042 + case MP_TRANSLATION:
5043 + {
5044 + struct mpc_config_translation *m =
5045 + (struct mpc_config_translation *)oemptr;
5046 + MP_translation_info(m);
5047 + oemptr += sizeof(*m);
5048 + count += sizeof(*m);
5049 + ++mpc_record;
5050 + break;
5051 + }
5052 + default:
5053 + {
5054 + printk(KERN_WARNING
5055 + "Unrecognised OEM table entry type! - %d\n",
5056 + (int)*oemptr);
5057 + return;
5058 + }
5059 + }
5060 + }
5061 +}
5062 +
5063 +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
5064 + char *productid)
5065 +{
5066 + if (strncmp(oem, "IBM NUMA", 8))
5067 + printk("Warning! May not be a NUMA-Q system!\n");
5068 + if (mpc->mpc_oemptr)
5069 + smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
5070 + mpc->mpc_oemsize);
5071 +}
5072 +#endif /* CONFIG_X86_NUMAQ */
5073 +
5074 +/*
5075 + * Read/parse the MPC
5076 + */
5077 +
5078 +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
5079 +{
5080 + char str[16];
5081 + char oem[10];
5082 + int count = sizeof(*mpc);
5083 + unsigned char *mpt = ((unsigned char *)mpc) + count;
5084 +
5085 + if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
5086 + printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
5087 + mpc->mpc_signature[0], mpc->mpc_signature[1],
5088 + mpc->mpc_signature[2], mpc->mpc_signature[3]);
5089 + return 0;
5090 + }
5091 + if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
5092 + printk(KERN_ERR "MPTABLE: checksum error!\n");
5093 + return 0;
5094 + }
5095 + if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
5096 + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
5097 + mpc->mpc_spec);
5098 + return 0;
5099 + }
5100 + if (!mpc->mpc_lapic) {
5101 + printk(KERN_ERR "MPTABLE: null local APIC address!\n");
5102 + return 0;
5103 + }
5104 + memcpy(oem, mpc->mpc_oem, 8);
5105 + oem[8] = 0;
5106 + printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
5107 +
5108 + memcpy(str, mpc->mpc_productid, 12);
5109 + str[12] = 0;
5110 + printk("Product ID: %s ", str);
5111 +
5112 +#ifdef CONFIG_X86_32
5113 + mps_oem_check(mpc, oem, str);
5114 +#endif
5115 + printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
5116 +
5117 + printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
5118 +
5119 + /* save the local APIC address, it might be non-default */
5120 + if (!acpi_lapic)
5121 + mp_lapic_addr = mpc->mpc_lapic;
5122 +
5123 + if (early)
5124 + return 1;
5125 +
5126 + /*
5127 + * Now process the configuration blocks.
5128 + */
5129 +#ifdef CONFIG_X86_NUMAQ
5130 + mpc_record = 0;
5131 +#endif
5132 + while (count < mpc->mpc_length) {
5133 + switch (*mpt) {
5134 + case MP_PROCESSOR:
5135 + {
5136 + struct mpc_config_processor *m =
5137 + (struct mpc_config_processor *)mpt;
5138 + /* ACPI may have already provided this data */
5139 + if (!acpi_lapic)
5140 + MP_processor_info(m);
5141 + mpt += sizeof(*m);
5142 + count += sizeof(*m);
5143 + break;
5144 + }
5145 + case MP_BUS:
5146 + {
5147 + struct mpc_config_bus *m =
5148 + (struct mpc_config_bus *)mpt;
5149 + MP_bus_info(m);
5150 + mpt += sizeof(*m);
5151 + count += sizeof(*m);
5152 + break;
5153 + }
5154 + case MP_IOAPIC:
5155 + {
5156 +#ifdef CONFIG_X86_IO_APIC
5157 + struct mpc_config_ioapic *m =
5158 + (struct mpc_config_ioapic *)mpt;
5159 + MP_ioapic_info(m);
5160 +#endif
5161 + mpt += sizeof(struct mpc_config_ioapic);
5162 + count += sizeof(struct mpc_config_ioapic);
5163 + break;
5164 + }
5165 + case MP_INTSRC:
5166 + {
5167 +#ifdef CONFIG_X86_IO_APIC
5168 + struct mpc_config_intsrc *m =
5169 + (struct mpc_config_intsrc *)mpt;
5170 +
5171 + MP_intsrc_info(m);
5172 +#endif
5173 + mpt += sizeof(struct mpc_config_intsrc);
5174 + count += sizeof(struct mpc_config_intsrc);
5175 + break;
5176 + }
5177 + case MP_LINTSRC:
5178 + {
5179 + struct mpc_config_lintsrc *m =
5180 + (struct mpc_config_lintsrc *)mpt;
5181 + MP_lintsrc_info(m);
5182 + mpt += sizeof(*m);
5183 + count += sizeof(*m);
5184 + break;
5185 + }
5186 + default:
5187 + /* wrong mptable */
5188 + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
5189 + printk(KERN_ERR "type %x\n", *mpt);
5190 + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
5191 + 1, mpc, mpc->mpc_length, 1);
5192 + count = mpc->mpc_length;
5193 + break;
5194 + }
5195 +#ifdef CONFIG_X86_NUMAQ
5196 + ++mpc_record;
5197 +#endif
5198 + }
5199 + setup_apic_routing();
5200 + if (!num_processors)
5201 + printk(KERN_ERR "MPTABLE: no processors registered!\n");
5202 + return num_processors;
5203 +}
5204 +
5205 +#ifdef CONFIG_X86_IO_APIC
5206 +
5207 +static int __init ELCR_trigger(unsigned int irq)
5208 +{
5209 + unsigned int port;
5210 +
5211 + port = 0x4d0 + (irq >> 3);
5212 + return (inb(port) >> (irq & 7)) & 1;
5213 +}
5214 +
5215 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
5216 +{
5217 + struct mpc_config_intsrc intsrc;
5218 + int i;
5219 + int ELCR_fallback = 0;
5220 +
5221 + intsrc.mpc_type = MP_INTSRC;
5222 + intsrc.mpc_irqflag = 0; /* conforming */
5223 + intsrc.mpc_srcbus = 0;
5224 + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
5225 +
5226 + intsrc.mpc_irqtype = mp_INT;
5227 +
5228 + /*
5229 + * If true, we have an ISA/PCI system with no IRQ entries
5230 + * in the MP table. To prevent the PCI interrupts from being set up
5231 + * incorrectly, we try to use the ELCR. The sanity check to see if
5232 + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
5233 + * never be level sensitive, so we simply see if the ELCR agrees.
5234 + * If it does, we assume it's valid.
5235 + */
5236 + if (mpc_default_type == 5) {
5237 + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
5238 + "falling back to ELCR\n");
5239 +
5240 + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
5241 + ELCR_trigger(13))
5242 + printk(KERN_ERR "ELCR contains invalid data... "
5243 + "not using ELCR\n");
5244 + else {
5245 + printk(KERN_INFO
5246 + "Using ELCR to identify PCI interrupts\n");
5247 + ELCR_fallback = 1;
5248 + }
5249 + }
5250 +
5251 + for (i = 0; i < 16; i++) {
5252 + switch (mpc_default_type) {
5253 + case 2:
5254 + if (i == 0 || i == 13)
5255 + continue; /* IRQ0 & IRQ13 not connected */
5256 + /* fall through */
5257 + default:
5258 + if (i == 2)
5259 + continue; /* IRQ2 is never connected */
5260 + }
5261 +
5262 + if (ELCR_fallback) {
5263 + /*
5264 + * If the ELCR indicates a level-sensitive interrupt, we
5265 + * copy that information over to the MP table in the
5266 + * irqflag field (level sensitive, active high polarity).
5267 + */
5268 + if (ELCR_trigger(i))
5269 + intsrc.mpc_irqflag = 13;
5270 + else
5271 + intsrc.mpc_irqflag = 0;
5272 + }
5273 +
5274 + intsrc.mpc_srcbusirq = i;
5275 + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
5276 + MP_intsrc_info(&intsrc);
5277 + }
5278 +
5279 + intsrc.mpc_irqtype = mp_ExtINT;
5280 + intsrc.mpc_srcbusirq = 0;
5281 + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
5282 + MP_intsrc_info(&intsrc);
5283 +}
5284 +
5285 +#endif
5286 +
5287 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
5288 +{
5289 + struct mpc_config_processor processor;
5290 + struct mpc_config_bus bus;
5291 +#ifdef CONFIG_X86_IO_APIC
5292 + struct mpc_config_ioapic ioapic;
5293 +#endif
5294 + struct mpc_config_lintsrc lintsrc;
5295 + int linttypes[2] = { mp_ExtINT, mp_NMI };
5296 + int i;
5297 +
5298 + /*
5299 + * local APIC has default address
5300 + */
5301 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
5302 +
5303 + /*
5304 + * 2 CPUs, numbered 0 & 1.
5305 + */
5306 + processor.mpc_type = MP_PROCESSOR;
5307 + /* Either an integrated APIC or a discrete 82489DX. */
5308 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
5309 + processor.mpc_cpuflag = CPU_ENABLED;
5310 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
5311 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
5312 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
5313 + processor.mpc_reserved[0] = 0;
5314 + processor.mpc_reserved[1] = 0;
5315 + for (i = 0; i < 2; i++) {
5316 + processor.mpc_apicid = i;
5317 + MP_processor_info(&processor);
5318 + }
5319 +
5320 + bus.mpc_type = MP_BUS;
5321 + bus.mpc_busid = 0;
5322 + switch (mpc_default_type) {
5323 + default:
5324 + printk(KERN_ERR "???\nUnknown standard configuration %d\n",
5325 + mpc_default_type);
5326 + /* fall through */
5327 + case 1:
5328 + case 5:
5329 + memcpy(bus.mpc_bustype, "ISA ", 6);
5330 + break;
5331 + case 2:
5332 + case 6:
5333 + case 3:
5334 + memcpy(bus.mpc_bustype, "EISA ", 6);
5335 + break;
5336 + case 4:
5337 + case 7:
5338 + memcpy(bus.mpc_bustype, "MCA ", 6);
5339 + }
5340 + MP_bus_info(&bus);
5341 + if (mpc_default_type > 4) {
5342 + bus.mpc_busid = 1;
5343 + memcpy(bus.mpc_bustype, "PCI ", 6);
5344 + MP_bus_info(&bus);
5345 + }
5346 +
5347 +#ifdef CONFIG_X86_IO_APIC
5348 + ioapic.mpc_type = MP_IOAPIC;
5349 + ioapic.mpc_apicid = 2;
5350 + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
5351 + ioapic.mpc_flags = MPC_APIC_USABLE;
5352 + ioapic.mpc_apicaddr = 0xFEC00000;
5353 + MP_ioapic_info(&ioapic);
5354 +
5355 + /*
5356 + * We set up most of the low 16 IO-APIC pins according to MPS rules.
5357 + */
5358 + construct_default_ioirq_mptable(mpc_default_type);
5359 +#endif
5360 + lintsrc.mpc_type = MP_LINTSRC;
5361 + lintsrc.mpc_irqflag = 0; /* conforming */
5362 + lintsrc.mpc_srcbusid = 0;
5363 + lintsrc.mpc_srcbusirq = 0;
5364 + lintsrc.mpc_destapic = MP_APIC_ALL;
5365 + for (i = 0; i < 2; i++) {
5366 + lintsrc.mpc_irqtype = linttypes[i];
5367 + lintsrc.mpc_destapiclint = i;
5368 + MP_lintsrc_info(&lintsrc);
5369 + }
5370 +}
5371 +
5372 +static struct intel_mp_floating *mpf_found;
5373 +
5374 +/*
5375 + * Scan the memory blocks for an SMP configuration block.
5376 + */
5377 +static void __init __get_smp_config(unsigned early)
5378 +{
5379 + struct intel_mp_floating *mpf = mpf_found;
5380 +
5381 + if (acpi_lapic && early)
5382 + return;
5383 + /*
5384 + * ACPI supports both logical (e.g. Hyper-Threading) and physical
5385 + * processors, where MPS only supports physical.
5386 + */
5387 + if (acpi_lapic && acpi_ioapic) {
5388 + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
5389 + "information\n");
5390 + return;
5391 + } else if (acpi_lapic)
5392 + printk(KERN_INFO "Using ACPI for processor (LAPIC) "
5393 + "configuration information\n");
5394 +
5395 + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
5396 + mpf->mpf_specification);
5397 +#ifdef CONFIG_X86_32
5398 + if (mpf->mpf_feature2 & (1 << 7)) {
5399 + printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
5400 + pic_mode = 1;
5401 + } else {
5402 + printk(KERN_INFO " Virtual Wire compatibility mode.\n");
5403 + pic_mode = 0;
5404 + }
5405 +#endif
5406 + /*
5407 + * Now see if we need to read further.
5408 + */
5409 + if (mpf->mpf_feature1 != 0) {
5410 + if (early) {
5411 + /*
5412 + * local APIC has default address
5413 + */
5414 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
5415 + return;
5416 + }
5417 +
5418 + printk(KERN_INFO "Default MP configuration #%d\n",
5419 + mpf->mpf_feature1);
5420 + construct_default_ISA_mptable(mpf->mpf_feature1);
5421 +
5422 + } else if (mpf->mpf_physptr) {
5423 +
5424 + /*
5425 + * Read the physical hardware table. Anything here will
5426 + * override the defaults.
5427 + */
5428 + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
5429 + smp_found_config = 0;
5430 + printk(KERN_ERR
5431 + "BIOS bug, MP table errors detected!...\n");
5432 + printk(KERN_ERR "... disabling SMP support. "
5433 + "(tell your hw vendor)\n");
5434 + return;
5435 + }
5436 +
5437 + if (early)
5438 + return;
5439 +#ifdef CONFIG_X86_IO_APIC
5440 + /*
5441 + * If there are no explicit MP IRQ entries, then we are
5442 + * broken. We set up most of the low 16 IO-APIC pins to
5443 + * ISA defaults and hope it will work.
5444 + */
5445 + if (!mp_irq_entries) {
5446 + struct mpc_config_bus bus;
5447 +
5448 + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
5449 + "using default mptable. "
5450 + "(tell your hw vendor)\n");
5451 +
5452 + bus.mpc_type = MP_BUS;
5453 + bus.mpc_busid = 0;
5454 + memcpy(bus.mpc_bustype, "ISA ", 6);
5455 + MP_bus_info(&bus);
5456 +
5457 + construct_default_ioirq_mptable(0);
5458 + }
5459 +#endif
5460 + } else
5461 + BUG();
5462 +
5463 + if (!early)
5464 + printk(KERN_INFO "Processors: %d\n", num_processors);
5465 + /*
5466 + * Only use the first configuration found.
5467 + */
5468 +}
5469 +
5470 +void __init early_get_smp_config(void)
5471 +{
5472 + __get_smp_config(1);
5473 +}
5474 +
5475 +void __init get_smp_config(void)
5476 +{
5477 + __get_smp_config(0);
5478 +}
5479 +
5480 +static int __init smp_scan_config(unsigned long base, unsigned long length,
5481 + unsigned reserve)
5482 +{
5483 + unsigned int *bp = isa_bus_to_virt(base);
5484 + struct intel_mp_floating *mpf;
5485 +
5486 + Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
5487 + BUILD_BUG_ON(sizeof(*mpf) != 16);
5488 +
5489 + while (length > 0) {
5490 + mpf = (struct intel_mp_floating *)bp;
5491 + if ((*bp == SMP_MAGIC_IDENT) &&
5492 + (mpf->mpf_length == 1) &&
5493 + !mpf_checksum((unsigned char *)bp, 16) &&
5494 + ((mpf->mpf_specification == 1)
5495 + || (mpf->mpf_specification == 4))) {
5496 +
5497 + smp_found_config = 1;
5498 + mpf_found = mpf;
5499 +#ifdef CONFIG_X86_32
5500 +#ifndef CONFIG_XEN
5501 + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
5502 + mpf, virt_to_phys(mpf));
5503 + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
5504 + BOOTMEM_DEFAULT);
5505 + if (mpf->mpf_physptr) {
5506 + /*
5507 + * We cannot access to MPC table to compute
5508 + * table size yet, as only few megabytes from
5509 + * the bottom is mapped now.
5510 + * PC-9800's MPC table places on the very last
5511 + * of physical memory; so that simply reserving
5512 + * PAGE_SIZE from mpg->mpf_physptr yields BUG()
5513 + * in reserve_bootmem.
5514 + */
5515 + unsigned long size = PAGE_SIZE;
5516 + unsigned long end = max_low_pfn * PAGE_SIZE;
5517 + if (mpf->mpf_physptr + size > end)
5518 + size = end - mpf->mpf_physptr;
5519 + reserve_bootmem(mpf->mpf_physptr, size,
5520 + BOOTMEM_DEFAULT);
5521 + }
5522 +#else
5523 + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
5524 + mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
5525 +#endif
5526 +#elif !defined(CONFIG_XEN)
5527 + if (!reserve)
5528 + return 1;
5529 +
5530 + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
5531 + if (mpf->mpf_physptr)
5532 + reserve_bootmem_generic(mpf->mpf_physptr,
5533 + PAGE_SIZE);
5534 +#endif
5535 + return 1;
5536 + }
5537 + bp += 4;
5538 + length -= 16;
5539 + }
5540 + return 0;
5541 +}
5542 +
5543 +static void __init __find_smp_config(unsigned reserve)
5544 +{
5545 +#ifndef CONFIG_XEN
5546 + unsigned int address;
5547 +#endif
5548 +
5549 + /*
5550 + * FIXME: Linux assumes you have 640K of base ram..
5551 + * this continues the error...
5552 + *
5553 + * 1) Scan the bottom 1K for a signature
5554 + * 2) Scan the top 1K of base RAM
5555 + * 3) Scan the 64K of bios
5556 + */
5557 + if (smp_scan_config(0x0, 0x400, reserve) ||
5558 + smp_scan_config(639 * 0x400, 0x400, reserve) ||
5559 + smp_scan_config(0xF0000, 0x10000, reserve))
5560 + return;
5561 + /*
5562 + * If it is an SMP machine we should know now, unless the
5563 + * configuration is in an EISA/MCA bus machine with an
5564 + * extended bios data area.
5565 + *
5566 + * there is a real-mode segmented pointer pointing to the
5567 + * 4K EBDA area at 0x40E, calculate and scan it here.
5568 + *
5569 + * NOTE! There are Linux loaders that will corrupt the EBDA
5570 + * area, and as such this kind of SMP config may be less
5571 + * trustworthy, simply because the SMP table may have been
5572 + * stomped on during early boot. These loaders are buggy and
5573 + * should be fixed.
5574 + *
5575 + * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
5576 + */
5577 +
5578 +#ifndef CONFIG_XEN
5579 + address = get_bios_ebda();
5580 + if (address)
5581 + smp_scan_config(address, 0x400, reserve);
5582 +#endif
5583 +}
5584 +
5585 +void __init early_find_smp_config(void)
5586 +{
5587 + __find_smp_config(0);
5588 +}
5589 +
5590 +void __init find_smp_config(void)
5591 +{
5592 + __find_smp_config(1);
5593 +}
5594 +
5595 +/* --------------------------------------------------------------------------
5596 + ACPI-based MP Configuration
5597 + -------------------------------------------------------------------------- */
5598 +
5599 +/*
5600 + * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
5601 + */
5602 +int es7000_plat;
5603 +
5604 +#ifdef CONFIG_ACPI
5605 +
5606 +#ifdef CONFIG_X86_IO_APIC
5607 +
5608 +#define MP_ISA_BUS 0
5609 +
5610 +extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
5611 +
5612 +static int mp_find_ioapic(int gsi)
5613 +{
5614 + int i = 0;
5615 +
5616 + /* Find the IOAPIC that manages this GSI. */
5617 + for (i = 0; i < nr_ioapics; i++) {
5618 + if ((gsi >= mp_ioapic_routing[i].gsi_base)
5619 + && (gsi <= mp_ioapic_routing[i].gsi_end))
5620 + return i;
5621 + }
5622 +
5623 + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
5624 + return -1;
5625 +}
5626 +
5627 +static u8 __init uniq_ioapic_id(u8 id)
5628 +{
5629 +#ifdef CONFIG_X86_32
5630 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
5631 + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
5632 + return io_apic_get_unique_id(nr_ioapics, id);
5633 + else
5634 + return id;
5635 +#else
5636 + int i;
5637 + DECLARE_BITMAP(used, 256);
5638 + bitmap_zero(used, 256);
5639 + for (i = 0; i < nr_ioapics; i++) {
5640 + struct mpc_config_ioapic *ia = &mp_ioapics[i];
5641 + __set_bit(ia->mpc_apicid, used);
5642 + }
5643 + if (!test_bit(id, used))
5644 + return id;
5645 + return find_first_zero_bit(used, 256);
5646 +#endif
5647 +}
5648 +
5649 +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
5650 +{
5651 + int idx = 0;
5652 +
5653 + if (bad_ioapic(address))
5654 + return;
5655 +
5656 + idx = nr_ioapics;
5657 +
5658 + mp_ioapics[idx].mpc_type = MP_IOAPIC;
5659 + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
5660 + mp_ioapics[idx].mpc_apicaddr = address;
5661 +
5662 +#ifndef CONFIG_XEN
5663 + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
5664 +#endif
5665 + mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
5666 +#ifdef CONFIG_X86_32
5667 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
5668 +#else
5669 + mp_ioapics[idx].mpc_apicver = 0;
5670 +#endif
5671 + /*
5672 + * Build basic GSI lookup table to facilitate gsi->io_apic lookups
5673 + * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
5674 + */
5675 + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
5676 + mp_ioapic_routing[idx].gsi_base = gsi_base;
5677 + mp_ioapic_routing[idx].gsi_end = gsi_base +
5678 + io_apic_get_redir_entries(idx);
5679 +
5680 + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
5681 + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
5682 + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
5683 + mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
5684 +
5685 + nr_ioapics++;
5686 +}
5687 +
5688 +void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
5689 +{
5690 + struct mpc_config_intsrc intsrc;
5691 + int ioapic = -1;
5692 + int pin = -1;
5693 +
5694 + /*
5695 + * Convert 'gsi' to 'ioapic.pin'.
5696 + */
5697 + ioapic = mp_find_ioapic(gsi);
5698 + if (ioapic < 0)
5699 + return;
5700 + pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
5701 +
5702 + /*
5703 + * TBD: This check is for faulty timer entries, where the override
5704 + * erroneously sets the trigger to level, resulting in a HUGE
5705 + * increase of timer interrupts!
5706 + */
5707 + if ((bus_irq == 0) && (trigger == 3))
5708 + trigger = 1;
5709 +
5710 + intsrc.mpc_type = MP_INTSRC;
5711 + intsrc.mpc_irqtype = mp_INT;
5712 + intsrc.mpc_irqflag = (trigger << 2) | polarity;
5713 + intsrc.mpc_srcbus = MP_ISA_BUS;
5714 + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
5715 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
5716 + intsrc.mpc_dstirq = pin; /* INTIN# */
5717 +
5718 + MP_intsrc_info(&intsrc);
5719 +}
5720 +
5721 +void __init mp_config_acpi_legacy_irqs(void)
5722 +{
5723 + struct mpc_config_intsrc intsrc;
5724 + int i = 0;
5725 + int ioapic = -1;
5726 +
5727 +#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
5728 + /*
5729 + * Fabricate the legacy ISA bus (bus #31).
5730 + */
5731 + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
5732 +#endif
5733 + set_bit(MP_ISA_BUS, mp_bus_not_pci);
5734 + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
5735 +
5736 + /*
5737 + * Older generations of ES7000 have no legacy identity mappings
5738 + */
5739 + if (es7000_plat == 1)
5740 + return;
5741 +
5742 + /*
5743 + * Locate the IOAPIC that manages the ISA IRQs (0-15).
5744 + */
5745 + ioapic = mp_find_ioapic(0);
5746 + if (ioapic < 0)
5747 + return;
5748 +
5749 + intsrc.mpc_type = MP_INTSRC;
5750 + intsrc.mpc_irqflag = 0; /* Conforming */
5751 + intsrc.mpc_srcbus = MP_ISA_BUS;
5752 +#ifdef CONFIG_X86_IO_APIC
5753 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
5754 +#endif
5755 + /*
5756 + * Use the default configuration for the IRQs 0-15. Unless
5757 + * overridden by (MADT) interrupt source override entries.
5758 + */
5759 + for (i = 0; i < 16; i++) {
5760 + int idx;
5761 +
5762 + for (idx = 0; idx < mp_irq_entries; idx++) {
5763 + struct mpc_config_intsrc *irq = mp_irqs + idx;
5764 +
5765 + /* Do we already have a mapping for this ISA IRQ? */
5766 + if (irq->mpc_srcbus == MP_ISA_BUS
5767 + && irq->mpc_srcbusirq == i)
5768 + break;
5769 +
5770 + /* Do we already have a mapping for this IOAPIC pin */
5771 + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
5772 + (irq->mpc_dstirq == i))
5773 + break;
5774 + }
5775 +
5776 + if (idx != mp_irq_entries) {
5777 + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
5778 + continue; /* IRQ already used */
5779 + }
5780 +
5781 + intsrc.mpc_irqtype = mp_INT;
5782 + intsrc.mpc_srcbusirq = i; /* Identity mapped */
5783 + intsrc.mpc_dstirq = i;
5784 +
5785 + MP_intsrc_info(&intsrc);
5786 + }
5787 +}
5788 +
5789 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
5790 +{
5791 + int ioapic;
5792 + int ioapic_pin;
5793 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
5794 +#define MAX_GSI_NUM 4096
5795 +#define IRQ_COMPRESSION_START 64
5796 +
5797 + static int pci_irq = IRQ_COMPRESSION_START;
5798 + /*
5799 + * Mapping between Global System Interrupts, which
5800 + * represent all possible interrupts, and IRQs
5801 + * assigned to actual devices.
5802 + */
5803 + static int gsi_to_irq[MAX_GSI_NUM];
5804 +#else
5805 +
5806 + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
5807 + return gsi;
5808 +#endif
5809 +
5810 + /* Don't set up the ACPI SCI because it's already set up */
5811 + if (acpi_gbl_FADT.sci_interrupt == gsi)
5812 + return gsi;
5813 +
5814 + ioapic = mp_find_ioapic(gsi);
5815 + if (ioapic < 0) {
5816 + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
5817 + return gsi;
5818 + }
5819 +
5820 + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
5821 +
5822 +#ifndef CONFIG_X86_32
5823 + if (ioapic_renumber_irq)
5824 + gsi = ioapic_renumber_irq(ioapic, gsi);
5825 +#endif
5826 +
5827 + /*
5828 + * Avoid pin reprogramming. PRTs typically include entries
5829 + * with redundant pin->gsi mappings (but unique PCI devices);
5830 + * we only program the IOAPIC on the first.
5831 + */
5832 + if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
5833 + printk(KERN_ERR "Invalid reference to IOAPIC pin "
5834 + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
5835 + ioapic_pin);
5836 + return gsi;
5837 + }
5838 + if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
5839 + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
5840 + mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
5841 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
5842 + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
5843 +#else
5844 + return gsi;
5845 +#endif
5846 + }
5847 +
5848 + set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
5849 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
5850 + /*
5851 + * For GSI >= 64, use IRQ compression
5852 + */
5853 + if ((gsi >= IRQ_COMPRESSION_START)
5854 + && (triggering == ACPI_LEVEL_SENSITIVE)) {
5855 + /*
5856 + * For PCI devices assign IRQs in order, avoiding gaps
5857 + * due to unused I/O APIC pins.
5858 + */
5859 + int irq = gsi;
5860 + if (gsi < MAX_GSI_NUM) {
5861 + /*
5862 + * Retain the VIA chipset work-around (gsi > 15), but
5863 + * avoid a problem where the 8254 timer (IRQ0) is setup
5864 + * via an override (so it's not on pin 0 of the ioapic),
5865 + * and at the same time, the pin 0 interrupt is a PCI
5866 + * type. The gsi > 15 test could cause these two pins
5867 + * to be shared as IRQ0, and they are not shareable.
5868 + * So test for this condition, and if necessary, avoid
5869 + * the pin collision.
5870 + */
5871 + gsi = pci_irq++;
5872 + /*
5873 + * Don't assign IRQ used by ACPI SCI
5874 + */
5875 + if (gsi == acpi_gbl_FADT.sci_interrupt)
5876 + gsi = pci_irq++;
5877 + gsi_to_irq[irq] = gsi;
5878 + } else {
5879 + printk(KERN_ERR "GSI %u is too high\n", gsi);
5880 + return gsi;
5881 + }
5882 + }
5883 +#endif
5884 + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
5885 + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
5886 + polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
5887 + return gsi;
5888 +}
5889 +
5890 +#endif /* CONFIG_X86_IO_APIC */
5891 +#endif /* CONFIG_ACPI */
5892 --- a/arch/x86/kernel/pci-dma-xen.c
5893 +++ b/arch/x86/kernel/pci-dma-xen.c
5894 @@ -1,283 +1,251 @@
5895 -/*
5896 - * Dynamic DMA mapping support.
5897 - *
5898 - * On i386 there is no hardware dynamic DMA address translation,
5899 - * so consistent alloc/free are merely page allocation/freeing.
5900 - * The rest of the dynamic DMA mapping interface is implemented
5901 - * in asm/pci.h.
5902 - */
5903 -
5904 -#include <linux/types.h>
5905 -#include <linux/mm.h>
5906 -#include <linux/string.h>
5907 +#include <linux/dma-mapping.h>
5908 +#include <linux/dmar.h>
5909 +#include <linux/bootmem.h>
5910 #include <linux/pci.h>
5911 -#include <linux/module.h>
5912 -#include <linux/version.h>
5913 -#include <asm/io.h>
5914 -#include <xen/balloon.h>
5915 -#include <xen/gnttab.h>
5916 -#include <asm/swiotlb.h>
5917 -#include <asm/tlbflush.h>
5918 -#include <asm/swiotlb_32.h>
5919 -#include <asm/gnttab_dma.h>
5920 -#include <asm/bug.h>
5921
5922 -#ifdef __x86_64__
5923 -#include <asm/iommu.h>
5924 +#include <asm/proto.h>
5925 +#include <asm/dma.h>
5926 +#include <asm/gart.h>
5927 +#include <asm/calgary.h>
5928 +
5929 +int forbid_dac __read_mostly;
5930 +EXPORT_SYMBOL(forbid_dac);
5931 +
5932 +const struct dma_mapping_ops *dma_ops;
5933 +EXPORT_SYMBOL(dma_ops);
5934 +
5935 +static int iommu_sac_force __read_mostly;
5936 +
5937 +#ifdef CONFIG_IOMMU_DEBUG
5938 +int panic_on_overflow __read_mostly = 1;
5939 +int force_iommu __read_mostly = 1;
5940 +#else
5941 +int panic_on_overflow __read_mostly = 0;
5942 +int force_iommu __read_mostly = 0;
5943 +#endif
5944
5945 int iommu_merge __read_mostly = 0;
5946 -EXPORT_SYMBOL(iommu_merge);
5947
5948 -dma_addr_t bad_dma_address __read_mostly;
5949 -EXPORT_SYMBOL(bad_dma_address);
5950 +int no_iommu __read_mostly;
5951 +/* Set this to 1 if there is a HW IOMMU in the system */
5952 +int iommu_detected __read_mostly = 0;
5953
5954 /* This tells the BIO block layer to assume merging. Default to off
5955 because we cannot guarantee merging later. */
5956 int iommu_bio_merge __read_mostly = 0;
5957 EXPORT_SYMBOL(iommu_bio_merge);
5958
5959 -int force_iommu __read_mostly= 0;
5960 +dma_addr_t bad_dma_address __read_mostly = 0;
5961 +EXPORT_SYMBOL(bad_dma_address);
5962
5963 -__init int iommu_setup(char *p)
5964 -{
5965 - return 1;
5966 -}
5967 +/* Dummy device used for NULL arguments (normally ISA). Better would
5968 + be probably a smaller DMA mask, but this is bug-to-bug compatible
5969 + to older i386. */
5970 +struct device fallback_dev = {
5971 + .bus_id = "fallback device",
5972 + .coherent_dma_mask = DMA_32BIT_MASK,
5973 + .dma_mask = &fallback_dev.coherent_dma_mask,
5974 +};
5975
5976 -void __init pci_iommu_alloc(void)
5977 +int dma_set_mask(struct device *dev, u64 mask)
5978 {
5979 -#ifdef CONFIG_SWIOTLB
5980 - pci_swiotlb_init();
5981 -#endif
5982 -}
5983 + if (!dev->dma_mask || !dma_supported(dev, mask))
5984 + return -EIO;
5985 +
5986 + *dev->dma_mask = mask;
5987
5988 -static int __init pci_iommu_init(void)
5989 -{
5990 - no_iommu_init();
5991 return 0;
5992 }
5993 +EXPORT_SYMBOL(dma_set_mask);
5994
5995 -/* Must execute after PCI subsystem */
5996 -fs_initcall(pci_iommu_init);
5997 -#endif
5998 -
5999 -struct dma_coherent_mem {
6000 - void *virt_base;
6001 - u32 device_base;
6002 - int size;
6003 - int flags;
6004 - unsigned long *bitmap;
6005 -};
6006 -
6007 -#define IOMMU_BUG_ON(test) \
6008 -do { \
6009 - if (unlikely(test)) { \
6010 - printk(KERN_ALERT "Fatal DMA error! " \
6011 - "Please use 'swiotlb=force'\n"); \
6012 - BUG(); \
6013 - } \
6014 -} while (0)
6015 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
6016 +static __initdata void *dma32_bootmem_ptr;
6017 +static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
6018
6019 -static int check_pages_physically_contiguous(unsigned long pfn,
6020 - unsigned int offset,
6021 - size_t length)
6022 +static int __init parse_dma32_size_opt(char *p)
6023 {
6024 - unsigned long next_mfn;
6025 - int i;
6026 - int nr_pages;
6027 -
6028 - next_mfn = pfn_to_mfn(pfn);
6029 - nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
6030 -
6031 - for (i = 1; i < nr_pages; i++) {
6032 - if (pfn_to_mfn(++pfn) != ++next_mfn)
6033 - return 0;
6034 - }
6035 - return 1;
6036 + if (!p)
6037 + return -EINVAL;
6038 + dma32_bootmem_size = memparse(p, &p);
6039 + return 0;
6040 }
6041 +early_param("dma32_size", parse_dma32_size_opt);
6042
6043 -int range_straddles_page_boundary(paddr_t p, size_t size)
6044 +void __init dma32_reserve_bootmem(void)
6045 {
6046 - unsigned long pfn = p >> PAGE_SHIFT;
6047 - unsigned int offset = p & ~PAGE_MASK;
6048 + unsigned long size, align;
6049 + if (end_pfn <= MAX_DMA32_PFN)
6050 + return;
6051
6052 - return ((offset + size > PAGE_SIZE) &&
6053 - !check_pages_physically_contiguous(pfn, offset, size));
6054 + align = 64ULL<<20;
6055 + size = round_up(dma32_bootmem_size, align);
6056 + dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
6057 + __pa(MAX_DMA_ADDRESS));
6058 + if (dma32_bootmem_ptr)
6059 + dma32_bootmem_size = size;
6060 + else
6061 + dma32_bootmem_size = 0;
6062 }
6063 -
6064 -int
6065 -dma_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
6066 - enum dma_data_direction direction)
6067 +static void __init dma32_free_bootmem(void)
6068 {
6069 - int i, rc;
6070 + int node;
6071 +
6072 + if (end_pfn <= MAX_DMA32_PFN)
6073 + return;
6074
6075 - BUG_ON(!valid_dma_direction(direction));
6076 - WARN_ON(nents == 0 || sgl->length == 0);
6077 + if (!dma32_bootmem_ptr)
6078 + return;
6079
6080 - if (swiotlb) {
6081 - rc = swiotlb_map_sg(hwdev, sgl, nents, direction);
6082 - } else {
6083 - struct scatterlist *sg;
6084 -
6085 - for_each_sg(sgl, sg, nents, i) {
6086 - BUG_ON(!sg_page(sg));
6087 - sg->dma_address =
6088 - gnttab_dma_map_page(sg_page(sg)) + sg->offset;
6089 - sg->dma_length = sg->length;
6090 - IOMMU_BUG_ON(address_needs_mapping(
6091 - hwdev, sg->dma_address));
6092 - IOMMU_BUG_ON(range_straddles_page_boundary(
6093 - page_to_pseudophys(sg_page(sg)) + sg->offset,
6094 - sg->length));
6095 - }
6096 - rc = nents;
6097 - }
6098 + for_each_online_node(node)
6099 + free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
6100 + dma32_bootmem_size);
6101
6102 - flush_write_buffers();
6103 - return rc;
6104 + dma32_bootmem_ptr = NULL;
6105 + dma32_bootmem_size = 0;
6106 }
6107 -EXPORT_SYMBOL(dma_map_sg);
6108 +#else
6109 +#define dma32_free_bootmem() ((void)0)
6110 +#endif
6111
6112 -void
6113 -dma_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
6114 - enum dma_data_direction direction)
6115 -{
6116 - int i;
6117 +static const struct dma_mapping_ops swiotlb_dma_ops = {
6118 + .mapping_error = swiotlb_dma_mapping_error,
6119 + .map_single = swiotlb_map_single_phys,
6120 + .unmap_single = swiotlb_unmap_single,
6121 + .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
6122 + .sync_single_for_device = swiotlb_sync_single_for_device,
6123 + .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
6124 + .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
6125 + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
6126 + .sync_sg_for_device = swiotlb_sync_sg_for_device,
6127 + .map_sg = swiotlb_map_sg,
6128 + .unmap_sg = swiotlb_unmap_sg,
6129 + .dma_supported = swiotlb_dma_supported
6130 +};
6131
6132 - BUG_ON(!valid_dma_direction(direction));
6133 - if (swiotlb)
6134 - swiotlb_unmap_sg(hwdev, sgl, nents, direction);
6135 - else {
6136 - struct scatterlist *sg;
6137 +void __init pci_iommu_alloc(void)
6138 +{
6139 + /* free the range so iommu could get some range less than 4G */
6140 + dma32_free_bootmem();
6141 + /*
6142 + * The order of these functions is important for
6143 + * fall-back/fail-over reasons
6144 + */
6145 +#ifdef CONFIG_GART_IOMMU
6146 + gart_iommu_hole_init();
6147 +#endif
6148
6149 - for_each_sg(sgl, sg, nents, i)
6150 - gnttab_dma_unmap_page(sg->dma_address);
6151 - }
6152 -}
6153 -EXPORT_SYMBOL(dma_unmap_sg);
6154 +#ifdef CONFIG_CALGARY_IOMMU
6155 + detect_calgary();
6156 +#endif
6157
6158 -#ifdef CONFIG_HIGHMEM
6159 -dma_addr_t
6160 -dma_map_page(struct device *dev, struct page *page, unsigned long offset,
6161 - size_t size, enum dma_data_direction direction)
6162 -{
6163 - dma_addr_t dma_addr;
6164 + detect_intel_iommu();
6165
6166 - BUG_ON(!valid_dma_direction(direction));
6167 +#ifdef CONFIG_SWIOTLB
6168 + swiotlb_init();
6169 if (swiotlb) {
6170 - dma_addr = swiotlb_map_page(
6171 - dev, page, offset, size, direction);
6172 - } else {
6173 - dma_addr = gnttab_dma_map_page(page) + offset;
6174 - IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
6175 + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
6176 + dma_ops = &swiotlb_dma_ops;
6177 }
6178 -
6179 - return dma_addr;
6180 +#endif
6181 }
6182 -EXPORT_SYMBOL(dma_map_page);
6183
6184 -void
6185 -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
6186 - enum dma_data_direction direction)
6187 +/*
6188 + * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
6189 + * documentation.
6190 + */
6191 +static __init int iommu_setup(char *p)
6192 {
6193 - BUG_ON(!valid_dma_direction(direction));
6194 - if (swiotlb)
6195 - swiotlb_unmap_page(dev, dma_address, size, direction);
6196 - else
6197 - gnttab_dma_unmap_page(dma_address);
6198 -}
6199 -EXPORT_SYMBOL(dma_unmap_page);
6200 -#endif /* CONFIG_HIGHMEM */
6201 + iommu_merge = 1;
6202
6203 -int
6204 -dma_mapping_error(dma_addr_t dma_addr)
6205 -{
6206 - if (swiotlb)
6207 - return swiotlb_dma_mapping_error(dma_addr);
6208 - return 0;
6209 -}
6210 -EXPORT_SYMBOL(dma_mapping_error);
6211 + if (!p)
6212 + return -EINVAL;
6213
6214 -int
6215 -dma_supported(struct device *dev, u64 mask)
6216 -{
6217 - if (swiotlb)
6218 - return swiotlb_dma_supported(dev, mask);
6219 - /*
6220 - * By default we'll BUG when an infeasible DMA is requested, and
6221 - * request swiotlb=force (see IOMMU_BUG_ON).
6222 - */
6223 - return 1;
6224 -}
6225 -EXPORT_SYMBOL(dma_supported);
6226 + while (*p) {
6227 + if (!strncmp(p, "off", 3))
6228 + no_iommu = 1;
6229 + /* gart_parse_options has more force support */
6230 + if (!strncmp(p, "force", 5))
6231 + force_iommu = 1;
6232 + if (!strncmp(p, "noforce", 7)) {
6233 + iommu_merge = 0;
6234 + force_iommu = 0;
6235 + }
6236
6237 -void *dma_alloc_coherent(struct device *dev, size_t size,
6238 - dma_addr_t *dma_handle, gfp_t gfp)
6239 -{
6240 - void *ret;
6241 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6242 - unsigned int order = get_order(size);
6243 - unsigned long vstart;
6244 - u64 mask;
6245 + if (!strncmp(p, "biomerge", 8)) {
6246 + iommu_bio_merge = 4096;
6247 + iommu_merge = 1;
6248 + force_iommu = 1;
6249 + }
6250 + if (!strncmp(p, "panic", 5))
6251 + panic_on_overflow = 1;
6252 + if (!strncmp(p, "nopanic", 7))
6253 + panic_on_overflow = 0;
6254 + if (!strncmp(p, "merge", 5)) {
6255 + iommu_merge = 1;
6256 + force_iommu = 1;
6257 + }
6258 + if (!strncmp(p, "nomerge", 7))
6259 + iommu_merge = 0;
6260 + if (!strncmp(p, "forcesac", 8))
6261 + iommu_sac_force = 1;
6262 + if (!strncmp(p, "allowdac", 8))
6263 + forbid_dac = 0;
6264 + if (!strncmp(p, "nodac", 5))
6265 + forbid_dac = -1;
6266 + if (!strncmp(p, "usedac", 6)) {
6267 + forbid_dac = -1;
6268 + return 1;
6269 + }
6270 +#ifdef CONFIG_SWIOTLB
6271 + if (!strncmp(p, "soft", 4))
6272 + swiotlb = 1;
6273 +#endif
6274
6275 - /* ignore region specifiers */
6276 - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
6277 +#ifdef CONFIG_GART_IOMMU
6278 + gart_parse_options(p);
6279 +#endif
6280
6281 - if (mem) {
6282 - int page = bitmap_find_free_region(mem->bitmap, mem->size,
6283 - order);
6284 - if (page >= 0) {
6285 - *dma_handle = mem->device_base + (page << PAGE_SHIFT);
6286 - ret = mem->virt_base + (page << PAGE_SHIFT);
6287 - memset(ret, 0, size);
6288 - return ret;
6289 - }
6290 - if (mem->flags & DMA_MEMORY_EXCLUSIVE)
6291 - return NULL;
6292 +#ifdef CONFIG_CALGARY_IOMMU
6293 + if (!strncmp(p, "calgary", 7))
6294 + use_calgary = 1;
6295 +#endif /* CONFIG_CALGARY_IOMMU */
6296 +
6297 + p += strcspn(p, ",");
6298 + if (*p == ',')
6299 + ++p;
6300 }
6301 + return 0;
6302 +}
6303 +early_param("iommu", iommu_setup);
6304
6305 - if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
6306 - gfp |= GFP_DMA;
6307 -
6308 - vstart = __get_free_pages(gfp, order);
6309 - ret = (void *)vstart;
6310 +static int check_pages_physically_contiguous(unsigned long pfn,
6311 + unsigned int offset,
6312 + size_t length)
6313 +{
6314 + unsigned long next_mfn;
6315 + int i;
6316 + int nr_pages;
6317
6318 - if (dev != NULL && dev->coherent_dma_mask)
6319 - mask = dev->coherent_dma_mask;
6320 - else
6321 - mask = 0xffffffff;
6322 + next_mfn = pfn_to_mfn(pfn);
6323 + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
6324
6325 - if (ret != NULL) {
6326 - if (xen_create_contiguous_region(vstart, order,
6327 - fls64(mask)) != 0) {
6328 - free_pages(vstart, order);
6329 - return NULL;
6330 - }
6331 - memset(ret, 0, size);
6332 - *dma_handle = virt_to_bus(ret);
6333 + for (i = 1; i < nr_pages; i++) {
6334 + if (pfn_to_mfn(++pfn) != ++next_mfn)
6335 + return 0;
6336 }
6337 - return ret;
6338 + return 1;
6339 }
6340 -EXPORT_SYMBOL(dma_alloc_coherent);
6341
6342 -void dma_free_coherent(struct device *dev, size_t size,
6343 - void *vaddr, dma_addr_t dma_handle)
6344 +int range_straddles_page_boundary(paddr_t p, size_t size)
6345 {
6346 - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6347 - int order = get_order(size);
6348 -
6349 - WARN_ON(irqs_disabled()); /* for portability */
6350 - if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
6351 - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
6352 + unsigned long pfn = p >> PAGE_SHIFT;
6353 + unsigned int offset = p & ~PAGE_MASK;
6354
6355 - bitmap_release_region(mem->bitmap, page, order);
6356 - } else {
6357 - xen_destroy_contiguous_region((unsigned long)vaddr, order);
6358 - free_pages((unsigned long)vaddr, order);
6359 - }
6360 + return ((offset + size > PAGE_SIZE) &&
6361 + !check_pages_physically_contiguous(pfn, offset, size));
6362 }
6363 -EXPORT_SYMBOL(dma_free_coherent);
6364
6365 -#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
6366 +#ifdef CONFIG_X86_32
6367 int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
6368 dma_addr_t device_addr, size_t size, int flags)
6369 {
6370 @@ -327,8 +295,8 @@ EXPORT_SYMBOL(dma_declare_coherent_memor
6371 void dma_release_declared_memory(struct device *dev)
6372 {
6373 struct dma_coherent_mem *mem = dev->dma_mem;
6374 -
6375 - if(!mem)
6376 +
6377 + if (!mem)
6378 return;
6379 dev->dma_mem = NULL;
6380 iounmap(mem->virt_base);
6381 @@ -341,8 +309,10 @@ void *dma_mark_declared_memory_occupied(
6382 dma_addr_t device_addr, size_t size)
6383 {
6384 struct dma_coherent_mem *mem = dev->dma_mem;
6385 - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
6386 int pos, err;
6387 + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
6388 +
6389 + pages >>= PAGE_SHIFT;
6390
6391 if (!mem)
6392 return ERR_PTR(-EINVAL);
6393 @@ -354,103 +324,270 @@ void *dma_mark_declared_memory_occupied(
6394 return mem->virt_base + (pos << PAGE_SHIFT);
6395 }
6396 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
6397 -#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
6398 -
6399 -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
6400 -/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
6401
6402 -int forbid_dac;
6403 -EXPORT_SYMBOL(forbid_dac);
6404 -
6405 -static __devinit void via_no_dac(struct pci_dev *dev)
6406 +static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
6407 + dma_addr_t *dma_handle, void **ret)
6408 {
6409 - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
6410 - printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
6411 - forbid_dac = 1;
6412 + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6413 + int order = get_order(size);
6414 +
6415 + if (mem) {
6416 + int page = bitmap_find_free_region(mem->bitmap, mem->size,
6417 + order);
6418 + if (page >= 0) {
6419 + *dma_handle = mem->device_base + (page << PAGE_SHIFT);
6420 + *ret = mem->virt_base + (page << PAGE_SHIFT);
6421 + memset(*ret, 0, size);
6422 + }
6423 + if (mem->flags & DMA_MEMORY_EXCLUSIVE)
6424 + *ret = NULL;
6425 }
6426 + return (mem != NULL);
6427 }
6428 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
6429
6430 -static int check_iommu(char *s)
6431 +static int dma_release_coherent(struct device *dev, int order, void *vaddr)
6432 {
6433 - if (!strcmp(s, "usedac")) {
6434 - forbid_dac = -1;
6435 + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
6436 +
6437 + if (mem && vaddr >= mem->virt_base && vaddr <
6438 + (mem->virt_base + (mem->size << PAGE_SHIFT))) {
6439 + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
6440 +
6441 + bitmap_release_region(mem->bitmap, page, order);
6442 return 1;
6443 }
6444 return 0;
6445 }
6446 -__setup("iommu=", check_iommu);
6447 +#else
6448 +#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
6449 +#define dma_release_coherent(dev, order, vaddr) (0)
6450 +#endif /* CONFIG_X86_32 */
6451 +
6452 +int dma_supported(struct device *dev, u64 mask)
6453 +{
6454 +#ifdef CONFIG_PCI
6455 + if (mask > 0xffffffff && forbid_dac > 0) {
6456 + printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
6457 + dev->bus_id);
6458 + return 0;
6459 + }
6460 #endif
6461
6462 -dma_addr_t
6463 -dma_map_single(struct device *dev, void *ptr, size_t size,
6464 - enum dma_data_direction direction)
6465 + if (dma_ops->dma_supported)
6466 + return dma_ops->dma_supported(dev, mask);
6467 +
6468 + /* Copied from i386. Doesn't make much sense, because it will
6469 + only work for pci_alloc_coherent.
6470 + The caller just has to use GFP_DMA in this case. */
6471 + if (mask < DMA_24BIT_MASK)
6472 + return 0;
6473 +
6474 + /* Tell the device to use SAC when IOMMU force is on. This
6475 + allows the driver to use cheaper accesses in some cases.
6476 +
6477 + Problem with this is that if we overflow the IOMMU area and
6478 + return DAC as fallback address the device may not handle it
6479 + correctly.
6480 +
6481 + As a special case some controllers have a 39bit address
6482 + mode that is as efficient as 32bit (aic79xx). Don't force
6483 + SAC for these. Assume all masks <= 40 bits are of this
6484 + type. Normally this doesn't make any difference, but gives
6485 + more gentle handling of IOMMU overflow. */
6486 + if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
6487 + printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
6488 + dev->bus_id, mask);
6489 + return 0;
6490 + }
6491 +
6492 + return 1;
6493 +}
6494 +EXPORT_SYMBOL(dma_supported);
6495 +
6496 +/* Allocate DMA memory on node near device */
6497 +static struct page *
6498 +dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
6499 {
6500 - dma_addr_t dma;
6501 + int node;
6502
6503 - BUG_ON(!valid_dma_direction(direction));
6504 - WARN_ON(size == 0);
6505 + node = dev_to_node(dev);
6506
6507 - if (swiotlb) {
6508 - dma = swiotlb_map_single(dev, ptr, size, direction);
6509 - } else {
6510 - dma = gnttab_dma_map_page(virt_to_page(ptr)) +
6511 - offset_in_page(ptr);
6512 - IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size));
6513 - IOMMU_BUG_ON(address_needs_mapping(dev, dma));
6514 - }
6515 -
6516 - flush_write_buffers();
6517 - return dma;
6518 -}
6519 -EXPORT_SYMBOL(dma_map_single);
6520 -
6521 -void
6522 -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
6523 - enum dma_data_direction direction)
6524 -{
6525 - BUG_ON(!valid_dma_direction(direction));
6526 - if (swiotlb)
6527 - swiotlb_unmap_single(dev, dma_addr, size, direction);
6528 - else
6529 - gnttab_dma_unmap_page(dma_addr);
6530 + return alloc_pages_node(node, gfp, order);
6531 +}
6532 +
6533 +/*
6534 + * Allocate memory for a coherent mapping.
6535 + */
6536 +void *
6537 +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
6538 + gfp_t gfp)
6539 +{
6540 + void *memory = NULL;
6541 + struct page *page;
6542 + unsigned long dma_mask = 0;
6543 + int noretry = 0;
6544 + unsigned int order = get_order(size);
6545 +
6546 + /* ignore region specifiers */
6547 + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
6548 +
6549 + if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
6550 + return memory;
6551 +
6552 + if (!dev) {
6553 + dev = &fallback_dev;
6554 + gfp |= GFP_DMA;
6555 + }
6556 + dma_mask = dev->coherent_dma_mask;
6557 + if (dma_mask == 0)
6558 + dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
6559 +
6560 + /* Device not DMA able */
6561 + if (dev->dma_mask == NULL)
6562 + return NULL;
6563 +
6564 + /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
6565 + if (gfp & __GFP_DMA)
6566 + noretry = 1;
6567 +
6568 +#ifdef CONFIG_XEN
6569 + gfp &= ~(__GFP_DMA | __GFP_DMA32);
6570 +#else
6571 +#ifdef CONFIG_X86_64
6572 + /* Why <=? Even when the mask is smaller than 4GB it is often
6573 + larger than 16MB and in this case we have a chance of
6574 + finding fitting memory in the next higher zone first. If
6575 + not retry with true GFP_DMA. -AK */
6576 + if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
6577 + gfp |= GFP_DMA32;
6578 +#endif
6579 +
6580 + again:
6581 +#endif
6582 + page = dma_alloc_pages(dev,
6583 + noretry ? gfp | __GFP_NORETRY : gfp, order);
6584 + if (page == NULL)
6585 + return NULL;
6586 +
6587 +#ifndef CONFIG_XEN
6588 + {
6589 + int high, mmu;
6590 + dma_addr_t bus = page_to_phys(page);
6591 + memory = page_address(page);
6592 + high = (bus + size) >= dma_mask;
6593 + mmu = high;
6594 + if (force_iommu && !(gfp & GFP_DMA))
6595 + mmu = 1;
6596 + else if (high) {
6597 + free_pages((unsigned long)memory, order);
6598 +
6599 + /* Don't use the 16MB ZONE_DMA unless absolutely
6600 + needed. It's better to use remapping first. */
6601 + if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
6602 + gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
6603 + goto again;
6604 + }
6605 +
6606 + /* Let low level make its own zone decisions */
6607 + gfp &= ~(GFP_DMA32|GFP_DMA);
6608 +
6609 + if (dma_ops->alloc_coherent)
6610 + return dma_ops->alloc_coherent(dev, size,
6611 + dma_handle, gfp);
6612 + return NULL;
6613 + }
6614 +
6615 + memset(memory, 0, size);
6616 + if (!mmu) {
6617 + *dma_handle = bus;
6618 + return memory;
6619 + }
6620 + }
6621 +
6622 + if (dma_ops->alloc_coherent) {
6623 + free_pages((unsigned long)memory, order);
6624 + gfp &= ~(GFP_DMA|GFP_DMA32);
6625 + return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
6626 + }
6627 +
6628 + if (dma_ops->map_simple) {
6629 + *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
6630 + size,
6631 + PCI_DMA_BIDIRECTIONAL);
6632 + if (*dma_handle != bad_dma_address)
6633 + return memory;
6634 + }
6635 +#else
6636 + memory = page_address(page);
6637 + if (xen_create_contiguous_region((unsigned long)memory, order,
6638 + fls64(dma_mask)) == 0) {
6639 + memset(memory, 0, size);
6640 + *dma_handle = virt_to_bus(memory);
6641 + return memory;
6642 + }
6643 +#endif
6644 +
6645 + if (panic_on_overflow)
6646 + panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
6647 + (unsigned long)size);
6648 + free_pages((unsigned long)memory, order);
6649 + return NULL;
6650 }
6651 -EXPORT_SYMBOL(dma_unmap_single);
6652 +EXPORT_SYMBOL(dma_alloc_coherent);
6653
6654 -void
6655 -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
6656 - enum dma_data_direction direction)
6657 +/*
6658 + * Unmap coherent memory.
6659 + * The caller must ensure that the device has finished accessing the mapping.
6660 + */
6661 +void dma_free_coherent(struct device *dev, size_t size,
6662 + void *vaddr, dma_addr_t bus)
6663 {
6664 - if (swiotlb)
6665 - swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
6666 + int order = get_order(size);
6667 + WARN_ON(irqs_disabled()); /* for portability */
6668 + if (dma_release_coherent(dev, order, vaddr))
6669 + return;
6670 +#ifndef CONFIG_XEN
6671 + if (dma_ops->unmap_single)
6672 + dma_ops->unmap_single(dev, bus, size, 0);
6673 +#endif
6674 + xen_destroy_contiguous_region((unsigned long)vaddr, order);
6675 + free_pages((unsigned long)vaddr, order);
6676 }
6677 -EXPORT_SYMBOL(dma_sync_single_for_cpu);
6678 +EXPORT_SYMBOL(dma_free_coherent);
6679
6680 -void
6681 -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
6682 - enum dma_data_direction direction)
6683 +static int __init pci_iommu_init(void)
6684 {
6685 - if (swiotlb)
6686 - swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
6687 +#ifdef CONFIG_CALGARY_IOMMU
6688 + calgary_iommu_init();
6689 +#endif
6690 +
6691 + intel_iommu_init();
6692 +
6693 +#ifdef CONFIG_GART_IOMMU
6694 + gart_iommu_init();
6695 +#endif
6696 +
6697 + no_iommu_init();
6698 + return 0;
6699 }
6700 -EXPORT_SYMBOL(dma_sync_single_for_device);
6701
6702 -void
6703 -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
6704 - enum dma_data_direction direction)
6705 +void pci_iommu_shutdown(void)
6706 {
6707 - if (swiotlb)
6708 - swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
6709 - flush_write_buffers();
6710 + gart_iommu_shutdown();
6711 }
6712 -EXPORT_SYMBOL(dma_sync_sg_for_cpu);
6713 +/* Must execute after PCI subsystem */
6714 +fs_initcall(pci_iommu_init);
6715 +
6716 +#ifdef CONFIG_PCI
6717 +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
6718
6719 -void
6720 -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
6721 - enum dma_data_direction direction)
6722 +static __devinit void via_no_dac(struct pci_dev *dev)
6723 {
6724 - if (swiotlb)
6725 - swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
6726 - flush_write_buffers();
6727 + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
6728 + printk(KERN_INFO "PCI: VIA PCI bridge detected."
6729 + "Disabling DAC.\n");
6730 + forbid_dac = 1;
6731 + }
6732 }
6733 -EXPORT_SYMBOL(dma_sync_sg_for_device);
6734 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
6735 +#endif
6736 --- /dev/null
6737 +++ b/arch/x86/kernel/pci-nommu-xen.c
6738 @@ -0,0 +1,103 @@
6739 +#include <linux/dma-mapping.h>
6740 +#include <linux/dmar.h>
6741 +#include <linux/bootmem.h>
6742 +#include <linux/pci.h>
6743 +
6744 +#include <xen/gnttab.h>
6745 +
6746 +#include <asm/proto.h>
6747 +#include <asm/dma.h>
6748 +#include <asm/swiotlb.h>
6749 +#include <asm/tlbflush.h>
6750 +#include <asm/gnttab_dma.h>
6751 +#include <asm/bug.h>
6752 +
6753 +#define IOMMU_BUG_ON(test) \
6754 +do { \
6755 + if (unlikely(test)) { \
6756 + printk(KERN_ALERT "Fatal DMA error! " \
6757 + "Please use 'swiotlb=force'\n"); \
6758 + BUG(); \
6759 + } \
6760 +} while (0)
6761 +
6762 +static int
6763 +gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
6764 + int direction)
6765 +{
6766 + unsigned int i;
6767 + struct scatterlist *sg;
6768 +
6769 + WARN_ON(nents == 0 || sgl->length == 0);
6770 +
6771 + for_each_sg(sgl, sg, nents, i) {
6772 + BUG_ON(!sg_page(sg));
6773 + sg->dma_address =
6774 + gnttab_dma_map_page(sg_page(sg)) + sg->offset;
6775 + sg->dma_length = sg->length;
6776 + IOMMU_BUG_ON(address_needs_mapping(
6777 + hwdev, sg->dma_address));
6778 + IOMMU_BUG_ON(range_straddles_page_boundary(
6779 + page_to_pseudophys(sg_page(sg)) + sg->offset,
6780 + sg->length));
6781 + }
6782 +
6783 + return nents;
6784 +}
6785 +
6786 +static void
6787 +gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
6788 + int direction)
6789 +{
6790 + unsigned int i;
6791 + struct scatterlist *sg;
6792 +
6793 + for_each_sg(sgl, sg, nents, i)
6794 + gnttab_dma_unmap_page(sg->dma_address);
6795 +}
6796 +
6797 +static dma_addr_t
6798 +gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size,
6799 + int direction)
6800 +{
6801 + dma_addr_t dma;
6802 +
6803 + WARN_ON(size == 0);
6804 +
6805 + dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) +
6806 + offset_in_page(paddr);
6807 + IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
6808 + IOMMU_BUG_ON(address_needs_mapping(dev, dma));
6809 +
6810 + return dma;
6811 +}
6812 +
6813 +static void
6814 +gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
6815 + int direction)
6816 +{
6817 + gnttab_dma_unmap_page(dma_addr);
6818 +}
6819 +
6820 +static int nommu_mapping_error(dma_addr_t dma_addr)
6821 +{
6822 + return (dma_addr == bad_dma_address);
6823 +}
6824 +
6825 +static const struct dma_mapping_ops nommu_dma_ops = {
6826 + .map_single = gnttab_map_single,
6827 + .unmap_single = gnttab_unmap_single,
6828 + .map_sg = gnttab_map_sg,
6829 + .unmap_sg = gnttab_unmap_sg,
6830 + .dma_supported = swiotlb_dma_supported,
6831 + .mapping_error = nommu_mapping_error
6832 +};
6833 +
6834 +void __init no_iommu_init(void)
6835 +{
6836 + if (dma_ops)
6837 + return;
6838 +
6839 + force_iommu = 0; /* no HW IOMMU */
6840 + dma_ops = &nommu_dma_ops;
6841 +}
6842 --- a/arch/x86/kernel/process_32-xen.c
6843 +++ b/arch/x86/kernel/process_32-xen.c
6844 @@ -36,6 +36,7 @@
6845 #include <linux/personality.h>
6846 #include <linux/tick.h>
6847 #include <linux/percpu.h>
6848 +#include <linux/prctl.h>
6849
6850 #include <asm/uaccess.h>
6851 #include <asm/pgtable.h>
6852 @@ -45,7 +46,6 @@
6853 #include <asm/processor.h>
6854 #include <asm/i387.h>
6855 #include <asm/desc.h>
6856 -#include <asm/vm86.h>
6857 #ifdef CONFIG_MATH_EMULATION
6858 #include <asm/math_emu.h>
6859 #endif
6860 @@ -102,16 +102,6 @@ void enable_hlt(void)
6861
6862 EXPORT_SYMBOL(enable_hlt);
6863
6864 -/*
6865 - * On SMP it's slightly faster (but much more power-consuming!)
6866 - * to poll the ->work.need_resched flag instead of waiting for the
6867 - * cross-CPU IPI to arrive. Use this option with caution.
6868 - */
6869 -static void poll_idle(void)
6870 -{
6871 - cpu_relax();
6872 -}
6873 -
6874 static void xen_idle(void)
6875 {
6876 current_thread_info()->status &= ~TS_POLLING;
6877 @@ -121,20 +111,10 @@ static void xen_idle(void)
6878 */
6879 smp_mb();
6880
6881 - local_irq_disable();
6882 - if (!need_resched()) {
6883 - ktime_t t0, t1;
6884 - u64 t0n, t1n;
6885 -
6886 - t0 = ktime_get();
6887 - t0n = ktime_to_ns(t0);
6888 + if (!need_resched())
6889 safe_halt(); /* enables interrupts racelessly */
6890 - local_irq_disable();
6891 - t1 = ktime_get();
6892 - t1n = ktime_to_ns(t1);
6893 - sched_clock_idle_wakeup_event(t1n - t0n);
6894 - }
6895 - local_irq_enable();
6896 + else
6897 + local_irq_enable();
6898 current_thread_info()->status |= TS_POLLING;
6899 }
6900 #ifdef CONFIG_APM_MODULE
6901 @@ -142,7 +122,6 @@ EXPORT_SYMBOL(default_idle);
6902 #endif
6903
6904 #ifdef CONFIG_HOTPLUG_CPU
6905 -extern cpumask_t cpu_initialized;
6906 static inline void play_dead(void)
6907 {
6908 idle_task_exit();
6909 @@ -187,6 +166,7 @@ void cpu_idle(void)
6910 if (cpu_is_offline(cpu))
6911 play_dead();
6912
6913 + local_irq_disable();
6914 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
6915 idle();
6916 }
6917 @@ -197,44 +177,6 @@ void cpu_idle(void)
6918 }
6919 }
6920
6921 -static void do_nothing(void *unused)
6922 -{
6923 -}
6924 -
6925 -/*
6926 - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
6927 - * pm_idle and update to new pm_idle value. Required while changing pm_idle
6928 - * handler on SMP systems.
6929 - *
6930 - * Caller must have changed pm_idle to the new value before the call. Old
6931 - * pm_idle value will not be used by any CPU after the return of this function.
6932 - */
6933 -void cpu_idle_wait(void)
6934 -{
6935 - smp_mb();
6936 - /* kick all the CPUs so that they exit out of pm_idle */
6937 - smp_call_function(do_nothing, NULL, 0, 1);
6938 -}
6939 -EXPORT_SYMBOL_GPL(cpu_idle_wait);
6940 -
6941 -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
6942 -{
6943 -}
6944 -
6945 -static int __init idle_setup(char *str)
6946 -{
6947 - if (!strcmp(str, "poll")) {
6948 - printk("using polling idle threads.\n");
6949 - pm_idle = poll_idle;
6950 - }
6951 - else
6952 - return -1;
6953 -
6954 - boot_option_idle_override = 1;
6955 - return 0;
6956 -}
6957 -early_param("idle", idle_setup);
6958 -
6959 void __show_registers(struct pt_regs *regs, int all)
6960 {
6961 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
6962 @@ -260,7 +202,7 @@ void __show_registers(struct pt_regs *re
6963 init_utsname()->version);
6964
6965 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
6966 - 0xffff & regs->cs, regs->ip, regs->flags,
6967 + (u16)regs->cs, regs->ip, regs->flags,
6968 smp_processor_id());
6969 print_symbol("EIP is at %s\n", regs->ip);
6970
6971 @@ -269,8 +211,7 @@ void __show_registers(struct pt_regs *re
6972 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
6973 regs->si, regs->di, regs->bp, sp);
6974 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
6975 - regs->ds & 0xffff, regs->es & 0xffff,
6976 - regs->fs & 0xffff, gs, ss);
6977 + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
6978
6979 if (!all)
6980 return;
6981 @@ -367,6 +308,7 @@ void flush_thread(void)
6982 /*
6983 * Forget coprocessor state..
6984 */
6985 + tsk->fpu_counter = 0;
6986 clear_fpu(tsk);
6987 clear_used_math();
6988 }
6989 @@ -437,11 +379,30 @@ int copy_thread(int nr, unsigned long cl
6990 return err;
6991 }
6992
6993 -#ifdef CONFIG_SECCOMP
6994 +void
6995 +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
6996 +{
6997 + __asm__("movl %0, %%gs" :: "r"(0));
6998 + regs->fs = 0;
6999 + set_fs(USER_DS);
7000 + regs->ds = __USER_DS;
7001 + regs->es = __USER_DS;
7002 + regs->ss = __USER_DS;
7003 + regs->cs = __USER_CS;
7004 + regs->ip = new_ip;
7005 + regs->sp = new_sp;
7006 + /*
7007 + * Free the old FP and other extended state
7008 + */
7009 + free_thread_xstate(current);
7010 +}
7011 +EXPORT_SYMBOL_GPL(start_thread);
7012 +
7013 static void hard_disable_TSC(void)
7014 {
7015 write_cr4(read_cr4() | X86_CR4_TSD);
7016 }
7017 +
7018 void disable_TSC(void)
7019 {
7020 preempt_disable();
7021 @@ -453,11 +414,47 @@ void disable_TSC(void)
7022 hard_disable_TSC();
7023 preempt_enable();
7024 }
7025 +
7026 static void hard_enable_TSC(void)
7027 {
7028 write_cr4(read_cr4() & ~X86_CR4_TSD);
7029 }
7030 -#endif /* CONFIG_SECCOMP */
7031 +
7032 +static void enable_TSC(void)
7033 +{
7034 + preempt_disable();
7035 + if (test_and_clear_thread_flag(TIF_NOTSC))
7036 + /*
7037 + * Must flip the CPU state synchronously with
7038 + * TIF_NOTSC in the current running context.
7039 + */
7040 + hard_enable_TSC();
7041 + preempt_enable();
7042 +}
7043 +
7044 +int get_tsc_mode(unsigned long adr)
7045 +{
7046 + unsigned int val;
7047 +
7048 + if (test_thread_flag(TIF_NOTSC))
7049 + val = PR_TSC_SIGSEGV;
7050 + else
7051 + val = PR_TSC_ENABLE;
7052 +
7053 + return put_user(val, (unsigned int __user *)adr);
7054 +}
7055 +
7056 +int set_tsc_mode(unsigned int val)
7057 +{
7058 + if (val == PR_TSC_SIGSEGV)
7059 + disable_TSC();
7060 + else if (val == PR_TSC_ENABLE)
7061 + enable_TSC();
7062 + else
7063 + return -EINVAL;
7064 +
7065 + return 0;
7066 +}
7067
7068 static noinline void
7069 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
7070 @@ -473,12 +470,12 @@ __switch_to_xtra(struct task_struct *pre
7071 /* we clear debugctl to make sure DS
7072 * is not in use when we change it */
7073 debugctl = 0;
7074 - wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
7075 + update_debugctlmsr(0);
7076 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
7077 }
7078
7079 if (next->debugctlmsr != debugctl)
7080 - wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
7081 + update_debugctlmsr(next->debugctlmsr);
7082
7083 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
7084 set_debugreg(next->debugreg0, 0);
7085 @@ -490,7 +487,6 @@ __switch_to_xtra(struct task_struct *pre
7086 set_debugreg(next->debugreg7, 7);
7087 }
7088
7089 -#ifdef CONFIG_SECCOMP
7090 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
7091 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
7092 /* prev and next are different */
7093 @@ -499,7 +495,6 @@ __switch_to_xtra(struct task_struct *pre
7094 else
7095 hard_enable_TSC();
7096 }
7097 -#endif
7098
7099 #ifdef X86_BTS
7100 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
7101 @@ -637,7 +632,7 @@ struct task_struct * __switch_to(struct
7102
7103 /* we're going to use this soon, after a few expensive things */
7104 if (next_p->fpu_counter > 5)
7105 - prefetch(&next->i387.fxsave);
7106 + prefetch(next->xstate);
7107
7108 /*
7109 * Now maybe handle debug registers
7110 @@ -658,8 +653,11 @@ struct task_struct * __switch_to(struct
7111 /* If the task has used fpu the last 5 timeslices, just do a full
7112 * restore of the math state immediately to avoid the trap; the
7113 * chances of needing FPU soon are obviously high now
7114 + *
7115 + * tsk_used_math() checks prevent calling math_state_restore(),
7116 + * which can sleep in the case of !tsk_used_math()
7117 */
7118 - if (next_p->fpu_counter > 5)
7119 + if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
7120 math_state_restore();
7121
7122 /*
7123 --- a/arch/x86/kernel/process_64-xen.c
7124 +++ b/arch/x86/kernel/process_64-xen.c
7125 @@ -39,6 +39,7 @@
7126 #include <linux/kprobes.h>
7127 #include <linux/kdebug.h>
7128 #include <linux/tick.h>
7129 +#include <linux/prctl.h>
7130
7131 #include <asm/uaccess.h>
7132 #include <asm/pgtable.h>
7133 @@ -102,17 +103,6 @@ void exit_idle(void)
7134 __exit_idle();
7135 }
7136
7137 -/*
7138 - * On SMP it's slightly faster (but much more power-consuming!)
7139 - * to poll the ->need_resched flag instead of waiting for the
7140 - * cross-CPU IPI to arrive. Use this option with caution.
7141 - */
7142 -static void poll_idle(void)
7143 -{
7144 - local_irq_enable();
7145 - cpu_relax();
7146 -}
7147 -
7148 static void xen_idle(void)
7149 {
7150 current_thread_info()->status &= ~TS_POLLING;
7151 @@ -121,20 +111,10 @@ static void xen_idle(void)
7152 * test NEED_RESCHED:
7153 */
7154 smp_mb();
7155 - local_irq_disable();
7156 - if (!need_resched()) {
7157 - ktime_t t0, t1;
7158 - u64 t0n, t1n;
7159 -
7160 - t0 = ktime_get();
7161 - t0n = ktime_to_ns(t0);
7162 + if (!need_resched())
7163 safe_halt(); /* enables interrupts racelessly */
7164 - local_irq_disable();
7165 - t1 = ktime_get();
7166 - t1n = ktime_to_ns(t1);
7167 - sched_clock_idle_wakeup_event(t1n - t0n);
7168 - }
7169 - local_irq_enable();
7170 + else
7171 + local_irq_enable();
7172 current_thread_info()->status |= TS_POLLING;
7173 }
7174
7175 @@ -195,45 +175,6 @@ void cpu_idle(void)
7176 }
7177 }
7178
7179 -static void do_nothing(void *unused)
7180 -{
7181 -}
7182 -
7183 -/*
7184 - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
7185 - * pm_idle and update to new pm_idle value. Required while changing pm_idle
7186 - * handler on SMP systems.
7187 - *
7188 - * Caller must have changed pm_idle to the new value before the call. Old
7189 - * pm_idle value will not be used by any CPU after the return of this function.
7190 - */
7191 -void cpu_idle_wait(void)
7192 -{
7193 - smp_mb();
7194 - /* kick all the CPUs so that they exit out of pm_idle */
7195 - smp_call_function(do_nothing, NULL, 0, 1);
7196 -}
7197 -EXPORT_SYMBOL_GPL(cpu_idle_wait);
7198 -
7199 -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
7200 -{
7201 -}
7202 -
7203 -static int __init idle_setup(char *str)
7204 -{
7205 - if (!strcmp(str, "poll")) {
7206 - printk("using polling idle threads.\n");
7207 - pm_idle = poll_idle;
7208 - } else if (!strcmp(str, "mwait"))
7209 - force_mwait = 1;
7210 - else
7211 - return -1;
7212 -
7213 - boot_option_idle_override = 1;
7214 - return 0;
7215 -}
7216 -early_param("idle", idle_setup);
7217 -
7218 /* Prints also some state that isn't saved in the pt_regs */
7219 void __show_regs(struct pt_regs * regs)
7220 {
7221 @@ -360,6 +301,7 @@ void flush_thread(void)
7222 /*
7223 * Forget coprocessor state..
7224 */
7225 + tsk->fpu_counter = 0;
7226 clear_fpu(tsk);
7227 clear_used_math();
7228 }
7229 @@ -472,6 +414,83 @@ out:
7230 return err;
7231 }
7232
7233 +void
7234 +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
7235 +{
7236 + asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
7237 + load_gs_index(0);
7238 + regs->ip = new_ip;
7239 + regs->sp = new_sp;
7240 + write_pda(oldrsp, new_sp);
7241 + regs->cs = __USER_CS;
7242 + regs->ss = __USER_DS;
7243 + regs->flags = 0x200;
7244 + set_fs(USER_DS);
7245 + /*
7246 + * Free the old FP and other extended state
7247 + */
7248 + free_thread_xstate(current);
7249 +}
7250 +EXPORT_SYMBOL_GPL(start_thread);
7251 +
7252 +static void hard_disable_TSC(void)
7253 +{
7254 + write_cr4(read_cr4() | X86_CR4_TSD);
7255 +}
7256 +
7257 +void disable_TSC(void)
7258 +{
7259 + preempt_disable();
7260 + if (!test_and_set_thread_flag(TIF_NOTSC))
7261 + /*
7262 + * Must flip the CPU state synchronously with
7263 + * TIF_NOTSC in the current running context.
7264 + */
7265 + hard_disable_TSC();
7266 + preempt_enable();
7267 +}
7268 +
7269 +static void hard_enable_TSC(void)
7270 +{
7271 + write_cr4(read_cr4() & ~X86_CR4_TSD);
7272 +}
7273 +
7274 +static void enable_TSC(void)
7275 +{
7276 + preempt_disable();
7277 + if (test_and_clear_thread_flag(TIF_NOTSC))
7278 + /*
7279 + * Must flip the CPU state synchronously with
7280 + * TIF_NOTSC in the current running context.
7281 + */
7282 + hard_enable_TSC();
7283 + preempt_enable();
7284 +}
7285 +
7286 +int get_tsc_mode(unsigned long adr)
7287 +{
7288 + unsigned int val;
7289 +
7290 + if (test_thread_flag(TIF_NOTSC))
7291 + val = PR_TSC_SIGSEGV;
7292 + else
7293 + val = PR_TSC_ENABLE;
7294 +
7295 + return put_user(val, (unsigned int __user *)adr);
7296 +}
7297 +
7298 +int set_tsc_mode(unsigned int val)
7299 +{
7300 + if (val == PR_TSC_SIGSEGV)
7301 + disable_TSC();
7302 + else if (val == PR_TSC_ENABLE)
7303 + enable_TSC();
7304 + else
7305 + return -EINVAL;
7306 +
7307 + return 0;
7308 +}
7309 +
7310 /*
7311 * This special macro can be used to load a debugging register
7312 */
7313 @@ -491,12 +510,12 @@ static inline void __switch_to_xtra(stru
7314 /* we clear debugctl to make sure DS
7315 * is not in use when we change it */
7316 debugctl = 0;
7317 - wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
7318 + update_debugctlmsr(0);
7319 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
7320 }
7321
7322 if (next->debugctlmsr != debugctl)
7323 - wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
7324 + update_debugctlmsr(next->debugctlmsr);
7325
7326 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
7327 loaddebug(next, 0);
7328 @@ -508,6 +527,15 @@ static inline void __switch_to_xtra(stru
7329 loaddebug(next, 7);
7330 }
7331
7332 + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
7333 + test_tsk_thread_flag(next_p, TIF_NOTSC)) {
7334 + /* prev and next are different */
7335 + if (test_tsk_thread_flag(next_p, TIF_NOTSC))
7336 + hard_disable_TSC();
7337 + else
7338 + hard_enable_TSC();
7339 + }
7340 +
7341 #ifdef X86_BTS
7342 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
7343 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
7344 @@ -547,7 +575,7 @@ __switch_to(struct task_struct *prev_p,
7345
7346 /* we're going to use this soon, after a few expensive things */
7347 if (next_p->fpu_counter>5)
7348 - prefetch(&next->i387.fxsave);
7349 + prefetch(next->xstate);
7350
7351 /*
7352 * This is basically '__unlazy_fpu', except that we queue a
7353 @@ -680,8 +708,11 @@ __switch_to(struct task_struct *prev_p,
7354 /* If the task has used fpu the last 5 timeslices, just do a full
7355 * restore of the math state immediately to avoid the trap; the
7356 * chances of needing FPU soon are obviously high now
7357 + *
7358 + * tsk_used_math() checks prevent calling math_state_restore(),
7359 + * which can sleep in the case of !tsk_used_math()
7360 */
7361 - if (next_p->fpu_counter>5)
7362 + if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
7363 math_state_restore();
7364 return prev_p;
7365 }
7366 --- /dev/null
7367 +++ b/arch/x86/kernel/process-xen.c
7368 @@ -0,0 +1,188 @@
7369 +#include <linux/errno.h>
7370 +#include <linux/kernel.h>
7371 +#include <linux/mm.h>
7372 +#include <linux/smp.h>
7373 +#include <linux/slab.h>
7374 +#include <linux/sched.h>
7375 +#include <linux/module.h>
7376 +#include <linux/pm.h>
7377 +
7378 +struct kmem_cache *task_xstate_cachep;
7379 +
7380 +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
7381 +{
7382 + *dst = *src;
7383 + if (src->thread.xstate) {
7384 + dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
7385 + GFP_KERNEL);
7386 + if (!dst->thread.xstate)
7387 + return -ENOMEM;
7388 + WARN_ON((unsigned long)dst->thread.xstate & 15);
7389 + memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
7390 + }
7391 + return 0;
7392 +}
7393 +
7394 +void free_thread_xstate(struct task_struct *tsk)
7395 +{
7396 + if (tsk->thread.xstate) {
7397 + kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
7398 + tsk->thread.xstate = NULL;
7399 + }
7400 +}
7401 +
7402 +void free_thread_info(struct thread_info *ti)
7403 +{
7404 + free_thread_xstate(ti->task);
7405 + free_pages((unsigned long)ti, get_order(THREAD_SIZE));
7406 +}
7407 +
7408 +void arch_task_cache_init(void)
7409 +{
7410 + task_xstate_cachep =
7411 + kmem_cache_create("task_xstate", xstate_size,
7412 + __alignof__(union thread_xstate),
7413 + SLAB_PANIC, NULL);
7414 +}
7415 +
7416 +static void do_nothing(void *unused)
7417 +{
7418 +}
7419 +
7420 +/*
7421 + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
7422 + * pm_idle and update to new pm_idle value. Required while changing pm_idle
7423 + * handler on SMP systems.
7424 + *
7425 + * Caller must have changed pm_idle to the new value before the call. Old
7426 + * pm_idle value will not be used by any CPU after the return of this function.
7427 + */
7428 +void cpu_idle_wait(void)
7429 +{
7430 + smp_mb();
7431 + /* kick all the CPUs so that they exit out of pm_idle */
7432 + smp_call_function(do_nothing, NULL, 0, 1);
7433 +}
7434 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
7435 +
7436 +#ifndef CONFIG_XEN
7437 +/*
7438 + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
7439 + * which can obviate IPI to trigger checking of need_resched.
7440 + * We execute MONITOR against need_resched and enter optimized wait state
7441 + * through MWAIT. Whenever someone changes need_resched, we would be woken
7442 + * up from MWAIT (without an IPI).
7443 + *
7444 + * New with Core Duo processors, MWAIT can take some hints based on CPU
7445 + * capability.
7446 + */
7447 +void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
7448 +{
7449 + if (!need_resched()) {
7450 + __monitor((void *)&current_thread_info()->flags, 0, 0);
7451 + smp_mb();
7452 + if (!need_resched())
7453 + __mwait(ax, cx);
7454 + }
7455 +}
7456 +
7457 +/* Default MONITOR/MWAIT with no hints, used for default C1 state */
7458 +static void mwait_idle(void)
7459 +{
7460 + if (!need_resched()) {
7461 + __monitor((void *)&current_thread_info()->flags, 0, 0);
7462 + smp_mb();
7463 + if (!need_resched())
7464 + __sti_mwait(0, 0);
7465 + else
7466 + local_irq_enable();
7467 + } else
7468 + local_irq_enable();
7469 +}
7470 +#endif
7471 +
7472 +/*
7473 + * On SMP it's slightly faster (but much more power-consuming!)
7474 + * to poll the ->work.need_resched flag instead of waiting for the
7475 + * cross-CPU IPI to arrive. Use this option with caution.
7476 + */
7477 +static void poll_idle(void)
7478 +{
7479 + local_irq_enable();
7480 + cpu_relax();
7481 +}
7482 +
7483 +#ifndef CONFIG_XEN
7484 +/*
7485 + * mwait selection logic:
7486 + *
7487 + * It depends on the CPU. For AMD CPUs that support MWAIT this is
7488 + * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
7489 + * then depend on a clock divisor and current Pstate of the core. If
7490 + * all cores of a processor are in halt state (C1) the processor can
7491 + * enter the C1E (C1 enhanced) state. If mwait is used this will never
7492 + * happen.
7493 + *
7494 + * idle=mwait overrides this decision and forces the usage of mwait.
7495 + */
7496 +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
7497 +{
7498 + if (force_mwait)
7499 + return 1;
7500 +
7501 + if (c->x86_vendor == X86_VENDOR_AMD) {
7502 + switch(c->x86) {
7503 + case 0x10:
7504 + case 0x11:
7505 + return 0;
7506 + }
7507 + }
7508 + return 1;
7509 +}
7510 +#endif
7511 +
7512 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
7513 +{
7514 +#ifndef CONFIG_XEN
7515 + static int selected;
7516 +
7517 + if (selected)
7518 + return;
7519 +#ifdef CONFIG_X86_SMP
7520 + if (pm_idle == poll_idle && smp_num_siblings > 1) {
7521 + printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
7522 + " performance may degrade.\n");
7523 + }
7524 +#endif
7525 + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
7526 + /*
7527 + * Skip, if setup has overridden idle.
7528 + * One CPU supports mwait => All CPUs supports mwait
7529 + */
7530 + if (!pm_idle) {
7531 + printk(KERN_INFO "using mwait in idle threads.\n");
7532 + pm_idle = mwait_idle;
7533 + }
7534 + }
7535 + selected = 1;
7536 +#endif
7537 +}
7538 +
7539 +static int __init idle_setup(char *str)
7540 +{
7541 + if (!strcmp(str, "poll")) {
7542 + printk("using polling idle threads.\n");
7543 + pm_idle = poll_idle;
7544 + }
7545 +#ifndef CONFIG_XEN
7546 + else if (!strcmp(str, "mwait"))
7547 + force_mwait = 1;
7548 +#endif
7549 + else
7550 + return -1;
7551 +
7552 + boot_option_idle_override = 1;
7553 + return 0;
7554 +}
7555 +early_param("idle", idle_setup);
7556 +
7557 --- a/arch/x86/kernel/setup_32-xen.c
7558 +++ b/arch/x86/kernel/setup_32-xen.c
7559 @@ -39,6 +39,7 @@
7560 #include <linux/efi.h>
7561 #include <linux/init.h>
7562 #include <linux/edd.h>
7563 +#include <linux/iscsi_ibft.h>
7564 #include <linux/nodemask.h>
7565 #include <linux/kernel.h>
7566 #include <linux/percpu.h>
7567 @@ -49,6 +50,7 @@
7568 #include <linux/pfn.h>
7569 #include <linux/pci.h>
7570 #include <linux/init_ohci1394_dma.h>
7571 +#include <linux/kvm_para.h>
7572
7573 #include <video/edid.h>
7574
7575 @@ -70,8 +72,9 @@
7576 #include <xen/firmware.h>
7577 #include <xen/xencons.h>
7578 #include <setup_arch.h>
7579 -#include <bios_ebda.h>
7580 +#include <asm/bios_ebda.h>
7581 #include <asm/cacheflush.h>
7582 +#include <asm/processor.h>
7583
7584 #ifdef CONFIG_XEN
7585 #include <xen/interface/kexec.h>
7586 @@ -136,7 +139,12 @@ static struct resource standard_io_resou
7587 }, {
7588 .name = "keyboard",
7589 .start = 0x0060,
7590 - .end = 0x006f,
7591 + .end = 0x0060,
7592 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
7593 +}, {
7594 + .name = "keyboard",
7595 + .start = 0x0064,
7596 + .end = 0x0064,
7597 .flags = IORESOURCE_BUSY | IORESOURCE_IO
7598 }, {
7599 .name = "dma page reg",
7600 @@ -166,6 +174,8 @@ struct cpuinfo_x86 new_cpu_data __cpuini
7601 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
7602 EXPORT_SYMBOL(boot_cpu_data);
7603
7604 +unsigned int def_to_bigsmp;
7605 +
7606 #ifndef CONFIG_X86_PAE
7607 unsigned long mmu_cr4_features;
7608 #else
7609 @@ -204,7 +214,7 @@ EXPORT_SYMBOL(ist_info);
7610 extern void early_cpu_init(void);
7611 extern int root_mountflags;
7612
7613 -unsigned long saved_videomode;
7614 +unsigned long saved_video_mode;
7615
7616 #define RAMDISK_IMAGE_START_MASK 0x07FF
7617 #define RAMDISK_PROMPT_FLAG 0x8000
7618 @@ -259,7 +269,7 @@ static inline void copy_edd(void)
7619 }
7620 #endif
7621
7622 -int __initdata user_defined_memmap = 0;
7623 +int __initdata user_defined_memmap;
7624
7625 /*
7626 * "mem=nopentium" disables the 4MB page tables.
7627 @@ -420,20 +430,59 @@ unsigned long __init find_max_low_pfn(vo
7628 }
7629
7630 #ifndef CONFIG_XEN
7631 +#define BIOS_LOWMEM_KILOBYTES 0x413
7632 +
7633 /*
7634 - * workaround for Dell systems that neglect to reserve EBDA
7635 + * The BIOS places the EBDA/XBDA at the top of conventional
7636 + * memory, and usually decreases the reported amount of
7637 + * conventional memory (int 0x12) too. This also contains a
7638 + * workaround for Dell systems that neglect to reserve EBDA.
7639 + * The same workaround also avoids a problem with the AMD768MPX
7640 + * chipset: reserve a page before VGA to prevent PCI prefetch
7641 + * into it (errata #56). Usually the page is reserved anyways,
7642 + * unless you have no PS/2 mouse plugged in.
7643 */
7644 static void __init reserve_ebda_region(void)
7645 {
7646 - unsigned int addr;
7647 - addr = get_bios_ebda();
7648 - if (addr)
7649 - reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
7650 + unsigned int lowmem, ebda_addr;
7651 +
7652 + /* To determine the position of the EBDA and the */
7653 + /* end of conventional memory, we need to look at */
7654 + /* the BIOS data area. In a paravirtual environment */
7655 + /* that area is absent. We'll just have to assume */
7656 + /* that the paravirt case can handle memory setup */
7657 + /* correctly, without our help. */
7658 + if (paravirt_enabled())
7659 + return;
7660 +
7661 + /* end of low (conventional) memory */
7662 + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
7663 + lowmem <<= 10;
7664 +
7665 + /* start of EBDA area */
7666 + ebda_addr = get_bios_ebda();
7667 +
7668 + /* Fixup: bios puts an EBDA in the top 64K segment */
7669 + /* of conventional memory, but does not adjust lowmem. */
7670 + if ((lowmem - ebda_addr) <= 0x10000)
7671 + lowmem = ebda_addr;
7672 +
7673 + /* Fixup: bios does not report an EBDA at all. */
7674 + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
7675 + if ((ebda_addr == 0) && (lowmem >= 0x9f000))
7676 + lowmem = 0x9f000;
7677 +
7678 + /* Paranoia: should never happen, but... */
7679 + if ((lowmem == 0) || (lowmem >= 0x100000))
7680 + lowmem = 0x9f000;
7681 +
7682 + /* reserve all memory between lowmem and the 1MB mark */
7683 + reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
7684 }
7685 #endif
7686
7687 #ifndef CONFIG_NEED_MULTIPLE_NODES
7688 -void __init setup_bootmem_allocator(void);
7689 +static void __init setup_bootmem_allocator(void);
7690 static unsigned long __init setup_memory(void)
7691 {
7692 /*
7693 @@ -469,7 +518,7 @@ static unsigned long __init setup_memory
7694 return max_low_pfn;
7695 }
7696
7697 -void __init zone_sizes_init(void)
7698 +static void __init zone_sizes_init(void)
7699 {
7700 unsigned long max_zone_pfns[MAX_NR_ZONES];
7701 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
7702 @@ -521,10 +570,16 @@ static void __init reserve_crashkernel(v
7703 (unsigned long)(crash_size >> 20),
7704 (unsigned long)(crash_base >> 20),
7705 (unsigned long)(total_mem >> 20));
7706 +
7707 + if (reserve_bootmem(crash_base, crash_size,
7708 + BOOTMEM_EXCLUSIVE) < 0) {
7709 + printk(KERN_INFO "crashkernel reservation "
7710 + "failed - memory is in use\n");
7711 + return;
7712 + }
7713 +
7714 crashk_res.start = crash_base;
7715 crashk_res.end = crash_base + crash_size - 1;
7716 - reserve_bootmem(crash_base, crash_size,
7717 - BOOTMEM_DEFAULT);
7718 } else
7719 printk(KERN_INFO "crashkernel reservation failed - "
7720 "you have to specify a base address\n");
7721 @@ -658,16 +713,9 @@ void __init setup_bootmem_allocator(void
7722 */
7723 reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
7724
7725 - /* reserve EBDA region, it's a 4K region */
7726 + /* reserve EBDA region */
7727 reserve_ebda_region();
7728
7729 - /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
7730 - PCI prefetch into it (errata #56). Usually the page is reserved anyways,
7731 - unless you have no PS/2 mouse plugged in. */
7732 - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
7733 - boot_cpu_data.x86 == 6)
7734 - reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
7735 -
7736 #ifdef CONFIG_SMP
7737 /*
7738 * But first pinch a few for the stack/trampoline stuff
7739 @@ -689,6 +737,8 @@ void __init setup_bootmem_allocator(void
7740 #endif
7741 numa_kva_reserve();
7742 reserve_crashkernel();
7743 +
7744 + reserve_ibft_region();
7745 }
7746
7747 /*
7748 @@ -724,6 +774,18 @@ char * __init __attribute__((weak)) memo
7749 return machine_specific_memory_setup();
7750 }
7751
7752 +#ifdef CONFIG_NUMA
7753 +/*
7754 + * In the golden day, when everything among i386 and x86_64 will be
7755 + * integrated, this will not live here
7756 + */
7757 +void *x86_cpu_to_node_map_early_ptr;
7758 +int x86_cpu_to_node_map_init[NR_CPUS] = {
7759 + [0 ... NR_CPUS-1] = NUMA_NO_NODE
7760 +};
7761 +DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
7762 +#endif
7763 +
7764 /*
7765 * Determine if we were loaded by an EFI loader. If so, then we have also been
7766 * passed the efi memmap, systab, etc., so we should use these data structures
7767 @@ -773,7 +835,7 @@ void __init setup_arch(char **cmdline_p)
7768 copy_edid();
7769 apm_info.bios = boot_params.apm_bios_info;
7770 ist_info = boot_params.ist_info;
7771 - saved_videomode = boot_params.hdr.vid_mode;
7772 + saved_video_mode = boot_params.hdr.vid_mode;
7773 if( boot_params.sys_desc_table.length != 0 ) {
7774 set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
7775 machine_id = boot_params.sys_desc_table.table[0];
7776 @@ -840,15 +902,19 @@ void __init setup_arch(char **cmdline_p)
7777 efi_init();
7778
7779 /* update e820 for memory not covered by WB MTRRs */
7780 - find_max_pfn();
7781 + propagate_e820_map();
7782 mtrr_bp_init();
7783 #ifndef CONFIG_XEN
7784 if (mtrr_trim_uncached_memory(max_pfn))
7785 - find_max_pfn();
7786 + propagate_e820_map();
7787 #endif
7788
7789 max_low_pfn = setup_memory();
7790
7791 +#ifdef CONFIG_KVM_CLOCK
7792 + kvmclock_init();
7793 +#endif
7794 +
7795 #ifdef CONFIG_VMI
7796 /*
7797 * Must be after max_low_pfn is determined, and before kernel
7798 @@ -856,6 +922,7 @@ void __init setup_arch(char **cmdline_p)
7799 */
7800 vmi_init();
7801 #endif
7802 + kvm_guest_init();
7803
7804 /*
7805 * NOTE: before this point _nobody_ is allowed to allocate
7806 @@ -977,6 +1044,18 @@ void __init setup_arch(char **cmdline_p)
7807
7808 io_delay_init();
7809
7810 +#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
7811 + /*
7812 + * setup to use the early static init tables during kernel startup
7813 + * X86_SMP will exclude sub-arches that don't deal well with it.
7814 + */
7815 + x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
7816 + x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
7817 +#ifdef CONFIG_NUMA
7818 + x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
7819 +#endif
7820 +#endif
7821 +
7822 #ifdef CONFIG_X86_GENERICARCH
7823 generic_apic_probe();
7824 #endif
7825 --- a/arch/x86/kernel/setup_64-xen.c
7826 +++ b/arch/x86/kernel/setup_64-xen.c
7827 @@ -29,18 +29,22 @@
7828 #include <linux/crash_dump.h>
7829 #include <linux/root_dev.h>
7830 #include <linux/pci.h>
7831 +#include <asm/pci-direct.h>
7832 #include <linux/efi.h>
7833 #include <linux/acpi.h>
7834 #include <linux/kallsyms.h>
7835 #include <linux/edd.h>
7836 +#include <linux/iscsi_ibft.h>
7837 #include <linux/mmzone.h>
7838 #include <linux/kexec.h>
7839 #include <linux/cpufreq.h>
7840 #include <linux/dmi.h>
7841 #include <linux/dma-mapping.h>
7842 #include <linux/ctype.h>
7843 +#include <linux/sort.h>
7844 #include <linux/uaccess.h>
7845 #include <linux/init_ohci1394_dma.h>
7846 +#include <linux/kvm_para.h>
7847
7848 #include <asm/mtrr.h>
7849 #include <asm/uaccess.h>
7850 @@ -58,7 +62,6 @@
7851 #include <asm/mmu_context.h>
7852 #include <asm/proto.h>
7853 #include <asm/setup.h>
7854 -#include <asm/mach_apic.h>
7855 #include <asm/numa.h>
7856 #include <asm/sections.h>
7857 #include <asm/dmi.h>
7858 @@ -66,6 +69,9 @@
7859 #include <asm/mce.h>
7860 #include <asm/ds.h>
7861 #include <asm/topology.h>
7862 +#include <asm/pat.h>
7863 +
7864 +#include <mach_apic.h>
7865 #ifdef CONFIG_XEN
7866 #include <linux/percpu.h>
7867 #include <xen/interface/physdev.h>
7868 @@ -149,7 +155,7 @@ extern int root_mountflags;
7869
7870 char __initdata command_line[COMMAND_LINE_SIZE];
7871
7872 -struct resource standard_io_resources[] = {
7873 +static struct resource standard_io_resources[] = {
7874 { .name = "dma1", .start = 0x00, .end = 0x1f,
7875 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
7876 { .name = "pic1", .start = 0x20, .end = 0x21,
7877 @@ -158,7 +164,9 @@ struct resource standard_io_resources[]
7878 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
7879 { .name = "timer1", .start = 0x50, .end = 0x53,
7880 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
7881 - { .name = "keyboard", .start = 0x60, .end = 0x6f,
7882 + { .name = "keyboard", .start = 0x60, .end = 0x60,
7883 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
7884 + { .name = "keyboard", .start = 0x64, .end = 0x64,
7885 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
7886 { .name = "dma page reg", .start = 0x80, .end = 0x8f,
7887 .flags = IORESOURCE_BUSY | IORESOURCE_IO },
7888 @@ -224,8 +232,10 @@ contig_initmem_init(unsigned long start_
7889 e820_register_active_regions(0, start_pfn, end_pfn);
7890 #ifdef CONFIG_XEN
7891 free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
7892 + early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
7893 #else
7894 free_bootmem_with_active_regions(0, end_pfn);
7895 + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
7896 #endif
7897 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
7898 }
7899 @@ -290,6 +300,7 @@ static void __init reserve_crashkernel(v
7900 (unsigned long)(total_mem >> 20));
7901 crashk_res.start = crash_base;
7902 crashk_res.end = crash_base + crash_size - 1;
7903 + insert_resource(&iomem_resource, &crashk_res);
7904 }
7905 }
7906 #else
7907 @@ -306,6 +317,40 @@ void __attribute__((weak)) __init memory
7908 machine_specific_memory_setup();
7909 }
7910
7911 +static void __init parse_setup_data(void)
7912 +{
7913 + struct setup_data *data;
7914 + unsigned long pa_data;
7915 +
7916 + if (boot_params.hdr.version < 0x0209)
7917 + return;
7918 + pa_data = boot_params.hdr.setup_data;
7919 + while (pa_data) {
7920 + data = early_ioremap(pa_data, PAGE_SIZE);
7921 + switch (data->type) {
7922 + default:
7923 + break;
7924 + }
7925 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
7926 + free_early(pa_data, pa_data+sizeof(*data)+data->len);
7927 +#endif
7928 + pa_data = data->next;
7929 + early_iounmap(data, PAGE_SIZE);
7930 + }
7931 +}
7932 +
7933 +#ifdef CONFIG_PCI_MMCONFIG
7934 +extern void __cpuinit fam10h_check_enable_mmcfg(void);
7935 +extern void __init check_enable_amd_mmconf_dmi(void);
7936 +#else
7937 +void __cpuinit fam10h_check_enable_mmcfg(void)
7938 +{
7939 +}
7940 +void __init check_enable_amd_mmconf_dmi(void)
7941 +{
7942 +}
7943 +#endif
7944 +
7945 /*
7946 * setup_arch - architecture-specific boot-time initializations
7947 *
7948 @@ -389,6 +434,8 @@ void __init setup_arch(char **cmdline_p)
7949 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
7950 *cmdline_p = command_line;
7951
7952 + parse_setup_data();
7953 +
7954 parse_early_param();
7955
7956 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7957 @@ -398,6 +445,13 @@ void __init setup_arch(char **cmdline_p)
7958
7959 finish_e820_parsing();
7960
7961 +#ifndef CONFIG_XEN
7962 + /* after parse_early_param, so could debug it */
7963 + insert_resource(&iomem_resource, &code_resource);
7964 + insert_resource(&iomem_resource, &data_resource);
7965 + insert_resource(&iomem_resource, &bss_resource);
7966 +#endif
7967 +
7968 early_gart_iommu_check();
7969
7970 e820_register_active_regions(0, 0, -1UL);
7971 @@ -420,15 +474,23 @@ void __init setup_arch(char **cmdline_p)
7972
7973 check_efer();
7974
7975 - init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
7976 + max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
7977 if (efi_enabled)
7978 efi_init();
7979
7980 +#ifndef CONFIG_XEN
7981 + vsmp_init();
7982 +#endif
7983 +
7984 if (is_initial_xendomain())
7985 dmi_scan_machine();
7986
7987 io_delay_init();
7988
7989 +#ifdef CONFIG_KVM_CLOCK
7990 + kvmclock_init();
7991 +#endif
7992 +
7993 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
7994 /* setup to use the early static init tables during kernel startup */
7995 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
7996 @@ -459,9 +521,9 @@ void __init setup_arch(char **cmdline_p)
7997 contig_initmem_init(0, end_pfn);
7998 #endif
7999
8000 - early_res_to_bootmem();
8001 -
8002 #ifndef CONFIG_XEN
8003 + dma32_reserve_bootmem();
8004 +
8005 #ifdef CONFIG_ACPI_SLEEP
8006 /*
8007 * Reserve low memory region for sleep support.
8008 @@ -487,16 +549,17 @@ void __init setup_arch(char **cmdline_p)
8009 unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
8010
8011 if (ramdisk_end <= end_of_mem) {
8012 -#ifndef CONFIG_XEN
8013 - reserve_bootmem_generic(ramdisk_image, ramdisk_size);
8014 -#endif
8015 + /*
8016 + * don't need to reserve again, already reserved early
8017 + * in x86_64_start_kernel, and early_res_to_bootmem
8018 + * convert that to reserved in bootmem
8019 + */
8020 initrd_start = ramdisk_image + PAGE_OFFSET;
8021 initrd_end = initrd_start+ramdisk_size;
8022 #ifdef CONFIG_XEN
8023 initrd_below_start_ok = 1;
8024 #endif
8025 } else {
8026 - /* Assumes everything on node 0 */
8027 free_bootmem(ramdisk_image, ramdisk_size);
8028 printk(KERN_ERR "initrd extends beyond end of memory "
8029 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
8030 @@ -506,6 +569,9 @@ void __init setup_arch(char **cmdline_p)
8031 }
8032 #endif
8033 reserve_crashkernel();
8034 +
8035 + reserve_ibft_region();
8036 +
8037 paging_init();
8038 map_vsyscall();
8039 #ifdef CONFIG_X86_LOCAL_APIC
8040 @@ -633,16 +699,16 @@ void __init setup_arch(char **cmdline_p)
8041 prefill_possible_map();
8042 #endif
8043
8044 + kvm_guest_init();
8045 +
8046 /*
8047 * We trust e820 completely. No explicit ROM probing in memory.
8048 */
8049 #ifdef CONFIG_XEN
8050 if (is_initial_xendomain())
8051 - e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
8052 - &code_resource, &data_resource, &bss_resource);
8053 + e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
8054 #else
8055 - e820_reserve_resources(e820.map, e820.nr_map,
8056 - &code_resource, &data_resource, &bss_resource);
8057 + e820_reserve_resources(e820.map, e820.nr_map);
8058 e820_mark_nosave_regions();
8059 #endif
8060
8061 @@ -690,6 +756,9 @@ void __init setup_arch(char **cmdline_p)
8062 #endif
8063
8064 #endif /* !CONFIG_XEN */
8065 +
8066 + /* do this before identify_cpu for boot cpu */
8067 + check_enable_amd_mmconf_dmi();
8068 }
8069
8070 #ifdef CONFIG_XEN
8071 @@ -786,9 +855,9 @@ static void __cpuinit amd_detect_cmp(str
8072 bits = c->x86_coreid_bits;
8073
8074 /* Low order bits define the core id (index of core in socket) */
8075 - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
8076 - /* Convert the APIC ID into the socket ID */
8077 - c->phys_proc_id = phys_pkg_id(bits);
8078 + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
8079 + /* Convert the initial APIC ID into the socket ID */
8080 + c->phys_proc_id = c->initial_apicid >> bits;
8081
8082 #ifdef CONFIG_NUMA
8083 node = c->phys_proc_id;
8084 @@ -805,7 +874,7 @@ static void __cpuinit amd_detect_cmp(str
8085 If that doesn't result in a usable node fall back to the
8086 path for the previous case. */
8087
8088 - int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
8089 + int ht_nodeid = c->initial_apicid;
8090
8091 if (ht_nodeid >= 0 &&
8092 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
8093 @@ -913,7 +982,7 @@ static void __cpuinit init_amd(struct cp
8094
8095 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
8096 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
8097 - clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
8098 + clear_cpu_cap(c, 0*32+31);
8099
8100 /* On C+ stepping K8 rep microcode works well for copy/memset */
8101 level = cpuid_eax(1);
8102 @@ -955,9 +1024,25 @@ static void __cpuinit init_amd(struct cp
8103 /* MFENCE stops RDTSC speculation */
8104 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
8105
8106 + if (c->x86 == 0x10)
8107 + fam10h_check_enable_mmcfg();
8108 +
8109 #ifndef CONFIG_XEN
8110 if (amd_apic_timer_broken())
8111 disable_apic_timer = 1;
8112 +
8113 + if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
8114 + unsigned long long tseg;
8115 +
8116 + /*
8117 + * Split up direct mapping around the TSEG SMM area.
8118 + * Don't do it for gbpages because there seems very little
8119 + * benefit in doing so.
8120 + */
8121 + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
8122 + (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
8123 + set_memory_4k((unsigned long)__va(tseg), 1);
8124 + }
8125 #endif
8126 }
8127
8128 @@ -1051,7 +1136,7 @@ static void __cpuinit early_init_intel(s
8129 {
8130 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
8131 (c->x86 == 0x6 && c->x86_model >= 0x0e))
8132 - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
8133 + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8134 }
8135
8136 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
8137 @@ -1094,9 +1179,6 @@ static void __cpuinit init_intel(struct
8138
8139 if (c->x86 == 15)
8140 c->x86_cache_alignment = c->x86_clflush_size * 2;
8141 - if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
8142 - (c->x86 == 0x6 && c->x86_model >= 0x0e))
8143 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8144 if (c->x86 == 6)
8145 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
8146 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
8147 @@ -1105,6 +1187,32 @@ static void __cpuinit init_intel(struct
8148 srat_detect_node();
8149 }
8150
8151 +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
8152 +{
8153 + if (c->x86 == 0x6 && c->x86_model >= 0xf)
8154 + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8155 +}
8156 +
8157 +static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
8158 +{
8159 + /* Cache sizes */
8160 + unsigned n;
8161 +
8162 + n = c->extended_cpuid_level;
8163 + if (n >= 0x80000008) {
8164 + unsigned eax = cpuid_eax(0x80000008);
8165 + c->x86_virt_bits = (eax >> 8) & 0xff;
8166 + c->x86_phys_bits = eax & 0xff;
8167 + }
8168 +
8169 + if (c->x86 == 0x6 && c->x86_model >= 0xf) {
8170 + c->x86_cache_alignment = c->x86_clflush_size * 2;
8171 + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
8172 + set_cpu_cap(c, X86_FEATURE_REP_GOOD);
8173 + }
8174 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
8175 +}
8176 +
8177 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
8178 {
8179 char *v = c->x86_vendor_id;
8180 @@ -1113,6 +1221,8 @@ static void __cpuinit get_cpu_vendor(str
8181 c->x86_vendor = X86_VENDOR_AMD;
8182 else if (!strcmp(v, "GenuineIntel"))
8183 c->x86_vendor = X86_VENDOR_INTEL;
8184 + else if (!strcmp(v, "CentaurHauls"))
8185 + c->x86_vendor = X86_VENDOR_CENTAUR;
8186 else
8187 c->x86_vendor = X86_VENDOR_UNKNOWN;
8188 }
8189 @@ -1160,15 +1270,16 @@ static void __cpuinit early_identify_cpu
8190 c->x86 += (tfms >> 20) & 0xff;
8191 if (c->x86 >= 0x6)
8192 c->x86_model += ((tfms >> 16) & 0xF) << 4;
8193 - if (c->x86_capability[0] & (1<<19))
8194 + if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
8195 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
8196 } else {
8197 /* Have CPUID level 0 only - unheard of */
8198 c->x86 = 4;
8199 }
8200
8201 + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
8202 #ifdef CONFIG_SMP
8203 - c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
8204 + c->phys_proc_id = c->initial_apicid;
8205 #endif
8206 /* AMD-defined flags: level 0x80000001 */
8207 xlvl = cpuid_eax(0x80000000);
8208 @@ -1201,8 +1312,12 @@ static void __cpuinit early_identify_cpu
8209 case X86_VENDOR_INTEL:
8210 early_init_intel(c);
8211 break;
8212 + case X86_VENDOR_CENTAUR:
8213 + early_init_centaur(c);
8214 + break;
8215 }
8216
8217 + validate_pat_support(c);
8218 }
8219
8220 /*
8221 @@ -1237,6 +1352,10 @@ void __cpuinit identify_cpu(struct cpuin
8222 init_intel(c);
8223 break;
8224
8225 + case X86_VENDOR_CENTAUR:
8226 + init_centaur(c);
8227 + break;
8228 +
8229 case X86_VENDOR_UNKNOWN:
8230 default:
8231 display_cacheinfo(c);
8232 @@ -1266,14 +1385,24 @@ void __cpuinit identify_cpu(struct cpuin
8233 #endif
8234 select_idle_routine(c);
8235
8236 - if (c != &boot_cpu_data)
8237 - mtrr_ap_init();
8238 #ifdef CONFIG_NUMA
8239 numa_add_cpu(smp_processor_id());
8240 #endif
8241
8242 }
8243
8244 +void __cpuinit identify_boot_cpu(void)
8245 +{
8246 + identify_cpu(&boot_cpu_data);
8247 +}
8248 +
8249 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
8250 +{
8251 + BUG_ON(c == &boot_cpu_data);
8252 + identify_cpu(c);
8253 + mtrr_ap_init();
8254 +}
8255 +
8256 static __init int setup_noclflush(char *arg)
8257 {
8258 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
8259 @@ -1302,123 +1431,3 @@ static __init int setup_disablecpuid(cha
8260 return 1;
8261 }
8262 __setup("clearcpuid=", setup_disablecpuid);
8263 -
8264 -/*
8265 - * Get CPU information for use by the procfs.
8266 - */
8267 -
8268 -static int show_cpuinfo(struct seq_file *m, void *v)
8269 -{
8270 - struct cpuinfo_x86 *c = v;
8271 - int cpu = 0, i;
8272 -
8273 -#ifdef CONFIG_SMP
8274 - cpu = c->cpu_index;
8275 -#endif
8276 -
8277 - seq_printf(m, "processor\t: %u\n"
8278 - "vendor_id\t: %s\n"
8279 - "cpu family\t: %d\n"
8280 - "model\t\t: %d\n"
8281 - "model name\t: %s\n",
8282 - (unsigned)cpu,
8283 - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8284 - c->x86,
8285 - (int)c->x86_model,
8286 - c->x86_model_id[0] ? c->x86_model_id : "unknown");
8287 -
8288 - if (c->x86_mask || c->cpuid_level >= 0)
8289 - seq_printf(m, "stepping\t: %d\n", c->x86_mask);
8290 - else
8291 - seq_printf(m, "stepping\t: unknown\n");
8292 -
8293 - if (cpu_has(c, X86_FEATURE_TSC)) {
8294 - unsigned int freq = cpufreq_quick_get((unsigned)cpu);
8295 -
8296 - if (!freq)
8297 - freq = cpu_khz;
8298 - seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
8299 - freq / 1000, (freq % 1000));
8300 - }
8301 -
8302 - /* Cache size */
8303 - if (c->x86_cache_size >= 0)
8304 - seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
8305 -
8306 -#ifdef CONFIG_SMP
8307 - if (smp_num_siblings * c->x86_max_cores > 1) {
8308 - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
8309 - seq_printf(m, "siblings\t: %d\n",
8310 - cpus_weight(per_cpu(cpu_core_map, cpu)));
8311 - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
8312 - seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
8313 - }
8314 -#endif
8315 -
8316 - seq_printf(m,
8317 - "fpu\t\t: yes\n"
8318 - "fpu_exception\t: yes\n"
8319 - "cpuid level\t: %d\n"
8320 - "wp\t\t: yes\n"
8321 - "flags\t\t:",
8322 - c->cpuid_level);
8323 -
8324 - for (i = 0; i < 32*NCAPINTS; i++)
8325 - if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8326 - seq_printf(m, " %s", x86_cap_flags[i]);
8327 -
8328 - seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
8329 - c->loops_per_jiffy/(500000/HZ),
8330 - (c->loops_per_jiffy/(5000/HZ)) % 100);
8331 -
8332 - if (c->x86_tlbsize > 0)
8333 - seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
8334 - seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
8335 - seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
8336 -
8337 - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8338 - c->x86_phys_bits, c->x86_virt_bits);
8339 -
8340 - seq_printf(m, "power management:");
8341 - for (i = 0; i < 32; i++) {
8342 - if (c->x86_power & (1 << i)) {
8343 - if (i < ARRAY_SIZE(x86_power_flags) &&
8344 - x86_power_flags[i])
8345 - seq_printf(m, "%s%s",
8346 - x86_power_flags[i][0]?" ":"",
8347 - x86_power_flags[i]);
8348 - else
8349 - seq_printf(m, " [%d]", i);
8350 - }
8351 - }
8352 -
8353 - seq_printf(m, "\n\n");
8354 -
8355 - return 0;
8356 -}
8357 -
8358 -static void *c_start(struct seq_file *m, loff_t *pos)
8359 -{
8360 - if (*pos == 0) /* just in case, cpu 0 is not the first */
8361 - *pos = first_cpu(cpu_online_map);
8362 - if ((*pos) < NR_CPUS && cpu_online(*pos))
8363 - return &cpu_data(*pos);
8364 - return NULL;
8365 -}
8366 -
8367 -static void *c_next(struct seq_file *m, void *v, loff_t *pos)
8368 -{
8369 - *pos = next_cpu(*pos, cpu_online_map);
8370 - return c_start(m, pos);
8371 -}
8372 -
8373 -static void c_stop(struct seq_file *m, void *v)
8374 -{
8375 -}
8376 -
8377 -const struct seq_operations cpuinfo_op = {
8378 - .start = c_start,
8379 - .next = c_next,
8380 - .stop = c_stop,
8381 - .show = show_cpuinfo,
8382 -};
8383 --- a/arch/x86/kernel/setup64-xen.c
8384 +++ b/arch/x86/kernel/setup64-xen.c
8385 @@ -15,6 +15,7 @@
8386 #include <linux/bootmem.h>
8387 #include <linux/bitops.h>
8388 #include <linux/module.h>
8389 +#include <linux/kgdb.h>
8390 #include <asm/pda.h>
8391 #include <asm/pgtable.h>
8392 #include <asm/processor.h>
8393 @@ -27,6 +28,7 @@
8394 #include <asm/proto.h>
8395 #include <asm/sections.h>
8396 #include <asm/setup.h>
8397 +#include <asm/genapic.h>
8398 #ifdef CONFIG_XEN
8399 #include <asm/hypervisor.h>
8400 #endif
8401 @@ -81,8 +83,8 @@ int force_personality32 = 0;
8402 Control non executable heap for 32bit processes.
8403 To control the stack too use noexec=off
8404
8405 -on PROT_READ does not imply PROT_EXEC for 32bit processes
8406 -off PROT_READ implies PROT_EXEC (default)
8407 +on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
8408 +off PROT_READ implies PROT_EXEC
8409 */
8410 static int __init nonx32_setup(char *str)
8411 {
8412 @@ -94,85 +96,6 @@ static int __init nonx32_setup(char *str
8413 }
8414 __setup("noexec32=", nonx32_setup);
8415
8416 -/*
8417 - * Copy data used in early init routines from the initial arrays to the
8418 - * per cpu data areas. These arrays then become expendable and the
8419 - * *_early_ptr's are zeroed indicating that the static arrays are gone.
8420 - */
8421 -static void __init setup_per_cpu_maps(void)
8422 -{
8423 -#ifndef CONFIG_XEN
8424 - int cpu;
8425 -
8426 - for_each_possible_cpu(cpu) {
8427 -#ifdef CONFIG_SMP
8428 - if (per_cpu_offset(cpu)) {
8429 -#endif
8430 - per_cpu(x86_cpu_to_apicid, cpu) =
8431 - x86_cpu_to_apicid_init[cpu];
8432 - per_cpu(x86_bios_cpu_apicid, cpu) =
8433 - x86_bios_cpu_apicid_init[cpu];
8434 -#ifdef CONFIG_NUMA
8435 - per_cpu(x86_cpu_to_node_map, cpu) =
8436 - x86_cpu_to_node_map_init[cpu];
8437 -#endif
8438 -#ifdef CONFIG_SMP
8439 - }
8440 - else
8441 - printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
8442 - cpu);
8443 -#endif
8444 - }
8445 -
8446 - /* indicate the early static arrays will soon be gone */
8447 - x86_cpu_to_apicid_early_ptr = NULL;
8448 - x86_bios_cpu_apicid_early_ptr = NULL;
8449 -#ifdef CONFIG_NUMA
8450 - x86_cpu_to_node_map_early_ptr = NULL;
8451 -#endif
8452 -#endif
8453 -}
8454 -
8455 -/*
8456 - * Great future plan:
8457 - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
8458 - * Always point %gs to its beginning
8459 - */
8460 -void __init setup_per_cpu_areas(void)
8461 -{
8462 - int i;
8463 - unsigned long size;
8464 -
8465 -#ifdef CONFIG_HOTPLUG_CPU
8466 - prefill_possible_map();
8467 -#endif
8468 -
8469 - /* Copy section for each CPU (we discard the original) */
8470 - size = PERCPU_ENOUGH_ROOM;
8471 -
8472 - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
8473 - for_each_cpu_mask (i, cpu_possible_map) {
8474 - char *ptr;
8475 -#ifndef CONFIG_NEED_MULTIPLE_NODES
8476 - ptr = alloc_bootmem_pages(size);
8477 -#else
8478 - int node = early_cpu_to_node(i);
8479 -
8480 - if (!node_online(node) || !NODE_DATA(node))
8481 - ptr = alloc_bootmem_pages(size);
8482 - else
8483 - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
8484 -#endif
8485 - if (!ptr)
8486 - panic("Cannot allocate cpu data for CPU %d\n", i);
8487 - cpu_pda(i)->data_offset = ptr - __per_cpu_start;
8488 - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
8489 - }
8490 -
8491 - /* setup percpu data maps early */
8492 - setup_per_cpu_maps();
8493 -}
8494 -
8495 #ifdef CONFIG_XEN
8496 static void __init_refok switch_pt(int cpu)
8497 {
8498 @@ -410,6 +333,17 @@ void __cpuinit cpu_init (void)
8499 #endif
8500 load_LDT(&init_mm.context);
8501
8502 +#ifdef CONFIG_KGDB
8503 + /*
8504 + * If the kgdb is connected no debug regs should be altered. This
8505 + * is only applicable when KGDB and a KGDB I/O module are built
8506 + * into the kernel and you are using early debugging with
8507 + * kgdbwait. KGDB will control the kernel HW breakpoint registers.
8508 + */
8509 + if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
8510 + arch_kgdb_ops.correct_hw_break();
8511 + else {
8512 +#endif
8513 /*
8514 * Clear all 6 debug registers:
8515 */
8516 @@ -420,10 +354,17 @@ void __cpuinit cpu_init (void)
8517 set_debugreg(0UL, 3);
8518 set_debugreg(0UL, 6);
8519 set_debugreg(0UL, 7);
8520 +#ifdef CONFIG_KGDB
8521 + /* If the kgdb is connected no debug regs should be altered. */
8522 + }
8523 +#endif
8524
8525 fpu_init();
8526
8527 asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
8528 if (raw_irqs_disabled())
8529 kernel_eflags &= ~X86_EFLAGS_IF;
8530 +
8531 + if (is_uv_system())
8532 + uv_cpu_init();
8533 }
8534 --- /dev/null
8535 +++ b/arch/x86/kernel/setup-xen.c
8536 @@ -0,0 +1,141 @@
8537 +#include <linux/kernel.h>
8538 +#include <linux/module.h>
8539 +#include <linux/init.h>
8540 +#include <linux/bootmem.h>
8541 +#include <linux/percpu.h>
8542 +#include <asm/smp.h>
8543 +#include <asm/percpu.h>
8544 +#include <asm/sections.h>
8545 +#include <asm/processor.h>
8546 +#include <asm/setup.h>
8547 +#include <asm/topology.h>
8548 +#include <asm/mpspec.h>
8549 +#include <asm/apicdef.h>
8550 +
8551 +#ifdef CONFIG_X86_LOCAL_APIC
8552 +unsigned int num_processors;
8553 +unsigned disabled_cpus __cpuinitdata;
8554 +/* Processor that is doing the boot up */
8555 +unsigned int boot_cpu_physical_apicid = -1U;
8556 +EXPORT_SYMBOL(boot_cpu_physical_apicid);
8557 +
8558 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
8559 +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
8560 +
8561 +/* Bitmask of physically existing CPUs */
8562 +physid_mask_t phys_cpu_present_map;
8563 +#endif
8564 +
8565 +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
8566 +/*
8567 + * Copy data used in early init routines from the initial arrays to the
8568 + * per cpu data areas. These arrays then become expendable and the
8569 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
8570 + */
8571 +static void __init setup_per_cpu_maps(void)
8572 +{
8573 +#ifndef CONFIG_XEN
8574 + int cpu;
8575 +
8576 + for_each_possible_cpu(cpu) {
8577 + per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
8578 + per_cpu(x86_bios_cpu_apicid, cpu) =
8579 + x86_bios_cpu_apicid_init[cpu];
8580 +#ifdef CONFIG_NUMA
8581 + per_cpu(x86_cpu_to_node_map, cpu) =
8582 + x86_cpu_to_node_map_init[cpu];
8583 +#endif
8584 + }
8585 +
8586 + /* indicate the early static arrays will soon be gone */
8587 + x86_cpu_to_apicid_early_ptr = NULL;
8588 + x86_bios_cpu_apicid_early_ptr = NULL;
8589 +#ifdef CONFIG_NUMA
8590 + x86_cpu_to_node_map_early_ptr = NULL;
8591 +#endif
8592 +#endif
8593 +}
8594 +
8595 +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
8596 +cpumask_t *cpumask_of_cpu_map __read_mostly;
8597 +EXPORT_SYMBOL(cpumask_of_cpu_map);
8598 +
8599 +/* requires nr_cpu_ids to be initialized */
8600 +static void __init setup_cpumask_of_cpu(void)
8601 +{
8602 + int i;
8603 +
8604 + /* alloc_bootmem zeroes memory */
8605 + cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
8606 + for (i = 0; i < nr_cpu_ids; i++)
8607 + cpu_set(i, cpumask_of_cpu_map[i]);
8608 +}
8609 +#else
8610 +static inline void setup_cpumask_of_cpu(void) { }
8611 +#endif
8612 +
8613 +#ifdef CONFIG_X86_32
8614 +/*
8615 + * Great future not-so-futuristic plan: make i386 and x86_64 do it
8616 + * the same way
8617 + */
8618 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
8619 +EXPORT_SYMBOL(__per_cpu_offset);
8620 +#endif
8621 +
8622 +/*
8623 + * Great future plan:
8624 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
8625 + * Always point %gs to its beginning
8626 + */
8627 +void __init setup_per_cpu_areas(void)
8628 +{
8629 + int i, highest_cpu = 0;
8630 + unsigned long size;
8631 +
8632 +#ifdef CONFIG_HOTPLUG_CPU
8633 + prefill_possible_map();
8634 +#endif
8635 +
8636 + /* Copy section for each CPU (we discard the original) */
8637 + size = PERCPU_ENOUGH_ROOM;
8638 + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
8639 + size);
8640 +
8641 + for_each_possible_cpu(i) {
8642 + char *ptr;
8643 +#ifndef CONFIG_NEED_MULTIPLE_NODES
8644 + ptr = alloc_bootmem_pages(size);
8645 +#else
8646 + int node = early_cpu_to_node(i);
8647 + if (!node_online(node) || !NODE_DATA(node)) {
8648 + ptr = alloc_bootmem_pages(size);
8649 + printk(KERN_INFO
8650 + "cpu %d has no node or node-local memory\n", i);
8651 + }
8652 + else
8653 + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
8654 +#endif
8655 + if (!ptr)
8656 + panic("Cannot allocate cpu data for CPU %d\n", i);
8657 +#ifdef CONFIG_X86_64
8658 + cpu_pda(i)->data_offset = ptr - __per_cpu_start;
8659 +#else
8660 + __per_cpu_offset[i] = ptr - __per_cpu_start;
8661 +#endif
8662 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
8663 +
8664 + highest_cpu = i;
8665 + }
8666 +
8667 + nr_cpu_ids = highest_cpu + 1;
8668 + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
8669 +
8670 + /* Setup percpu data maps */
8671 + setup_per_cpu_maps();
8672 +
8673 + /* Setup cpumask_of_cpu map */
8674 + setup_cpumask_of_cpu();
8675 +}
8676 +
8677 +#endif
8678 --- a/arch/x86/kernel/smp_32-xen.c
8679 +++ /dev/null
8680 @@ -1,647 +0,0 @@
8681 -/*
8682 - * Intel SMP support routines.
8683 - *
8684 - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
8685 - * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
8686 - *
8687 - * This code is released under the GNU General Public License version 2 or
8688 - * later.
8689 - */
8690 -
8691 -#include <linux/init.h>
8692 -
8693 -#include <linux/mm.h>
8694 -#include <linux/delay.h>
8695 -#include <linux/spinlock.h>
8696 -#include <linux/kernel_stat.h>
8697 -#include <linux/mc146818rtc.h>
8698 -#include <linux/cache.h>
8699 -#include <linux/interrupt.h>
8700 -#include <linux/cpu.h>
8701 -#include <linux/module.h>
8702 -
8703 -#include <asm/mtrr.h>
8704 -#include <asm/tlbflush.h>
8705 -#include <asm/mmu_context.h>
8706 -#if 0
8707 -#include <mach_apic.h>
8708 -#endif
8709 -#include <xen/evtchn.h>
8710 -
8711 -/*
8712 - * Some notes on x86 processor bugs affecting SMP operation:
8713 - *
8714 - * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
8715 - * The Linux implications for SMP are handled as follows:
8716 - *
8717 - * Pentium III / [Xeon]
8718 - * None of the E1AP-E3AP errata are visible to the user.
8719 - *
8720 - * E1AP. see PII A1AP
8721 - * E2AP. see PII A2AP
8722 - * E3AP. see PII A3AP
8723 - *
8724 - * Pentium II / [Xeon]
8725 - * None of the A1AP-A3AP errata are visible to the user.
8726 - *
8727 - * A1AP. see PPro 1AP
8728 - * A2AP. see PPro 2AP
8729 - * A3AP. see PPro 7AP
8730 - *
8731 - * Pentium Pro
8732 - * None of 1AP-9AP errata are visible to the normal user,
8733 - * except occasional delivery of 'spurious interrupt' as trap #15.
8734 - * This is very rare and a non-problem.
8735 - *
8736 - * 1AP. Linux maps APIC as non-cacheable
8737 - * 2AP. worked around in hardware
8738 - * 3AP. fixed in C0 and above steppings microcode update.
8739 - * Linux does not use excessive STARTUP_IPIs.
8740 - * 4AP. worked around in hardware
8741 - * 5AP. symmetric IO mode (normal Linux operation) not affected.
8742 - * 'noapic' mode has vector 0xf filled out properly.
8743 - * 6AP. 'noapic' mode might be affected - fixed in later steppings
8744 - * 7AP. We do not assume writes to the LVT deassering IRQs
8745 - * 8AP. We do not enable low power mode (deep sleep) during MP bootup
8746 - * 9AP. We do not use mixed mode
8747 - *
8748 - * Pentium
8749 - * There is a marginal case where REP MOVS on 100MHz SMP
8750 - * machines with B stepping processors can fail. XXX should provide
8751 - * an L1cache=Writethrough or L1cache=off option.
8752 - *
8753 - * B stepping CPUs may hang. There are hardware work arounds
8754 - * for this. We warn about it in case your board doesn't have the work
8755 - * arounds. Basically that's so I can tell anyone with a B stepping
8756 - * CPU and SMP problems "tough".
8757 - *
8758 - * Specific items [From Pentium Processor Specification Update]
8759 - *
8760 - * 1AP. Linux doesn't use remote read
8761 - * 2AP. Linux doesn't trust APIC errors
8762 - * 3AP. We work around this
8763 - * 4AP. Linux never generated 3 interrupts of the same priority
8764 - * to cause a lost local interrupt.
8765 - * 5AP. Remote read is never used
8766 - * 6AP. not affected - worked around in hardware
8767 - * 7AP. not affected - worked around in hardware
8768 - * 8AP. worked around in hardware - we get explicit CS errors if not
8769 - * 9AP. only 'noapic' mode affected. Might generate spurious
8770 - * interrupts, we log only the first one and count the
8771 - * rest silently.
8772 - * 10AP. not affected - worked around in hardware
8773 - * 11AP. Linux reads the APIC between writes to avoid this, as per
8774 - * the documentation. Make sure you preserve this as it affects
8775 - * the C stepping chips too.
8776 - * 12AP. not affected - worked around in hardware
8777 - * 13AP. not affected - worked around in hardware
8778 - * 14AP. we always deassert INIT during bootup
8779 - * 15AP. not affected - worked around in hardware
8780 - * 16AP. not affected - worked around in hardware
8781 - * 17AP. not affected - worked around in hardware
8782 - * 18AP. not affected - worked around in hardware
8783 - * 19AP. not affected - worked around in BIOS
8784 - *
8785 - * If this sounds worrying believe me these bugs are either ___RARE___,
8786 - * or are signal timing bugs worked around in hardware and there's
8787 - * about nothing of note with C stepping upwards.
8788 - */
8789 -
8790 -DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
8791 -
8792 -/*
8793 - * the following functions deal with sending IPIs between CPUs.
8794 - *
8795 - * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
8796 - */
8797 -
8798 -static inline int __prepare_ICR (unsigned int shortcut, int vector)
8799 -{
8800 - unsigned int icr = shortcut | APIC_DEST_LOGICAL;
8801 -
8802 - switch (vector) {
8803 - default:
8804 - icr |= APIC_DM_FIXED | vector;
8805 - break;
8806 - case NMI_VECTOR:
8807 - icr |= APIC_DM_NMI;
8808 - break;
8809 - }
8810 - return icr;
8811 -}
8812 -
8813 -static inline int __prepare_ICR2 (unsigned int mask)
8814 -{
8815 - return SET_APIC_DEST_FIELD(mask);
8816 -}
8817 -
8818 -DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
8819 -
8820 -static inline void __send_IPI_one(unsigned int cpu, int vector)
8821 -{
8822 - int irq = per_cpu(ipi_to_irq, cpu)[vector];
8823 - BUG_ON(irq < 0);
8824 - notify_remote_via_irq(irq);
8825 -}
8826 -
8827 -void __send_IPI_shortcut(unsigned int shortcut, int vector)
8828 -{
8829 - int cpu;
8830 -
8831 - switch (shortcut) {
8832 - case APIC_DEST_SELF:
8833 - __send_IPI_one(smp_processor_id(), vector);
8834 - break;
8835 - case APIC_DEST_ALLBUT:
8836 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
8837 - if (cpu == smp_processor_id())
8838 - continue;
8839 - if (cpu_isset(cpu, cpu_online_map)) {
8840 - __send_IPI_one(cpu, vector);
8841 - }
8842 - }
8843 - break;
8844 - default:
8845 - printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
8846 - vector);
8847 - break;
8848 - }
8849 -}
8850 -
8851 -void send_IPI_self(int vector)
8852 -{
8853 - __send_IPI_shortcut(APIC_DEST_SELF, vector);
8854 -}
8855 -
8856 -/*
8857 - * This is only used on smaller machines.
8858 - */
8859 -void send_IPI_mask_bitmask(cpumask_t mask, int vector)
8860 -{
8861 - unsigned long flags;
8862 - unsigned int cpu;
8863 -
8864 - local_irq_save(flags);
8865 - WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
8866 -
8867 - for (cpu = 0; cpu < NR_CPUS; ++cpu) {
8868 - if (cpu_isset(cpu, mask)) {
8869 - __send_IPI_one(cpu, vector);
8870 - }
8871 - }
8872 -
8873 - local_irq_restore(flags);
8874 -}
8875 -
8876 -void send_IPI_mask_sequence(cpumask_t mask, int vector)
8877 -{
8878 -
8879 - send_IPI_mask_bitmask(mask, vector);
8880 -}
8881 -
8882 -#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
8883 -
8884 -#if 0 /* XEN */
8885 -/*
8886 - * Smarter SMP flushing macros.
8887 - * c/o Linus Torvalds.
8888 - *
8889 - * These mean you can really definitely utterly forget about
8890 - * writing to user space from interrupts. (Its not allowed anyway).
8891 - *
8892 - * Optimizations Manfred Spraul <manfred@colorfullife.com>
8893 - */
8894 -
8895 -static cpumask_t flush_cpumask;
8896 -static struct mm_struct * flush_mm;
8897 -static unsigned long flush_va;
8898 -static DEFINE_SPINLOCK(tlbstate_lock);
8899 -
8900 -/*
8901 - * We cannot call mmdrop() because we are in interrupt context,
8902 - * instead update mm->cpu_vm_mask.
8903 - *
8904 - * We need to reload %cr3 since the page tables may be going
8905 - * away from under us..
8906 - */
8907 -void leave_mm(int cpu)
8908 -{
8909 - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
8910 - BUG();
8911 - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
8912 - load_cr3(swapper_pg_dir);
8913 -}
8914 -EXPORT_SYMBOL_GPL(leave_mm);
8915 -
8916 -/*
8917 - *
8918 - * The flush IPI assumes that a thread switch happens in this order:
8919 - * [cpu0: the cpu that switches]
8920 - * 1) switch_mm() either 1a) or 1b)
8921 - * 1a) thread switch to a different mm
8922 - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
8923 - * Stop ipi delivery for the old mm. This is not synchronized with
8924 - * the other cpus, but smp_invalidate_interrupt ignore flush ipis
8925 - * for the wrong mm, and in the worst case we perform a superfluous
8926 - * tlb flush.
8927 - * 1a2) set cpu_tlbstate to TLBSTATE_OK
8928 - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8929 - * was in lazy tlb mode.
8930 - * 1a3) update cpu_tlbstate[].active_mm
8931 - * Now cpu0 accepts tlb flushes for the new mm.
8932 - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
8933 - * Now the other cpus will send tlb flush ipis.
8934 - * 1a4) change cr3.
8935 - * 1b) thread switch without mm change
8936 - * cpu_tlbstate[].active_mm is correct, cpu0 already handles
8937 - * flush ipis.
8938 - * 1b1) set cpu_tlbstate to TLBSTATE_OK
8939 - * 1b2) test_and_set the cpu bit in cpu_vm_mask.
8940 - * Atomically set the bit [other cpus will start sending flush ipis],
8941 - * and test the bit.
8942 - * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
8943 - * 2) switch %%esp, ie current
8944 - *
8945 - * The interrupt must handle 2 special cases:
8946 - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
8947 - * - the cpu performs speculative tlb reads, i.e. even if the cpu only
8948 - * runs in kernel space, the cpu could load tlb entries for user space
8949 - * pages.
8950 - *
8951 - * The good news is that cpu_tlbstate is local to each cpu, no
8952 - * write/read ordering problems.
8953 - */
8954 -
8955 -/*
8956 - * TLB flush IPI:
8957 - *
8958 - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
8959 - * 2) Leave the mm if we are in the lazy tlb mode.
8960 - */
8961 -
8962 -irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id)
8963 -{
8964 - unsigned long cpu;
8965 -
8966 - cpu = get_cpu();
8967 -
8968 - if (!cpu_isset(cpu, flush_cpumask))
8969 - goto out;
8970 - /*
8971 - * This was a BUG() but until someone can quote me the
8972 - * line from the intel manual that guarantees an IPI to
8973 - * multiple CPUs is retried _only_ on the erroring CPUs
8974 - * its staying as a return
8975 - *
8976 - * BUG();
8977 - */
8978 -
8979 - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
8980 - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
8981 - if (flush_va == TLB_FLUSH_ALL)
8982 - local_flush_tlb();
8983 - else
8984 - __flush_tlb_one(flush_va);
8985 - } else
8986 - leave_mm(cpu);
8987 - }
8988 - smp_mb__before_clear_bit();
8989 - cpu_clear(cpu, flush_cpumask);
8990 - smp_mb__after_clear_bit();
8991 -out:
8992 - put_cpu_no_resched();
8993 - __get_cpu_var(irq_stat).irq_tlb_count++;
8994 -
8995 - return IRQ_HANDLED;
8996 -}
8997 -
8998 -void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
8999 - unsigned long va)
9000 -{
9001 - cpumask_t cpumask = *cpumaskp;
9002 -
9003 - /*
9004 - * A couple of (to be removed) sanity checks:
9005 - *
9006 - * - current CPU must not be in mask
9007 - * - mask must exist :)
9008 - */
9009 - BUG_ON(cpus_empty(cpumask));
9010 - BUG_ON(cpu_isset(smp_processor_id(), cpumask));
9011 - BUG_ON(!mm);
9012 -
9013 -#ifdef CONFIG_HOTPLUG_CPU
9014 - /* If a CPU which we ran on has gone down, OK. */
9015 - cpus_and(cpumask, cpumask, cpu_online_map);
9016 - if (unlikely(cpus_empty(cpumask)))
9017 - return;
9018 -#endif
9019 -
9020 - /*
9021 - * i'm not happy about this global shared spinlock in the
9022 - * MM hot path, but we'll see how contended it is.
9023 - * AK: x86-64 has a faster method that could be ported.
9024 - */
9025 - spin_lock(&tlbstate_lock);
9026 -
9027 - flush_mm = mm;
9028 - flush_va = va;
9029 - cpus_or(flush_cpumask, cpumask, flush_cpumask);
9030 - /*
9031 - * We have to send the IPI only to
9032 - * CPUs affected.
9033 - */
9034 - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
9035 -
9036 - while (!cpus_empty(flush_cpumask))
9037 - /* nothing. lockup detection does not belong here */
9038 - cpu_relax();
9039 -
9040 - flush_mm = NULL;
9041 - flush_va = 0;
9042 - spin_unlock(&tlbstate_lock);
9043 -}
9044 -
9045 -void flush_tlb_current_task(void)
9046 -{
9047 - struct mm_struct *mm = current->mm;
9048 - cpumask_t cpu_mask;
9049 -
9050 - preempt_disable();
9051 - cpu_mask = mm->cpu_vm_mask;
9052 - cpu_clear(smp_processor_id(), cpu_mask);
9053 -
9054 - local_flush_tlb();
9055 - if (!cpus_empty(cpu_mask))
9056 - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9057 - preempt_enable();
9058 -}
9059 -
9060 -void flush_tlb_mm (struct mm_struct * mm)
9061 -{
9062 - cpumask_t cpu_mask;
9063 -
9064 - preempt_disable();
9065 - cpu_mask = mm->cpu_vm_mask;
9066 - cpu_clear(smp_processor_id(), cpu_mask);
9067 -
9068 - if (current->active_mm == mm) {
9069 - if (current->mm)
9070 - local_flush_tlb();
9071 - else
9072 - leave_mm(smp_processor_id());
9073 - }
9074 - if (!cpus_empty(cpu_mask))
9075 - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9076 -
9077 - preempt_enable();
9078 -}
9079 -
9080 -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
9081 -{
9082 - struct mm_struct *mm = vma->vm_mm;
9083 - cpumask_t cpu_mask;
9084 -
9085 - preempt_disable();
9086 - cpu_mask = mm->cpu_vm_mask;
9087 - cpu_clear(smp_processor_id(), cpu_mask);
9088 -
9089 - if (current->active_mm == mm) {
9090 - if(current->mm)
9091 - __flush_tlb_one(va);
9092 - else
9093 - leave_mm(smp_processor_id());
9094 - }
9095 -
9096 - if (!cpus_empty(cpu_mask))
9097 - flush_tlb_others(cpu_mask, mm, va);
9098 -
9099 - preempt_enable();
9100 -}
9101 -EXPORT_SYMBOL(flush_tlb_page);
9102 -
9103 -static void do_flush_tlb_all(void* info)
9104 -{
9105 - unsigned long cpu = smp_processor_id();
9106 -
9107 - __flush_tlb_all();
9108 - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
9109 - leave_mm(cpu);
9110 -}
9111 -
9112 -void flush_tlb_all(void)
9113 -{
9114 - on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
9115 -}
9116 -
9117 -#endif /* XEN */
9118 -
9119 -/*
9120 - * this function sends a 'reschedule' IPI to another CPU.
9121 - * it goes straight through and wastes no time serializing
9122 - * anything. Worst case is that we lose a reschedule ...
9123 - */
9124 -void xen_smp_send_reschedule(int cpu)
9125 -{
9126 - WARN_ON(cpu_is_offline(cpu));
9127 - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
9128 -}
9129 -
9130 -/*
9131 - * Structure and data for smp_call_function(). This is designed to minimise
9132 - * static memory requirements. It also looks cleaner.
9133 - */
9134 -static DEFINE_SPINLOCK(call_lock);
9135 -
9136 -struct call_data_struct {
9137 - void (*func) (void *info);
9138 - void *info;
9139 - atomic_t started;
9140 - atomic_t finished;
9141 - int wait;
9142 -};
9143 -
9144 -void lock_ipi_call_lock(void)
9145 -{
9146 - spin_lock_irq(&call_lock);
9147 -}
9148 -
9149 -void unlock_ipi_call_lock(void)
9150 -{
9151 - spin_unlock_irq(&call_lock);
9152 -}
9153 -
9154 -static struct call_data_struct *call_data;
9155 -
9156 -static void __smp_call_function(void (*func) (void *info), void *info,
9157 - int nonatomic, int wait)
9158 -{
9159 - struct call_data_struct data;
9160 - int cpus = num_online_cpus() - 1;
9161 -
9162 - if (!cpus)
9163 - return;
9164 -
9165 - data.func = func;
9166 - data.info = info;
9167 - atomic_set(&data.started, 0);
9168 - data.wait = wait;
9169 - if (wait)
9170 - atomic_set(&data.finished, 0);
9171 -
9172 - call_data = &data;
9173 - mb();
9174 -
9175 - /* Send a message to all other CPUs and wait for them to respond */
9176 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
9177 -
9178 - /* Wait for response */
9179 - while (atomic_read(&data.started) != cpus)
9180 - cpu_relax();
9181 -
9182 - if (wait)
9183 - while (atomic_read(&data.finished) != cpus)
9184 - cpu_relax();
9185 -}
9186 -
9187 -
9188 -/**
9189 - * smp_call_function_mask(): Run a function on a set of other CPUs.
9190 - * @mask: The set of cpus to run on. Must not include the current cpu.
9191 - * @func: The function to run. This must be fast and non-blocking.
9192 - * @info: An arbitrary pointer to pass to the function.
9193 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
9194 - *
9195 - * Returns 0 on success, else a negative status code.
9196 - *
9197 - * If @wait is true, then returns once @func has returned; otherwise
9198 - * it returns just before the target cpu calls @func.
9199 - *
9200 - * You must not call this function with disabled interrupts or from a
9201 - * hardware interrupt handler or from a bottom half handler.
9202 - */
9203 -int
9204 -xen_smp_call_function_mask(cpumask_t mask,
9205 - void (*func)(void *), void *info,
9206 - int wait)
9207 -{
9208 - struct call_data_struct data;
9209 - cpumask_t allbutself;
9210 - int cpus;
9211 -
9212 - /* Can deadlock when called with interrupts disabled */
9213 - WARN_ON(irqs_disabled());
9214 -
9215 - /* Holding any lock stops cpus from going down. */
9216 - spin_lock(&call_lock);
9217 -
9218 - allbutself = cpu_online_map;
9219 - cpu_clear(smp_processor_id(), allbutself);
9220 -
9221 - cpus_and(mask, mask, allbutself);
9222 - cpus = cpus_weight(mask);
9223 -
9224 - if (!cpus) {
9225 - spin_unlock(&call_lock);
9226 - return 0;
9227 - }
9228 -
9229 - data.func = func;
9230 - data.info = info;
9231 - atomic_set(&data.started, 0);
9232 - data.wait = wait;
9233 - if (wait)
9234 - atomic_set(&data.finished, 0);
9235 -
9236 - call_data = &data;
9237 - mb();
9238 -
9239 - /* Send a message to other CPUs */
9240 - if (cpus_equal(mask, allbutself))
9241 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
9242 - else
9243 - send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
9244 -
9245 - /* Wait for response */
9246 - while (atomic_read(&data.started) != cpus)
9247 - cpu_relax();
9248 -
9249 - if (wait)
9250 - while (atomic_read(&data.finished) != cpus)
9251 - cpu_relax();
9252 - spin_unlock(&call_lock);
9253 -
9254 - return 0;
9255 -}
9256 -
9257 -static void stop_this_cpu (void * dummy)
9258 -{
9259 - local_irq_disable();
9260 - /*
9261 - * Remove this CPU:
9262 - */
9263 - cpu_clear(smp_processor_id(), cpu_online_map);
9264 - disable_all_local_evtchn();
9265 - if (cpu_data(smp_processor_id()).hlt_works_ok)
9266 - for(;;) halt();
9267 - for (;;);
9268 -}
9269 -
9270 -/*
9271 - * this function calls the 'stop' function on all other CPUs in the system.
9272 - */
9273 -
9274 -void xen_smp_send_stop(void)
9275 -{
9276 - /* Don't deadlock on the call lock in panic */
9277 - int nolock = !spin_trylock(&call_lock);
9278 - unsigned long flags;
9279 -
9280 - local_irq_save(flags);
9281 - __smp_call_function(stop_this_cpu, NULL, 0, 0);
9282 - if (!nolock)
9283 - spin_unlock(&call_lock);
9284 - disable_all_local_evtchn();
9285 - local_irq_restore(flags);
9286 -}
9287 -
9288 -/*
9289 - * Reschedule call back. Nothing to do,
9290 - * all the work is done automatically when
9291 - * we return from the interrupt.
9292 - */
9293 -irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
9294 -{
9295 - __get_cpu_var(irq_stat).irq_resched_count++;
9296 -
9297 - return IRQ_HANDLED;
9298 -}
9299 -
9300 -#include <linux/kallsyms.h>
9301 -irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
9302 -{
9303 - void (*func) (void *info) = call_data->func;
9304 - void *info = call_data->info;
9305 - int wait = call_data->wait;
9306 -
9307 - /*
9308 - * Notify initiating CPU that I've grabbed the data and am
9309 - * about to execute the function
9310 - */
9311 - mb();
9312 - atomic_inc(&call_data->started);
9313 - /*
9314 - * At this point the info structure may be out of scope unless wait==1
9315 - */
9316 - irq_enter();
9317 - (*func)(info);
9318 - __get_cpu_var(irq_stat).irq_call_count++;
9319 - irq_exit();
9320 -
9321 - if (wait) {
9322 - mb();
9323 - atomic_inc(&call_data->finished);
9324 - }
9325 -
9326 - return IRQ_HANDLED;
9327 -}
9328 --- a/arch/x86/kernel/smp_64-xen.c
9329 +++ /dev/null
9330 @@ -1,554 +0,0 @@
9331 -/*
9332 - * Intel SMP support routines.
9333 - *
9334 - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
9335 - * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
9336 - * (c) 2002,2003 Andi Kleen, SuSE Labs.
9337 - *
9338 - * This code is released under the GNU General Public License version 2 or
9339 - * later.
9340 - */
9341 -
9342 -#include <linux/init.h>
9343 -
9344 -#include <linux/mm.h>
9345 -#include <linux/delay.h>
9346 -#include <linux/spinlock.h>
9347 -#include <linux/smp.h>
9348 -#include <linux/kernel_stat.h>
9349 -#include <linux/mc146818rtc.h>
9350 -#include <linux/interrupt.h>
9351 -
9352 -#include <asm/mtrr.h>
9353 -#include <asm/pgalloc.h>
9354 -#include <asm/tlbflush.h>
9355 -#include <asm/mach_apic.h>
9356 -#include <asm/mmu_context.h>
9357 -#include <asm/proto.h>
9358 -#include <asm/apicdef.h>
9359 -#include <asm/idle.h>
9360 -#ifdef CONFIG_XEN
9361 -#include <xen/evtchn.h>
9362 -#endif
9363 -
9364 -#ifndef CONFIG_XEN
9365 -/*
9366 - * Smarter SMP flushing macros.
9367 - * c/o Linus Torvalds.
9368 - *
9369 - * These mean you can really definitely utterly forget about
9370 - * writing to user space from interrupts. (Its not allowed anyway).
9371 - *
9372 - * Optimizations Manfred Spraul <manfred@colorfullife.com>
9373 - *
9374 - * More scalable flush, from Andi Kleen
9375 - *
9376 - * To avoid global state use 8 different call vectors.
9377 - * Each CPU uses a specific vector to trigger flushes on other
9378 - * CPUs. Depending on the received vector the target CPUs look into
9379 - * the right per cpu variable for the flush data.
9380 - *
9381 - * With more than 8 CPUs they are hashed to the 8 available
9382 - * vectors. The limited global vector space forces us to this right now.
9383 - * In future when interrupts are split into per CPU domains this could be
9384 - * fixed, at the cost of triggering multiple IPIs in some cases.
9385 - */
9386 -
9387 -union smp_flush_state {
9388 - struct {
9389 - cpumask_t flush_cpumask;
9390 - struct mm_struct *flush_mm;
9391 - unsigned long flush_va;
9392 - spinlock_t tlbstate_lock;
9393 - };
9394 - char pad[SMP_CACHE_BYTES];
9395 -} ____cacheline_aligned;
9396 -
9397 -/* State is put into the per CPU data section, but padded
9398 - to a full cache line because other CPUs can access it and we don't
9399 - want false sharing in the per cpu data segment. */
9400 -static DEFINE_PER_CPU(union smp_flush_state, flush_state);
9401 -
9402 -/*
9403 - * We cannot call mmdrop() because we are in interrupt context,
9404 - * instead update mm->cpu_vm_mask.
9405 - */
9406 -void leave_mm(int cpu)
9407 -{
9408 - if (read_pda(mmu_state) == TLBSTATE_OK)
9409 - BUG();
9410 - cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
9411 - load_cr3(swapper_pg_dir);
9412 -}
9413 -EXPORT_SYMBOL_GPL(leave_mm);
9414 -
9415 -/*
9416 - *
9417 - * The flush IPI assumes that a thread switch happens in this order:
9418 - * [cpu0: the cpu that switches]
9419 - * 1) switch_mm() either 1a) or 1b)
9420 - * 1a) thread switch to a different mm
9421 - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
9422 - * Stop ipi delivery for the old mm. This is not synchronized with
9423 - * the other cpus, but smp_invalidate_interrupt ignore flush ipis
9424 - * for the wrong mm, and in the worst case we perform a superfluous
9425 - * tlb flush.
9426 - * 1a2) set cpu mmu_state to TLBSTATE_OK
9427 - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
9428 - * was in lazy tlb mode.
9429 - * 1a3) update cpu active_mm
9430 - * Now cpu0 accepts tlb flushes for the new mm.
9431 - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
9432 - * Now the other cpus will send tlb flush ipis.
9433 - * 1a4) change cr3.
9434 - * 1b) thread switch without mm change
9435 - * cpu active_mm is correct, cpu0 already handles
9436 - * flush ipis.
9437 - * 1b1) set cpu mmu_state to TLBSTATE_OK
9438 - * 1b2) test_and_set the cpu bit in cpu_vm_mask.
9439 - * Atomically set the bit [other cpus will start sending flush ipis],
9440 - * and test the bit.
9441 - * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
9442 - * 2) switch %%esp, ie current
9443 - *
9444 - * The interrupt must handle 2 special cases:
9445 - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
9446 - * - the cpu performs speculative tlb reads, i.e. even if the cpu only
9447 - * runs in kernel space, the cpu could load tlb entries for user space
9448 - * pages.
9449 - *
9450 - * The good news is that cpu mmu_state is local to each cpu, no
9451 - * write/read ordering problems.
9452 - */
9453 -
9454 -/*
9455 - * TLB flush IPI:
9456 - *
9457 - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
9458 - * 2) Leave the mm if we are in the lazy tlb mode.
9459 - *
9460 - * Interrupts are disabled.
9461 - */
9462 -
9463 -asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
9464 -{
9465 - int cpu;
9466 - int sender;
9467 - union smp_flush_state *f;
9468 -
9469 - cpu = smp_processor_id();
9470 - /*
9471 - * orig_rax contains the negated interrupt vector.
9472 - * Use that to determine where the sender put the data.
9473 - */
9474 - sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
9475 - f = &per_cpu(flush_state, sender);
9476 -
9477 - if (!cpu_isset(cpu, f->flush_cpumask))
9478 - goto out;
9479 - /*
9480 - * This was a BUG() but until someone can quote me the
9481 - * line from the intel manual that guarantees an IPI to
9482 - * multiple CPUs is retried _only_ on the erroring CPUs
9483 - * its staying as a return
9484 - *
9485 - * BUG();
9486 - */
9487 -
9488 - if (f->flush_mm == read_pda(active_mm)) {
9489 - if (read_pda(mmu_state) == TLBSTATE_OK) {
9490 - if (f->flush_va == TLB_FLUSH_ALL)
9491 - local_flush_tlb();
9492 - else
9493 - __flush_tlb_one(f->flush_va);
9494 - } else
9495 - leave_mm(cpu);
9496 - }
9497 -out:
9498 - ack_APIC_irq();
9499 - cpu_clear(cpu, f->flush_cpumask);
9500 - add_pda(irq_tlb_count, 1);
9501 -}
9502 -
9503 -void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
9504 - unsigned long va)
9505 -{
9506 - int sender;
9507 - union smp_flush_state *f;
9508 - cpumask_t cpumask = *cpumaskp;
9509 -
9510 - /* Caller has disabled preemption */
9511 - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
9512 - f = &per_cpu(flush_state, sender);
9513 -
9514 - /*
9515 - * Could avoid this lock when
9516 - * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
9517 - * probably not worth checking this for a cache-hot lock.
9518 - */
9519 - spin_lock(&f->tlbstate_lock);
9520 -
9521 - f->flush_mm = mm;
9522 - f->flush_va = va;
9523 - cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
9524 -
9525 - /*
9526 - * We have to send the IPI only to
9527 - * CPUs affected.
9528 - */
9529 - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
9530 -
9531 - while (!cpus_empty(f->flush_cpumask))
9532 - cpu_relax();
9533 -
9534 - f->flush_mm = NULL;
9535 - f->flush_va = 0;
9536 - spin_unlock(&f->tlbstate_lock);
9537 -}
9538 -
9539 -int __cpuinit init_smp_flush(void)
9540 -{
9541 - int i;
9542 -
9543 - for_each_cpu_mask(i, cpu_possible_map) {
9544 - spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
9545 - }
9546 - return 0;
9547 -}
9548 -core_initcall(init_smp_flush);
9549 -
9550 -void flush_tlb_current_task(void)
9551 -{
9552 - struct mm_struct *mm = current->mm;
9553 - cpumask_t cpu_mask;
9554 -
9555 - preempt_disable();
9556 - cpu_mask = mm->cpu_vm_mask;
9557 - cpu_clear(smp_processor_id(), cpu_mask);
9558 -
9559 - local_flush_tlb();
9560 - if (!cpus_empty(cpu_mask))
9561 - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9562 - preempt_enable();
9563 -}
9564 -
9565 -void flush_tlb_mm (struct mm_struct * mm)
9566 -{
9567 - cpumask_t cpu_mask;
9568 -
9569 - preempt_disable();
9570 - cpu_mask = mm->cpu_vm_mask;
9571 - cpu_clear(smp_processor_id(), cpu_mask);
9572 -
9573 - if (current->active_mm == mm) {
9574 - if (current->mm)
9575 - local_flush_tlb();
9576 - else
9577 - leave_mm(smp_processor_id());
9578 - }
9579 - if (!cpus_empty(cpu_mask))
9580 - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
9581 -
9582 - preempt_enable();
9583 -}
9584 -
9585 -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
9586 -{
9587 - struct mm_struct *mm = vma->vm_mm;
9588 - cpumask_t cpu_mask;
9589 -
9590 - preempt_disable();
9591 - cpu_mask = mm->cpu_vm_mask;
9592 - cpu_clear(smp_processor_id(), cpu_mask);
9593 -
9594 - if (current->active_mm == mm) {
9595 - if(current->mm)
9596 - __flush_tlb_one(va);
9597 - else
9598 - leave_mm(smp_processor_id());
9599 - }
9600 -
9601 - if (!cpus_empty(cpu_mask))
9602 - flush_tlb_others(cpu_mask, mm, va);
9603 -
9604 - preempt_enable();
9605 -}
9606 -
9607 -static void do_flush_tlb_all(void* info)
9608 -{
9609 - unsigned long cpu = smp_processor_id();
9610 -
9611 - __flush_tlb_all();
9612 - if (read_pda(mmu_state) == TLBSTATE_LAZY)
9613 - leave_mm(cpu);
9614 -}
9615 -
9616 -void flush_tlb_all(void)
9617 -{
9618 - on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
9619 -}
9620 -#endif /* Xen */
9621 -
9622 -/*
9623 - * this function sends a 'reschedule' IPI to another CPU.
9624 - * it goes straight through and wastes no time serializing
9625 - * anything. Worst case is that we lose a reschedule ...
9626 - */
9627 -
9628 -void smp_send_reschedule(int cpu)
9629 -{
9630 - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
9631 -}
9632 -
9633 -/*
9634 - * Structure and data for smp_call_function(). This is designed to minimise
9635 - * static memory requirements. It also looks cleaner.
9636 - */
9637 -static DEFINE_SPINLOCK(call_lock);
9638 -
9639 -struct call_data_struct {
9640 - void (*func) (void *info);
9641 - void *info;
9642 - atomic_t started;
9643 - atomic_t finished;
9644 - int wait;
9645 -};
9646 -
9647 -static struct call_data_struct * call_data;
9648 -
9649 -void lock_ipi_call_lock(void)
9650 -{
9651 - spin_lock_irq(&call_lock);
9652 -}
9653 -
9654 -void unlock_ipi_call_lock(void)
9655 -{
9656 - spin_unlock_irq(&call_lock);
9657 -}
9658 -
9659 -/*
9660 - * this function sends a 'generic call function' IPI to all other CPU
9661 - * of the system defined in the mask.
9662 - */
9663 -static int __smp_call_function_mask(cpumask_t mask,
9664 - void (*func)(void *), void *info,
9665 - int wait)
9666 -{
9667 - struct call_data_struct data;
9668 - cpumask_t allbutself;
9669 - int cpus;
9670 -
9671 - allbutself = cpu_online_map;
9672 - cpu_clear(smp_processor_id(), allbutself);
9673 -
9674 - cpus_and(mask, mask, allbutself);
9675 - cpus = cpus_weight(mask);
9676 -
9677 - if (!cpus)
9678 - return 0;
9679 -
9680 - data.func = func;
9681 - data.info = info;
9682 - atomic_set(&data.started, 0);
9683 - data.wait = wait;
9684 - if (wait)
9685 - atomic_set(&data.finished, 0);
9686 -
9687 - call_data = &data;
9688 - wmb();
9689 -
9690 - /* Send a message to other CPUs */
9691 - if (cpus_equal(mask, allbutself))
9692 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
9693 - else
9694 - send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
9695 -
9696 - /* Wait for response */
9697 - while (atomic_read(&data.started) != cpus)
9698 - cpu_relax();
9699 -
9700 - if (!wait)
9701 - return 0;
9702 -
9703 - while (atomic_read(&data.finished) != cpus)
9704 - cpu_relax();
9705 -
9706 - return 0;
9707 -}
9708 -/**
9709 - * smp_call_function_mask(): Run a function on a set of other CPUs.
9710 - * @mask: The set of cpus to run on. Must not include the current cpu.
9711 - * @func: The function to run. This must be fast and non-blocking.
9712 - * @info: An arbitrary pointer to pass to the function.
9713 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
9714 - *
9715 - * Returns 0 on success, else a negative status code.
9716 - *
9717 - * If @wait is true, then returns once @func has returned; otherwise
9718 - * it returns just before the target cpu calls @func.
9719 - *
9720 - * You must not call this function with disabled interrupts or from a
9721 - * hardware interrupt handler or from a bottom half handler.
9722 - */
9723 -int smp_call_function_mask(cpumask_t mask,
9724 - void (*func)(void *), void *info,
9725 - int wait)
9726 -{
9727 - int ret;
9728 -
9729 - /* Can deadlock when called with interrupts disabled */
9730 - WARN_ON(irqs_disabled());
9731 -
9732 - spin_lock(&call_lock);
9733 - ret = __smp_call_function_mask(mask, func, info, wait);
9734 - spin_unlock(&call_lock);
9735 - return ret;
9736 -}
9737 -EXPORT_SYMBOL(smp_call_function_mask);
9738 -
9739 -/*
9740 - * smp_call_function_single - Run a function on a specific CPU
9741 - * @func: The function to run. This must be fast and non-blocking.
9742 - * @info: An arbitrary pointer to pass to the function.
9743 - * @nonatomic: Currently unused.
9744 - * @wait: If true, wait until function has completed on other CPUs.
9745 - *
9746 - * Retrurns 0 on success, else a negative status code.
9747 - *
9748 - * Does not return until the remote CPU is nearly ready to execute <func>
9749 - * or is or has executed.
9750 - */
9751 -
9752 -int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
9753 - int nonatomic, int wait)
9754 -{
9755 - /* prevent preemption and reschedule on another processor */
9756 - int ret, me = get_cpu();
9757 -
9758 - /* Can deadlock when called with interrupts disabled */
9759 - WARN_ON(irqs_disabled());
9760 -
9761 - if (cpu == me) {
9762 - local_irq_disable();
9763 - func(info);
9764 - local_irq_enable();
9765 - put_cpu();
9766 - return 0;
9767 - }
9768 -
9769 - ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
9770 -
9771 - put_cpu();
9772 - return ret;
9773 -}
9774 -EXPORT_SYMBOL(smp_call_function_single);
9775 -
9776 -/*
9777 - * smp_call_function - run a function on all other CPUs.
9778 - * @func: The function to run. This must be fast and non-blocking.
9779 - * @info: An arbitrary pointer to pass to the function.
9780 - * @nonatomic: currently unused.
9781 - * @wait: If true, wait (atomically) until function has completed on other
9782 - * CPUs.
9783 - *
9784 - * Returns 0 on success, else a negative status code. Does not return until
9785 - * remote CPUs are nearly ready to execute func or are or have executed.
9786 - *
9787 - * You must not call this function with disabled interrupts or from a
9788 - * hardware interrupt handler or from a bottom half handler.
9789 - * Actually there are a few legal cases, like panic.
9790 - */
9791 -int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
9792 - int wait)
9793 -{
9794 - return smp_call_function_mask(cpu_online_map, func, info, wait);
9795 -}
9796 -EXPORT_SYMBOL(smp_call_function);
9797 -
9798 -static void stop_this_cpu(void *dummy)
9799 -{
9800 - local_irq_disable();
9801 - /*
9802 - * Remove this CPU:
9803 - */
9804 - cpu_clear(smp_processor_id(), cpu_online_map);
9805 - disable_all_local_evtchn();
9806 - for (;;)
9807 - halt();
9808 -}
9809 -
9810 -void smp_send_stop(void)
9811 -{
9812 - int nolock;
9813 - unsigned long flags;
9814 -
9815 -#ifndef CONFIG_XEN
9816 - if (reboot_force)
9817 - return;
9818 -#endif
9819 -
9820 - /* Don't deadlock on the call lock in panic */
9821 - nolock = !spin_trylock(&call_lock);
9822 - local_irq_save(flags);
9823 - __smp_call_function_mask(cpu_online_map, stop_this_cpu, NULL, 0);
9824 - if (!nolock)
9825 - spin_unlock(&call_lock);
9826 - disable_all_local_evtchn();
9827 - local_irq_restore(flags);
9828 -}
9829 -
9830 -/*
9831 - * Reschedule call back. Nothing to do,
9832 - * all the work is done automatically when
9833 - * we return from the interrupt.
9834 - */
9835 -#ifndef CONFIG_XEN
9836 -asmlinkage void smp_reschedule_interrupt(void)
9837 -#else
9838 -asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx)
9839 -#endif
9840 -{
9841 -#ifndef CONFIG_XEN
9842 - ack_APIC_irq();
9843 -#endif
9844 - add_pda(irq_resched_count, 1);
9845 -#ifdef CONFIG_XEN
9846 - return IRQ_HANDLED;
9847 -#endif
9848 -}
9849 -
9850 -#ifndef CONFIG_XEN
9851 -asmlinkage void smp_call_function_interrupt(void)
9852 -#else
9853 -asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx)
9854 -#endif
9855 -{
9856 - void (*func) (void *info) = call_data->func;
9857 - void *info = call_data->info;
9858 - int wait = call_data->wait;
9859 -
9860 -#ifndef CONFIG_XEN
9861 - ack_APIC_irq();
9862 -#endif
9863 - /*
9864 - * Notify initiating CPU that I've grabbed the data and am
9865 - * about to execute the function
9866 - */
9867 - mb();
9868 - atomic_inc(&call_data->started);
9869 - /*
9870 - * At this point the info structure may be out of scope unless wait==1
9871 - */
9872 - exit_idle();
9873 - irq_enter();
9874 - (*func)(info);
9875 - add_pda(irq_call_count, 1);
9876 - irq_exit();
9877 - if (wait) {
9878 - mb();
9879 - atomic_inc(&call_data->finished);
9880 - }
9881 -#ifdef CONFIG_XEN
9882 - return IRQ_HANDLED;
9883 -#endif
9884 -}
9885 --- /dev/null
9886 +++ b/arch/x86/kernel/smp-xen.c
9887 @@ -0,0 +1,329 @@
9888 +/*
9889 + * Intel SMP support routines.
9890 + *
9891 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
9892 + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
9893 + * (c) 2002,2003 Andi Kleen, SuSE Labs.
9894 + *
9895 + * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
9896 + *
9897 + * This code is released under the GNU General Public License version 2 or
9898 + * later.
9899 + */
9900 +
9901 +#include <linux/init.h>
9902 +
9903 +#include <linux/mm.h>
9904 +#include <linux/delay.h>
9905 +#include <linux/spinlock.h>
9906 +#include <linux/kernel_stat.h>
9907 +#include <linux/mc146818rtc.h>
9908 +#include <linux/cache.h>
9909 +#include <linux/interrupt.h>
9910 +#include <linux/cpu.h>
9911 +
9912 +#include <asm/mtrr.h>
9913 +#include <asm/tlbflush.h>
9914 +#include <asm/mmu_context.h>
9915 +#include <asm/proto.h>
9916 +#include <mach_ipi.h>
9917 +#include <xen/evtchn.h>
9918 +/*
9919 + * Some notes on x86 processor bugs affecting SMP operation:
9920 + *
9921 + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
9922 + * The Linux implications for SMP are handled as follows:
9923 + *
9924 + * Pentium III / [Xeon]
9925 + * None of the E1AP-E3AP errata are visible to the user.
9926 + *
9927 + * E1AP. see PII A1AP
9928 + * E2AP. see PII A2AP
9929 + * E3AP. see PII A3AP
9930 + *
9931 + * Pentium II / [Xeon]
9932 + * None of the A1AP-A3AP errata are visible to the user.
9933 + *
9934 + * A1AP. see PPro 1AP
9935 + * A2AP. see PPro 2AP
9936 + * A3AP. see PPro 7AP
9937 + *
9938 + * Pentium Pro
9939 + * None of 1AP-9AP errata are visible to the normal user,
9940 + * except occasional delivery of 'spurious interrupt' as trap #15.
9941 + * This is very rare and a non-problem.
9942 + *
9943 + * 1AP. Linux maps APIC as non-cacheable
9944 + * 2AP. worked around in hardware
9945 + * 3AP. fixed in C0 and above steppings microcode update.
9946 + * Linux does not use excessive STARTUP_IPIs.
9947 + * 4AP. worked around in hardware
9948 + * 5AP. symmetric IO mode (normal Linux operation) not affected.
9949 + * 'noapic' mode has vector 0xf filled out properly.
9950 + * 6AP. 'noapic' mode might be affected - fixed in later steppings
9951 + * 7AP. We do not assume writes to the LVT deassering IRQs
9952 + * 8AP. We do not enable low power mode (deep sleep) during MP bootup
9953 + * 9AP. We do not use mixed mode
9954 + *
9955 + * Pentium
9956 + * There is a marginal case where REP MOVS on 100MHz SMP
9957 + * machines with B stepping processors can fail. XXX should provide
9958 + * an L1cache=Writethrough or L1cache=off option.
9959 + *
9960 + * B stepping CPUs may hang. There are hardware work arounds
9961 + * for this. We warn about it in case your board doesn't have the work
9962 + * arounds. Basically that's so I can tell anyone with a B stepping
9963 + * CPU and SMP problems "tough".
9964 + *
9965 + * Specific items [From Pentium Processor Specification Update]
9966 + *
9967 + * 1AP. Linux doesn't use remote read
9968 + * 2AP. Linux doesn't trust APIC errors
9969 + * 3AP. We work around this
9970 + * 4AP. Linux never generated 3 interrupts of the same priority
9971 + * to cause a lost local interrupt.
9972 + * 5AP. Remote read is never used
9973 + * 6AP. not affected - worked around in hardware
9974 + * 7AP. not affected - worked around in hardware
9975 + * 8AP. worked around in hardware - we get explicit CS errors if not
9976 + * 9AP. only 'noapic' mode affected. Might generate spurious
9977 + * interrupts, we log only the first one and count the
9978 + * rest silently.
9979 + * 10AP. not affected - worked around in hardware
9980 + * 11AP. Linux reads the APIC between writes to avoid this, as per
9981 + * the documentation. Make sure you preserve this as it affects
9982 + * the C stepping chips too.
9983 + * 12AP. not affected - worked around in hardware
9984 + * 13AP. not affected - worked around in hardware
9985 + * 14AP. we always deassert INIT during bootup
9986 + * 15AP. not affected - worked around in hardware
9987 + * 16AP. not affected - worked around in hardware
9988 + * 17AP. not affected - worked around in hardware
9989 + * 18AP. not affected - worked around in hardware
9990 + * 19AP. not affected - worked around in BIOS
9991 + *
9992 + * If this sounds worrying believe me these bugs are either ___RARE___,
9993 + * or are signal timing bugs worked around in hardware and there's
9994 + * about nothing of note with C stepping upwards.
9995 + */
9996 +
9997 +/*
9998 + * this function sends a 'reschedule' IPI to another CPU.
9999 + * it goes straight through and wastes no time serializing
10000 + * anything. Worst case is that we lose a reschedule ...
10001 + */
10002 +void xen_smp_send_reschedule(int cpu)
10003 +{
10004 + if (unlikely(cpu_is_offline(cpu))) {
10005 + WARN_ON(1);
10006 + return;
10007 + }
10008 + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
10009 +}
10010 +
10011 +/*
10012 + * Structure and data for smp_call_function(). This is designed to minimise
10013 + * static memory requirements. It also looks cleaner.
10014 + */
10015 +static DEFINE_SPINLOCK(call_lock);
10016 +
10017 +struct call_data_struct {
10018 + void (*func) (void *info);
10019 + void *info;
10020 + atomic_t started;
10021 + atomic_t finished;
10022 + int wait;
10023 +};
10024 +
10025 +void lock_ipi_call_lock(void)
10026 +{
10027 + spin_lock_irq(&call_lock);
10028 +}
10029 +
10030 +void unlock_ipi_call_lock(void)
10031 +{
10032 + spin_unlock_irq(&call_lock);
10033 +}
10034 +
10035 +static struct call_data_struct *call_data;
10036 +
10037 +static void __smp_call_function(void (*func) (void *info), void *info,
10038 + int nonatomic, int wait)
10039 +{
10040 + struct call_data_struct data;
10041 + int cpus = num_online_cpus() - 1;
10042 +
10043 + if (!cpus)
10044 + return;
10045 +
10046 + data.func = func;
10047 + data.info = info;
10048 + atomic_set(&data.started, 0);
10049 + data.wait = wait;
10050 + if (wait)
10051 + atomic_set(&data.finished, 0);
10052 +
10053 + call_data = &data;
10054 + mb();
10055 +
10056 + /* Send a message to all other CPUs and wait for them to respond */
10057 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
10058 +
10059 + /* Wait for response */
10060 + while (atomic_read(&data.started) != cpus)
10061 + cpu_relax();
10062 +
10063 + if (wait)
10064 + while (atomic_read(&data.finished) != cpus)
10065 + cpu_relax();
10066 +}
10067 +
10068 +
10069 +/**
10070 + * smp_call_function_mask(): Run a function on a set of other CPUs.
10071 + * @mask: The set of cpus to run on. Must not include the current cpu.
10072 + * @func: The function to run. This must be fast and non-blocking.
10073 + * @info: An arbitrary pointer to pass to the function.
10074 + * @wait: If true, wait (atomically) until function has completed on other CPUs.
10075 + *
10076 + * Returns 0 on success, else a negative status code.
10077 + *
10078 + * If @wait is true, then returns once @func has returned; otherwise
10079 + * it returns just before the target cpu calls @func.
10080 + *
10081 + * You must not call this function with disabled interrupts or from a
10082 + * hardware interrupt handler or from a bottom half handler.
10083 + */
10084 +int
10085 +xen_smp_call_function_mask(cpumask_t mask,
10086 + void (*func)(void *), void *info,
10087 + int wait)
10088 +{
10089 + struct call_data_struct data;
10090 + cpumask_t allbutself;
10091 + int cpus;
10092 +
10093 + /* Can deadlock when called with interrupts disabled */
10094 + WARN_ON(irqs_disabled());
10095 +
10096 + /* Holding any lock stops cpus from going down. */
10097 + spin_lock(&call_lock);
10098 +
10099 + allbutself = cpu_online_map;
10100 + cpu_clear(smp_processor_id(), allbutself);
10101 +
10102 + cpus_and(mask, mask, allbutself);
10103 + cpus = cpus_weight(mask);
10104 +
10105 + if (!cpus) {
10106 + spin_unlock(&call_lock);
10107 + return 0;
10108 + }
10109 +
10110 + data.func = func;
10111 + data.info = info;
10112 + atomic_set(&data.started, 0);
10113 + data.wait = wait;
10114 + if (wait)
10115 + atomic_set(&data.finished, 0);
10116 +
10117 + call_data = &data;
10118 + wmb();
10119 +
10120 + /* Send a message to other CPUs */
10121 + if (cpus_equal(mask, allbutself) &&
10122 + cpus_equal(cpu_online_map, cpu_callout_map))
10123 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
10124 + else
10125 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
10126 +
10127 + /* Wait for response */
10128 + while (atomic_read(&data.started) != cpus)
10129 + cpu_relax();
10130 +
10131 + if (wait)
10132 + while (atomic_read(&data.finished) != cpus)
10133 + cpu_relax();
10134 + spin_unlock(&call_lock);
10135 +
10136 + return 0;
10137 +}
10138 +
10139 +static void stop_this_cpu(void *dummy)
10140 +{
10141 + local_irq_disable();
10142 + /*
10143 + * Remove this CPU:
10144 + */
10145 + cpu_clear(smp_processor_id(), cpu_online_map);
10146 + disable_all_local_evtchn();
10147 + if (hlt_works(smp_processor_id()))
10148 + for (;;) halt();
10149 + for (;;);
10150 +}
10151 +
10152 +/*
10153 + * this function calls the 'stop' function on all other CPUs in the system.
10154 + */
10155 +
10156 +void xen_smp_send_stop(void)
10157 +{
10158 + int nolock;
10159 + unsigned long flags;
10160 +
10161 + /* Don't deadlock on the call lock in panic */
10162 + nolock = !spin_trylock(&call_lock);
10163 + local_irq_save(flags);
10164 + __smp_call_function(stop_this_cpu, NULL, 0, 0);
10165 + if (!nolock)
10166 + spin_unlock(&call_lock);
10167 + disable_all_local_evtchn();
10168 + local_irq_restore(flags);
10169 +}
10170 +
10171 +/*
10172 + * Reschedule call back. Nothing to do,
10173 + * all the work is done automatically when
10174 + * we return from the interrupt.
10175 + */
10176 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
10177 +{
10178 +#ifdef CONFIG_X86_32
10179 + __get_cpu_var(irq_stat).irq_resched_count++;
10180 +#else
10181 + add_pda(irq_resched_count, 1);
10182 +#endif
10183 + return IRQ_HANDLED;
10184 +}
10185 +
10186 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
10187 +{
10188 + void (*func) (void *info) = call_data->func;
10189 + void *info = call_data->info;
10190 + int wait = call_data->wait;
10191 +
10192 + /*
10193 + * Notify initiating CPU that I've grabbed the data and am
10194 + * about to execute the function
10195 + */
10196 + mb();
10197 + atomic_inc(&call_data->started);
10198 + /*
10199 + * At this point the info structure may be out of scope unless wait==1
10200 + */
10201 + irq_enter();
10202 + (*func)(info);
10203 +#ifdef CONFIG_X86_32
10204 + __get_cpu_var(irq_stat).irq_call_count++;
10205 +#else
10206 + add_pda(irq_call_count, 1);
10207 +#endif
10208 + irq_exit();
10209 +
10210 + if (wait) {
10211 + mb();
10212 + atomic_inc(&call_data->finished);
10213 + }
10214 +
10215 + return IRQ_HANDLED;
10216 +}
10217 --- a/arch/x86/kernel/time_32-xen.c
10218 +++ b/arch/x86/kernel/time_32-xen.c
10219 @@ -701,8 +701,6 @@ int xen_update_persistent_clock(void)
10220 return 0;
10221 }
10222
10223 -extern void (*late_time_init)(void);
10224 -
10225 /* Dynamically-mapped IRQ. */
10226 DEFINE_PER_CPU(int, timer_irq);
10227
10228 --- a/arch/x86/kernel/traps_32-xen.c
10229 +++ b/arch/x86/kernel/traps_32-xen.c
10230 @@ -9,26 +9,28 @@
10231 * 'Traps.c' handles hardware traps and faults after we have saved some
10232 * state in 'asm.s'.
10233 */
10234 -#include <linux/sched.h>
10235 +#include <linux/interrupt.h>
10236 +#include <linux/kallsyms.h>
10237 +#include <linux/spinlock.h>
10238 +#include <linux/highmem.h>
10239 +#include <linux/kprobes.h>
10240 +#include <linux/uaccess.h>
10241 +#include <linux/utsname.h>
10242 +#include <linux/kdebug.h>
10243 #include <linux/kernel.h>
10244 +#include <linux/module.h>
10245 +#include <linux/ptrace.h>
10246 #include <linux/string.h>
10247 +#include <linux/unwind.h>
10248 +#include <linux/delay.h>
10249 #include <linux/errno.h>
10250 +#include <linux/kexec.h>
10251 +#include <linux/sched.h>
10252 #include <linux/timer.h>
10253 -#include <linux/mm.h>
10254 #include <linux/init.h>
10255 -#include <linux/delay.h>
10256 -#include <linux/spinlock.h>
10257 -#include <linux/interrupt.h>
10258 -#include <linux/highmem.h>
10259 -#include <linux/kallsyms.h>
10260 -#include <linux/ptrace.h>
10261 -#include <linux/utsname.h>
10262 -#include <linux/kprobes.h>
10263 -#include <linux/kexec.h>
10264 -#include <linux/unwind.h>
10265 -#include <linux/uaccess.h>
10266 -#include <linux/nmi.h>
10267 #include <linux/bug.h>
10268 +#include <linux/nmi.h>
10269 +#include <linux/mm.h>
10270
10271 #ifdef CONFIG_EISA
10272 #include <linux/ioport.h>
10273 @@ -43,21 +45,18 @@
10274 #include <linux/edac.h>
10275 #endif
10276
10277 +#include <asm/arch_hooks.h>
10278 +#include <asm/stacktrace.h>
10279 #include <asm/processor.h>
10280 -#include <asm/system.h>
10281 -#include <asm/io.h>
10282 -#include <asm/atomic.h>
10283 #include <asm/debugreg.h>
10284 +#include <asm/atomic.h>
10285 +#include <asm/system.h>
10286 +#include <asm/unwind.h>
10287 #include <asm/desc.h>
10288 #include <asm/i387.h>
10289 #include <asm/nmi.h>
10290 -#include <asm/unwind.h>
10291 #include <asm/smp.h>
10292 -#include <asm/arch_hooks.h>
10293 -#include <linux/kdebug.h>
10294 -#include <asm/stacktrace.h>
10295 -
10296 -#include <linux/module.h>
10297 +#include <asm/io.h>
10298
10299 #include "mach_traps.h"
10300
10301 @@ -71,7 +70,7 @@ EXPORT_SYMBOL_GPL(used_vectors);
10302 asmlinkage int system_call(void);
10303
10304 /* Do we ignore FPU interrupts ? */
10305 -char ignore_fpu_irq = 0;
10306 +char ignore_fpu_irq;
10307
10308 #ifndef CONFIG_X86_NO_IDT
10309 /*
10310 @@ -113,12 +112,13 @@ static unsigned int code_bytes = 64;
10311 void printk_address(unsigned long address, int reliable)
10312 {
10313 #ifdef CONFIG_KALLSYMS
10314 - unsigned long offset = 0, symsize;
10315 + char namebuf[KSYM_NAME_LEN];
10316 + unsigned long offset = 0;
10317 + unsigned long symsize;
10318 const char *symname;
10319 - char *modname;
10320 - char *delim = ":";
10321 - char namebuf[128];
10322 char reliab[4] = "";
10323 + char *delim = ":";
10324 + char *modname;
10325
10326 symname = kallsyms_lookup(address, &symsize, &offset,
10327 &modname, namebuf);
10328 @@ -146,13 +146,14 @@ static inline int valid_stack_ptr(struct
10329
10330 /* The form of the top of the frame on the stack */
10331 struct stack_frame {
10332 - struct stack_frame *next_frame;
10333 - unsigned long return_address;
10334 + struct stack_frame *next_frame;
10335 + unsigned long return_address;
10336 };
10337
10338 -static inline unsigned long print_context_stack(struct thread_info *tinfo,
10339 - unsigned long *stack, unsigned long bp,
10340 - const struct stacktrace_ops *ops, void *data)
10341 +static inline unsigned long
10342 +print_context_stack(struct thread_info *tinfo,
10343 + unsigned long *stack, unsigned long bp,
10344 + const struct stacktrace_ops *ops, void *data)
10345 {
10346 struct stack_frame *frame = (struct stack_frame *)bp;
10347
10348 @@ -174,7 +175,7 @@ static inline unsigned long print_contex
10349 return bp;
10350 }
10351
10352 -#define MSG(msg) ops->warning(data, msg)
10353 +#define MSG(msg) ops->warning(data, msg)
10354
10355 void dump_trace(struct task_struct *task, struct pt_regs *regs,
10356 unsigned long *stack, unsigned long bp,
10357 @@ -185,6 +186,7 @@ void dump_trace(struct task_struct *task
10358
10359 if (!stack) {
10360 unsigned long dummy;
10361 +
10362 stack = &dummy;
10363 if (task != current)
10364 stack = (unsigned long *)task->thread.sp;
10365 @@ -194,7 +196,7 @@ void dump_trace(struct task_struct *task
10366 if (!bp) {
10367 if (task == current) {
10368 /* Grab bp right from our regs */
10369 - asm ("movl %%ebp, %0" : "=r" (bp) : );
10370 + asm("movl %%ebp, %0" : "=r" (bp) :);
10371 } else {
10372 /* bp is the last reg pushed by switch_to */
10373 bp = *(unsigned long *) task->thread.sp;
10374 @@ -204,15 +206,18 @@ void dump_trace(struct task_struct *task
10375
10376 while (1) {
10377 struct thread_info *context;
10378 +
10379 context = (struct thread_info *)
10380 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
10381 bp = print_context_stack(context, stack, bp, ops, data);
10382 - /* Should be after the line below, but somewhere
10383 - in early boot context comes out corrupted and we
10384 - can't reference it -AK */
10385 + /*
10386 + * Should be after the line below, but somewhere
10387 + * in early boot context comes out corrupted and we
10388 + * can't reference it:
10389 + */
10390 if (ops->stack(data, "IRQ") < 0)
10391 break;
10392 - stack = (unsigned long*)context->previous_esp;
10393 + stack = (unsigned long *)context->previous_esp;
10394 if (!stack)
10395 break;
10396 touch_nmi_watchdog();
10397 @@ -251,15 +256,15 @@ static void print_trace_address(void *da
10398 }
10399
10400 static const struct stacktrace_ops print_trace_ops = {
10401 - .warning = print_trace_warning,
10402 - .warning_symbol = print_trace_warning_symbol,
10403 - .stack = print_trace_stack,
10404 - .address = print_trace_address,
10405 + .warning = print_trace_warning,
10406 + .warning_symbol = print_trace_warning_symbol,
10407 + .stack = print_trace_stack,
10408 + .address = print_trace_address,
10409 };
10410
10411 static void
10412 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
10413 - unsigned long *stack, unsigned long bp, char *log_lvl)
10414 + unsigned long *stack, unsigned long bp, char *log_lvl)
10415 {
10416 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
10417 printk("%s =======================\n", log_lvl);
10418 @@ -271,21 +276,22 @@ void show_trace(struct task_struct *task
10419 show_trace_log_lvl(task, regs, stack, bp, "");
10420 }
10421
10422 -static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
10423 - unsigned long *sp, unsigned long bp, char *log_lvl)
10424 +static void
10425 +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
10426 + unsigned long *sp, unsigned long bp, char *log_lvl)
10427 {
10428 unsigned long *stack;
10429 int i;
10430
10431 if (sp == NULL) {
10432 if (task)
10433 - sp = (unsigned long*)task->thread.sp;
10434 + sp = (unsigned long *)task->thread.sp;
10435 else
10436 sp = (unsigned long *)&sp;
10437 }
10438
10439 stack = sp;
10440 - for(i = 0; i < kstack_depth_to_print; i++) {
10441 + for (i = 0; i < kstack_depth_to_print; i++) {
10442 if (kstack_end(stack))
10443 break;
10444 if (i && ((i % 8) == 0))
10445 @@ -293,6 +299,7 @@ static void show_stack_log_lvl(struct ta
10446 printk("%08lx ", *stack++);
10447 }
10448 printk("\n%sCall Trace:\n", log_lvl);
10449 +
10450 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
10451 }
10452
10453 @@ -307,8 +314,8 @@ void show_stack(struct task_struct *task
10454 */
10455 void dump_stack(void)
10456 {
10457 - unsigned long stack;
10458 unsigned long bp = 0;
10459 + unsigned long stack;
10460
10461 #ifdef CONFIG_FRAME_POINTER
10462 if (!bp)
10463 @@ -320,6 +327,7 @@ void dump_stack(void)
10464 init_utsname()->release,
10465 (int)strcspn(init_utsname()->version, " "),
10466 init_utsname()->version);
10467 +
10468 show_trace(current, NULL, &stack, bp);
10469 }
10470
10471 @@ -331,6 +339,7 @@ void show_registers(struct pt_regs *regs
10472
10473 print_modules();
10474 __show_registers(regs, 0);
10475 +
10476 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
10477 TASK_COMM_LEN, current->comm, task_pid_nr(current),
10478 current_thread_info(), current, task_thread_info(current));
10479 @@ -339,10 +348,10 @@ void show_registers(struct pt_regs *regs
10480 * time of the fault..
10481 */
10482 if (!user_mode_vm(regs)) {
10483 - u8 *ip;
10484 unsigned int code_prologue = code_bytes * 43 / 64;
10485 unsigned int code_len = code_bytes;
10486 unsigned char c;
10487 + u8 *ip;
10488
10489 printk("\n" KERN_EMERG "Stack: ");
10490 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
10491 @@ -369,7 +378,7 @@ void show_registers(struct pt_regs *regs
10492 }
10493 }
10494 printk("\n");
10495 -}
10496 +}
10497
10498 int is_valid_bugaddr(unsigned long ip)
10499 {
10500 @@ -385,10 +394,10 @@ int is_valid_bugaddr(unsigned long ip)
10501
10502 static int die_counter;
10503
10504 -int __kprobes __die(const char * str, struct pt_regs * regs, long err)
10505 +int __kprobes __die(const char *str, struct pt_regs *regs, long err)
10506 {
10507 - unsigned long sp;
10508 unsigned short ss;
10509 + unsigned long sp;
10510
10511 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
10512 #ifdef CONFIG_PREEMPT
10513 @@ -403,8 +412,8 @@ int __kprobes __die(const char * str, st
10514 printk("\n");
10515
10516 if (notify_die(DIE_OOPS, str, regs, err,
10517 - current->thread.trap_no, SIGSEGV) !=
10518 - NOTIFY_STOP) {
10519 + current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
10520 +
10521 show_registers(regs);
10522 /* Executive summary in case the oops scrolled away */
10523 sp = (unsigned long) (&regs->sp);
10524 @@ -416,17 +425,18 @@ int __kprobes __die(const char * str, st
10525 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
10526 print_symbol("%s", regs->ip);
10527 printk(" SS:ESP %04x:%08lx\n", ss, sp);
10528 +
10529 return 0;
10530 - } else {
10531 - return 1;
10532 }
10533 +
10534 + return 1;
10535 }
10536
10537 /*
10538 - * This is gone through when something in the kernel has done something bad and
10539 - * is about to be terminated.
10540 + * This is gone through when something in the kernel has done something bad
10541 + * and is about to be terminated:
10542 */
10543 -void die(const char * str, struct pt_regs * regs, long err)
10544 +void die(const char *str, struct pt_regs *regs, long err)
10545 {
10546 static struct {
10547 raw_spinlock_t lock;
10548 @@ -448,8 +458,9 @@ void die(const char * str, struct pt_reg
10549 die.lock_owner = smp_processor_id();
10550 die.lock_owner_depth = 0;
10551 bust_spinlocks(1);
10552 - } else
10553 + } else {
10554 raw_local_irq_save(flags);
10555 + }
10556
10557 if (++die.lock_owner_depth < 3) {
10558 report_bug(regs->ip, regs);
10559 @@ -482,19 +493,20 @@ void die(const char * str, struct pt_reg
10560 do_exit(SIGSEGV);
10561 }
10562
10563 -static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
10564 +static inline void
10565 +die_if_kernel(const char *str, struct pt_regs *regs, long err)
10566 {
10567 if (!user_mode_vm(regs))
10568 die(str, regs, err);
10569 }
10570
10571 -static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
10572 - struct pt_regs * regs, long error_code,
10573 - siginfo_t *info)
10574 +static void __kprobes
10575 +do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
10576 + long error_code, siginfo_t *info)
10577 {
10578 struct task_struct *tsk = current;
10579
10580 - if (regs->flags & VM_MASK) {
10581 + if (regs->flags & X86_VM_MASK) {
10582 if (vm86)
10583 goto vm86_trap;
10584 goto trap_signal;
10585 @@ -503,109 +515,112 @@ static void __kprobes do_trap(int trapnr
10586 if (!user_mode(regs))
10587 goto kernel_trap;
10588
10589 - trap_signal: {
10590 - /*
10591 - * We want error_code and trap_no set for userspace faults and
10592 - * kernelspace faults which result in die(), but not
10593 - * kernelspace faults which are fixed up. die() gives the
10594 - * process no chance to handle the signal and notice the
10595 - * kernel fault information, so that won't result in polluting
10596 - * the information about previously queued, but not yet
10597 - * delivered, faults. See also do_general_protection below.
10598 - */
10599 - tsk->thread.error_code = error_code;
10600 - tsk->thread.trap_no = trapnr;
10601 +trap_signal:
10602 + /*
10603 + * We want error_code and trap_no set for userspace faults and
10604 + * kernelspace faults which result in die(), but not
10605 + * kernelspace faults which are fixed up. die() gives the
10606 + * process no chance to handle the signal and notice the
10607 + * kernel fault information, so that won't result in polluting
10608 + * the information about previously queued, but not yet
10609 + * delivered, faults. See also do_general_protection below.
10610 + */
10611 + tsk->thread.error_code = error_code;
10612 + tsk->thread.trap_no = trapnr;
10613
10614 - if (info)
10615 - force_sig_info(signr, info, tsk);
10616 - else
10617 - force_sig(signr, tsk);
10618 - return;
10619 - }
10620 + if (info)
10621 + force_sig_info(signr, info, tsk);
10622 + else
10623 + force_sig(signr, tsk);
10624 + return;
10625
10626 - kernel_trap: {
10627 - if (!fixup_exception(regs)) {
10628 - tsk->thread.error_code = error_code;
10629 - tsk->thread.trap_no = trapnr;
10630 - die(str, regs, error_code);
10631 - }
10632 - return;
10633 +kernel_trap:
10634 + if (!fixup_exception(regs)) {
10635 + tsk->thread.error_code = error_code;
10636 + tsk->thread.trap_no = trapnr;
10637 + die(str, regs, error_code);
10638 }
10639 + return;
10640
10641 - vm86_trap: {
10642 - int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
10643 - if (ret) goto trap_signal;
10644 - return;
10645 - }
10646 +vm86_trap:
10647 + if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
10648 + error_code, trapnr))
10649 + goto trap_signal;
10650 + return;
10651 }
10652
10653 -#define DO_ERROR(trapnr, signr, str, name) \
10654 -void do_##name(struct pt_regs * regs, long error_code) \
10655 -{ \
10656 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10657 - == NOTIFY_STOP) \
10658 - return; \
10659 - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
10660 -}
10661 -
10662 -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
10663 -void do_##name(struct pt_regs * regs, long error_code) \
10664 -{ \
10665 - siginfo_t info; \
10666 - if (irq) \
10667 - local_irq_enable(); \
10668 - info.si_signo = signr; \
10669 - info.si_errno = 0; \
10670 - info.si_code = sicode; \
10671 - info.si_addr = (void __user *)siaddr; \
10672 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10673 - == NOTIFY_STOP) \
10674 - return; \
10675 - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
10676 -}
10677 -
10678 -#define DO_VM86_ERROR(trapnr, signr, str, name) \
10679 -void do_##name(struct pt_regs * regs, long error_code) \
10680 -{ \
10681 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10682 - == NOTIFY_STOP) \
10683 - return; \
10684 - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
10685 -}
10686 -
10687 -#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
10688 -void do_##name(struct pt_regs * regs, long error_code) \
10689 -{ \
10690 - siginfo_t info; \
10691 - info.si_signo = signr; \
10692 - info.si_errno = 0; \
10693 - info.si_code = sicode; \
10694 - info.si_addr = (void __user *)siaddr; \
10695 - trace_hardirqs_fixup(); \
10696 - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10697 - == NOTIFY_STOP) \
10698 - return; \
10699 - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
10700 +#define DO_ERROR(trapnr, signr, str, name) \
10701 +void do_##name(struct pt_regs *regs, long error_code) \
10702 +{ \
10703 + trace_hardirqs_fixup(); \
10704 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10705 + == NOTIFY_STOP) \
10706 + return; \
10707 + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
10708 +}
10709 +
10710 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
10711 +void do_##name(struct pt_regs *regs, long error_code) \
10712 +{ \
10713 + siginfo_t info; \
10714 + if (irq) \
10715 + local_irq_enable(); \
10716 + info.si_signo = signr; \
10717 + info.si_errno = 0; \
10718 + info.si_code = sicode; \
10719 + info.si_addr = (void __user *)siaddr; \
10720 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10721 + == NOTIFY_STOP) \
10722 + return; \
10723 + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
10724 +}
10725 +
10726 +#define DO_VM86_ERROR(trapnr, signr, str, name) \
10727 +void do_##name(struct pt_regs *regs, long error_code) \
10728 +{ \
10729 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10730 + == NOTIFY_STOP) \
10731 + return; \
10732 + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
10733 +}
10734 +
10735 +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
10736 +void do_##name(struct pt_regs *regs, long error_code) \
10737 +{ \
10738 + siginfo_t info; \
10739 + info.si_signo = signr; \
10740 + info.si_errno = 0; \
10741 + info.si_code = sicode; \
10742 + info.si_addr = (void __user *)siaddr; \
10743 + trace_hardirqs_fixup(); \
10744 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
10745 + == NOTIFY_STOP) \
10746 + return; \
10747 + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
10748 }
10749
10750 -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
10751 +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
10752 #ifndef CONFIG_KPROBES
10753 -DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
10754 +DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
10755 #endif
10756 -DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
10757 -DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
10758 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
10759 -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
10760 +DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
10761 +DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
10762 +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
10763 +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
10764 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
10765 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
10766 DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
10767 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
10768 -DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
10769 +DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
10770
10771 void __kprobes do_general_protection(struct pt_regs * regs,
10772 long error_code)
10773 {
10774 - if (regs->flags & VM_MASK)
10775 + struct thread_struct *thread;
10776 +
10777 + thread = &current->thread;
10778 +
10779 + if (regs->flags & X86_VM_MASK)
10780 goto gp_in_vm86;
10781
10782 if (!user_mode(regs))
10783 @@ -613,6 +628,7 @@ void __kprobes do_general_protection(str
10784
10785 current->thread.error_code = error_code;
10786 current->thread.trap_no = 13;
10787 +
10788 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
10789 printk_ratelimit()) {
10790 printk(KERN_INFO
10791 @@ -642,22 +658,25 @@ gp_in_kernel:
10792 }
10793 }
10794
10795 -static __kprobes void
10796 -mem_parity_error(unsigned char reason, struct pt_regs * regs)
10797 +static notrace __kprobes void
10798 +mem_parity_error(unsigned char reason, struct pt_regs *regs)
10799 {
10800 - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
10801 - "CPU %d.\n", reason, smp_processor_id());
10802 - printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
10803 + printk(KERN_EMERG
10804 + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
10805 + reason, smp_processor_id());
10806 +
10807 + printk(KERN_EMERG
10808 + "You have some hardware problem, likely on the PCI bus.\n");
10809
10810 #if defined(CONFIG_EDAC)
10811 - if(edac_handler_set()) {
10812 + if (edac_handler_set()) {
10813 edac_atomic_assert_error();
10814 return;
10815 }
10816 #endif
10817
10818 if (panic_on_unrecovered_nmi)
10819 - panic("NMI: Not continuing");
10820 + panic("NMI: Not continuing");
10821
10822 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
10823
10824 @@ -665,8 +684,8 @@ mem_parity_error(unsigned char reason, s
10825 clear_mem_error(reason);
10826 }
10827
10828 -static __kprobes void
10829 -io_check_error(unsigned char reason, struct pt_regs * regs)
10830 +static notrace __kprobes void
10831 +io_check_error(unsigned char reason, struct pt_regs *regs)
10832 {
10833 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
10834 show_registers(regs);
10835 @@ -675,38 +694,43 @@ io_check_error(unsigned char reason, str
10836 clear_io_check_error(reason);
10837 }
10838
10839 -static __kprobes void
10840 -unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
10841 +static notrace __kprobes void
10842 +unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
10843 {
10844 + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
10845 + return;
10846 #ifdef CONFIG_MCA
10847 - /* Might actually be able to figure out what the guilty party
10848 - * is. */
10849 - if( MCA_bus ) {
10850 + /*
10851 + * Might actually be able to figure out what the guilty party
10852 + * is:
10853 + */
10854 + if (MCA_bus) {
10855 mca_handle_nmi();
10856 return;
10857 }
10858 #endif
10859 - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
10860 - "CPU %d.\n", reason, smp_processor_id());
10861 + printk(KERN_EMERG
10862 + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
10863 + reason, smp_processor_id());
10864 +
10865 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
10866 if (panic_on_unrecovered_nmi)
10867 - panic("NMI: Not continuing");
10868 + panic("NMI: Not continuing");
10869
10870 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
10871 }
10872
10873 static DEFINE_SPINLOCK(nmi_print_lock);
10874
10875 -void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
10876 +void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
10877 {
10878 - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
10879 - NOTIFY_STOP)
10880 + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
10881 return;
10882
10883 spin_lock(&nmi_print_lock);
10884 /*
10885 * We are in trouble anyway, lets at least try
10886 - * to get a message out.
10887 + * to get a message out:
10888 */
10889 bust_spinlocks(1);
10890 printk(KERN_EMERG "%s", msg);
10891 @@ -717,9 +741,10 @@ void __kprobes die_nmi(struct pt_regs *r
10892 spin_unlock(&nmi_print_lock);
10893 bust_spinlocks(0);
10894
10895 - /* If we are in kernel we are probably nested up pretty bad
10896 - * and might aswell get out now while we still can.
10897 - */
10898 + /*
10899 + * If we are in kernel we are probably nested up pretty bad
10900 + * and might aswell get out now while we still can:
10901 + */
10902 if (!user_mode_vm(regs)) {
10903 current->thread.trap_no = 2;
10904 crash_kexec(regs);
10905 @@ -728,14 +753,14 @@ void __kprobes die_nmi(struct pt_regs *r
10906 do_exit(SIGSEGV);
10907 }
10908
10909 -static __kprobes void default_do_nmi(struct pt_regs * regs)
10910 +static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
10911 {
10912 unsigned char reason = 0;
10913
10914 - /* Only the BSP gets external NMIs from the system. */
10915 + /* Only the BSP gets external NMIs from the system: */
10916 if (!smp_processor_id())
10917 reason = get_nmi_reason();
10918 -
10919 +
10920 if (!(reason & 0xc0)) {
10921 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
10922 == NOTIFY_STOP)
10923 @@ -748,8 +773,10 @@ static __kprobes void default_do_nmi(str
10924 if (nmi_watchdog_tick(regs, reason))
10925 return;
10926 if (!do_nmi_callback(regs, smp_processor_id()))
10927 -#endif
10928 unknown_nmi_error(reason, regs);
10929 +#else
10930 + unknown_nmi_error(reason, regs);
10931 +#endif
10932
10933 return;
10934 }
10935 @@ -761,14 +788,14 @@ static __kprobes void default_do_nmi(str
10936 io_check_error(reason, regs);
10937 /*
10938 * Reassert NMI in case it became active meanwhile
10939 - * as it's edge-triggered.
10940 + * as it's edge-triggered:
10941 */
10942 reassert_nmi();
10943 }
10944
10945 static int ignore_nmis;
10946
10947 -__kprobes void do_nmi(struct pt_regs * regs, long error_code)
10948 +notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
10949 {
10950 int cpu;
10951
10952 @@ -804,9 +831,12 @@ void __kprobes do_int3(struct pt_regs *r
10953 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
10954 == NOTIFY_STOP)
10955 return;
10956 - /* This is an interrupt gate, because kprobes wants interrupts
10957 - disabled. Normal trap handlers don't. */
10958 + /*
10959 + * This is an interrupt gate, because kprobes wants interrupts
10960 + * disabled. Normal trap handlers don't.
10961 + */
10962 restore_interrupts(regs);
10963 +
10964 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
10965 }
10966 #endif
10967 @@ -821,7 +851,7 @@ void __kprobes do_int3(struct pt_regs *r
10968 * from user space. Such code must not hold kernel locks (since it
10969 * can equally take a page fault), therefore it is safe to call
10970 * force_sig_info even though that claims and releases locks.
10971 - *
10972 + *
10973 * Code in ./signal.c ensures that the debug control register
10974 * is restored before we deliver any signal, and therefore that
10975 * user code runs with the correct debug control register even though
10976 @@ -833,10 +863,10 @@ void __kprobes do_int3(struct pt_regs *r
10977 * find every occurrence of the TF bit that could be saved away even
10978 * by user code)
10979 */
10980 -void __kprobes do_debug(struct pt_regs * regs, long error_code)
10981 +void __kprobes do_debug(struct pt_regs *regs, long error_code)
10982 {
10983 - unsigned int condition;
10984 struct task_struct *tsk = current;
10985 + unsigned int condition;
10986
10987 trace_hardirqs_fixup();
10988
10989 @@ -861,7 +891,7 @@ void __kprobes do_debug(struct pt_regs *
10990 goto clear_dr7;
10991 }
10992
10993 - if (regs->flags & VM_MASK)
10994 + if (regs->flags & X86_VM_MASK)
10995 goto debug_vm86;
10996
10997 /* Save debug status register where ptrace can see it */
10998 @@ -884,7 +914,8 @@ void __kprobes do_debug(struct pt_regs *
10999 /* Ok, finally something we can handle */
11000 send_sigtrap(tsk, regs, error_code);
11001
11002 - /* Disable additional traps. They'll be re-enabled when
11003 + /*
11004 + * Disable additional traps. They'll be re-enabled when
11005 * the signal is delivered.
11006 */
11007 clear_dr7:
11008 @@ -897,7 +928,7 @@ debug_vm86:
11009
11010 clear_TF_reenable:
11011 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
11012 - regs->flags &= ~TF_MASK;
11013 + regs->flags &= ~X86_EFLAGS_TF;
11014 return;
11015 }
11016
11017 @@ -908,9 +939,10 @@ clear_TF_reenable:
11018 */
11019 void math_error(void __user *ip)
11020 {
11021 - struct task_struct * task;
11022 + struct task_struct *task;
11023 + unsigned short cwd;
11024 + unsigned short swd;
11025 siginfo_t info;
11026 - unsigned short cwd, swd;
11027
11028 /*
11029 * Save the info for the exception handler and clear the error.
11030 @@ -936,36 +968,36 @@ void math_error(void __user *ip)
11031 cwd = get_fpu_cwd(task);
11032 swd = get_fpu_swd(task);
11033 switch (swd & ~cwd & 0x3f) {
11034 - case 0x000: /* No unmasked exception */
11035 - return;
11036 - default: /* Multiple exceptions */
11037 - break;
11038 - case 0x001: /* Invalid Op */
11039 - /*
11040 - * swd & 0x240 == 0x040: Stack Underflow
11041 - * swd & 0x240 == 0x240: Stack Overflow
11042 - * User must clear the SF bit (0x40) if set
11043 - */
11044 - info.si_code = FPE_FLTINV;
11045 - break;
11046 - case 0x002: /* Denormalize */
11047 - case 0x010: /* Underflow */
11048 - info.si_code = FPE_FLTUND;
11049 - break;
11050 - case 0x004: /* Zero Divide */
11051 - info.si_code = FPE_FLTDIV;
11052 - break;
11053 - case 0x008: /* Overflow */
11054 - info.si_code = FPE_FLTOVF;
11055 - break;
11056 - case 0x020: /* Precision */
11057 - info.si_code = FPE_FLTRES;
11058 - break;
11059 + case 0x000: /* No unmasked exception */
11060 + return;
11061 + default: /* Multiple exceptions */
11062 + break;
11063 + case 0x001: /* Invalid Op */
11064 + /*
11065 + * swd & 0x240 == 0x040: Stack Underflow
11066 + * swd & 0x240 == 0x240: Stack Overflow
11067 + * User must clear the SF bit (0x40) if set
11068 + */
11069 + info.si_code = FPE_FLTINV;
11070 + break;
11071 + case 0x002: /* Denormalize */
11072 + case 0x010: /* Underflow */
11073 + info.si_code = FPE_FLTUND;
11074 + break;
11075 + case 0x004: /* Zero Divide */
11076 + info.si_code = FPE_FLTDIV;
11077 + break;
11078 + case 0x008: /* Overflow */
11079 + info.si_code = FPE_FLTOVF;
11080 + break;
11081 + case 0x020: /* Precision */
11082 + info.si_code = FPE_FLTRES;
11083 + break;
11084 }
11085 force_sig_info(SIGFPE, &info, task);
11086 }
11087
11088 -void do_coprocessor_error(struct pt_regs * regs, long error_code)
11089 +void do_coprocessor_error(struct pt_regs *regs, long error_code)
11090 {
11091 ignore_fpu_irq = 1;
11092 math_error((void __user *)regs->ip);
11093 @@ -973,9 +1005,9 @@ void do_coprocessor_error(struct pt_regs
11094
11095 static void simd_math_error(void __user *ip)
11096 {
11097 - struct task_struct * task;
11098 - siginfo_t info;
11099 + struct task_struct *task;
11100 unsigned short mxcsr;
11101 + siginfo_t info;
11102
11103 /*
11104 * Save the info for the exception handler and clear the error.
11105 @@ -996,84 +1028,82 @@ static void simd_math_error(void __user
11106 */
11107 mxcsr = get_fpu_mxcsr(task);
11108 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
11109 - case 0x000:
11110 - default:
11111 - break;
11112 - case 0x001: /* Invalid Op */
11113 - info.si_code = FPE_FLTINV;
11114 - break;
11115 - case 0x002: /* Denormalize */
11116 - case 0x010: /* Underflow */
11117 - info.si_code = FPE_FLTUND;
11118 - break;
11119 - case 0x004: /* Zero Divide */
11120 - info.si_code = FPE_FLTDIV;
11121 - break;
11122 - case 0x008: /* Overflow */
11123 - info.si_code = FPE_FLTOVF;
11124 - break;
11125 - case 0x020: /* Precision */
11126 - info.si_code = FPE_FLTRES;
11127 - break;
11128 + case 0x000:
11129 + default:
11130 + break;
11131 + case 0x001: /* Invalid Op */
11132 + info.si_code = FPE_FLTINV;
11133 + break;
11134 + case 0x002: /* Denormalize */
11135 + case 0x010: /* Underflow */
11136 + info.si_code = FPE_FLTUND;
11137 + break;
11138 + case 0x004: /* Zero Divide */
11139 + info.si_code = FPE_FLTDIV;
11140 + break;
11141 + case 0x008: /* Overflow */
11142 + info.si_code = FPE_FLTOVF;
11143 + break;
11144 + case 0x020: /* Precision */
11145 + info.si_code = FPE_FLTRES;
11146 + break;
11147 }
11148 force_sig_info(SIGFPE, &info, task);
11149 }
11150
11151 -void do_simd_coprocessor_error(struct pt_regs * regs,
11152 - long error_code)
11153 +void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
11154 {
11155 if (cpu_has_xmm) {
11156 /* Handle SIMD FPU exceptions on PIII+ processors. */
11157 ignore_fpu_irq = 1;
11158 simd_math_error((void __user *)regs->ip);
11159 - } else {
11160 - /*
11161 - * Handle strange cache flush from user space exception
11162 - * in all other cases. This is undocumented behaviour.
11163 - */
11164 - if (regs->flags & VM_MASK) {
11165 - handle_vm86_fault((struct kernel_vm86_regs *)regs,
11166 - error_code);
11167 - return;
11168 - }
11169 - current->thread.trap_no = 19;
11170 - current->thread.error_code = error_code;
11171 - die_if_kernel("cache flush denied", regs, error_code);
11172 - force_sig(SIGSEGV, current);
11173 + return;
11174 + }
11175 + /*
11176 + * Handle strange cache flush from user space exception
11177 + * in all other cases. This is undocumented behaviour.
11178 + */
11179 + if (regs->flags & X86_VM_MASK) {
11180 + handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
11181 + return;
11182 }
11183 + current->thread.trap_no = 19;
11184 + current->thread.error_code = error_code;
11185 + die_if_kernel("cache flush denied", regs, error_code);
11186 + force_sig(SIGSEGV, current);
11187 }
11188
11189 #ifndef CONFIG_XEN
11190 -void do_spurious_interrupt_bug(struct pt_regs * regs,
11191 - long error_code)
11192 +void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
11193 {
11194 #if 0
11195 /* No need to warn about this any longer. */
11196 - printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
11197 + printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
11198 #endif
11199 }
11200
11201 -unsigned long patch_espfix_desc(unsigned long uesp,
11202 - unsigned long kesp)
11203 +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
11204 {
11205 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
11206 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
11207 unsigned long new_kesp = kesp - base;
11208 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
11209 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
11210 +
11211 /* Set up base for espfix segment */
11212 - desc &= 0x00f0ff0000000000ULL;
11213 - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
11214 + desc &= 0x00f0ff0000000000ULL;
11215 + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
11216 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
11217 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
11218 (lim_pages & 0xffff);
11219 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
11220 +
11221 return new_kesp;
11222 }
11223 #endif
11224
11225 /*
11226 - * 'math_state_restore()' saves the current math information in the
11227 + * 'math_state_restore()' saves the current math information in the
11228 * old math state array, and gets the new ones from the current task
11229 *
11230 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
11231 @@ -1087,9 +1117,22 @@ asmlinkage void math_state_restore(void)
11232 struct thread_info *thread = current_thread_info();
11233 struct task_struct *tsk = thread->task;
11234
11235 + if (!tsk_used_math(tsk)) {
11236 + local_irq_enable();
11237 + /*
11238 + * does a slab alloc which can sleep
11239 + */
11240 + if (init_fpu(tsk)) {
11241 + /*
11242 + * ran out of memory!
11243 + */
11244 + do_group_exit(SIGKILL);
11245 + return;
11246 + }
11247 + local_irq_disable();
11248 + }
11249 +
11250 /* NB. 'clts' is done for us by Xen during virtual trap. */
11251 - if (!tsk_used_math(tsk))
11252 - init_fpu(tsk);
11253 restore_fpu(tsk);
11254 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
11255 tsk->fpu_counter++;
11256 @@ -1100,15 +1143,15 @@ EXPORT_SYMBOL_GPL(math_state_restore);
11257
11258 asmlinkage void math_emulate(long arg)
11259 {
11260 - printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
11261 - printk(KERN_EMERG "killing %s.\n",current->comm);
11262 - force_sig(SIGFPE,current);
11263 + printk(KERN_EMERG
11264 + "math-emulation not enabled and no coprocessor found.\n");
11265 + printk(KERN_EMERG "killing %s.\n", current->comm);
11266 + force_sig(SIGFPE, current);
11267 schedule();
11268 }
11269
11270 #endif /* CONFIG_MATH_EMULATION */
11271
11272 -
11273 /*
11274 * NB. All these are "trap gates" (i.e. events_mask isn't set) except
11275 * for those that specify <dpl>|4 in the second field.
11276 @@ -1146,25 +1189,21 @@ void __init trap_init(void)
11277 if (ret)
11278 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
11279
11280 - /*
11281 - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
11282 - * Generate a build-time error if the alignment is wrong.
11283 - */
11284 - BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
11285 if (cpu_has_fxsr) {
11286 printk(KERN_INFO "Enabling fast FPU save and restore... ");
11287 set_in_cr4(X86_CR4_OSFXSR);
11288 printk("done.\n");
11289 }
11290 if (cpu_has_xmm) {
11291 - printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
11292 - "support... ");
11293 + printk(KERN_INFO
11294 + "Enabling unmasked SIMD FPU exception support... ");
11295 set_in_cr4(X86_CR4_OSXMMEXCPT);
11296 printk("done.\n");
11297 }
11298
11299 + init_thread_xstate();
11300 /*
11301 - * Should be a barrier for any external CPU state.
11302 + * Should be a barrier for any external CPU state:
11303 */
11304 cpu_init();
11305 }
11306 @@ -1183,6 +1222,7 @@ void __cpuinit smp_trap_init(trap_info_t
11307 static int __init kstack_setup(char *s)
11308 {
11309 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
11310 +
11311 return 1;
11312 }
11313 __setup("kstack=", kstack_setup);
11314 --- a/arch/x86/kernel/traps_64-xen.c
11315 +++ b/arch/x86/kernel/traps_64-xen.c
11316 @@ -33,6 +33,8 @@
11317 #include <linux/kdebug.h>
11318 #include <linux/utsname.h>
11319
11320 +#include <mach_traps.h>
11321 +
11322 #if defined(CONFIG_EDAC)
11323 #include <linux/edac.h>
11324 #endif
11325 @@ -601,10 +603,16 @@ void die(const char * str, struct pt_reg
11326 }
11327
11328 #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
11329 -void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
11330 +notrace __kprobes void
11331 +die_nmi(char *str, struct pt_regs *regs, int do_panic)
11332 {
11333 - unsigned long flags = oops_begin();
11334 + unsigned long flags;
11335 +
11336 + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
11337 + NOTIFY_STOP)
11338 + return;
11339
11340 + flags = oops_begin();
11341 /*
11342 * We are in trouble anyway, lets at least try
11343 * to get a message out.
11344 @@ -769,7 +777,7 @@ asmlinkage void __kprobes do_general_pro
11345 die("general protection fault", regs, error_code);
11346 }
11347
11348 -static __kprobes void
11349 +static notrace __kprobes void
11350 mem_parity_error(unsigned char reason, struct pt_regs * regs)
11351 {
11352 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
11353 @@ -792,7 +800,7 @@ mem_parity_error(unsigned char reason, s
11354 clear_mem_error(reason);
11355 }
11356
11357 -static __kprobes void
11358 +static notrace __kprobes void
11359 io_check_error(unsigned char reason, struct pt_regs * regs)
11360 {
11361 printk("NMI: IOCK error (debug interrupt?)\n");
11362 @@ -802,9 +810,11 @@ io_check_error(unsigned char reason, str
11363 clear_io_check_error(reason);
11364 }
11365
11366 -static __kprobes void
11367 +static notrace __kprobes void
11368 unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
11369 {
11370 + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
11371 + return;
11372 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
11373 reason);
11374 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
11375 @@ -817,7 +827,7 @@ unknown_nmi_error(unsigned char reason,
11376
11377 /* Runs on IST stack. This code must keep interrupts off all the time.
11378 Nested NMIs are prevented by the CPU. */
11379 -asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
11380 +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
11381 {
11382 unsigned char reason = 0;
11383 int cpu;
11384 @@ -1117,11 +1127,25 @@ asmlinkage void __attribute__((weak)) mc
11385 asmlinkage void math_state_restore(void)
11386 {
11387 struct task_struct *me = current;
11388 +
11389 + if (!used_math()) {
11390 + local_irq_enable();
11391 + /*
11392 + * does a slab alloc which can sleep
11393 + */
11394 + if (init_fpu(me)) {
11395 + /*
11396 + * ran out of memory!
11397 + */
11398 + do_group_exit(SIGKILL);
11399 + return;
11400 + }
11401 + local_irq_disable();
11402 + }
11403 +
11404 /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
11405
11406 - if (!used_math())
11407 - init_fpu(me);
11408 - restore_fpu_checking(&me->thread.i387.fxsave);
11409 + restore_fpu_checking(&me->thread.xstate->fxsave);
11410 task_thread_info(me)->status |= TS_USEDFPU;
11411 me->fpu_counter++;
11412 }
11413 @@ -1168,6 +1192,10 @@ void __init trap_init(void)
11414 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
11415
11416 /*
11417 + * initialize the per thread extended state:
11418 + */
11419 + init_thread_xstate();
11420 + /*
11421 * Should be a barrier for any external CPU state.
11422 */
11423 cpu_init();
11424 --- a/arch/x86/kernel/vsyscall_64-xen.c
11425 +++ b/arch/x86/kernel/vsyscall_64-xen.c
11426 @@ -216,7 +216,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
11427 return 0;
11428 }
11429
11430 -long __vsyscall(3) venosys_1(void)
11431 +static long __vsyscall(3) venosys_1(void)
11432 {
11433 return -ENOSYS;
11434 }
11435 --- a/arch/x86/mm/fault-xen.c
11436 +++ b/arch/x86/mm/fault-xen.c
11437 @@ -510,6 +510,11 @@ static int vmalloc_fault(unsigned long a
11438 unsigned long pgd_paddr;
11439 pmd_t *pmd_k;
11440 pte_t *pte_k;
11441 +
11442 + /* Make sure we are in vmalloc area */
11443 + if (!(address >= VMALLOC_START && address < VMALLOC_END))
11444 + return -1;
11445 +
11446 /*
11447 * Synchronize this task's top level page-table
11448 * with the 'reference' page table.
11449 @@ -671,7 +676,7 @@ void __kprobes do_page_fault(struct pt_r
11450 #ifdef CONFIG_X86_32
11451 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11452 fault has been handled. */
11453 - if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
11454 + if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
11455 local_irq_enable();
11456
11457 /*
11458 @@ -1018,9 +1023,5 @@ void vmalloc_sync_all(void)
11459 if (address == start)
11460 start = address + PGDIR_SIZE;
11461 }
11462 - /* Check that there is no need to do the same for the modules area. */
11463 - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
11464 - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
11465 - (__START_KERNEL & PGDIR_MASK)));
11466 #endif
11467 }
11468 --- a/arch/x86/mm/highmem_32-xen.c
11469 +++ b/arch/x86/mm/highmem_32-xen.c
11470 @@ -200,6 +200,5 @@ EXPORT_SYMBOL(kmap);
11471 EXPORT_SYMBOL(kunmap);
11472 EXPORT_SYMBOL(kmap_atomic);
11473 EXPORT_SYMBOL(kunmap_atomic);
11474 -EXPORT_SYMBOL(kmap_atomic_to_page);
11475 EXPORT_SYMBOL(clear_highpage);
11476 EXPORT_SYMBOL(copy_highpage);
11477 --- a/arch/x86/mm/init_32-xen.c
11478 +++ b/arch/x86/mm/init_32-xen.c
11479 @@ -1,5 +1,4 @@
11480 /*
11481 - * linux/arch/i386/mm/init.c
11482 *
11483 * Copyright (C) 1995 Linus Torvalds
11484 *
11485 @@ -22,6 +21,7 @@
11486 #include <linux/init.h>
11487 #include <linux/highmem.h>
11488 #include <linux/pagemap.h>
11489 +#include <linux/pci.h>
11490 #include <linux/pfn.h>
11491 #include <linux/poison.h>
11492 #include <linux/bootmem.h>
11493 @@ -54,6 +54,8 @@
11494
11495 unsigned int __VMALLOC_RESERVE = 128 << 20;
11496
11497 +unsigned long max_pfn_mapped;
11498 +
11499 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
11500 unsigned long highstart_pfn, highend_pfn;
11501
11502 @@ -73,7 +75,7 @@ static pmd_t * __init one_md_table_init(
11503 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
11504 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
11505
11506 - paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
11507 + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
11508 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
11509 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
11510 pud = pud_offset(pgd, 0);
11511 @@ -107,7 +109,7 @@ static pte_t * __init one_page_table_ini
11512 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
11513 }
11514
11515 - paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
11516 + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
11517 make_lowmem_page_readonly(page_table,
11518 XENFEAT_writable_page_tables);
11519 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
11520 @@ -209,8 +211,13 @@ static void __init kernel_physical_mappi
11521 /*
11522 * Map with big pages if possible, otherwise
11523 * create normal page tables:
11524 + *
11525 + * Don't use a large page for the first 2/4MB of memory
11526 + * because there are often fixed size MTRRs in there
11527 + * and overlapping MTRRs into large pages can cause
11528 + * slowdowns.
11529 */
11530 - if (cpu_has_pse) {
11531 + if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
11532 unsigned int addr2;
11533 pgprot_t prot = PAGE_KERNEL_LARGE;
11534
11535 @@ -224,6 +231,7 @@ static void __init kernel_physical_mappi
11536 set_pmd(pmd, pfn_pmd(pfn, prot));
11537
11538 pfn += PTRS_PER_PTE;
11539 + max_pfn_mapped = pfn;
11540 continue;
11541 }
11542 pte = one_page_table_init(pmd);
11543 @@ -241,6 +249,7 @@ static void __init kernel_physical_mappi
11544
11545 set_pte(pte, pfn_pte(pfn, prot));
11546 }
11547 + max_pfn_mapped = pfn;
11548 pte_ofs = 0;
11549 }
11550 pmd_idx = 0;
11551 @@ -262,6 +271,25 @@ static inline int page_kills_ppro(unsign
11552
11553 #endif
11554
11555 +/*
11556 + * devmem_is_allowed() checks to see if /dev/mem access to a certain address
11557 + * is valid. The argument is a physical page number.
11558 + *
11559 + *
11560 + * On x86, access has to be given to the first megabyte of ram because that area
11561 + * contains bios code and data regions used by X and dosemu and similar apps.
11562 + * Access has to be given to non-kernel-ram areas as well, these contain the PCI
11563 + * mmio resources as well as potential bios/acpi data regions.
11564 + */
11565 +int devmem_is_allowed(unsigned long pagenr)
11566 +{
11567 + if (pagenr <= 256)
11568 + return 1;
11569 + if (mfn_to_local_pfn(pagenr) >= max_pfn)
11570 + return 1;
11571 + return 0;
11572 +}
11573 +
11574 #ifdef CONFIG_HIGHMEM
11575 pte_t *kmap_pte;
11576 pgprot_t kmap_prot;
11577 @@ -303,48 +331,18 @@ static void __init permanent_kmaps_init(
11578 pkmap_page_table = pte;
11579 }
11580
11581 -static void __meminit free_new_highpage(struct page *page, int pfn)
11582 -{
11583 - init_page_count(page);
11584 - if (pfn < xen_start_info->nr_pages)
11585 - __free_page(page);
11586 - totalhigh_pages++;
11587 -}
11588 -
11589 void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
11590 {
11591 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
11592 ClearPageReserved(page);
11593 - free_new_highpage(page, pfn);
11594 + init_page_count(page);
11595 + if (pfn < xen_start_info->nr_pages)
11596 + __free_page(page);
11597 + totalhigh_pages++;
11598 } else
11599 SetPageReserved(page);
11600 }
11601
11602 -static int __meminit
11603 -add_one_highpage_hotplug(struct page *page, unsigned long pfn)
11604 -{
11605 - free_new_highpage(page, pfn);
11606 - totalram_pages++;
11607 -#ifdef CONFIG_FLATMEM
11608 - max_mapnr = max(pfn, max_mapnr);
11609 -#endif
11610 - num_physpages++;
11611 -
11612 - return 0;
11613 -}
11614 -
11615 -/*
11616 - * Not currently handling the NUMA case.
11617 - * Assuming single node and all memory that
11618 - * has been added dynamically that would be
11619 - * onlined here is in HIGHMEM.
11620 - */
11621 -void __meminit online_page(struct page *page)
11622 -{
11623 - ClearPageReserved(page);
11624 - add_one_highpage_hotplug(page, page_to_pfn(page));
11625 -}
11626 -
11627 #ifndef CONFIG_NUMA
11628 static void __init set_highmem_pages_init(int bad_ppro)
11629 {
11630 @@ -459,15 +457,13 @@ void zap_low_mappings(void)
11631 {
11632 int i;
11633
11634 - save_pg_dir();
11635 -
11636 /*
11637 * Zap initial low-memory mappings.
11638 *
11639 * Note that "pgd_clear()" doesn't do it for
11640 * us, because pgd_clear() is a no-op on i386.
11641 */
11642 - for (i = 0; i < USER_PTRS_PER_PGD; i++) {
11643 + for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
11644 #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
11645 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
11646 #else
11647 @@ -572,9 +568,9 @@ void __init paging_init(void)
11648
11649 /*
11650 * Test if the WP bit works in supervisor mode. It isn't supported on 386's
11651 - * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
11652 - * used to involve black magic jumps to work around some nasty CPU bugs,
11653 - * but fortunately the switch to using exceptions got rid of all that.
11654 + * and also on some strange 486's. All 586+'s are OK. This used to involve
11655 + * black magic jumps to work around some nasty CPU bugs, but fortunately the
11656 + * switch to using exceptions got rid of all that.
11657 */
11658 static void __init test_wp_bit(void)
11659 {
11660 @@ -605,9 +601,7 @@ void __init mem_init(void)
11661 int tmp, bad_ppro;
11662 unsigned long pfn;
11663
11664 -#if defined(CONFIG_SWIOTLB)
11665 - swiotlb_init();
11666 -#endif
11667 + pci_iommu_alloc();
11668
11669 #ifdef CONFIG_FLATMEM
11670 BUG_ON(!mem_map);
11671 @@ -710,16 +704,8 @@ void __init mem_init(void)
11672 test_wp_bit();
11673
11674 cpa_init();
11675 -
11676 - /*
11677 - * Subtle. SMP is doing it's boot stuff late (because it has to
11678 - * fork idle threads) - but it also needs low mappings for the
11679 - * protected-mode entry to work. We zap these entries only after
11680 - * the WP-bit has been tested.
11681 - */
11682 -#ifndef CONFIG_SMP
11683 + save_pg_dir();
11684 zap_low_mappings();
11685 -#endif
11686
11687 SetPagePinned(virt_to_page(init_mm.pgd));
11688 }
11689 @@ -769,25 +755,17 @@ void mark_rodata_ro(void)
11690 unsigned long start = PFN_ALIGN(_text);
11691 unsigned long size = PFN_ALIGN(_etext) - start;
11692
11693 -#ifndef CONFIG_KPROBES
11694 -#ifdef CONFIG_HOTPLUG_CPU
11695 - /* It must still be possible to apply SMP alternatives. */
11696 - if (num_possible_cpus() <= 1)
11697 -#endif
11698 - {
11699 - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
11700 - printk(KERN_INFO "Write protecting the kernel text: %luk\n",
11701 - size >> 10);
11702 + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
11703 + printk(KERN_INFO "Write protecting the kernel text: %luk\n",
11704 + size >> 10);
11705
11706 #ifdef CONFIG_CPA_DEBUG
11707 - printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
11708 - start, start+size);
11709 - set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
11710 + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
11711 + start, start+size);
11712 + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
11713
11714 - printk(KERN_INFO "Testing CPA: write protecting again\n");
11715 - set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
11716 -#endif
11717 - }
11718 + printk(KERN_INFO "Testing CPA: write protecting again\n");
11719 + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
11720 #endif
11721 start += size;
11722 size = (unsigned long)__end_rodata - start;
11723 --- a/arch/x86/mm/init_64-xen.c
11724 +++ b/arch/x86/mm/init_64-xen.c
11725 @@ -52,9 +52,6 @@
11726
11727 #include <xen/features.h>
11728
11729 -const struct dma_mapping_ops *dma_ops;
11730 -EXPORT_SYMBOL(dma_ops);
11731 -
11732 #if CONFIG_XEN_COMPAT <= 0x030002
11733 unsigned int __kernel_page_user;
11734 EXPORT_SYMBOL(__kernel_page_user);
11735 @@ -68,6 +65,28 @@ extern unsigned long start_pfn;
11736 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
11737 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
11738
11739 +int direct_gbpages __meminitdata
11740 +#ifdef CONFIG_DIRECT_GBPAGES
11741 + = 1
11742 +#endif
11743 +;
11744 +
11745 +#ifndef CONFIG_XEN
11746 +static int __init parse_direct_gbpages_off(char *arg)
11747 +{
11748 + direct_gbpages = 0;
11749 + return 0;
11750 +}
11751 +early_param("nogbpages", parse_direct_gbpages_off);
11752 +
11753 +static int __init parse_direct_gbpages_on(char *arg)
11754 +{
11755 + direct_gbpages = 1;
11756 + return 0;
11757 +}
11758 +early_param("gbpages", parse_direct_gbpages_on);
11759 +#endif
11760 +
11761 /*
11762 * Use this until direct mapping is established, i.e. before __va() is
11763 * available in init_memory_mapping().
11764 @@ -135,9 +154,6 @@ void show_mem(void)
11765
11766 printk(KERN_INFO "Mem-info:\n");
11767 show_free_areas();
11768 - printk(KERN_INFO "Free swap: %6ldkB\n",
11769 - nr_swap_pages << (PAGE_SHIFT-10));
11770 -
11771 for_each_online_pgdat(pgdat) {
11772 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
11773 /*
11774 @@ -328,7 +344,7 @@ void __init cleanup_highmap(void)
11775 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
11776
11777 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
11778 - if (!pmd_present(*pmd))
11779 + if (pmd_none(*pmd))
11780 continue;
11781 if (vaddr < (unsigned long) _text || vaddr > end)
11782 set_pmd(pmd, __pmd(0));
11783 @@ -337,8 +353,7 @@ void __init cleanup_highmap(void)
11784 #endif
11785
11786 /* NOTE: this is meant to be run only at boot */
11787 -void __init
11788 -__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
11789 +void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
11790 {
11791 unsigned long address = __fix_to_virt(idx);
11792
11793 @@ -463,7 +478,7 @@ __meminit void early_iounmap(void *addr,
11794 }
11795 #endif
11796
11797 -static void __meminit
11798 +static unsigned long __meminit
11799 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
11800 {
11801 int i = pmd_index(address);
11802 @@ -503,21 +518,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
11803 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
11804 }
11805 }
11806 + return address;
11807 }
11808
11809 -static void __meminit
11810 +static unsigned long __meminit
11811 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
11812 {
11813 pmd_t *pmd = pmd_offset(pud, 0);
11814 + unsigned long last_map_addr;
11815 +
11816 spin_lock(&init_mm.page_table_lock);
11817 - phys_pmd_init(pmd, address, end);
11818 + last_map_addr = phys_pmd_init(pmd, address, end);
11819 spin_unlock(&init_mm.page_table_lock);
11820 __flush_tlb_all();
11821 + return last_map_addr;
11822 }
11823
11824 -static void __meminit
11825 +static unsigned long __meminit
11826 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
11827 {
11828 + unsigned long last_map_addr = end;
11829 int i = pud_index(addr);
11830
11831 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
11832 @@ -529,7 +549,15 @@ phys_pud_init(pud_t *pud_page, unsigned
11833 break;
11834
11835 if (__pud_val(*pud)) {
11836 - phys_pmd_update(pud, addr, end);
11837 + if (!pud_large(*pud))
11838 + last_map_addr = phys_pmd_update(pud, addr, end);
11839 + continue;
11840 + }
11841 +
11842 + if (direct_gbpages) {
11843 + set_pte((pte_t *)pud,
11844 + pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
11845 + last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
11846 continue;
11847 }
11848
11849 @@ -537,12 +565,14 @@ phys_pud_init(pud_t *pud_page, unsigned
11850
11851 spin_lock(&init_mm.page_table_lock);
11852 *pud = __pud(pmd_phys | _KERNPG_TABLE);
11853 - phys_pmd_init(pmd, addr, end);
11854 + last_map_addr = phys_pmd_init(pmd, addr, end);
11855 spin_unlock(&init_mm.page_table_lock);
11856
11857 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
11858 }
11859 __flush_tlb_all();
11860 +
11861 + return last_map_addr >> PAGE_SHIFT;
11862 }
11863
11864 void __init xen_init_pt(void)
11865 @@ -763,16 +793,136 @@ static void __init xen_finish_init_mappi
11866 table_end = start_pfn;
11867 }
11868
11869 +static void __init init_gbpages(void)
11870 +{
11871 + if (direct_gbpages && cpu_has_gbpages)
11872 + printk(KERN_INFO "Using GB pages for direct mapping\n");
11873 + else
11874 + direct_gbpages = 0;
11875 +}
11876 +
11877 +#ifdef CONFIG_MEMTEST_BOOTPARAM
11878 +
11879 +static void __init memtest(unsigned long start_phys, unsigned long size,
11880 + unsigned pattern)
11881 +{
11882 + unsigned long i;
11883 + unsigned long *start;
11884 + unsigned long start_bad;
11885 + unsigned long last_bad;
11886 + unsigned long val;
11887 + unsigned long start_phys_aligned;
11888 + unsigned long count;
11889 + unsigned long incr;
11890 +
11891 + switch (pattern) {
11892 + case 0:
11893 + val = 0UL;
11894 + break;
11895 + case 1:
11896 + val = -1UL;
11897 + break;
11898 + case 2:
11899 + val = 0x5555555555555555UL;
11900 + break;
11901 + case 3:
11902 + val = 0xaaaaaaaaaaaaaaaaUL;
11903 + break;
11904 + default:
11905 + return;
11906 + }
11907 +
11908 + incr = sizeof(unsigned long);
11909 + start_phys_aligned = ALIGN(start_phys, incr);
11910 + count = (size - (start_phys_aligned - start_phys))/incr;
11911 + start = __va(start_phys_aligned);
11912 + start_bad = 0;
11913 + last_bad = 0;
11914 +
11915 + for (i = 0; i < count; i++)
11916 + start[i] = val;
11917 + for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
11918 + if (*start != val) {
11919 + if (start_phys_aligned == last_bad + incr) {
11920 + last_bad += incr;
11921 + } else {
11922 + if (start_bad) {
11923 + printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
11924 + val, start_bad, last_bad + incr);
11925 + reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
11926 + }
11927 + start_bad = last_bad = start_phys_aligned;
11928 + }
11929 + }
11930 + }
11931 + if (start_bad) {
11932 + printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
11933 + val, start_bad, last_bad + incr);
11934 + reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
11935 + }
11936 +
11937 +}
11938 +
11939 +static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
11940 +
11941 +static int __init parse_memtest(char *arg)
11942 +{
11943 + if (arg)
11944 + memtest_pattern = simple_strtoul(arg, NULL, 0);
11945 + return 0;
11946 +}
11947 +
11948 +early_param("memtest", parse_memtest);
11949 +
11950 +static void __init early_memtest(unsigned long start, unsigned long end)
11951 +{
11952 + u64 t_start, t_size;
11953 + unsigned pattern;
11954 +
11955 + if (!memtest_pattern)
11956 + return;
11957 +
11958 + printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
11959 + for (pattern = 0; pattern < memtest_pattern; pattern++) {
11960 + t_start = start;
11961 + t_size = 0;
11962 + while (t_start < end) {
11963 + t_start = find_e820_area_size(t_start, &t_size, 1);
11964 +
11965 + /* done ? */
11966 + if (t_start >= end)
11967 + break;
11968 + if (t_start + t_size > end)
11969 + t_size = end - t_start;
11970 +
11971 + printk(KERN_CONT "\n %016llx - %016llx pattern %d",
11972 + (unsigned long long)t_start,
11973 + (unsigned long long)t_start + t_size, pattern);
11974 +
11975 + memtest(t_start, t_size, pattern);
11976 +
11977 + t_start += t_size;
11978 + }
11979 + }
11980 + printk(KERN_CONT "\n");
11981 +}
11982 +#else
11983 +static void __init early_memtest(unsigned long start, unsigned long end)
11984 +{
11985 +}
11986 +#endif
11987 +
11988 /*
11989 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
11990 * This runs before bootmem is initialized and gets pages directly from
11991 * the physical memory. To access them they are temporarily mapped.
11992 */
11993 -void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
11994 +unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
11995 {
11996 - unsigned long next;
11997 + unsigned long next, last_map_addr = end;
11998 + unsigned long start_phys = start, end_phys = end;
11999
12000 - pr_debug("init_memory_mapping\n");
12001 + printk(KERN_INFO "init_memory_mapping\n");
12002
12003 /*
12004 * Find space for the kernel direct mapping tables.
12005 @@ -781,8 +931,10 @@ void __init_refok init_memory_mapping(un
12006 * memory mapped. Unfortunately this is done currently before the
12007 * nodes are discovered.
12008 */
12009 - if (!after_bootmem)
12010 + if (!after_bootmem) {
12011 + init_gbpages();
12012 find_early_table_space(end);
12013 + }
12014
12015 start = (unsigned long)__va(start);
12016 end = (unsigned long)__va(end);
12017 @@ -799,7 +951,7 @@ void __init_refok init_memory_mapping(un
12018 next = start + PGDIR_SIZE;
12019 if (next > end)
12020 next = end;
12021 - phys_pud_init(pud, __pa(start), __pa(next));
12022 + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
12023 if (!after_bootmem) {
12024 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
12025 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
12026 @@ -816,6 +968,11 @@ void __init_refok init_memory_mapping(un
12027 if (!after_bootmem)
12028 reserve_early(table_start << PAGE_SHIFT,
12029 table_end << PAGE_SHIFT, "PGTABLE");
12030 +
12031 + if (!after_bootmem)
12032 + early_memtest(start_phys, end_phys);
12033 +
12034 + return last_map_addr;
12035 }
12036
12037 #ifndef CONFIG_NUMA
12038 @@ -839,15 +996,6 @@ void __init paging_init(void)
12039 /*
12040 * Memory hotplug specific functions
12041 */
12042 -void online_page(struct page *page)
12043 -{
12044 - ClearPageReserved(page);
12045 - init_page_count(page);
12046 - __free_page(page);
12047 - totalram_pages++;
12048 - num_physpages++;
12049 -}
12050 -
12051 #ifdef CONFIG_MEMORY_HOTPLUG
12052 /*
12053 * Memory is added always to NORMAL zone. This means you will never get
12054 @@ -857,11 +1005,13 @@ int arch_add_memory(int nid, u64 start,
12055 {
12056 struct pglist_data *pgdat = NODE_DATA(nid);
12057 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
12058 - unsigned long start_pfn = start >> PAGE_SHIFT;
12059 + unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
12060 unsigned long nr_pages = size >> PAGE_SHIFT;
12061 int ret;
12062
12063 - init_memory_mapping(start, start + size-1);
12064 + last_mapped_pfn = init_memory_mapping(start, start + size-1);
12065 + if (last_mapped_pfn > max_pfn_mapped)
12066 + max_pfn_mapped = last_mapped_pfn;
12067
12068 ret = __add_pages(zone, start_pfn, nr_pages);
12069 WARN_ON(1);
12070 @@ -880,6 +1030,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
12071
12072 #endif /* CONFIG_MEMORY_HOTPLUG */
12073
12074 +/*
12075 + * devmem_is_allowed() checks to see if /dev/mem access to a certain address
12076 + * is valid. The argument is a physical page number.
12077 + *
12078 + *
12079 + * On x86, access has to be given to the first megabyte of ram because that area
12080 + * contains bios code and data regions used by X and dosemu and similar apps.
12081 + * Access has to be given to non-kernel-ram areas as well, these contain the PCI
12082 + * mmio resources as well as potential bios/acpi data regions.
12083 + */
12084 +int devmem_is_allowed(unsigned long pagenr)
12085 +{
12086 + if (pagenr <= 256)
12087 + return 1;
12088 + if (mfn_to_local_pfn(pagenr) >= max_pfn)
12089 + return 1;
12090 + return 0;
12091 +}
12092 +
12093 +
12094 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
12095 kcore_modules, kcore_vsyscall;
12096
12097 @@ -988,24 +1158,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
12098
12099 void mark_rodata_ro(void)
12100 {
12101 - unsigned long start = (unsigned long)_stext, end;
12102 -
12103 -#ifdef CONFIG_HOTPLUG_CPU
12104 - /* It must still be possible to apply SMP alternatives. */
12105 - if (num_possible_cpus() > 1)
12106 - start = (unsigned long)_etext;
12107 -#endif
12108 -
12109 -#ifdef CONFIG_KPROBES
12110 - start = (unsigned long)__start_rodata;
12111 -#endif
12112 -
12113 - end = (unsigned long)__end_rodata;
12114 - start = (start + PAGE_SIZE - 1) & PAGE_MASK;
12115 - end &= PAGE_MASK;
12116 - if (end <= start)
12117 - return;
12118 -
12119 + unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
12120
12121 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
12122 (end - start) >> 10);
12123 @@ -1028,6 +1181,7 @@ void mark_rodata_ro(void)
12124 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
12125 #endif
12126 }
12127 +
12128 #endif
12129
12130 #ifdef CONFIG_BLK_DEV_INITRD
12131 @@ -1040,7 +1194,7 @@ void free_initrd_mem(unsigned long start
12132 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
12133 {
12134 #ifdef CONFIG_NUMA
12135 - int nid = phys_to_nid(phys);
12136 + int nid, next_nid;
12137 #endif
12138 unsigned long pfn = phys >> PAGE_SHIFT;
12139
12140 @@ -1049,7 +1203,7 @@ void __init reserve_bootmem_generic(unsi
12141 * This can happen with kdump kernels when accessing
12142 * firmware tables:
12143 */
12144 - if (pfn < end_pfn_map)
12145 + if (pfn < max_pfn_mapped)
12146 return;
12147
12148 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
12149 @@ -1059,10 +1213,16 @@ void __init reserve_bootmem_generic(unsi
12150
12151 /* Should check here against the e820 map to avoid double free */
12152 #ifdef CONFIG_NUMA
12153 - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
12154 + nid = phys_to_nid(phys);
12155 + next_nid = phys_to_nid(phys + len - 1);
12156 + if (nid == next_nid)
12157 + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
12158 + else
12159 + reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
12160 #else
12161 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
12162 #endif
12163 +
12164 #ifndef CONFIG_XEN
12165 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
12166 static unsigned long dma_reserve __initdata;
12167 @@ -1160,6 +1320,10 @@ const char *arch_vma_name(struct vm_area
12168 /*
12169 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
12170 */
12171 +static long __meminitdata addr_start, addr_end;
12172 +static void __meminitdata *p_start, *p_end;
12173 +static int __meminitdata node_start;
12174 +
12175 int __meminit
12176 vmemmap_populate(struct page *start_page, unsigned long size, int node)
12177 {
12178 @@ -1194,12 +1358,32 @@ vmemmap_populate(struct page *start_page
12179 PAGE_KERNEL_LARGE);
12180 set_pmd(pmd, __pmd_ma(__pte_val(entry)));
12181
12182 - printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
12183 - addr, addr + PMD_SIZE - 1, p, node);
12184 + /* check to see if we have contiguous blocks */
12185 + if (p_end != p || node_start != node) {
12186 + if (p_start)
12187 + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
12188 + addr_start, addr_end-1, p_start, p_end-1, node_start);
12189 + addr_start = addr;
12190 + node_start = node;
12191 + p_start = p;
12192 + }
12193 + addr_end = addr + PMD_SIZE;
12194 + p_end = p + PMD_SIZE;
12195 } else {
12196 vmemmap_verify((pte_t *)pmd, node, addr, next);
12197 }
12198 }
12199 return 0;
12200 }
12201 +
12202 +void __meminit vmemmap_populate_print_last(void)
12203 +{
12204 + if (p_start) {
12205 + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
12206 + addr_start, addr_end-1, p_start, p_end-1, node_start);
12207 + p_start = NULL;
12208 + p_end = NULL;
12209 + node_start = 0;
12210 + }
12211 +}
12212 #endif
12213 --- a/arch/x86/mm/ioremap-xen.c
12214 +++ b/arch/x86/mm/ioremap-xen.c
12215 @@ -20,14 +20,11 @@
12216 #include <asm/pgtable.h>
12217 #include <asm/tlbflush.h>
12218 #include <asm/pgalloc.h>
12219 +#include <asm/pat.h>
12220
12221 -enum ioremap_mode {
12222 - IOR_MODE_UNCACHED,
12223 - IOR_MODE_CACHED,
12224 -};
12225 -
12226 -#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
12227 +#ifdef CONFIG_X86_64
12228
12229 +#ifndef CONFIG_XEN
12230 unsigned long __phys_addr(unsigned long x)
12231 {
12232 if (x >= __START_KERNEL_map)
12233 @@ -35,6 +32,19 @@ unsigned long __phys_addr(unsigned long
12234 return x - PAGE_OFFSET;
12235 }
12236 EXPORT_SYMBOL(__phys_addr);
12237 +#endif
12238 +
12239 +static inline int phys_addr_valid(unsigned long addr)
12240 +{
12241 + return addr < (1UL << boot_cpu_data.x86_phys_bits);
12242 +}
12243 +
12244 +#else
12245 +
12246 +static inline int phys_addr_valid(unsigned long addr)
12247 +{
12248 + return 1;
12249 +}
12250
12251 #endif
12252
12253 @@ -92,7 +102,8 @@ static int __direct_remap_pfn_range(stru
12254 * Fill in the machine address: PTE ptr is done later by
12255 * apply_to_page_range().
12256 */
12257 - v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
12258 + v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot)))
12259 + | _PAGE_IO;
12260
12261 mfn++;
12262 address += PAGE_SIZE;
12263 @@ -189,10 +200,9 @@ int touch_pte_range(struct mm_struct *mm
12264
12265 EXPORT_SYMBOL(touch_pte_range);
12266
12267 -#ifdef CONFIG_X86_32
12268 int page_is_ram(unsigned long pagenr)
12269 {
12270 - unsigned long addr, end;
12271 + resource_size_t addr, end;
12272 int i;
12273
12274 #ifndef CONFIG_XEN
12275 @@ -228,31 +238,51 @@ int page_is_ram(unsigned long pagenr)
12276 }
12277 return 0;
12278 }
12279 -#endif
12280
12281 /*
12282 * Fix up the linear direct mapping of the kernel to avoid cache attribute
12283 * conflicts.
12284 */
12285 static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
12286 - enum ioremap_mode mode)
12287 + unsigned long prot_val)
12288 {
12289 unsigned long nrpages = size >> PAGE_SHIFT;
12290 int err;
12291
12292 - switch (mode) {
12293 - case IOR_MODE_UNCACHED:
12294 + switch (prot_val) {
12295 + case _PAGE_CACHE_UC:
12296 default:
12297 - err = set_memory_uc(vaddr, nrpages);
12298 + err = _set_memory_uc(vaddr, nrpages);
12299 + break;
12300 + case _PAGE_CACHE_WC:
12301 + err = _set_memory_wc(vaddr, nrpages);
12302 break;
12303 - case IOR_MODE_CACHED:
12304 - err = set_memory_wb(vaddr, nrpages);
12305 + case _PAGE_CACHE_WB:
12306 + err = _set_memory_wb(vaddr, nrpages);
12307 break;
12308 }
12309
12310 return err;
12311 }
12312
12313 +int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
12314 + unsigned long prot_val)
12315 +{
12316 + unsigned long sz;
12317 + int rc;
12318 +
12319 + for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
12320 + unsigned long pfn = mfn_to_local_pfn(mfn);
12321 +
12322 + if (pfn >= max_pfn_mapped)
12323 + continue;
12324 + rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
12325 + PAGE_SIZE, prot_val);
12326 + }
12327 +
12328 + return rc;
12329 +}
12330 +
12331 /*
12332 * Remap an arbitrary physical address space into the kernel virtual
12333 * address space. Needed when the kernel wants to access high addresses
12334 @@ -262,12 +292,15 @@ static int ioremap_change_attr(unsigned
12335 * have to convert them into an offset in a page-aligned mapping, but the
12336 * caller shouldn't need to know that small detail.
12337 */
12338 -static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
12339 - enum ioremap_mode mode)
12340 +static void __iomem *__ioremap_caller(resource_size_t phys_addr,
12341 + unsigned long size, unsigned long prot_val, void *caller)
12342 {
12343 - unsigned long mfn, offset, last_addr, vaddr;
12344 + unsigned long mfn, offset, vaddr;
12345 + resource_size_t last_addr;
12346 struct vm_struct *area;
12347 + unsigned long new_prot_val;
12348 pgprot_t prot;
12349 + int retval;
12350 domid_t domid = DOMID_IO;
12351
12352 /* Don't allow wraparound or zero size */
12353 @@ -275,6 +308,13 @@ static void __iomem *__ioremap(resource_
12354 if (!size || last_addr < phys_addr)
12355 return NULL;
12356
12357 + if (!phys_addr_valid(phys_addr)) {
12358 + printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
12359 + (unsigned long long)phys_addr);
12360 + WARN_ON_ONCE(1);
12361 + return NULL;
12362 + }
12363 +
12364 /*
12365 * Don't remap the low PCI/ISA area, it's always mapped..
12366 */
12367 @@ -287,55 +327,86 @@ static void __iomem *__ioremap(resource_
12368 for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
12369 unsigned long pfn = mfn_to_local_pfn(mfn);
12370
12371 - if (pfn >= max_pfn)
12372 - continue;
12373 + if (pfn_valid(pfn)) {
12374 + if (!PageReserved(pfn_to_page(pfn)))
12375 + return NULL;
12376 + domid = DOMID_SELF;
12377 + }
12378 + }
12379 + WARN_ON_ONCE(domid == DOMID_SELF);
12380
12381 - domid = DOMID_SELF;
12382 + /*
12383 + * Mappings have to be page-aligned
12384 + */
12385 + offset = phys_addr & ~PAGE_MASK;
12386 + phys_addr &= PAGE_MASK;
12387 + size = PAGE_ALIGN(last_addr+1) - phys_addr;
12388
12389 - if (pfn >= max_pfn_mapped) /* bogus */
12390 - continue;
12391 + retval = reserve_memtype(phys_addr, phys_addr + size,
12392 + prot_val, &new_prot_val);
12393 + if (retval) {
12394 + pr_debug("Warning: reserve_memtype returned %d\n", retval);
12395 + return NULL;
12396 + }
12397
12398 - if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
12399 + if (prot_val != new_prot_val) {
12400 + /*
12401 + * Do not fallback to certain memory types with certain
12402 + * requested type:
12403 + * - request is uc-, return cannot be write-back
12404 + * - request is uc-, return cannot be write-combine
12405 + * - request is write-combine, return cannot be write-back
12406 + */
12407 + if ((prot_val == _PAGE_CACHE_UC_MINUS &&
12408 + (new_prot_val == _PAGE_CACHE_WB ||
12409 + new_prot_val == _PAGE_CACHE_WC)) ||
12410 + (prot_val == _PAGE_CACHE_WC &&
12411 + new_prot_val == _PAGE_CACHE_WB)) {
12412 + pr_debug(
12413 + "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
12414 + (unsigned long long)phys_addr,
12415 + (unsigned long long)(phys_addr + size),
12416 + prot_val, new_prot_val);
12417 + free_memtype(phys_addr, phys_addr + size);
12418 return NULL;
12419 + }
12420 + prot_val = new_prot_val;
12421 }
12422
12423 - switch (mode) {
12424 - case IOR_MODE_UNCACHED:
12425 + switch (prot_val) {
12426 + case _PAGE_CACHE_UC:
12427 default:
12428 - /*
12429 - * FIXME: we will use UC MINUS for now, as video fb drivers
12430 - * depend on it. Upcoming ioremap_wc() will fix this behavior.
12431 - */
12432 + prot = PAGE_KERNEL_NOCACHE;
12433 + break;
12434 + case _PAGE_CACHE_UC_MINUS:
12435 prot = PAGE_KERNEL_UC_MINUS;
12436 break;
12437 - case IOR_MODE_CACHED:
12438 + case _PAGE_CACHE_WC:
12439 + prot = PAGE_KERNEL_WC;
12440 + break;
12441 + case _PAGE_CACHE_WB:
12442 prot = PAGE_KERNEL;
12443 break;
12444 }
12445
12446 /*
12447 - * Mappings have to be page-aligned
12448 - */
12449 - offset = phys_addr & ~PAGE_MASK;
12450 - phys_addr &= PAGE_MASK;
12451 - size = PAGE_ALIGN(last_addr+1) - phys_addr;
12452 -
12453 - /*
12454 * Ok, go for it..
12455 */
12456 - area = get_vm_area(size, VM_IOREMAP | (mode << 20));
12457 + area = get_vm_area_caller(size, VM_IOREMAP, caller);
12458 if (!area)
12459 return NULL;
12460 area->phys_addr = phys_addr;
12461 vaddr = (unsigned long) area->addr;
12462 if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
12463 size, prot, domid)) {
12464 + free_memtype(phys_addr, phys_addr + size);
12465 free_vm_area(area);
12466 return NULL;
12467 }
12468
12469 - if (ioremap_change_attr(vaddr, size, mode) < 0) {
12470 - iounmap((void __iomem *) vaddr);
12471 + if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
12472 + free_memtype(phys_addr, phys_addr + size);
12473 + vunmap(area->addr);
12474 return NULL;
12475 }
12476
12477 @@ -365,16 +436,72 @@ static void __iomem *__ioremap(resource_
12478 */
12479 void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
12480 {
12481 - return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
12482 + /*
12483 + * Ideally, this should be:
12484 + * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
12485 + *
12486 + * Till we fix all X drivers to use ioremap_wc(), we will use
12487 + * UC MINUS.
12488 + */
12489 + unsigned long val = _PAGE_CACHE_UC_MINUS;
12490 +
12491 + return __ioremap_caller(phys_addr, size, val,
12492 + __builtin_return_address(0));
12493 }
12494 EXPORT_SYMBOL(ioremap_nocache);
12495
12496 +/**
12497 + * ioremap_wc - map memory into CPU space write combined
12498 + * @offset: bus address of the memory
12499 + * @size: size of the resource to map
12500 + *
12501 + * This version of ioremap ensures that the memory is marked write combining.
12502 + * Write combining allows faster writes to some hardware devices.
12503 + *
12504 + * Must be freed with iounmap.
12505 + */
12506 +void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
12507 +{
12508 + if (pat_wc_enabled)
12509 + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
12510 + __builtin_return_address(0));
12511 + else
12512 + return ioremap_nocache(phys_addr, size);
12513 +}
12514 +EXPORT_SYMBOL(ioremap_wc);
12515 +
12516 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
12517 {
12518 - return __ioremap(phys_addr, size, IOR_MODE_CACHED);
12519 + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
12520 + __builtin_return_address(0));
12521 }
12522 EXPORT_SYMBOL(ioremap_cache);
12523
12524 +#ifndef CONFIG_XEN
12525 +static void __iomem *ioremap_default(resource_size_t phys_addr,
12526 + unsigned long size)
12527 +{
12528 + unsigned long flags;
12529 + void *ret;
12530 + int err;
12531 +
12532 + /*
12533 + * - WB for WB-able memory and no other conflicting mappings
12534 + * - UC_MINUS for non-WB-able memory with no other conflicting mappings
12535 + * - Inherit from confliting mappings otherwise
12536 + */
12537 + err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
12538 + if (err < 0)
12539 + return NULL;
12540 +
12541 + ret = (void *) __ioremap_caller(phys_addr, size, flags,
12542 + __builtin_return_address(0));
12543 +
12544 + free_memtype(phys_addr, phys_addr + size);
12545 + return (void __iomem *)ret;
12546 +}
12547 +#endif
12548 +
12549 /**
12550 * iounmap - Free a IO remapping
12551 * @addr: virtual address from ioremap_*
12552 @@ -417,15 +544,7 @@ void iounmap(volatile void __iomem *addr
12553 return;
12554 }
12555
12556 - if ((p->flags >> 20) != IOR_MODE_CACHED) {
12557 - unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
12558 - unsigned long mfn = p->phys_addr;
12559 - unsigned long va = (unsigned long)addr;
12560 -
12561 - for (; n > 0; n--, mfn++, va += PAGE_SIZE)
12562 - if (mfn_to_local_pfn(mfn) < max_pfn)
12563 - set_memory_wb(va, 1);
12564 - }
12565 + free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
12566
12567 /* Finally remove it */
12568 o = remove_vm_area((void *)addr);
12569 @@ -434,6 +553,37 @@ void iounmap(volatile void __iomem *addr
12570 }
12571 EXPORT_SYMBOL(iounmap);
12572
12573 +#ifndef CONFIG_XEN
12574 +/*
12575 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
12576 + * access
12577 + */
12578 +void *xlate_dev_mem_ptr(unsigned long phys)
12579 +{
12580 + void *addr;
12581 + unsigned long start = phys & PAGE_MASK;
12582 +
12583 + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
12584 + if (page_is_ram(start >> PAGE_SHIFT))
12585 + return __va(phys);
12586 +
12587 + addr = (void *)ioremap_default(start, PAGE_SIZE);
12588 + if (addr)
12589 + addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
12590 +
12591 + return addr;
12592 +}
12593 +
12594 +void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
12595 +{
12596 + if (page_is_ram(phys >> PAGE_SHIFT))
12597 + return;
12598 +
12599 + iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
12600 + return;
12601 +}
12602 +#endif
12603 +
12604 int __initdata early_ioremap_debug;
12605
12606 static int __init early_ioremap_debug_setup(char *str)
12607 @@ -445,8 +595,8 @@ static int __init early_ioremap_debug_se
12608 early_param("early_ioremap_debug", early_ioremap_debug_setup);
12609
12610 static __initdata int after_paging_init;
12611 -static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
12612 - __attribute__((aligned(PAGE_SIZE)));
12613 +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
12614 + __section(.bss.page_aligned);
12615
12616 #ifdef CONFIG_X86_32
12617 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
12618 @@ -461,8 +611,8 @@ static inline pmd_t * __init early_iorem
12619 }
12620 #else
12621 #define early_ioremap_pmd early_get_pmd
12622 +#undef make_lowmem_page_readonly
12623 #define make_lowmem_page_readonly early_make_page_readonly
12624 -#define make_lowmem_page_writable make_page_writable
12625 #endif
12626
12627 static inline pte_t * __init early_ioremap_pte(unsigned long addr)
12628 @@ -511,7 +661,7 @@ void __init early_ioremap_clear(void)
12629 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
12630 pmd_clear(pmd);
12631 make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
12632 - /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
12633 + /* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */
12634 __flush_tlb_all();
12635 }
12636
12637 @@ -652,10 +802,11 @@ void __init early_iounmap(void *addr, un
12638 unsigned long offset;
12639 unsigned int nrpages;
12640 enum fixed_addresses idx;
12641 - unsigned int nesting;
12642 + int nesting;
12643
12644 nesting = --early_ioremap_nested;
12645 - WARN_ON(nesting < 0);
12646 + if (WARN_ON(nesting < 0))
12647 + return;
12648
12649 if (early_ioremap_debug) {
12650 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
12651 --- a/arch/x86/mm/pageattr-xen.c
12652 +++ b/arch/x86/mm/pageattr-xen.c
12653 @@ -9,6 +9,8 @@
12654 #include <linux/slab.h>
12655 #include <linux/mm.h>
12656 #include <linux/interrupt.h>
12657 +#include <linux/seq_file.h>
12658 +#include <linux/debugfs.h>
12659
12660 #include <asm/e820.h>
12661 #include <asm/processor.h>
12662 @@ -17,370 +19,7 @@
12663 #include <asm/uaccess.h>
12664 #include <asm/pgalloc.h>
12665 #include <asm/proto.h>
12666 -#include <asm/mmu_context.h>
12667 -
12668 -#ifndef CONFIG_X86_64
12669 -#define TASK_SIZE64 TASK_SIZE
12670 -#endif
12671 -
12672 -static void _pin_lock(struct mm_struct *mm, int lock) {
12673 - if (lock)
12674 - spin_lock(&mm->page_table_lock);
12675 -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
12676 - /* While mm->page_table_lock protects us against insertions and
12677 - * removals of higher level page table pages, it doesn't protect
12678 - * against updates of pte-s. Such updates, however, require the
12679 - * pte pages to be in consistent state (unpinned+writable or
12680 - * pinned+readonly). The pinning and attribute changes, however
12681 - * cannot be done atomically, which is why such updates must be
12682 - * prevented from happening concurrently.
12683 - * Note that no pte lock can ever elsewhere be acquired nesting
12684 - * with an already acquired one in the same mm, or with the mm's
12685 - * page_table_lock already acquired, as that would break in the
12686 - * non-split case (where all these are actually resolving to the
12687 - * one page_table_lock). Thus acquiring all of them here is not
12688 - * going to result in dead locks, and the order of acquires
12689 - * doesn't matter.
12690 - */
12691 - {
12692 - pgd_t *pgd = mm->pgd;
12693 - unsigned g;
12694 -
12695 - for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
12696 - pud_t *pud;
12697 - unsigned u;
12698 -
12699 - if (pgd_none(*pgd))
12700 - continue;
12701 - pud = pud_offset(pgd, 0);
12702 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
12703 - pmd_t *pmd;
12704 - unsigned m;
12705 -
12706 - if (pud_none(*pud))
12707 - continue;
12708 - pmd = pmd_offset(pud, 0);
12709 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
12710 - spinlock_t *ptl;
12711 -
12712 - if (pmd_none(*pmd))
12713 - continue;
12714 - ptl = pte_lockptr(0, pmd);
12715 - if (lock)
12716 - spin_lock(ptl);
12717 - else
12718 - spin_unlock(ptl);
12719 - }
12720 - }
12721 - }
12722 - }
12723 -#endif
12724 - if (!lock)
12725 - spin_unlock(&mm->page_table_lock);
12726 -}
12727 -#define pin_lock(mm) _pin_lock(mm, 1)
12728 -#define pin_unlock(mm) _pin_lock(mm, 0)
12729 -
12730 -#define PIN_BATCH sizeof(void *)
12731 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
12732 -
12733 -static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
12734 - unsigned int cpu, unsigned int seq)
12735 -{
12736 - unsigned long pfn = page_to_pfn(page);
12737 -
12738 - if (PageHighMem(page)) {
12739 - if (pgprot_val(flags) & _PAGE_RW)
12740 - ClearPagePinned(page);
12741 - else
12742 - SetPagePinned(page);
12743 - } else {
12744 - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
12745 - (unsigned long)__va(pfn << PAGE_SHIFT),
12746 - pfn_pte(pfn, flags), 0);
12747 - if (unlikely(++seq == PIN_BATCH)) {
12748 - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
12749 - PIN_BATCH, NULL)))
12750 - BUG();
12751 - seq = 0;
12752 - }
12753 - }
12754 -
12755 - return seq;
12756 -}
12757 -
12758 -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
12759 -{
12760 - pgd_t *pgd = pgd_base;
12761 - pud_t *pud;
12762 - pmd_t *pmd;
12763 - int g,u,m;
12764 - unsigned int cpu, seq;
12765 - multicall_entry_t *mcl;
12766 -
12767 - if (xen_feature(XENFEAT_auto_translated_physmap))
12768 - return;
12769 -
12770 - cpu = get_cpu();
12771 -
12772 - /*
12773 - * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
12774 - * may not be the 'current' task's pagetables (e.g., current may be
12775 - * 32-bit, but the pagetables may be for a 64-bit task).
12776 - * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
12777 - * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
12778 - */
12779 - for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
12780 - if (pgd_none(*pgd))
12781 - continue;
12782 - pud = pud_offset(pgd, 0);
12783 - if (PTRS_PER_PUD > 1) /* not folded */
12784 - seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
12785 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
12786 - if (pud_none(*pud))
12787 - continue;
12788 - pmd = pmd_offset(pud, 0);
12789 - if (PTRS_PER_PMD > 1) /* not folded */
12790 - seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
12791 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
12792 - if (pmd_none(*pmd))
12793 - continue;
12794 - seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
12795 - }
12796 - }
12797 - }
12798 -
12799 - mcl = per_cpu(pb_mcl, cpu);
12800 -#ifdef CONFIG_X86_64
12801 - if (unlikely(seq > PIN_BATCH - 2)) {
12802 - if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
12803 - BUG();
12804 - seq = 0;
12805 - }
12806 - MULTI_update_va_mapping(mcl + seq,
12807 - (unsigned long)__user_pgd(pgd_base),
12808 - pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
12809 - 0);
12810 - MULTI_update_va_mapping(mcl + seq + 1,
12811 - (unsigned long)pgd_base,
12812 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
12813 - UVMF_TLB_FLUSH);
12814 - if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
12815 - BUG();
12816 -#else
12817 - if (likely(seq != 0)) {
12818 - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
12819 - (unsigned long)pgd_base,
12820 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
12821 - UVMF_TLB_FLUSH);
12822 - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
12823 - seq + 1, NULL)))
12824 - BUG();
12825 - } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
12826 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
12827 - UVMF_TLB_FLUSH))
12828 - BUG();
12829 -#endif
12830 -
12831 - put_cpu();
12832 -}
12833 -
12834 -static void __pgd_pin(pgd_t *pgd)
12835 -{
12836 - pgd_walk(pgd, PAGE_KERNEL_RO);
12837 - kmap_flush_unused();
12838 - xen_pgd_pin(__pa(pgd)); /* kernel */
12839 -#ifdef CONFIG_X86_64
12840 - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
12841 -#endif
12842 - SetPagePinned(virt_to_page(pgd));
12843 -}
12844 -
12845 -static void __pgd_unpin(pgd_t *pgd)
12846 -{
12847 - xen_pgd_unpin(__pa(pgd));
12848 -#ifdef CONFIG_X86_64
12849 - xen_pgd_unpin(__pa(__user_pgd(pgd)));
12850 -#endif
12851 - pgd_walk(pgd, PAGE_KERNEL);
12852 - ClearPagePinned(virt_to_page(pgd));
12853 -}
12854 -
12855 -void pgd_test_and_unpin(pgd_t *pgd)
12856 -{
12857 - if (PagePinned(virt_to_page(pgd)))
12858 - __pgd_unpin(pgd);
12859 -}
12860 -
12861 -void mm_pin(struct mm_struct *mm)
12862 -{
12863 - if (xen_feature(XENFEAT_writable_page_tables))
12864 - return;
12865 -
12866 - pin_lock(mm);
12867 - __pgd_pin(mm->pgd);
12868 - pin_unlock(mm);
12869 -}
12870 -
12871 -void mm_unpin(struct mm_struct *mm)
12872 -{
12873 - if (xen_feature(XENFEAT_writable_page_tables))
12874 - return;
12875 -
12876 - pin_lock(mm);
12877 - __pgd_unpin(mm->pgd);
12878 - pin_unlock(mm);
12879 -}
12880 -
12881 -void mm_pin_all(void)
12882 -{
12883 - struct page *page;
12884 - unsigned long flags;
12885 -
12886 - if (xen_feature(XENFEAT_writable_page_tables))
12887 - return;
12888 -
12889 - /*
12890 - * Allow uninterrupted access to the pgd_list. Also protects
12891 - * __pgd_pin() by disabling preemption.
12892 - * All other CPUs must be at a safe point (e.g., in stop_machine
12893 - * or offlined entirely).
12894 - */
12895 - spin_lock_irqsave(&pgd_lock, flags);
12896 - list_for_each_entry(page, &pgd_list, lru) {
12897 - if (!PagePinned(page))
12898 - __pgd_pin((pgd_t *)page_address(page));
12899 - }
12900 - spin_unlock_irqrestore(&pgd_lock, flags);
12901 -}
12902 -
12903 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
12904 -{
12905 - if (!PagePinned(virt_to_page(mm->pgd)))
12906 - mm_pin(mm);
12907 -}
12908 -
12909 -void arch_exit_mmap(struct mm_struct *mm)
12910 -{
12911 - struct task_struct *tsk = current;
12912 -
12913 - task_lock(tsk);
12914 -
12915 - /*
12916 - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
12917 - * *much* faster this way, as no tlb flushes means bigger wrpt batches.
12918 - */
12919 - if (tsk->active_mm == mm) {
12920 - tsk->active_mm = &init_mm;
12921 - atomic_inc(&init_mm.mm_count);
12922 -
12923 - switch_mm(mm, &init_mm, tsk);
12924 -
12925 - atomic_dec(&mm->mm_count);
12926 - BUG_ON(atomic_read(&mm->mm_count) == 0);
12927 - }
12928 -
12929 - task_unlock(tsk);
12930 -
12931 - if (PagePinned(virt_to_page(mm->pgd))
12932 - && atomic_read(&mm->mm_count) == 1
12933 - && !mm->context.has_foreign_mappings)
12934 - mm_unpin(mm);
12935 -}
12936 -
12937 -static void _pte_free(struct page *page, unsigned int order)
12938 -{
12939 - BUG_ON(order);
12940 - __pte_free(page);
12941 -}
12942 -
12943 -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
12944 -{
12945 - struct page *pte;
12946 -
12947 -#ifdef CONFIG_HIGHPTE
12948 - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
12949 -#else
12950 - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
12951 -#endif
12952 - if (pte) {
12953 - pgtable_page_ctor(pte);
12954 - SetPageForeign(pte, _pte_free);
12955 - init_page_count(pte);
12956 - }
12957 - return pte;
12958 -}
12959 -
12960 -void __pte_free(pgtable_t pte)
12961 -{
12962 - if (!PageHighMem(pte)) {
12963 - unsigned long va = (unsigned long)page_address(pte);
12964 - unsigned int level;
12965 - pte_t *ptep = lookup_address(va, &level);
12966 -
12967 - BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
12968 - if (!pte_write(*ptep)
12969 - && HYPERVISOR_update_va_mapping(va,
12970 - mk_pte(pte, PAGE_KERNEL),
12971 - 0))
12972 - BUG();
12973 - } else
12974 -#ifdef CONFIG_HIGHPTE
12975 - ClearPagePinned(pte);
12976 -#else
12977 - BUG();
12978 -#endif
12979 -
12980 - ClearPageForeign(pte);
12981 - init_page_count(pte);
12982 - pgtable_page_dtor(pte);
12983 - __free_page(pte);
12984 -}
12985 -
12986 -#if PAGETABLE_LEVELS >= 3
12987 -static void _pmd_free(struct page *page, unsigned int order)
12988 -{
12989 - BUG_ON(order);
12990 - __pmd_free(page);
12991 -}
12992 -
12993 -pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
12994 -{
12995 - struct page *pmd;
12996 -
12997 - pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
12998 - if (!pmd)
12999 - return NULL;
13000 - SetPageForeign(pmd, _pmd_free);
13001 - init_page_count(pmd);
13002 - return page_address(pmd);
13003 -}
13004 -
13005 -void __pmd_free(pgtable_t pmd)
13006 -{
13007 - unsigned long va = (unsigned long)page_address(pmd);
13008 - unsigned int level;
13009 - pte_t *ptep = lookup_address(va, &level);
13010 -
13011 - BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
13012 - if (!pte_write(*ptep)
13013 - && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
13014 - BUG();
13015 -
13016 - ClearPageForeign(pmd);
13017 - init_page_count(pmd);
13018 - __free_page(pmd);
13019 -}
13020 -#endif
13021 -
13022 -/* blktap and gntdev need this, as otherwise they would implicitly (and
13023 - * needlessly, as they never use it) reference init_mm. */
13024 -pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
13025 - unsigned long addr, pte_t *ptep, int full)
13026 -{
13027 - return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
13028 -}
13029 -EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
13030 +#include <asm/pat.h>
13031
13032 /*
13033 * The current flushing context - we pass it instead of 5 arguments:
13034 @@ -392,6 +31,7 @@ struct cpa_data {
13035 int numpages;
13036 int flushtlb;
13037 unsigned long pfn;
13038 + unsigned force_split : 1;
13039 };
13040
13041 #ifdef CONFIG_X86_64
13042 @@ -637,6 +277,9 @@ try_preserve_large_page(pte_t *kpte, uns
13043 int i, do_split = 1;
13044 unsigned int level;
13045
13046 + if (cpa->force_split)
13047 + return 1;
13048 +
13049 spin_lock_irqsave(&pgd_lock, flags);
13050 /*
13051 * Check for races, another CPU might have split this page
13052 @@ -856,9 +499,7 @@ static int split_large_page(pte_t *kpte,
13053 goto out_unlock;
13054
13055 pbase = (pte_t *)page_address(base);
13056 -#ifdef CONFIG_X86_32
13057 - paravirt_alloc_pt(&init_mm, page_to_pfn(base));
13058 -#endif
13059 + paravirt_alloc_pte(&init_mm, page_to_pfn(base));
13060 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
13061
13062 #ifdef CONFIG_X86_64
13063 @@ -918,7 +559,7 @@ static int __change_page_attr(struct cpa
13064 repeat:
13065 kpte = lookup_address(address, &level);
13066 if (!kpte)
13067 - return primary ? -EINVAL : 0;
13068 + return 0;
13069
13070 old_pte = *kpte;
13071 if (!__pte_val(old_pte)) {
13072 @@ -1077,7 +718,8 @@ static inline int cache_attr(pgprot_t at
13073 }
13074
13075 static int change_page_attr_set_clr(unsigned long addr, int numpages,
13076 - pgprot_t mask_set, pgprot_t mask_clr)
13077 + pgprot_t mask_set, pgprot_t mask_clr,
13078 + int force_split)
13079 {
13080 struct cpa_data cpa;
13081 int ret, cache, checkalias;
13082 @@ -1088,7 +730,7 @@ static int change_page_attr_set_clr(unsi
13083 */
13084 mask_set = canon_pgprot(mask_set);
13085 mask_clr = canon_pgprot(mask_clr);
13086 - if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
13087 + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
13088 return 0;
13089
13090 /* Ensure we are PAGE_SIZE aligned */
13091 @@ -1105,6 +747,7 @@ static int change_page_attr_set_clr(unsi
13092 cpa.mask_set = mask_set;
13093 cpa.mask_clr = mask_clr;
13094 cpa.flushtlb = 0;
13095 + cpa.force_split = force_split;
13096
13097 /* No alias checking for _NX bit modifications */
13098 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
13099 @@ -1143,26 +786,67 @@ out:
13100 static inline int change_page_attr_set(unsigned long addr, int numpages,
13101 pgprot_t mask)
13102 {
13103 - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
13104 + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
13105 }
13106
13107 static inline int change_page_attr_clear(unsigned long addr, int numpages,
13108 pgprot_t mask)
13109 {
13110 - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
13111 + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
13112 }
13113
13114 -int set_memory_uc(unsigned long addr, int numpages)
13115 +int _set_memory_uc(unsigned long addr, int numpages)
13116 {
13117 + /*
13118 + * for now UC MINUS. see comments in ioremap_nocache()
13119 + */
13120 return change_page_attr_set(addr, numpages,
13121 - __pgprot(_PAGE_PCD));
13122 + __pgprot(_PAGE_CACHE_UC_MINUS));
13123 +}
13124 +
13125 +int set_memory_uc(unsigned long addr, int numpages)
13126 +{
13127 + /*
13128 + * for now UC MINUS. see comments in ioremap_nocache()
13129 + */
13130 + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
13131 + _PAGE_CACHE_UC_MINUS, NULL))
13132 + return -EINVAL;
13133 +
13134 + return _set_memory_uc(addr, numpages);
13135 }
13136 EXPORT_SYMBOL(set_memory_uc);
13137
13138 -int set_memory_wb(unsigned long addr, int numpages)
13139 +int _set_memory_wc(unsigned long addr, int numpages)
13140 +{
13141 + return change_page_attr_set(addr, numpages,
13142 + __pgprot(_PAGE_CACHE_WC));
13143 +}
13144 +
13145 +int set_memory_wc(unsigned long addr, int numpages)
13146 +{
13147 + if (!pat_wc_enabled)
13148 + return set_memory_uc(addr, numpages);
13149 +
13150 + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
13151 + _PAGE_CACHE_WC, NULL))
13152 + return -EINVAL;
13153 +
13154 + return _set_memory_wc(addr, numpages);
13155 +}
13156 +EXPORT_SYMBOL(set_memory_wc);
13157 +
13158 +int _set_memory_wb(unsigned long addr, int numpages)
13159 {
13160 return change_page_attr_clear(addr, numpages,
13161 - __pgprot(_PAGE_PCD | _PAGE_PWT));
13162 + __pgprot(_PAGE_CACHE_MASK));
13163 +}
13164 +
13165 +int set_memory_wb(unsigned long addr, int numpages)
13166 +{
13167 + free_memtype(addr, addr + numpages * PAGE_SIZE);
13168 +
13169 + return _set_memory_wb(addr, numpages);
13170 }
13171 EXPORT_SYMBOL(set_memory_wb);
13172
13173 @@ -1193,6 +877,12 @@ int set_memory_np(unsigned long addr, in
13174 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
13175 }
13176
13177 +int set_memory_4k(unsigned long addr, int numpages)
13178 +{
13179 + return change_page_attr_set_clr(addr, numpages, __pgprot(0),
13180 + __pgprot(0), 1);
13181 +}
13182 +
13183 int set_pages_uc(struct page *page, int numpages)
13184 {
13185 unsigned long addr = (unsigned long)page_address(page);
13186 @@ -1302,6 +992,45 @@ void kernel_map_pages(struct page *page,
13187 cpa_fill_pool(NULL);
13188 }
13189
13190 +#ifdef CONFIG_DEBUG_FS
13191 +static int dpa_show(struct seq_file *m, void *v)
13192 +{
13193 + seq_puts(m, "DEBUG_PAGEALLOC\n");
13194 + seq_printf(m, "pool_size : %lu\n", pool_size);
13195 + seq_printf(m, "pool_pages : %lu\n", pool_pages);
13196 + seq_printf(m, "pool_low : %lu\n", pool_low);
13197 + seq_printf(m, "pool_used : %lu\n", pool_used);
13198 + seq_printf(m, "pool_failed : %lu\n", pool_failed);
13199 +
13200 + return 0;
13201 +}
13202 +
13203 +static int dpa_open(struct inode *inode, struct file *filp)
13204 +{
13205 + return single_open(filp, dpa_show, NULL);
13206 +}
13207 +
13208 +static const struct file_operations dpa_fops = {
13209 + .open = dpa_open,
13210 + .read = seq_read,
13211 + .llseek = seq_lseek,
13212 + .release = single_release,
13213 +};
13214 +
13215 +static int __init debug_pagealloc_proc_init(void)
13216 +{
13217 + struct dentry *de;
13218 +
13219 + de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
13220 + &dpa_fops);
13221 + if (!de)
13222 + return -ENOMEM;
13223 +
13224 + return 0;
13225 +}
13226 +__initcall(debug_pagealloc_proc_init);
13227 +#endif
13228 +
13229 #ifdef CONFIG_HIBERNATION
13230
13231 bool kernel_page_present(struct page *page)
13232 --- /dev/null
13233 +++ b/arch/x86/mm/pat-xen.c
13234 @@ -0,0 +1,602 @@
13235 +/*
13236 + * Handle caching attributes in page tables (PAT)
13237 + *
13238 + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
13239 + * Suresh B Siddha <suresh.b.siddha@intel.com>
13240 + *
13241 + * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
13242 + */
13243 +
13244 +#include <linux/mm.h>
13245 +#include <linux/kernel.h>
13246 +#include <linux/gfp.h>
13247 +#include <linux/fs.h>
13248 +#include <linux/bootmem.h>
13249 +
13250 +#include <asm/msr.h>
13251 +#include <asm/tlbflush.h>
13252 +#include <asm/processor.h>
13253 +#include <asm/page.h>
13254 +#include <asm/pgtable.h>
13255 +#include <asm/pat.h>
13256 +#include <asm/e820.h>
13257 +#include <asm/cacheflush.h>
13258 +#include <asm/fcntl.h>
13259 +#include <asm/mtrr.h>
13260 +#include <asm/io.h>
13261 +
13262 +#ifdef CONFIG_X86_PAT
13263 +int __read_mostly pat_wc_enabled = 1;
13264 +
13265 +void __cpuinit pat_disable(char *reason)
13266 +{
13267 + pat_wc_enabled = 0;
13268 + printk(KERN_INFO "%s\n", reason);
13269 +}
13270 +
13271 +static int __init nopat(char *str)
13272 +{
13273 + pat_disable("PAT support disabled.");
13274 + return 0;
13275 +}
13276 +early_param("nopat", nopat);
13277 +#endif
13278 +
13279 +static u64 __read_mostly boot_pat_state;
13280 +
13281 +enum {
13282 + PAT_UC = 0, /* uncached */
13283 + PAT_WC = 1, /* Write combining */
13284 + PAT_WT = 4, /* Write Through */
13285 + PAT_WP = 5, /* Write Protected */
13286 + PAT_WB = 6, /* Write Back (default) */
13287 + PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
13288 +};
13289 +
13290 +#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
13291 +
13292 +void pat_init(void)
13293 +{
13294 + u64 pat;
13295 +
13296 + if (!pat_wc_enabled)
13297 + return;
13298 +
13299 + /* Paranoia check. */
13300 + if (!cpu_has_pat) {
13301 + printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
13302 + /*
13303 + * Panic if this happens on the secondary CPU, and we
13304 + * switched to PAT on the boot CPU. We have no way to
13305 + * undo PAT.
13306 + */
13307 + BUG_ON(boot_pat_state);
13308 + }
13309 +
13310 +#ifndef CONFIG_XEN
13311 + /* Set PWT to Write-Combining. All other bits stay the same */
13312 + /*
13313 + * PTE encoding used in Linux:
13314 + * PAT
13315 + * |PCD
13316 + * ||PWT
13317 + * |||
13318 + * 000 WB _PAGE_CACHE_WB
13319 + * 001 WC _PAGE_CACHE_WC
13320 + * 010 UC- _PAGE_CACHE_UC_MINUS
13321 + * 011 UC _PAGE_CACHE_UC
13322 + * PAT bit unused
13323 + */
13324 + pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
13325 + PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
13326 +
13327 + /* Boot CPU check */
13328 + if (!boot_pat_state)
13329 + rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
13330 +
13331 + wrmsrl(MSR_IA32_CR_PAT, pat);
13332 +#else
13333 + /*
13334 + * PAT settings are part of the hypervisor interface, and their
13335 + * assignment cannot be changed.
13336 + */
13337 + rdmsrl(MSR_IA32_CR_PAT, pat);
13338 + if (!boot_pat_state)
13339 + boot_pat_state = pat;
13340 +#endif
13341 + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
13342 + smp_processor_id(), boot_pat_state, pat);
13343 +}
13344 +
13345 +#undef PAT
13346 +
13347 +static char *cattr_name(unsigned long flags)
13348 +{
13349 + switch (flags & _PAGE_CACHE_MASK) {
13350 + case _PAGE_CACHE_UC: return "uncached";
13351 + case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
13352 + case _PAGE_CACHE_WB: return "write-back";
13353 + case _PAGE_CACHE_WC: return "write-combining";
13354 + case _PAGE_CACHE_WP: return "write-protected";
13355 + case _PAGE_CACHE_WT: return "write-through";
13356 + default: return "broken";
13357 + }
13358 +}
13359 +
13360 +/*
13361 + * The global memtype list keeps track of memory type for specific
13362 + * physical memory areas. Conflicting memory types in different
13363 + * mappings can cause CPU cache corruption. To avoid this we keep track.
13364 + *
13365 + * The list is sorted based on starting address and can contain multiple
13366 + * entries for each address (this allows reference counting for overlapping
13367 + * areas). All the aliases have the same cache attributes of course.
13368 + * Zero attributes are represented as holes.
13369 + *
13370 + * Currently the data structure is a list because the number of mappings
13371 + * are expected to be relatively small. If this should be a problem
13372 + * it could be changed to a rbtree or similar.
13373 + *
13374 + * memtype_lock protects the whole list.
13375 + */
13376 +
13377 +struct memtype {
13378 + u64 start;
13379 + u64 end;
13380 + unsigned long type;
13381 + struct list_head nd;
13382 +};
13383 +
13384 +static LIST_HEAD(memtype_list);
13385 +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
13386 +
13387 +/*
13388 + * Does intersection of PAT memory type and MTRR memory type and returns
13389 + * the resulting memory type as PAT understands it.
13390 + * (Type in pat and mtrr will not have same value)
13391 + * The intersection is based on "Effective Memory Type" tables in IA-32
13392 + * SDM vol 3a
13393 + */
13394 +static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
13395 + unsigned long *ret_prot)
13396 +{
13397 + unsigned long pat_type;
13398 + u8 mtrr_type;
13399 +
13400 + pat_type = prot & _PAGE_CACHE_MASK;
13401 + prot &= (~_PAGE_CACHE_MASK);
13402 +
13403 + /*
13404 + * We return the PAT request directly for types where PAT takes
13405 + * precedence with respect to MTRR and for UC_MINUS.
13406 + * Consistency checks with other PAT requests is done later
13407 + * while going through memtype list.
13408 + */
13409 + if (pat_type == _PAGE_CACHE_WC) {
13410 + *ret_prot = prot | _PAGE_CACHE_WC;
13411 + return 0;
13412 + } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
13413 + *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
13414 + return 0;
13415 + } else if (pat_type == _PAGE_CACHE_UC) {
13416 + *ret_prot = prot | _PAGE_CACHE_UC;
13417 + return 0;
13418 + }
13419 +
13420 + /*
13421 + * Look for MTRR hint to get the effective type in case where PAT
13422 + * request is for WB.
13423 + */
13424 + mtrr_type = mtrr_type_lookup(start, end);
13425 +
13426 + if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
13427 + *ret_prot = prot | _PAGE_CACHE_UC;
13428 + } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
13429 + *ret_prot = prot | _PAGE_CACHE_WC;
13430 + } else {
13431 + *ret_prot = prot | _PAGE_CACHE_WB;
13432 + }
13433 +
13434 + return 0;
13435 +}
13436 +
13437 +/*
13438 + * req_type typically has one of the:
13439 + * - _PAGE_CACHE_WB
13440 + * - _PAGE_CACHE_WC
13441 + * - _PAGE_CACHE_UC_MINUS
13442 + * - _PAGE_CACHE_UC
13443 + *
13444 + * req_type will have a special case value '-1', when requester want to inherit
13445 + * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
13446 + *
13447 + * If ret_type is NULL, function will return an error if it cannot reserve the
13448 + * region with req_type. If ret_type is non-null, function will return
13449 + * available type in ret_type in case of no error. In case of any error
13450 + * it will return a negative return value.
13451 + */
13452 +int reserve_memtype(u64 start, u64 end, unsigned long req_type,
13453 + unsigned long *ret_type)
13454 +{
13455 + struct memtype *new_entry = NULL;
13456 + struct memtype *parse;
13457 + unsigned long actual_type;
13458 + int err = 0;
13459 +
13460 + /* Only track when pat_wc_enabled */
13461 + if (!pat_wc_enabled) {
13462 + /* This is identical to page table setting without PAT */
13463 + if (ret_type) {
13464 + if (req_type == -1) {
13465 + *ret_type = _PAGE_CACHE_WB;
13466 + } else {
13467 + *ret_type = req_type;
13468 + }
13469 + }
13470 + return 0;
13471 + }
13472 +
13473 + /* Low ISA region is always mapped WB in page table. No need to track */
13474 + if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
13475 + if (ret_type)
13476 + *ret_type = _PAGE_CACHE_WB;
13477 +
13478 + return 0;
13479 + }
13480 +
13481 + if (req_type == -1) {
13482 + /*
13483 + * Call mtrr_lookup to get the type hint. This is an
13484 + * optimization for /dev/mem mmap'ers into WB memory (BIOS
13485 + * tools and ACPI tools). Use WB request for WB memory and use
13486 + * UC_MINUS otherwise.
13487 + */
13488 + u8 mtrr_type = mtrr_type_lookup(start, end);
13489 +
13490 + if (mtrr_type == MTRR_TYPE_WRBACK) {
13491 + req_type = _PAGE_CACHE_WB;
13492 + actual_type = _PAGE_CACHE_WB;
13493 + } else {
13494 + req_type = _PAGE_CACHE_UC_MINUS;
13495 + actual_type = _PAGE_CACHE_UC_MINUS;
13496 + }
13497 + } else {
13498 + req_type &= _PAGE_CACHE_MASK;
13499 + err = pat_x_mtrr_type(start, end, req_type, &actual_type);
13500 + }
13501 +
13502 + if (err) {
13503 + if (ret_type)
13504 + *ret_type = actual_type;
13505 +
13506 + return -EINVAL;
13507 + }
13508 +
13509 + new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
13510 + if (!new_entry)
13511 + return -ENOMEM;
13512 +
13513 + new_entry->start = start;
13514 + new_entry->end = end;
13515 + new_entry->type = actual_type;
13516 +
13517 + if (ret_type)
13518 + *ret_type = actual_type;
13519 +
13520 + spin_lock(&memtype_lock);
13521 +
13522 + /* Search for existing mapping that overlaps the current range */
13523 + list_for_each_entry(parse, &memtype_list, nd) {
13524 + struct memtype *saved_ptr;
13525 +
13526 + if (parse->start >= end) {
13527 + pr_debug("New Entry\n");
13528 + list_add(&new_entry->nd, parse->nd.prev);
13529 + new_entry = NULL;
13530 + break;
13531 + }
13532 +
13533 + if (start <= parse->start && end >= parse->start) {
13534 + if (actual_type != parse->type && ret_type) {
13535 + actual_type = parse->type;
13536 + *ret_type = actual_type;
13537 + new_entry->type = actual_type;
13538 + }
13539 +
13540 + if (actual_type != parse->type) {
13541 + printk(
13542 + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13543 + current->comm, current->pid,
13544 + start, end,
13545 + cattr_name(actual_type),
13546 + cattr_name(parse->type));
13547 + err = -EBUSY;
13548 + break;
13549 + }
13550 +
13551 + saved_ptr = parse;
13552 + /*
13553 + * Check to see whether the request overlaps more
13554 + * than one entry in the list
13555 + */
13556 + list_for_each_entry_continue(parse, &memtype_list, nd) {
13557 + if (end <= parse->start) {
13558 + break;
13559 + }
13560 +
13561 + if (actual_type != parse->type) {
13562 + printk(
13563 + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13564 + current->comm, current->pid,
13565 + start, end,
13566 + cattr_name(actual_type),
13567 + cattr_name(parse->type));
13568 + err = -EBUSY;
13569 + break;
13570 + }
13571 + }
13572 +
13573 + if (err) {
13574 + break;
13575 + }
13576 +
13577 + pr_debug("Overlap at 0x%Lx-0x%Lx\n",
13578 + saved_ptr->start, saved_ptr->end);
13579 + /* No conflict. Go ahead and add this new entry */
13580 + list_add(&new_entry->nd, saved_ptr->nd.prev);
13581 + new_entry = NULL;
13582 + break;
13583 + }
13584 +
13585 + if (start < parse->end) {
13586 + if (actual_type != parse->type && ret_type) {
13587 + actual_type = parse->type;
13588 + *ret_type = actual_type;
13589 + new_entry->type = actual_type;
13590 + }
13591 +
13592 + if (actual_type != parse->type) {
13593 + printk(
13594 + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13595 + current->comm, current->pid,
13596 + start, end,
13597 + cattr_name(actual_type),
13598 + cattr_name(parse->type));
13599 + err = -EBUSY;
13600 + break;
13601 + }
13602 +
13603 + saved_ptr = parse;
13604 + /*
13605 + * Check to see whether the request overlaps more
13606 + * than one entry in the list
13607 + */
13608 + list_for_each_entry_continue(parse, &memtype_list, nd) {
13609 + if (end <= parse->start) {
13610 + break;
13611 + }
13612 +
13613 + if (actual_type != parse->type) {
13614 + printk(
13615 + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
13616 + current->comm, current->pid,
13617 + start, end,
13618 + cattr_name(actual_type),
13619 + cattr_name(parse->type));
13620 + err = -EBUSY;
13621 + break;
13622 + }
13623 + }
13624 +
13625 + if (err) {
13626 + break;
13627 + }
13628 +
13629 + pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
13630 + saved_ptr->start, saved_ptr->end);
13631 + /* No conflict. Go ahead and add this new entry */
13632 + list_add(&new_entry->nd, &saved_ptr->nd);
13633 + new_entry = NULL;
13634 + break;
13635 + }
13636 + }
13637 +
13638 + if (err) {
13639 + printk(KERN_INFO
13640 + "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
13641 + start, end, cattr_name(new_entry->type),
13642 + cattr_name(req_type));
13643 + kfree(new_entry);
13644 + spin_unlock(&memtype_lock);
13645 + return err;
13646 + }
13647 +
13648 + if (new_entry) {
13649 + /* No conflict. Not yet added to the list. Add to the tail */
13650 + list_add_tail(&new_entry->nd, &memtype_list);
13651 + pr_debug("New Entry\n");
13652 + }
13653 +
13654 + if (ret_type) {
13655 + pr_debug(
13656 + "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
13657 + start, end, cattr_name(actual_type),
13658 + cattr_name(req_type), cattr_name(*ret_type));
13659 + } else {
13660 + pr_debug(
13661 + "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
13662 + start, end, cattr_name(actual_type),
13663 + cattr_name(req_type));
13664 + }
13665 +
13666 + spin_unlock(&memtype_lock);
13667 + return err;
13668 +}
13669 +
13670 +int free_memtype(u64 start, u64 end)
13671 +{
13672 + struct memtype *ml;
13673 + int err = -EINVAL;
13674 +
13675 + /* Only track when pat_wc_enabled */
13676 + if (!pat_wc_enabled) {
13677 + return 0;
13678 + }
13679 +
13680 + /* Low ISA region is always mapped WB. No need to track */
13681 + if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
13682 + return 0;
13683 + }
13684 +
13685 + spin_lock(&memtype_lock);
13686 + list_for_each_entry(ml, &memtype_list, nd) {
13687 + if (ml->start == start && ml->end == end) {
13688 + list_del(&ml->nd);
13689 + kfree(ml);
13690 + err = 0;
13691 + break;
13692 + }
13693 + }
13694 + spin_unlock(&memtype_lock);
13695 +
13696 + if (err) {
13697 + printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
13698 + current->comm, current->pid, start, end);
13699 + }
13700 +
13701 + pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
13702 + return err;
13703 +}
13704 +
13705 +
13706 +/*
13707 + * /dev/mem mmap interface. The memtype used for mapping varies:
13708 + * - Use UC for mappings with O_SYNC flag
13709 + * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
13710 + * inherit the memtype from existing mapping.
13711 + * - Else use UC_MINUS memtype (for backward compatibility with existing
13712 + * X drivers.
13713 + */
13714 +pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
13715 + unsigned long size, pgprot_t vma_prot)
13716 +{
13717 + return vma_prot;
13718 +}
13719 +
13720 +#ifdef CONFIG_NONPROMISC_DEVMEM
13721 +/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
13722 +static inline int range_is_allowed(unsigned long mfn, unsigned long size)
13723 +{
13724 + return 1;
13725 +}
13726 +#else
13727 +static inline int range_is_allowed(unsigned long mfn, unsigned long size)
13728 +{
13729 + u64 from = ((u64)mfn) << PAGE_SHIFT;
13730 + u64 to = from + size;
13731 + u64 cursor = from;
13732 +
13733 + while (cursor < to) {
13734 + if (!devmem_is_allowed(mfn)) {
13735 + printk(KERN_INFO
13736 + "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
13737 + current->comm, from, to);
13738 + return 0;
13739 + }
13740 + cursor += PAGE_SIZE;
13741 + mfn++;
13742 + }
13743 + return 1;
13744 +}
13745 +#endif /* CONFIG_NONPROMISC_DEVMEM */
13746 +
13747 +int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
13748 + unsigned long size, pgprot_t *vma_prot)
13749 +{
13750 + u64 addr = (u64)mfn << PAGE_SHIFT;
13751 + unsigned long flags = _PAGE_CACHE_UC_MINUS;
13752 + int retval;
13753 +
13754 + if (!range_is_allowed(mfn, size))
13755 + return 0;
13756 +
13757 + if (file->f_flags & O_SYNC) {
13758 + flags = _PAGE_CACHE_UC;
13759 + }
13760 +
13761 +#ifndef CONFIG_X86_32
13762 +#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */
13763 + /*
13764 + * On the PPro and successors, the MTRRs are used to set
13765 + * memory types for physical addresses outside main memory,
13766 + * so blindly setting UC or PWT on those pages is wrong.
13767 + * For Pentiums and earlier, the surround logic should disable
13768 + * caching for the high addresses through the KEN pin, but
13769 + * we maintain the tradition of paranoia in this code.
13770 + */
13771 + if (!pat_wc_enabled &&
13772 + ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
13773 + test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
13774 + test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
13775 + test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
13776 + (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
13777 + flags = _PAGE_CACHE_UC;
13778 + }
13779 +#endif
13780 +#endif
13781 +
13782 + /*
13783 + * With O_SYNC, we can only take UC mapping. Fail if we cannot.
13784 + * Without O_SYNC, we want to get
13785 + * - WB for WB-able memory and no other conflicting mappings
13786 + * - UC_MINUS for non-WB-able memory with no other conflicting mappings
13787 + * - Inherit from confliting mappings otherwise
13788 + */
13789 + if (flags != _PAGE_CACHE_UC_MINUS) {
13790 + retval = reserve_memtype(addr, addr + size, flags, NULL);
13791 + } else {
13792 + retval = reserve_memtype(addr, addr + size, -1, &flags);
13793 + }
13794 +
13795 + if (retval < 0)
13796 + return 0;
13797 +
13798 + if (ioremap_check_change_attr(mfn, size, flags) < 0) {
13799 + free_memtype(addr, addr + size);
13800 + printk(KERN_INFO
13801 + "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
13802 + current->comm, current->pid,
13803 + cattr_name(flags),
13804 + addr, addr + size);
13805 + return 0;
13806 + }
13807 +
13808 + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
13809 + flags);
13810 + return 1;
13811 +}
13812 +
13813 +void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
13814 +{
13815 + u64 addr = (u64)mfn << PAGE_SHIFT;
13816 + unsigned long flags;
13817 + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
13818 +
13819 + reserve_memtype(addr, addr + size, want_flags, &flags);
13820 + if (flags != want_flags) {
13821 + printk(KERN_INFO
13822 + "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
13823 + current->comm, current->pid,
13824 + cattr_name(want_flags),
13825 + addr, (unsigned long long)(addr + size),
13826 + cattr_name(flags));
13827 + }
13828 +}
13829 +
13830 +void unmap_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
13831 +{
13832 + u64 addr = (u64)mfn << PAGE_SHIFT;
13833 +
13834 + free_memtype(addr, addr + size);
13835 +}
13836 +
13837 --- a/arch/x86/mm/pgtable_32-xen.c
13838 +++ b/arch/x86/mm/pgtable_32-xen.c
13839 @@ -1,7 +1,3 @@
13840 -/*
13841 - * linux/arch/i386/mm/pgtable.c
13842 - */
13843 -
13844 #include <linux/sched.h>
13845 #include <linux/kernel.h>
13846 #include <linux/errno.h>
13847 @@ -41,7 +37,6 @@ void show_mem(void)
13848
13849 printk(KERN_INFO "Mem-info:\n");
13850 show_free_areas();
13851 - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
13852 for_each_online_pgdat(pgdat) {
13853 pgdat_resize_lock(pgdat, &flags);
13854 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
13855 @@ -157,243 +152,6 @@ void __init reserve_top_address(unsigned
13856 __VMALLOC_RESERVE += reserve;
13857 }
13858
13859 -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
13860 -{
13861 - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
13862 - if (pte)
13863 - make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
13864 - return pte;
13865 -}
13866 -
13867 -/*
13868 - * List of all pgd's needed for non-PAE so it can invalidate entries
13869 - * in both cached and uncached pgd's; not needed for PAE since the
13870 - * kernel pmd is shared. If PAE were not to share the pmd a similar
13871 - * tactic would be needed. This is essentially codepath-based locking
13872 - * against pageattr.c; it is the unique case in which a valid change
13873 - * of kernel pagetables can't be lazily synchronized by vmalloc faults.
13874 - * vmalloc faults work because attached pagetables are never freed.
13875 - * -- wli
13876 - */
13877 -static inline void pgd_list_add(pgd_t *pgd)
13878 -{
13879 - struct page *page = virt_to_page(pgd);
13880 -
13881 - list_add(&page->lru, &pgd_list);
13882 -}
13883 -
13884 -static inline void pgd_list_del(pgd_t *pgd)
13885 -{
13886 - struct page *page = virt_to_page(pgd);
13887 -
13888 - list_del(&page->lru);
13889 -}
13890 -
13891 -#define UNSHARED_PTRS_PER_PGD \
13892 - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
13893 -
13894 -static void pgd_ctor(void *p)
13895 -{
13896 - pgd_t *pgd = p;
13897 - unsigned long flags;
13898 -
13899 - pgd_test_and_unpin(pgd);
13900 -
13901 - /* Clear usermode parts of PGD */
13902 - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
13903 -
13904 - spin_lock_irqsave(&pgd_lock, flags);
13905 -
13906 - /* If the pgd points to a shared pagetable level (either the
13907 - ptes in non-PAE, or shared PMD in PAE), then just copy the
13908 - references from swapper_pg_dir. */
13909 - if (PAGETABLE_LEVELS == 2 ||
13910 - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
13911 - clone_pgd_range(pgd + USER_PTRS_PER_PGD,
13912 - swapper_pg_dir + USER_PTRS_PER_PGD,
13913 - KERNEL_PGD_PTRS);
13914 - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
13915 - __pa(swapper_pg_dir) >> PAGE_SHIFT,
13916 - USER_PTRS_PER_PGD,
13917 - KERNEL_PGD_PTRS);
13918 - }
13919 -
13920 - /* list required to sync kernel mapping updates */
13921 - if (PAGETABLE_LEVELS == 2)
13922 - pgd_list_add(pgd);
13923 -
13924 - spin_unlock_irqrestore(&pgd_lock, flags);
13925 -}
13926 -
13927 -static void pgd_dtor(void *pgd)
13928 -{
13929 - unsigned long flags; /* can be called from interrupt context */
13930 -
13931 - if (!SHARED_KERNEL_PMD) {
13932 - spin_lock_irqsave(&pgd_lock, flags);
13933 - pgd_list_del(pgd);
13934 - spin_unlock_irqrestore(&pgd_lock, flags);
13935 - }
13936 -
13937 - pgd_test_and_unpin(pgd);
13938 -}
13939 -
13940 -#ifdef CONFIG_X86_PAE
13941 -/*
13942 - * Mop up any pmd pages which may still be attached to the pgd.
13943 - * Normally they will be freed by munmap/exit_mmap, but any pmd we
13944 - * preallocate which never got a corresponding vma will need to be
13945 - * freed manually.
13946 - */
13947 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
13948 -{
13949 - int i;
13950 -
13951 - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
13952 - pgd_t pgd = pgdp[i];
13953 -
13954 - if (__pgd_val(pgd) != 0) {
13955 - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
13956 -
13957 - pgdp[i] = xen_make_pgd(0);
13958 -
13959 - paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
13960 - pmd_free(mm, pmd);
13961 - }
13962 - }
13963 -}
13964 -
13965 -/*
13966 - * In PAE mode, we need to do a cr3 reload (=tlb flush) when
13967 - * updating the top-level pagetable entries to guarantee the
13968 - * processor notices the update. Since this is expensive, and
13969 - * all 4 top-level entries are used almost immediately in a
13970 - * new process's life, we just pre-populate them here.
13971 - *
13972 - * Also, if we're in a paravirt environment where the kernel pmd is
13973 - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
13974 - * and initialize the kernel pmds here.
13975 - */
13976 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
13977 -{
13978 - pud_t *pud;
13979 - pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
13980 - unsigned long addr, flags;
13981 - int i;
13982 -
13983 - /*
13984 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
13985 - * allocation). We therefore store virtual addresses of pmds as they
13986 - * do not change across save/restore, and poke the machine addresses
13987 - * into the pgdir under the pgd_lock.
13988 - */
13989 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
13990 - pmds[i] = pmd_alloc_one(mm, addr);
13991 - if (!pmds[i])
13992 - goto out_oom;
13993 - }
13994 -
13995 - spin_lock_irqsave(&pgd_lock, flags);
13996 -
13997 - /* Protect against save/restore: move below 4GB under pgd_lock. */
13998 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
13999 - && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
14000 - spin_unlock_irqrestore(&pgd_lock, flags);
14001 -out_oom:
14002 - while (i--)
14003 - pmd_free(mm, pmds[i]);
14004 - return 0;
14005 - }
14006 -
14007 - /* Copy kernel pmd contents and write-protect the new pmds. */
14008 - pud = pud_offset(pgd, 0);
14009 - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
14010 - i++, pud++, addr += PUD_SIZE) {
14011 - if (i >= USER_PTRS_PER_PGD) {
14012 - memcpy(pmds[i],
14013 - (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
14014 - sizeof(pmd_t) * PTRS_PER_PMD);
14015 - make_lowmem_page_readonly(
14016 - pmds[i], XENFEAT_writable_page_tables);
14017 - }
14018 -
14019 - /* It is safe to poke machine addresses of pmds under the pgd_lock. */
14020 - pud_populate(mm, pud, pmds[i]);
14021 - }
14022 -
14023 - /* List required to sync kernel mapping updates and
14024 - * to pin/unpin on save/restore. */
14025 - pgd_list_add(pgd);
14026 -
14027 - spin_unlock_irqrestore(&pgd_lock, flags);
14028 -
14029 - return 1;
14030 -}
14031 -#else /* !CONFIG_X86_PAE */
14032 -/* No need to prepopulate any pagetable entries in non-PAE modes. */
14033 -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14034 -{
14035 - return 1;
14036 -}
14037 -
14038 -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
14039 -{
14040 -}
14041 -#endif /* CONFIG_X86_PAE */
14042 -
14043 -pgd_t *pgd_alloc(struct mm_struct *mm)
14044 -{
14045 - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
14046 -
14047 - /* so that alloc_pd can use it */
14048 - mm->pgd = pgd;
14049 - if (pgd)
14050 - pgd_ctor(pgd);
14051 -
14052 - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
14053 - free_page((unsigned long)pgd);
14054 - pgd = NULL;
14055 - }
14056 -
14057 - return pgd;
14058 -}
14059 -
14060 -void pgd_free(struct mm_struct *mm, pgd_t *pgd)
14061 -{
14062 - /*
14063 - * After this the pgd should not be pinned for the duration of this
14064 - * function's execution. We should never sleep and thus never race:
14065 - * 1. User pmds will not become write-protected under our feet due
14066 - * to a concurrent mm_pin_all().
14067 - * 2. The machine addresses in PGD entries will not become invalid
14068 - * due to a concurrent save/restore.
14069 - */
14070 - pgd_dtor(pgd);
14071 -
14072 - if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
14073 - xen_destroy_contiguous_region((unsigned long)pgd, 0);
14074 -
14075 - pgd_mop_up_pmds(mm, pgd);
14076 - free_page((unsigned long)pgd);
14077 -}
14078 -
14079 -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
14080 -{
14081 - pgtable_page_dtor(pte);
14082 - paravirt_release_pt(page_to_pfn(pte));
14083 - tlb_remove_page(tlb, pte);
14084 -}
14085 -
14086 -#ifdef CONFIG_X86_PAE
14087 -
14088 -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
14089 -{
14090 - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
14091 - tlb_remove_page(tlb, virt_to_page(pmd));
14092 -}
14093 -
14094 -#endif
14095 -
14096 void make_lowmem_page_readonly(void *va, unsigned int feature)
14097 {
14098 pte_t *pte;
14099 --- /dev/null
14100 +++ b/arch/x86/mm/pgtable-xen.c
14101 @@ -0,0 +1,709 @@
14102 +#include <linux/mm.h>
14103 +#include <linux/module.h>
14104 +#include <xen/features.h>
14105 +#include <asm/pgalloc.h>
14106 +#include <asm/pgtable.h>
14107 +#include <asm/tlb.h>
14108 +#include <asm/hypervisor.h>
14109 +#include <asm/mmu_context.h>
14110 +
14111 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
14112 +{
14113 + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
14114 + if (pte)
14115 + make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
14116 + return pte;
14117 +}
14118 +
14119 +static void _pte_free(struct page *page, unsigned int order)
14120 +{
14121 + BUG_ON(order);
14122 + __pte_free(page);
14123 +}
14124 +
14125 +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
14126 +{
14127 + struct page *pte;
14128 +
14129 +#ifdef CONFIG_HIGHPTE
14130 + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
14131 +#else
14132 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
14133 +#endif
14134 + if (pte) {
14135 + pgtable_page_ctor(pte);
14136 + SetPageForeign(pte, _pte_free);
14137 + init_page_count(pte);
14138 + }
14139 + return pte;
14140 +}
14141 +
14142 +void __pte_free(pgtable_t pte)
14143 +{
14144 + if (!PageHighMem(pte)) {
14145 + unsigned long va = (unsigned long)page_address(pte);
14146 + unsigned int level;
14147 + pte_t *ptep = lookup_address(va, &level);
14148 +
14149 + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
14150 + if (!pte_write(*ptep)
14151 + && HYPERVISOR_update_va_mapping(va,
14152 + mk_pte(pte, PAGE_KERNEL),
14153 + 0))
14154 + BUG();
14155 + } else
14156 +#ifdef CONFIG_HIGHPTE
14157 + ClearPagePinned(pte);
14158 +#else
14159 + BUG();
14160 +#endif
14161 +
14162 + ClearPageForeign(pte);
14163 + init_page_count(pte);
14164 + pgtable_page_dtor(pte);
14165 + __free_page(pte);
14166 +}
14167 +
14168 +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
14169 +{
14170 + pgtable_page_dtor(pte);
14171 + paravirt_release_pte(page_to_pfn(pte));
14172 + tlb_remove_page(tlb, pte);
14173 +}
14174 +
14175 +#if PAGETABLE_LEVELS > 2
14176 +static void _pmd_free(struct page *page, unsigned int order)
14177 +{
14178 + BUG_ON(order);
14179 + __pmd_free(page);
14180 +}
14181 +
14182 +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
14183 +{
14184 + struct page *pmd;
14185 +
14186 + pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
14187 + if (!pmd)
14188 + return NULL;
14189 + SetPageForeign(pmd, _pmd_free);
14190 + init_page_count(pmd);
14191 + return page_address(pmd);
14192 +}
14193 +
14194 +void __pmd_free(pgtable_t pmd)
14195 +{
14196 + unsigned long va = (unsigned long)page_address(pmd);
14197 + unsigned int level;
14198 + pte_t *ptep = lookup_address(va, &level);
14199 +
14200 + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
14201 + if (!pte_write(*ptep)
14202 + && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
14203 + BUG();
14204 +
14205 + ClearPageForeign(pmd);
14206 + init_page_count(pmd);
14207 + __free_page(pmd);
14208 +}
14209 +
14210 +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
14211 +{
14212 + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
14213 + tlb_remove_page(tlb, virt_to_page(pmd));
14214 +}
14215 +
14216 +#if PAGETABLE_LEVELS > 3
14217 +void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
14218 +{
14219 + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
14220 + tlb_remove_page(tlb, virt_to_page(pud));
14221 +}
14222 +#endif /* PAGETABLE_LEVELS > 3 */
14223 +#endif /* PAGETABLE_LEVELS > 2 */
14224 +
14225 +#ifndef CONFIG_X86_64
14226 +#define TASK_SIZE64 TASK_SIZE
14227 +#endif
14228 +
14229 +static void _pin_lock(struct mm_struct *mm, int lock) {
14230 + if (lock)
14231 + spin_lock(&mm->page_table_lock);
14232 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
14233 + /* While mm->page_table_lock protects us against insertions and
14234 + * removals of higher level page table pages, it doesn't protect
14235 + * against updates of pte-s. Such updates, however, require the
14236 + * pte pages to be in consistent state (unpinned+writable or
14237 + * pinned+readonly). The pinning and attribute changes, however
14238 + * cannot be done atomically, which is why such updates must be
14239 + * prevented from happening concurrently.
14240 + * Note that no pte lock can ever elsewhere be acquired nesting
14241 + * with an already acquired one in the same mm, or with the mm's
14242 + * page_table_lock already acquired, as that would break in the
14243 + * non-split case (where all these are actually resolving to the
14244 + * one page_table_lock). Thus acquiring all of them here is not
14245 + * going to result in dead locks, and the order of acquires
14246 + * doesn't matter.
14247 + */
14248 + {
14249 + pgd_t *pgd = mm->pgd;
14250 + unsigned g;
14251 +
14252 + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
14253 + pud_t *pud;
14254 + unsigned u;
14255 +
14256 + if (pgd_none(*pgd))
14257 + continue;
14258 + pud = pud_offset(pgd, 0);
14259 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
14260 + pmd_t *pmd;
14261 + unsigned m;
14262 +
14263 + if (pud_none(*pud))
14264 + continue;
14265 + pmd = pmd_offset(pud, 0);
14266 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
14267 + spinlock_t *ptl;
14268 +
14269 + if (pmd_none(*pmd))
14270 + continue;
14271 + ptl = pte_lockptr(0, pmd);
14272 + if (lock)
14273 + spin_lock(ptl);
14274 + else
14275 + spin_unlock(ptl);
14276 + }
14277 + }
14278 + }
14279 + }
14280 +#endif
14281 + if (!lock)
14282 + spin_unlock(&mm->page_table_lock);
14283 +}
14284 +#define pin_lock(mm) _pin_lock(mm, 1)
14285 +#define pin_unlock(mm) _pin_lock(mm, 0)
14286 +
14287 +#define PIN_BATCH sizeof(void *)
14288 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
14289 +
14290 +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
14291 + unsigned int cpu, unsigned int seq)
14292 +{
14293 + unsigned long pfn = page_to_pfn(page);
14294 +
14295 + if (PageHighMem(page)) {
14296 + if (pgprot_val(flags) & _PAGE_RW)
14297 + ClearPagePinned(page);
14298 + else
14299 + SetPagePinned(page);
14300 + } else {
14301 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
14302 + (unsigned long)__va(pfn << PAGE_SHIFT),
14303 + pfn_pte(pfn, flags), 0);
14304 + if (unlikely(++seq == PIN_BATCH)) {
14305 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
14306 + PIN_BATCH, NULL)))
14307 + BUG();
14308 + seq = 0;
14309 + }
14310 + }
14311 +
14312 + return seq;
14313 +}
14314 +
14315 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
14316 +{
14317 + pgd_t *pgd = pgd_base;
14318 + pud_t *pud;
14319 + pmd_t *pmd;
14320 + int g,u,m;
14321 + unsigned int cpu, seq;
14322 + multicall_entry_t *mcl;
14323 +
14324 + if (xen_feature(XENFEAT_auto_translated_physmap))
14325 + return;
14326 +
14327 + cpu = get_cpu();
14328 +
14329 + /*
14330 + * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
14331 + * may not be the 'current' task's pagetables (e.g., current may be
14332 + * 32-bit, but the pagetables may be for a 64-bit task).
14333 + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
14334 + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
14335 + */
14336 + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
14337 + if (pgd_none(*pgd))
14338 + continue;
14339 + pud = pud_offset(pgd, 0);
14340 + if (PTRS_PER_PUD > 1) /* not folded */
14341 + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
14342 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
14343 + if (pud_none(*pud))
14344 + continue;
14345 + pmd = pmd_offset(pud, 0);
14346 + if (PTRS_PER_PMD > 1) /* not folded */
14347 + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
14348 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
14349 + if (pmd_none(*pmd))
14350 + continue;
14351 + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
14352 + }
14353 + }
14354 + }
14355 +
14356 + mcl = per_cpu(pb_mcl, cpu);
14357 +#ifdef CONFIG_X86_64
14358 + if (unlikely(seq > PIN_BATCH - 2)) {
14359 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
14360 + BUG();
14361 + seq = 0;
14362 + }
14363 + MULTI_update_va_mapping(mcl + seq,
14364 + (unsigned long)__user_pgd(pgd_base),
14365 + pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
14366 + 0);
14367 + MULTI_update_va_mapping(mcl + seq + 1,
14368 + (unsigned long)pgd_base,
14369 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
14370 + UVMF_TLB_FLUSH);
14371 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
14372 + BUG();
14373 +#else
14374 + if (likely(seq != 0)) {
14375 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
14376 + (unsigned long)pgd_base,
14377 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
14378 + UVMF_TLB_FLUSH);
14379 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
14380 + seq + 1, NULL)))
14381 + BUG();
14382 + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
14383 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
14384 + UVMF_TLB_FLUSH))
14385 + BUG();
14386 +#endif
14387 +
14388 + put_cpu();
14389 +}
14390 +
14391 +static void __pgd_pin(pgd_t *pgd)
14392 +{
14393 + pgd_walk(pgd, PAGE_KERNEL_RO);
14394 + kmap_flush_unused();
14395 + xen_pgd_pin(__pa(pgd)); /* kernel */
14396 +#ifdef CONFIG_X86_64
14397 + xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
14398 +#endif
14399 + SetPagePinned(virt_to_page(pgd));
14400 +}
14401 +
14402 +static void __pgd_unpin(pgd_t *pgd)
14403 +{
14404 + xen_pgd_unpin(__pa(pgd));
14405 +#ifdef CONFIG_X86_64
14406 + xen_pgd_unpin(__pa(__user_pgd(pgd)));
14407 +#endif
14408 + pgd_walk(pgd, PAGE_KERNEL);
14409 + ClearPagePinned(virt_to_page(pgd));
14410 +}
14411 +
14412 +static void pgd_test_and_unpin(pgd_t *pgd)
14413 +{
14414 + if (PagePinned(virt_to_page(pgd)))
14415 + __pgd_unpin(pgd);
14416 +}
14417 +
14418 +void mm_pin(struct mm_struct *mm)
14419 +{
14420 + if (xen_feature(XENFEAT_writable_page_tables))
14421 + return;
14422 +
14423 + pin_lock(mm);
14424 + __pgd_pin(mm->pgd);
14425 + pin_unlock(mm);
14426 +}
14427 +
14428 +void mm_unpin(struct mm_struct *mm)
14429 +{
14430 + if (xen_feature(XENFEAT_writable_page_tables))
14431 + return;
14432 +
14433 + pin_lock(mm);
14434 + __pgd_unpin(mm->pgd);
14435 + pin_unlock(mm);
14436 +}
14437 +
14438 +void mm_pin_all(void)
14439 +{
14440 + struct page *page;
14441 + unsigned long flags;
14442 +
14443 + if (xen_feature(XENFEAT_writable_page_tables))
14444 + return;
14445 +
14446 + /*
14447 + * Allow uninterrupted access to the pgd_list. Also protects
14448 + * __pgd_pin() by disabling preemption.
14449 + * All other CPUs must be at a safe point (e.g., in stop_machine
14450 + * or offlined entirely).
14451 + */
14452 + spin_lock_irqsave(&pgd_lock, flags);
14453 + list_for_each_entry(page, &pgd_list, lru) {
14454 + if (!PagePinned(page))
14455 + __pgd_pin((pgd_t *)page_address(page));
14456 + }
14457 + spin_unlock_irqrestore(&pgd_lock, flags);
14458 +}
14459 +
14460 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
14461 +{
14462 + if (!PagePinned(virt_to_page(mm->pgd)))
14463 + mm_pin(mm);
14464 +}
14465 +
14466 +void arch_exit_mmap(struct mm_struct *mm)
14467 +{
14468 + struct task_struct *tsk = current;
14469 +
14470 + task_lock(tsk);
14471 +
14472 + /*
14473 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
14474 + * *much* faster this way, as no tlb flushes means bigger wrpt batches.
14475 + */
14476 + if (tsk->active_mm == mm) {
14477 + tsk->active_mm = &init_mm;
14478 + atomic_inc(&init_mm.mm_count);
14479 +
14480 + switch_mm(mm, &init_mm, tsk);
14481 +
14482 + atomic_dec(&mm->mm_count);
14483 + BUG_ON(atomic_read(&mm->mm_count) == 0);
14484 + }
14485 +
14486 + task_unlock(tsk);
14487 +
14488 + if (PagePinned(virt_to_page(mm->pgd))
14489 + && atomic_read(&mm->mm_count) == 1
14490 + && !mm->context.has_foreign_mappings)
14491 + mm_unpin(mm);
14492 +}
14493 +
14494 +static inline void pgd_list_add(pgd_t *pgd)
14495 +{
14496 + struct page *page = virt_to_page(pgd);
14497 +
14498 + list_add(&page->lru, &pgd_list);
14499 +}
14500 +
14501 +static inline void pgd_list_del(pgd_t *pgd)
14502 +{
14503 + struct page *page = virt_to_page(pgd);
14504 +
14505 + list_del(&page->lru);
14506 +}
14507 +
14508 +#define UNSHARED_PTRS_PER_PGD \
14509 + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
14510 +
14511 +static void pgd_ctor(void *p)
14512 +{
14513 + pgd_t *pgd = p;
14514 + unsigned long flags;
14515 +
14516 + pgd_test_and_unpin(pgd);
14517 +
14518 + /* Clear usermode parts of PGD */
14519 + memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
14520 +
14521 + spin_lock_irqsave(&pgd_lock, flags);
14522 +
14523 + /* If the pgd points to a shared pagetable level (either the
14524 + ptes in non-PAE, or shared PMD in PAE), then just copy the
14525 + references from swapper_pg_dir. */
14526 + if (PAGETABLE_LEVELS == 2 ||
14527 + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
14528 + PAGETABLE_LEVELS == 4) {
14529 + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
14530 + swapper_pg_dir + KERNEL_PGD_BOUNDARY,
14531 + KERNEL_PGD_PTRS);
14532 + paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
14533 + __pa(swapper_pg_dir) >> PAGE_SHIFT,
14534 + KERNEL_PGD_BOUNDARY,
14535 + KERNEL_PGD_PTRS);
14536 + }
14537 +
14538 +#ifdef CONFIG_X86_64
14539 + /* set level3_user_pgt for vsyscall area */
14540 + __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
14541 + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
14542 +#endif
14543 +
14544 +#ifndef CONFIG_X86_PAE
14545 + /* list required to sync kernel mapping updates */
14546 + if (!SHARED_KERNEL_PMD)
14547 + pgd_list_add(pgd);
14548 +#endif
14549 +
14550 + spin_unlock_irqrestore(&pgd_lock, flags);
14551 +}
14552 +
14553 +static void pgd_dtor(void *pgd)
14554 +{
14555 + unsigned long flags; /* can be called from interrupt context */
14556 +
14557 + if (!SHARED_KERNEL_PMD) {
14558 + spin_lock_irqsave(&pgd_lock, flags);
14559 + pgd_list_del(pgd);
14560 + spin_unlock_irqrestore(&pgd_lock, flags);
14561 + }
14562 +
14563 + pgd_test_and_unpin(pgd);
14564 +}
14565 +
14566 +/*
14567 + * List of all pgd's needed for non-PAE so it can invalidate entries
14568 + * in both cached and uncached pgd's; not needed for PAE since the
14569 + * kernel pmd is shared. If PAE were not to share the pmd a similar
14570 + * tactic would be needed. This is essentially codepath-based locking
14571 + * against pageattr.c; it is the unique case in which a valid change
14572 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
14573 + * vmalloc faults work because attached pagetables are never freed.
14574 + * -- wli
14575 + */
14576 +
14577 +#ifdef CONFIG_X86_PAE
14578 +/*
14579 + * Mop up any pmd pages which may still be attached to the pgd.
14580 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
14581 + * preallocate which never got a corresponding vma will need to be
14582 + * freed manually.
14583 + */
14584 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
14585 +{
14586 + int i;
14587 +
14588 + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
14589 + pgd_t pgd = pgdp[i];
14590 +
14591 + if (__pgd_val(pgd) != 0) {
14592 + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
14593 +
14594 + pgdp[i] = xen_make_pgd(0);
14595 +
14596 + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
14597 + pmd_free(mm, pmd);
14598 + }
14599 + }
14600 +
14601 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
14602 + xen_destroy_contiguous_region((unsigned long)pgdp, 0);
14603 +}
14604 +
14605 +/*
14606 + * In PAE mode, we need to do a cr3 reload (=tlb flush) when
14607 + * updating the top-level pagetable entries to guarantee the
14608 + * processor notices the update. Since this is expensive, and
14609 + * all 4 top-level entries are used almost immediately in a
14610 + * new process's life, we just pre-populate them here.
14611 + *
14612 + * Also, if we're in a paravirt environment where the kernel pmd is
14613 + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
14614 + * and initialize the kernel pmds here.
14615 + */
14616 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14617 +{
14618 + pud_t *pud;
14619 + pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
14620 + unsigned long addr, flags;
14621 + int i;
14622 +
14623 + /*
14624 + * We can race save/restore (if we sleep during a GFP_KERNEL memory
14625 + * allocation). We therefore store virtual addresses of pmds as they
14626 + * do not change across save/restore, and poke the machine addresses
14627 + * into the pgdir under the pgd_lock.
14628 + */
14629 + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
14630 + pmds[i] = pmd_alloc_one(mm, addr);
14631 + if (!pmds[i])
14632 + goto out_oom;
14633 + }
14634 +
14635 + spin_lock_irqsave(&pgd_lock, flags);
14636 +
14637 + /* Protect against save/restore: move below 4GB under pgd_lock. */
14638 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
14639 + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
14640 + spin_unlock_irqrestore(&pgd_lock, flags);
14641 +out_oom:
14642 + while (i--)
14643 + pmd_free(mm, pmds[i]);
14644 + return 0;
14645 + }
14646 +
14647 + /* Copy kernel pmd contents and write-protect the new pmds. */
14648 + pud = pud_offset(pgd, 0);
14649 + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
14650 + i++, pud++, addr += PUD_SIZE) {
14651 + if (i >= KERNEL_PGD_BOUNDARY) {
14652 + memcpy(pmds[i],
14653 + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
14654 + sizeof(pmd_t) * PTRS_PER_PMD);
14655 + make_lowmem_page_readonly(
14656 + pmds[i], XENFEAT_writable_page_tables);
14657 + }
14658 +
14659 + /* It is safe to poke machine addresses of pmds under the pgd_lock. */
14660 + pud_populate(mm, pud, pmds[i]);
14661 + }
14662 +
14663 + /* List required to sync kernel mapping updates and
14664 + * to pin/unpin on save/restore. */
14665 + pgd_list_add(pgd);
14666 +
14667 + spin_unlock_irqrestore(&pgd_lock, flags);
14668 +
14669 + return 1;
14670 +}
14671 +
14672 +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
14673 +{
14674 + struct page *page = virt_to_page(pmd);
14675 + unsigned long pfn = page_to_pfn(page);
14676 +
14677 + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
14678 +
14679 + /* Note: almost everything apart from _PAGE_PRESENT is
14680 + reserved at the pmd (PDPT) level. */
14681 + if (PagePinned(virt_to_page(mm->pgd))) {
14682 + BUG_ON(PageHighMem(page));
14683 + BUG_ON(HYPERVISOR_update_va_mapping(
14684 + (unsigned long)__va(pfn << PAGE_SHIFT),
14685 + pfn_pte(pfn, PAGE_KERNEL_RO), 0));
14686 + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
14687 + } else
14688 + *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
14689 +
14690 + /*
14691 + * According to Intel App note "TLBs, Paging-Structure Caches,
14692 + * and Their Invalidation", April 2007, document 317080-001,
14693 + * section 8.1: in PAE mode we explicitly have to flush the
14694 + * TLB via cr3 if the top-level pgd is changed...
14695 + */
14696 + if (mm == current->active_mm)
14697 + xen_tlb_flush();
14698 +}
14699 +#else /* !CONFIG_X86_PAE */
14700 +/* No need to prepopulate any pagetable entries in non-PAE modes. */
14701 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
14702 +{
14703 + return 1;
14704 +}
14705 +
14706 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
14707 +{
14708 +}
14709 +#endif /* CONFIG_X86_PAE */
14710 +
14711 +#ifdef CONFIG_X86_64
14712 +/* We allocate two contiguous pages for kernel and user. */
14713 +#define PGD_ORDER 1
14714 +#else
14715 +#define PGD_ORDER 0
14716 +#endif
14717 +
14718 +pgd_t *pgd_alloc(struct mm_struct *mm)
14719 +{
14720 + pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
14721 +
14722 + /* so that alloc_pd can use it */
14723 + mm->pgd = pgd;
14724 + if (pgd)
14725 + pgd_ctor(pgd);
14726 +
14727 + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
14728 + free_pages((unsigned long)pgd, PGD_ORDER);
14729 + pgd = NULL;
14730 + }
14731 +
14732 + return pgd;
14733 +}
14734 +
14735 +void pgd_free(struct mm_struct *mm, pgd_t *pgd)
14736 +{
14737 + /*
14738 + * After this the pgd should not be pinned for the duration of this
14739 + * function's execution. We should never sleep and thus never race:
14740 + * 1. User pmds will not become write-protected under our feet due
14741 + * to a concurrent mm_pin_all().
14742 + * 2. The machine addresses in PGD entries will not become invalid
14743 + * due to a concurrent save/restore.
14744 + */
14745 + pgd_dtor(pgd);
14746 +
14747 + pgd_mop_up_pmds(mm, pgd);
14748 + free_pages((unsigned long)pgd, PGD_ORDER);
14749 +}
14750 +
14751 +/* blktap and gntdev need this, as otherwise they would implicitly (and
14752 + * needlessly, as they never use it) reference init_mm. */
14753 +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
14754 + unsigned long addr, pte_t *ptep, int full)
14755 +{
14756 + return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
14757 +}
14758 +EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
14759 +
14760 +int ptep_set_access_flags(struct vm_area_struct *vma,
14761 + unsigned long address, pte_t *ptep,
14762 + pte_t entry, int dirty)
14763 +{
14764 + int changed = !pte_same(*ptep, entry);
14765 +
14766 + if (changed && dirty) {
14767 + if (likely(vma->vm_mm == current->mm)) {
14768 + if (HYPERVISOR_update_va_mapping(address,
14769 + entry,
14770 + (unsigned long)vma->vm_mm->cpu_vm_mask.bits|
14771 + UVMF_INVLPG|UVMF_MULTI))
14772 + BUG();
14773 + } else {
14774 + xen_l1_entry_update(ptep, entry);
14775 + flush_tlb_page(vma, address);
14776 + }
14777 + }
14778 +
14779 + return changed;
14780 +}
14781 +
14782 +int ptep_test_and_clear_young(struct vm_area_struct *vma,
14783 + unsigned long addr, pte_t *ptep)
14784 +{
14785 + int ret = 0;
14786 +
14787 + if (pte_young(*ptep))
14788 + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
14789 + &ptep->pte);
14790 +
14791 + if (ret)
14792 + pte_update(vma->vm_mm, addr, ptep);
14793 +
14794 + return ret;
14795 +}
14796 +
14797 +int ptep_clear_flush_young(struct vm_area_struct *vma,
14798 + unsigned long address, pte_t *ptep)
14799 +{
14800 + pte_t pte = *ptep;
14801 + int young = pte_young(pte);
14802 +
14803 + pte = pte_mkold(pte);
14804 + if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
14805 + ptep_set_access_flags(vma, address, ptep, pte, young);
14806 + else if (young)
14807 + ptep->pte_low = pte.pte_low;
14808 +
14809 + return young;
14810 +}
14811 --- a/arch/x86/pci/i386.c
14812 +++ b/arch/x86/pci/i386.c
14813 @@ -328,10 +328,14 @@ int pci_mmap_page_range(struct pci_dev *
14814 flags = new_flags;
14815 }
14816
14817 +#ifndef CONFIG_XEN
14818 if (((vma->vm_pgoff < max_low_pfn_mapped) ||
14819 (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
14820 vma->vm_pgoff < max_pfn_mapped)) &&
14821 ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
14822 +#else
14823 + if (ioremap_check_change_attr(vma->vm_pgoff, len, flags)) {
14824 +#endif
14825 free_memtype(addr, addr + len);
14826 return -EINVAL;
14827 }
14828 --- a/arch/x86/pci/irq-xen.c
14829 +++ b/arch/x86/pci/irq-xen.c
14830 @@ -140,9 +140,11 @@ static void __init pirq_peer_trick(void)
14831 busmap[e->bus] = 1;
14832 }
14833 for(i = 1; i < 256; i++) {
14834 + int node;
14835 if (!busmap[i] || pci_find_bus(0, i))
14836 continue;
14837 - if (pci_scan_bus_with_sysdata(i))
14838 + node = get_mp_bus_to_node(i);
14839 + if (pci_scan_bus_on_node(i, &pci_root_ops, node))
14840 printk(KERN_INFO "PCI: Discovered primary peer "
14841 "bus %02x [IRQ]\n", i);
14842 }
14843 @@ -204,7 +206,7 @@ static int pirq_ali_get(struct pci_dev *
14844 {
14845 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
14846
14847 - WARN_ON_ONCE(pirq >= 16);
14848 + WARN_ON_ONCE(pirq > 16);
14849 return irqmap[read_config_nybble(router, 0x48, pirq-1)];
14850 }
14851
14852 @@ -213,7 +215,7 @@ static int pirq_ali_set(struct pci_dev *
14853 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
14854 unsigned int val = irqmap[irq];
14855
14856 - WARN_ON_ONCE(pirq >= 16);
14857 + WARN_ON_ONCE(pirq > 16);
14858 if (val) {
14859 write_config_nybble(router, 0x48, pirq-1, val);
14860 return 1;
14861 @@ -264,7 +266,7 @@ static int pirq_via586_get(struct pci_de
14862 {
14863 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
14864
14865 - WARN_ON_ONCE(pirq >= 5);
14866 + WARN_ON_ONCE(pirq > 5);
14867 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
14868 }
14869
14870 @@ -272,7 +274,7 @@ static int pirq_via586_set(struct pci_de
14871 {
14872 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
14873
14874 - WARN_ON_ONCE(pirq >= 5);
14875 + WARN_ON_ONCE(pirq > 5);
14876 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
14877 return 1;
14878 }
14879 @@ -286,7 +288,7 @@ static int pirq_ite_get(struct pci_dev *
14880 {
14881 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
14882
14883 - WARN_ON_ONCE(pirq >= 4);
14884 + WARN_ON_ONCE(pirq > 4);
14885 return read_config_nybble(router,0x43, pirqmap[pirq-1]);
14886 }
14887
14888 @@ -294,7 +296,7 @@ static int pirq_ite_set(struct pci_dev *
14889 {
14890 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
14891
14892 - WARN_ON_ONCE(pirq >= 4);
14893 + WARN_ON_ONCE(pirq > 4);
14894 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
14895 return 1;
14896 }
14897 @@ -623,6 +625,13 @@ static __init int via_router_probe(struc
14898 */
14899 device = PCI_DEVICE_ID_VIA_8235;
14900 break;
14901 + case PCI_DEVICE_ID_VIA_8237:
14902 + /**
14903 + * Asus a7v600 bios wrongly reports 8237
14904 + * as 586-compatible
14905 + */
14906 + device = PCI_DEVICE_ID_VIA_8237;
14907 + break;
14908 }
14909 }
14910
14911 --- a/arch/x86/vdso/vdso32-setup-xen.c
14912 +++ b/arch/x86/vdso/vdso32-setup-xen.c
14913 @@ -164,7 +164,7 @@ static __init void relocate_vdso(Elf32_E
14914 Elf32_Shdr *shdr;
14915 int i;
14916
14917 - BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
14918 + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
14919 !elf_check_arch_ia32(ehdr) ||
14920 ehdr->e_type != ET_DYN);
14921
14922 @@ -233,8 +233,12 @@ void syscall32_cpu_init(void)
14923 BUG();
14924 #endif
14925
14926 - if (use_sysenter < 0)
14927 - use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
14928 + if (use_sysenter < 0) {
14929 + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
14930 + use_sysenter = 1;
14931 + if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
14932 + use_sysenter = 1;
14933 + }
14934 }
14935
14936 #define compat_uses_vma 1
14937 @@ -337,8 +341,6 @@ int __init sysenter_setup(void)
14938
14939 #ifdef CONFIG_X86_32
14940 gate_vma_init();
14941 -
14942 - printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
14943 #endif
14944
14945 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
14946 @@ -383,6 +385,9 @@ int arch_setup_additional_pages(struct l
14947 int ret = 0;
14948 bool compat;
14949
14950 + if (vdso_enabled == VDSO_DISABLED)
14951 + return 0;
14952 +
14953 down_write(&mm->mmap_sem);
14954
14955 /* Test compat mode once here, in case someone
14956 --- a/drivers/acpi/processor_core.c
14957 +++ b/drivers/acpi/processor_core.c
14958 @@ -657,7 +657,7 @@ static int acpi_processor_get_info(struc
14959 * of /proc/cpuinfo
14960 */
14961 status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
14962 - if (ACPI_SUCCESS(status))
14963 + if (ACPI_SUCCESS(status) && pr->id != -1)
14964 arch_fix_phys_package_id(pr->id, object.integer.value);
14965
14966 return 0;
14967 --- a/drivers/input/xen-kbdfront.c
14968 +++ b/drivers/input/xen-kbdfront.c
14969 @@ -325,7 +325,6 @@ static struct xenbus_device_id xenkbd_id
14970
14971 static struct xenbus_driver xenkbd = {
14972 .name = "vkbd",
14973 - .owner = THIS_MODULE,
14974 .ids = xenkbd_ids,
14975 .probe = xenkbd_probe,
14976 .remove = xenkbd_remove,
14977 --- a/drivers/oprofile/cpu_buffer.c
14978 +++ b/drivers/oprofile/cpu_buffer.c
14979 @@ -310,7 +310,7 @@ void oprofile_add_trace(unsigned long pc
14980 #ifdef CONFIG_XEN
14981 int oprofile_add_domain_switch(int32_t domain_id)
14982 {
14983 - struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
14984 + struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
14985
14986 /* should have space for switching into and out of domain
14987 (2 slots each) plus one sample and one cpu mode switch */
14988 --- a/drivers/pci/msi-xen.c
14989 +++ b/drivers/pci/msi-xen.c
14990 @@ -588,7 +588,7 @@ int pci_enable_msi(struct pci_dev* dev)
14991 EXPORT_SYMBOL(pci_enable_msi);
14992
14993 extern void pci_frontend_disable_msi(struct pci_dev* dev);
14994 -void pci_disable_msi(struct pci_dev* dev)
14995 +void pci_msi_shutdown(struct pci_dev* dev)
14996 {
14997 int pirq;
14998
14999 @@ -617,6 +617,10 @@ void pci_disable_msi(struct pci_dev* dev
15000 pci_intx_for_msi(dev, 1);
15001 dev->msi_enabled = 0;
15002 }
15003 +void pci_disable_msi(struct pci_dev* dev)
15004 +{
15005 + pci_msi_shutdown(dev);
15006 +}
15007 EXPORT_SYMBOL(pci_disable_msi);
15008
15009 /**
15010 @@ -719,7 +723,7 @@ int pci_enable_msix(struct pci_dev* dev,
15011 EXPORT_SYMBOL(pci_enable_msix);
15012
15013 extern void pci_frontend_disable_msix(struct pci_dev* dev);
15014 -void pci_disable_msix(struct pci_dev* dev)
15015 +void pci_msix_shutdown(struct pci_dev* dev)
15016 {
15017 if (!pci_msi_enable)
15018 return;
15019 @@ -756,6 +760,10 @@ void pci_disable_msix(struct pci_dev* de
15020 pci_intx_for_msi(dev, 1);
15021 dev->msix_enabled = 0;
15022 }
15023 +void pci_disable_msix(struct pci_dev* dev)
15024 +{
15025 + pci_msix_shutdown(dev);
15026 +}
15027 EXPORT_SYMBOL(pci_disable_msix);
15028
15029 /**
15030 --- a/drivers/video/Kconfig
15031 +++ b/drivers/video/Kconfig
15032 @@ -2047,7 +2047,7 @@ config FB_VIRTUAL
15033
15034 config XEN_FBDEV_FRONTEND
15035 tristate "Xen virtual frame buffer support"
15036 - depends on FB && XEN
15037 + depends on FB && PARAVIRT_XEN
15038 select FB_SYS_FILLRECT
15039 select FB_SYS_COPYAREA
15040 select FB_SYS_IMAGEBLIT
15041 --- a/drivers/video/xen-fbfront.c
15042 +++ b/drivers/video/xen-fbfront.c
15043 @@ -670,7 +670,6 @@ static struct xenbus_device_id xenfb_ids
15044
15045 static struct xenbus_driver xenfb = {
15046 .name = "vfb",
15047 - .owner = THIS_MODULE,
15048 .ids = xenfb_ids,
15049 .probe = xenfb_probe,
15050 .remove = xenfb_remove,
15051 --- a/drivers/xen/blkfront/blkfront.c
15052 +++ b/drivers/xen/blkfront/blkfront.c
15053 @@ -282,7 +282,9 @@ static void backend_changed(struct xenbu
15054 break;
15055
15056 case XenbusStateClosing:
15057 - bd = bdget(info->dev);
15058 + if (!info->gd)
15059 + break;
15060 + bd = bdget_disk(info->gd, 0);
15061 if (bd == NULL)
15062 xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
15063
15064 --- a/drivers/xen/blkfront/block.h
15065 +++ b/drivers/xen/blkfront/block.h
15066 @@ -96,7 +96,6 @@ struct blk_shadow {
15067 struct blkfront_info
15068 {
15069 struct xenbus_device *xbdev;
15070 - dev_t dev;
15071 struct gendisk *gd;
15072 int vdevice;
15073 blkif_vdev_t handle;
15074 --- a/drivers/xen/blkfront/vbd.c
15075 +++ b/drivers/xen/blkfront/vbd.c
15076 @@ -246,17 +246,32 @@ xlvbd_init_blk_queue(struct gendisk *gd,
15077 return 0;
15078 }
15079
15080 -static int
15081 -xlvbd_alloc_gendisk(int major, int minor, blkif_sector_t capacity, int vdevice,
15082 - u16 vdisk_info, u16 sector_size,
15083 - struct blkfront_info *info)
15084 +int
15085 +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
15086 + u16 sector_size, struct blkfront_info *info)
15087 {
15088 + int major, minor;
15089 struct gendisk *gd;
15090 struct xlbd_major_info *mi;
15091 int nr_minors = 1;
15092 int err = -ENODEV;
15093 unsigned int offset;
15094
15095 + if ((vdevice>>EXT_SHIFT) > 1) {
15096 + /* this is above the extended range; something is wrong */
15097 + printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
15098 + return -ENODEV;
15099 + }
15100 +
15101 + if (!VDEV_IS_EXTENDED(vdevice)) {
15102 + major = BLKIF_MAJOR(vdevice);
15103 + minor = BLKIF_MINOR(vdevice);
15104 + }
15105 + else {
15106 + major = 202;
15107 + minor = BLKIF_MINOR_EXT(vdevice);
15108 + }
15109 +
15110 BUG_ON(info->gd != NULL);
15111 BUG_ON(info->mi != NULL);
15112 BUG_ON(info->rq != NULL);
15113 @@ -337,41 +352,6 @@ xlvbd_alloc_gendisk(int major, int minor
15114 return err;
15115 }
15116
15117 -int
15118 -xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
15119 - u16 sector_size, struct blkfront_info *info)
15120 -{
15121 - struct block_device *bd;
15122 - int err = 0;
15123 - int major, minor;
15124 -
15125 - if ((vdevice>>EXT_SHIFT) > 1) {
15126 - /* this is above the extended range; something is wrong */
15127 - printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
15128 - return -ENODEV;
15129 - }
15130 -
15131 - if (!VDEV_IS_EXTENDED(vdevice)) {
15132 - major = BLKIF_MAJOR(vdevice);
15133 - minor = BLKIF_MINOR(vdevice);
15134 - }
15135 - else {
15136 - major = 202;
15137 - minor = BLKIF_MINOR_EXT(vdevice);
15138 - }
15139 -
15140 - info->dev = MKDEV(major, minor);
15141 - bd = bdget(info->dev);
15142 - if (bd == NULL)
15143 - return -ENODEV;
15144 -
15145 - err = xlvbd_alloc_gendisk(major, minor, capacity, vdevice, vdisk_info,
15146 - sector_size, info);
15147 -
15148 - bdput(bd);
15149 - return err;
15150 -}
15151 -
15152 void
15153 xlvbd_del(struct blkfront_info *info)
15154 {
15155 --- a/drivers/xen/blktap/blktap.c
15156 +++ b/drivers/xen/blktap/blktap.c
15157 @@ -111,6 +111,7 @@ typedef struct tap_blkif {
15158 unsigned long mode; /*current switching mode */
15159 int minor; /*Minor number for tapdisk device */
15160 pid_t pid; /*tapdisk process id */
15161 + struct pid_namespace *pid_ns; /*... and its corresponding namespace */
15162 enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
15163 shutdown */
15164 unsigned long *idx_map; /*Record the user ring id to kern
15165 @@ -295,16 +296,14 @@ static inline int OFFSET_TO_SEG(int offs
15166 * BLKTAP VM OPS
15167 */
15168
15169 -static struct page *blktap_nopage(struct vm_area_struct *vma,
15170 - unsigned long address,
15171 - int *type)
15172 +static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
15173 {
15174 /*
15175 * if the page has not been mapped in by the driver then return
15176 - * NOPAGE_SIGBUS to the domain.
15177 + * VM_FAULT_SIGBUS to the domain.
15178 */
15179
15180 - return NOPAGE_SIGBUS;
15181 + return VM_FAULT_SIGBUS;
15182 }
15183
15184 static pte_t blktap_clear_pte(struct vm_area_struct *vma,
15185 @@ -390,7 +389,7 @@ static pte_t blktap_clear_pte(struct vm_
15186 }
15187
15188 struct vm_operations_struct blktap_vm_ops = {
15189 - nopage: blktap_nopage,
15190 + fault: blktap_fault,
15191 zap_pte: blktap_clear_pte,
15192 };
15193
15194 @@ -483,9 +482,8 @@ found:
15195 tapfds[minor] = info;
15196
15197 if ((class = get_xen_class()) != NULL)
15198 - class_device_create(class, NULL,
15199 - MKDEV(blktap_major, minor), NULL,
15200 - "blktap%d", minor);
15201 + device_create(class, NULL, MKDEV(blktap_major, minor),
15202 + "blktap%d", minor);
15203 }
15204
15205 out:
15206 @@ -527,7 +525,7 @@ void signal_tapdisk(int idx)
15207 return;
15208
15209 if (info->pid > 0) {
15210 - ptask = find_task_by_pid(info->pid);
15211 + ptask = find_task_by_pid_ns(info->pid, info->pid_ns);
15212 if (ptask)
15213 info->status = CLEANSHUTDOWN;
15214 }
15215 @@ -773,8 +771,9 @@ static int blktap_ioctl(struct inode *in
15216 {
15217 if (info) {
15218 info->pid = (pid_t)arg;
15219 - DPRINTK("blktap: pid received %d\n",
15220 - info->pid);
15221 + info->pid_ns = current->nsproxy->pid_ns;
15222 + DPRINTK("blktap: pid received %p:%d\n",
15223 + info->pid_ns, info->pid);
15224 }
15225 return 0;
15226 }
15227 @@ -1687,9 +1686,7 @@ static int __init blkif_init(void)
15228 * We only create the device when a request of a new device is
15229 * made.
15230 */
15231 - class_device_create(class, NULL,
15232 - MKDEV(blktap_major, 0), NULL,
15233 - "blktap0");
15234 + device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
15235 } else {
15236 /* this is bad, but not fatal */
15237 WPRINTK("blktap: sysfs xen_class not created\n");
15238 --- a/drivers/xen/char/mem.c
15239 +++ b/drivers/xen/char/mem.c
15240 @@ -33,6 +33,27 @@ static inline int uncached_access(struct
15241 return 0;
15242 }
15243
15244 +static inline int range_is_allowed(unsigned long pfn, unsigned long size)
15245 +{
15246 +#ifdef CONFIG_NONPROMISC_DEVMEM
15247 + u64 from = ((u64)pfn) << PAGE_SHIFT;
15248 + u64 to = from + size;
15249 + u64 cursor = from;
15250 +
15251 + while (cursor < to) {
15252 + if (!devmem_is_allowed(pfn)) {
15253 + printk(KERN_INFO
15254 + "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
15255 + current->comm, from, to);
15256 + return 0;
15257 + }
15258 + cursor += PAGE_SIZE;
15259 + pfn++;
15260 + }
15261 +#endif
15262 + return 1;
15263 +}
15264 +
15265 /*
15266 * This funcion reads the *physical* memory. The f_pos points directly to the
15267 * memory location.
15268 @@ -55,6 +76,9 @@ static ssize_t read_mem(struct file * fi
15269
15270 sz = min_t(unsigned long, sz, count);
15271
15272 + if (!range_is_allowed(p >> PAGE_SHIFT, count))
15273 + return -EPERM;
15274 +
15275 v = ioremap(p, sz);
15276 if (IS_ERR(v) || v == NULL) {
15277 /*
15278 @@ -103,6 +127,9 @@ static ssize_t write_mem(struct file * f
15279
15280 sz = min_t(unsigned long, sz, count);
15281
15282 + if (!range_is_allowed(p >> PAGE_SHIFT, sz))
15283 + return -EPERM;
15284 +
15285 v = ioremap(p, sz);
15286 if (v == NULL)
15287 break;
15288 @@ -131,6 +158,23 @@ static ssize_t write_mem(struct file * f
15289 }
15290
15291 #ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
15292 +static void mmap_mem_open(struct vm_area_struct *vma)
15293 +{
15294 + map_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
15295 + vma->vm_page_prot);
15296 +}
15297 +
15298 +static void mmap_mem_close(struct vm_area_struct *vma)
15299 +{
15300 + unmap_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
15301 + vma->vm_page_prot);
15302 +}
15303 +
15304 +static struct vm_operations_struct mmap_mem_ops = {
15305 + .open = mmap_mem_open,
15306 + .close = mmap_mem_close
15307 +};
15308 +
15309 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
15310 {
15311 size_t size = vma->vm_end - vma->vm_start;
15312 @@ -138,6 +182,15 @@ static int xen_mmap_mem(struct file * fi
15313 if (uncached_access(file))
15314 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
15315
15316 + if (!range_is_allowed(vma->vm_pgoff, size))
15317 + return -EPERM;
15318 +
15319 + if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
15320 + &vma->vm_page_prot))
15321 + return -EINVAL;
15322 +
15323 + vma->vm_ops = &mmap_mem_ops;
15324 +
15325 /* We want to return the real error code, not EAGAIN. */
15326 return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
15327 size, vma->vm_page_prot, DOMID_IO);
15328 --- a/drivers/xen/console/console.c
15329 +++ b/drivers/xen/console/console.c
15330 @@ -536,16 +536,18 @@ static int xencons_write(
15331 return i;
15332 }
15333
15334 -static void xencons_put_char(struct tty_struct *tty, u_char ch)
15335 +static int xencons_put_char(struct tty_struct *tty, u_char ch)
15336 {
15337 unsigned long flags;
15338 + int ret;
15339
15340 if (DUMMY_TTY(tty))
15341 - return;
15342 + return 0;
15343
15344 spin_lock_irqsave(&xencons_lock, flags);
15345 - (void)__xencons_put_char(ch);
15346 + ret = __xencons_put_char(ch);
15347 spin_unlock_irqrestore(&xencons_lock, flags);
15348 + return ret;
15349 }
15350
15351 static void xencons_flush_chars(struct tty_struct *tty)
15352 @@ -567,7 +569,7 @@ static void xencons_wait_until_sent(stru
15353 if (DUMMY_TTY(tty))
15354 return;
15355
15356 - while (DRV(tty->driver)->chars_in_buffer(tty)) {
15357 + while (tty_chars_in_buffer(tty)) {
15358 set_current_state(TASK_INTERRUPTIBLE);
15359 schedule_timeout(1);
15360 if (signal_pending(current))
15361 @@ -616,8 +618,7 @@ static void xencons_close(struct tty_str
15362
15363 tty->closing = 1;
15364 tty_wait_until_sent(tty, 0);
15365 - if (DRV(tty->driver)->flush_buffer != NULL)
15366 - DRV(tty->driver)->flush_buffer(tty);
15367 + tty_driver_flush_buffer(tty);
15368 if (tty->ldisc.flush_buffer != NULL)
15369 tty->ldisc.flush_buffer(tty);
15370 tty->closing = 0;
15371 --- a/drivers/xen/core/machine_kexec.c
15372 +++ b/drivers/xen/core/machine_kexec.c
15373 @@ -90,6 +90,9 @@ void __init xen_machine_kexec_setup_reso
15374 xen_hypervisor_res.start = range.start;
15375 xen_hypervisor_res.end = range.start + range.size - 1;
15376 xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
15377 +#ifdef CONFIG_X86_64
15378 + insert_resource(&iomem_resource, &xen_hypervisor_res);
15379 +#endif
15380
15381 /* fill in crashk_res if range is reserved by hypervisor */
15382
15383 @@ -102,6 +105,9 @@ void __init xen_machine_kexec_setup_reso
15384 if (range.size) {
15385 crashk_res.start = range.start;
15386 crashk_res.end = range.start + range.size - 1;
15387 +#ifdef CONFIG_X86_64
15388 + insert_resource(&iomem_resource, &crashk_res);
15389 +#endif
15390 }
15391
15392 /* get physical address of vmcoreinfo */
15393 @@ -146,11 +152,13 @@ void __init xen_machine_kexec_setup_reso
15394 return;
15395 }
15396
15397 +#ifndef CONFIG_X86_64
15398 void __init xen_machine_kexec_register_resources(struct resource *res)
15399 {
15400 request_resource(res, &xen_hypervisor_res);
15401 machine_kexec_register_resources(res);
15402 }
15403 +#endif
15404
15405 static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
15406 {
15407 --- a/drivers/xen/core/machine_reboot.c
15408 +++ b/drivers/xen/core/machine_reboot.c
15409 @@ -52,6 +52,14 @@ void machine_power_off(void)
15410 HYPERVISOR_shutdown(SHUTDOWN_poweroff);
15411 }
15412
15413 +#ifdef CONFIG_KEXEC
15414 +#include <asm/reboot.h>
15415 +void machine_crash_shutdown(struct pt_regs *regs)
15416 +{
15417 + native_machine_crash_shutdown(regs);
15418 +}
15419 +#endif
15420 +
15421 int reboot_thru_bios = 0; /* for dmi_scan.c */
15422 EXPORT_SYMBOL(machine_restart);
15423 EXPORT_SYMBOL(machine_halt);
15424 --- a/drivers/xen/core/smpboot.c
15425 +++ b/drivers/xen/core/smpboot.c
15426 @@ -57,17 +57,16 @@ static DEFINE_PER_CPU(int, callfunc_irq)
15427 static char resched_name[NR_CPUS][15];
15428 static char callfunc_name[NR_CPUS][15];
15429
15430 -u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
15431 +#ifdef CONFIG_X86_LOCAL_APIC
15432 +#define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
15433 +#else
15434 +#define set_cpu_to_apicid(cpu, apicid)
15435 +#endif
15436
15437 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
15438 DEFINE_PER_CPU(cpumask_t, cpu_core_map);
15439 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
15440
15441 -#if defined(__i386__)
15442 -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
15443 -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
15444 -#endif
15445 -
15446 void __init prefill_possible_map(void)
15447 {
15448 int i, rc;
15449 @@ -158,7 +157,7 @@ static int __cpuinit xen_smp_intr_init(u
15450 }
15451
15452 #ifdef CONFIG_HOTPLUG_CPU
15453 -static void xen_smp_intr_exit(unsigned int cpu)
15454 +static void __cpuexit xen_smp_intr_exit(unsigned int cpu)
15455 {
15456 if (cpu != 0)
15457 local_teardown_timer(cpu);
15458 @@ -267,8 +266,7 @@ void __init smp_prepare_cpus(unsigned in
15459 boot_cpu_data.apicid = apicid;
15460 cpu_data(0) = boot_cpu_data;
15461
15462 - cpu_2_logical_apicid[0] = apicid;
15463 - per_cpu(x86_cpu_to_apicid, 0) = apicid;
15464 + set_cpu_to_apicid(0, apicid);
15465
15466 current_thread_info()->cpu = 0;
15467
15468 @@ -323,8 +321,7 @@ void __init smp_prepare_cpus(unsigned in
15469 cpu_data(cpu).cpu_index = cpu;
15470 cpu_data(cpu).apicid = apicid;
15471
15472 - cpu_2_logical_apicid[cpu] = apicid;
15473 - per_cpu(x86_cpu_to_apicid, cpu) = apicid;
15474 + set_cpu_to_apicid(cpu, apicid);
15475
15476 #ifdef __x86_64__
15477 cpu_pda(cpu)->pcurrent = idle;
15478 @@ -379,7 +376,7 @@ static int __init initialize_cpu_present
15479 }
15480 core_initcall(initialize_cpu_present_map);
15481
15482 -int __cpu_disable(void)
15483 +int __cpuexit __cpu_disable(void)
15484 {
15485 cpumask_t map = cpu_online_map;
15486 unsigned int cpu = smp_processor_id();
15487 @@ -396,7 +393,7 @@ int __cpu_disable(void)
15488 return 0;
15489 }
15490
15491 -void __cpu_die(unsigned int cpu)
15492 +void __cpuexit __cpu_die(unsigned int cpu)
15493 {
15494 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
15495 current->state = TASK_UNINTERRUPTIBLE;
15496 --- a/drivers/xen/core/xen_proc.c
15497 +++ b/drivers/xen/core/xen_proc.c
15498 @@ -8,7 +8,7 @@ static struct proc_dir_entry *xen_base;
15499 struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
15500 {
15501 if ( xen_base == NULL )
15502 - if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
15503 + if ( (xen_base = proc_mkdir("xen", NULL)) == NULL )
15504 panic("Couldn't create /proc/xen");
15505 return create_proc_entry(name, mode, xen_base);
15506 }
15507 --- a/drivers/xen/fbfront/xenfb.c
15508 +++ b/drivers/xen/fbfront/xenfb.c
15509 @@ -94,7 +94,7 @@ struct xenfb_info
15510 * only mappings. The former creates unfaulted pages. Preserves
15511 * invariant. The latter removes pages. Preserves invariant.
15512 *
15513 - * 3. Holding both locks: xenfb_vm_nopage(). Extends the dirty
15514 + * 3. Holding both locks: xenfb_vm_fault(). Extends the dirty
15515 * rectangle and updates mappings consistently. Preserves
15516 * invariant.
15517 *
15518 @@ -113,13 +113,13 @@ struct xenfb_info
15519 *
15520 * But FIXME: the invariant is too weak. It misses that the fault
15521 * record in mappings must be consistent with the mapping of pages in
15522 - * the associated address space! do_no_page() updates the PTE after
15523 - * xenfb_vm_nopage() returns, i.e. outside the critical region. This
15524 + * the associated address space! __do_fault() updates the PTE after
15525 + * xenfb_vm_fault() returns, i.e. outside the critical region. This
15526 * allows the following race:
15527 *
15528 * X writes to some address in the Xen frame buffer
15529 - * Fault - call do_no_page()
15530 - * call xenfb_vm_nopage()
15531 + * Fault - call __do_fault()
15532 + * call xenfb_vm_fault()
15533 * grab mm_lock
15534 * map->faults++;
15535 * release mm_lock
15536 @@ -386,18 +386,17 @@ static void xenfb_vm_close(struct vm_are
15537 mutex_unlock(&info->mm_lock);
15538 }
15539
15540 -static struct page *xenfb_vm_nopage(struct vm_area_struct *vma,
15541 - unsigned long vaddr, int *type)
15542 +static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
15543 {
15544 struct xenfb_mapping *map = vma->vm_private_data;
15545 struct xenfb_info *info = map->info;
15546 - int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT;
15547 + int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
15548 unsigned long flags;
15549 struct page *page;
15550 int y1, y2;
15551
15552 if (pgnr >= info->nr_pages)
15553 - return NOPAGE_SIGBUS;
15554 + return VM_FAULT_SIGBUS;
15555
15556 mutex_lock(&info->mm_lock);
15557 spin_lock_irqsave(&info->dirty_lock, flags);
15558 @@ -413,16 +412,15 @@ static struct page *xenfb_vm_nopage(stru
15559 spin_unlock_irqrestore(&info->dirty_lock, flags);
15560 mutex_unlock(&info->mm_lock);
15561
15562 - if (type)
15563 - *type = VM_FAULT_MINOR;
15564 + vmf->page = page;
15565
15566 - return page;
15567 + return VM_FAULT_MINOR;
15568 }
15569
15570 static struct vm_operations_struct xenfb_vm_ops = {
15571 .open = xenfb_vm_open,
15572 .close = xenfb_vm_close,
15573 - .nopage = xenfb_vm_nopage,
15574 + .fault = xenfb_vm_fault,
15575 };
15576
15577 static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
15578 --- a/drivers/xen/gntdev/gntdev.c
15579 +++ b/drivers/xen/gntdev/gntdev.c
15580 @@ -392,7 +392,7 @@ nomem_out:
15581 static int __init gntdev_init(void)
15582 {
15583 struct class *class;
15584 - struct class_device *device;
15585 + struct device *device;
15586
15587 if (!is_running_on_xen()) {
15588 printk(KERN_ERR "You must be running Xen to use gntdev\n");
15589 @@ -417,8 +417,8 @@ static int __init gntdev_init(void)
15590 return 0;
15591 }
15592
15593 - device = class_device_create(class, NULL, MKDEV(gntdev_major, 0),
15594 - NULL, GNTDEV_NAME);
15595 + device = device_create(class, NULL, MKDEV(gntdev_major, 0),
15596 + GNTDEV_NAME);
15597 if (IS_ERR(device)) {
15598 printk(KERN_ERR "Error creating gntdev device in xen_class\n");
15599 printk(KERN_ERR "gntdev created with major number = %d\n",
15600 @@ -435,7 +435,7 @@ static void __exit gntdev_exit(void)
15601 {
15602 struct class *class;
15603 if ((class = get_xen_class()) != NULL)
15604 - class_device_destroy(class, MKDEV(gntdev_major, 0));
15605 + device_destroy(class, MKDEV(gntdev_major, 0));
15606 unregister_chrdev(gntdev_major, GNTDEV_NAME);
15607 }
15608
15609 --- a/drivers/xen/Kconfig
15610 +++ b/drivers/xen/Kconfig
15611 @@ -2,8 +2,6 @@
15612 # This Kconfig describe xen options
15613 #
15614
15615 -mainmenu "Xen Configuration"
15616 -
15617 config XEN
15618 bool
15619
15620 --- a/drivers/xen/Makefile
15621 +++ b/drivers/xen/Makefile
15622 @@ -1,5 +1,8 @@
15623 -obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o
15624 +obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
15625 +xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
15626 +xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
15627
15628 +xen-balloon-$(CONFIG_XEN) := balloon/
15629 obj-$(CONFIG_XEN) += core/
15630 obj-$(CONFIG_XEN) += console/
15631 obj-$(CONFIG_XEN) += evtchn/
15632 @@ -7,7 +10,8 @@ obj-y += xenbus/
15633 obj-$(CONFIG_XEN) += char/
15634
15635 obj-$(CONFIG_XEN) += util.o
15636 -obj-$(CONFIG_XEN_BALLOON) += balloon/
15637 +obj-$(CONFIG_XEN_XENCOMM) += $(xen-xencomm-y)
15638 +obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y)
15639 obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
15640 obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
15641 obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
15642 --- a/drivers/xen/netfront/netfront.c
15643 +++ b/drivers/xen/netfront/netfront.c
15644 @@ -1464,8 +1464,7 @@ err:
15645 }
15646 }
15647
15648 - while ((skb = __skb_dequeue(&errq)))
15649 - kfree_skb(skb);
15650 + __skb_queue_purge(&errq);
15651
15652 while ((skb = __skb_dequeue(&rxq)) != NULL) {
15653 struct page *page = NETFRONT_SKB_CB(skb)->page;
15654 @@ -1630,8 +1629,7 @@ static void netif_release_rx_bufs_flip(s
15655 }
15656 }
15657
15658 - while ((skb = __skb_dequeue(&free_list)) != NULL)
15659 - dev_kfree_skb(skb);
15660 + __skb_queue_purge(&free_list);
15661
15662 spin_unlock_bh(&np->rx_lock);
15663 }
15664 --- a/drivers/xen/privcmd/privcmd.c
15665 +++ b/drivers/xen/privcmd/privcmd.c
15666 @@ -261,15 +261,13 @@ static long privcmd_ioctl(struct file *f
15667 }
15668
15669 #ifndef HAVE_ARCH_PRIVCMD_MMAP
15670 -static struct page *privcmd_nopage(struct vm_area_struct *vma,
15671 - unsigned long address,
15672 - int *type)
15673 +static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
15674 {
15675 - return NOPAGE_SIGBUS;
15676 + return VM_FAULT_SIGBUS;
15677 }
15678
15679 static struct vm_operations_struct privcmd_vm_ops = {
15680 - .nopage = privcmd_nopage
15681 + .fault = privcmd_fault
15682 };
15683
15684 static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
15685 --- a/drivers/xen/xenbus/xenbus_client.c
15686 +++ b/drivers/xen/xenbus/xenbus_client.c
15687 @@ -440,7 +440,7 @@ int xenbus_map_ring_valloc(struct xenbus
15688
15689 *vaddr = NULL;
15690
15691 - area = alloc_vm_area(PAGE_SIZE);
15692 + area = xen_alloc_vm_area(PAGE_SIZE);
15693 if (!area)
15694 return -ENOMEM;
15695
15696 @@ -450,7 +450,7 @@ int xenbus_map_ring_valloc(struct xenbus
15697 BUG();
15698
15699 if (op.status != GNTST_okay) {
15700 - free_vm_area(area);
15701 + xen_free_vm_area(area);
15702 xenbus_dev_fatal(dev, op.status,
15703 "mapping in shared page %d from domain %d",
15704 gnt_ref, dev->otherend_id);
15705 @@ -549,7 +549,7 @@ int xenbus_unmap_ring_vfree(struct xenbu
15706 BUG();
15707
15708 if (op.status == GNTST_okay)
15709 - free_vm_area(area);
15710 + xen_free_vm_area(area);
15711 else
15712 xenbus_dev_error(dev, op.status,
15713 "unmapping page at handle %d error %d",
15714 --- a/drivers/xen/xenbus/xenbus_probe.c
15715 +++ b/drivers/xen/xenbus/xenbus_probe.c
15716 @@ -173,7 +173,7 @@ static int read_backend_details(struct x
15717 return read_otherend_details(xendev, "backend-id", "backend");
15718 }
15719
15720 -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) && (defined(CONFIG_XEN) || defined(MODULE))
15721 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
15722 static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env)
15723 {
15724 struct xenbus_device *xdev;
15725 @@ -185,8 +185,10 @@ static int xenbus_uevent_frontend(struct
15726 return -ENODEV;
15727
15728 /* stuff we want to pass to /sbin/hotplug */
15729 +#if defined(CONFIG_XEN) || defined(MODULE)
15730 add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype);
15731 add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename);
15732 +#endif
15733 add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype);
15734
15735 return 0;
15736 @@ -207,10 +209,8 @@ static struct xen_bus_type xenbus_fronte
15737 .probe = xenbus_dev_probe,
15738 .remove = xenbus_dev_remove,
15739 .shutdown = xenbus_dev_shutdown,
15740 -#if defined(CONFIG_XEN) || defined(MODULE)
15741 .uevent = xenbus_uevent_frontend,
15742 #endif
15743 -#endif
15744 },
15745 #if defined(CONFIG_XEN) || defined(MODULE)
15746 .dev = {
15747 @@ -519,6 +519,15 @@ static ssize_t xendev_show_devtype(struc
15748 }
15749 DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
15750
15751 +static ssize_t xendev_show_modalias(struct device *dev,
15752 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
15753 + struct device_attribute *attr,
15754 +#endif
15755 + char *buf)
15756 +{
15757 + return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
15758 +}
15759 +DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
15760
15761 int xenbus_probe_node(struct xen_bus_type *bus,
15762 const char *type,
15763 @@ -579,10 +588,16 @@ int xenbus_probe_node(struct xen_bus_typ
15764
15765 err = device_create_file(&xendev->dev, &dev_attr_devtype);
15766 if (err)
15767 - goto fail_remove_file;
15768 + goto fail_remove_nodename;
15769 +
15770 + err = device_create_file(&xendev->dev, &dev_attr_modalias);
15771 + if (err)
15772 + goto fail_remove_devtype;
15773
15774 return 0;
15775 -fail_remove_file:
15776 +fail_remove_devtype:
15777 + device_remove_file(&xendev->dev, &dev_attr_devtype);
15778 +fail_remove_nodename:
15779 device_remove_file(&xendev->dev, &dev_attr_nodename);
15780 fail_unregister:
15781 device_unregister(&xendev->dev);
15782 --- a/fs/aio.c
15783 +++ b/fs/aio.c
15784 @@ -1255,6 +1255,7 @@ static void io_destroy(struct kioctx *io
15785 #ifdef CONFIG_EPOLL
15786 /* forget the poll file, but it's up to the user to close it */
15787 if (ioctx->file) {
15788 + fput(ioctx->file);
15789 ioctx->file->private_data = 0;
15790 ioctx->file = 0;
15791 }
15792 @@ -1279,6 +1280,7 @@ static int aio_queue_fd_close(struct ino
15793 spin_lock_irq(&ioctx->ctx_lock);
15794 ioctx->file = 0;
15795 spin_unlock_irq(&ioctx->ctx_lock);
15796 + fput(file);
15797 }
15798 return 0;
15799 }
15800 @@ -1314,16 +1316,17 @@ static const struct file_operations aioq
15801
15802 static int make_aio_fd(struct kioctx *ioctx)
15803 {
15804 - int error, fd;
15805 - struct inode *inode;
15806 + int fd;
15807 struct file *file;
15808
15809 - error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
15810 - &aioq_fops, ioctx);
15811 - if (error)
15812 - return error;
15813 + fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
15814 + if (fd < 0)
15815 + return fd;
15816
15817 /* associate the file with the IO context */
15818 + file = fget(fd);
15819 + if (!file)
15820 + return -EBADF;
15821 file->private_data = ioctx;
15822 ioctx->file = file;
15823 init_waitqueue_head(&ioctx->poll_wait);
15824 --- a/include/asm-x86/dma-mapping.h
15825 +++ b/include/asm-x86/dma-mapping.h
15826 @@ -223,8 +223,13 @@ static inline dma_addr_t dma_map_page(st
15827 struct dma_mapping_ops *ops = get_dma_ops(dev);
15828
15829 BUG_ON(!valid_dma_direction(direction));
15830 +#ifndef CONFIG_XEN
15831 return ops->map_single(dev, page_to_phys(page) + offset,
15832 size, direction);
15833 +#else
15834 + return ops->map_single(dev, page_to_pseudophys(page) + offset,
15835 + size, direction);
15836 +#endif
15837 }
15838
15839 static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
15840 --- a/include/asm-x86/genapic_64.h
15841 +++ b/include/asm-x86/genapic_64.h
15842 @@ -46,5 +46,6 @@ extern struct genapic apic_x2apic_phys;
15843 extern int acpi_madt_oem_check(char *, char *);
15844
15845 +#ifndef CONFIG_XEN
15846 enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
15847 extern enum uv_system_type get_uv_system_type(void);
15848 extern int is_uv_system(void);
15849 @@ -55,6 +56,10 @@ DECLARE_PER_CPU(int, x2apic_extra_bits);
15850 extern void uv_cpu_init(void);
15851 extern void uv_system_init(void);
15852 extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
15853 +#else
15854 +#define is_uv_system() 0
15855 +#define uv_cpu_init() ((void)0)
15856 +#endif
15857
15858 extern void setup_apic_routing(void);
15859
15860 --- a/include/asm-x86/mach-xen/asm/desc.h
15861 +++ b/include/asm-x86/mach-xen/asm/desc.h
15862 @@ -64,8 +64,8 @@ static inline struct desc_struct *get_cp
15863 }
15864
15865 static inline void pack_gate(gate_desc *gate, unsigned char type,
15866 - unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
15867 -
15868 + unsigned long base, unsigned dpl, unsigned flags,
15869 + unsigned short seg)
15870 {
15871 gate->a = (seg << 16) | (base & 0xffff);
15872 gate->b = (base & 0xffff0000) |
15873 @@ -84,22 +84,23 @@ static inline int desc_empty(const void
15874 #define load_TR_desc() native_load_tr_desc()
15875 #define load_gdt(dtr) native_load_gdt(dtr)
15876 #define load_idt(dtr) native_load_idt(dtr)
15877 -#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
15878 -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
15879 +#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
15880 +#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
15881
15882 #define store_gdt(dtr) native_store_gdt(dtr)
15883 #define store_idt(dtr) native_store_idt(dtr)
15884 #define store_tr(tr) (tr = native_store_tr())
15885 -#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
15886 +#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
15887
15888 #define load_TLS(t, cpu) native_load_tls(t, cpu)
15889 #define set_ldt native_set_ldt
15890
15891 -#define write_ldt_entry(dt, entry, desc) \
15892 - native_write_ldt_entry(dt, entry, desc)
15893 -#define write_gdt_entry(dt, entry, desc, type) \
15894 - native_write_gdt_entry(dt, entry, desc, type)
15895 -#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
15896 +#define write_ldt_entry(dt, entry, desc) \
15897 + native_write_ldt_entry(dt, entry, desc)
15898 +#define write_gdt_entry(dt, entry, desc, type) \
15899 + native_write_gdt_entry(dt, entry, desc, type)
15900 +#define write_idt_entry(dt, entry, g) \
15901 + native_write_idt_entry(dt, entry, g)
15902
15903 static inline void native_write_idt_entry(gate_desc *idt, int entry,
15904 const gate_desc *gate)
15905 @@ -138,8 +139,8 @@ static inline void pack_descriptor(struc
15906 {
15907 desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
15908 desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
15909 - (limit & 0x000f0000) | ((type & 0xff) << 8) |
15910 - ((flags & 0xf) << 20);
15911 + (limit & 0x000f0000) | ((type & 0xff) << 8) |
15912 + ((flags & 0xf) << 20);
15913 desc->p = 1;
15914 }
15915
15916 @@ -160,7 +161,6 @@ static inline void set_tssldt_descriptor
15917 desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
15918 desc->base3 = PTR_HIGH(addr);
15919 #else
15920 -
15921 pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
15922 #endif
15923 }
15924 @@ -178,7 +178,8 @@ static inline void __set_tss_desc(unsign
15925 * last valid byte
15926 */
15927 set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
15928 - IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
15929 + IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
15930 + sizeof(unsigned long) - 1);
15931 write_gdt_entry(d, entry, &tss, DESC_TSS);
15932 }
15933
15934 @@ -187,16 +188,16 @@ static inline void __set_tss_desc(unsign
15935 static inline void native_set_ldt(const void *addr, unsigned int entries)
15936 {
15937 if (likely(entries == 0))
15938 - __asm__ __volatile__("lldt %w0"::"q" (0));
15939 + asm volatile("lldt %w0"::"q" (0));
15940 else {
15941 unsigned cpu = smp_processor_id();
15942 ldt_desc ldt;
15943
15944 - set_tssldt_descriptor(&ldt, (unsigned long)addr,
15945 - DESC_LDT, entries * sizeof(ldt) - 1);
15946 + set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
15947 + entries * LDT_ENTRY_SIZE - 1);
15948 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
15949 &ldt, DESC_LDT);
15950 - __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
15951 + asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
15952 }
15953 }
15954
15955 @@ -261,15 +262,15 @@ static inline void xen_load_tls(struct t
15956 }
15957 #endif
15958
15959 -#define _LDT_empty(info) (\
15960 - (info)->base_addr == 0 && \
15961 - (info)->limit == 0 && \
15962 - (info)->contents == 0 && \
15963 - (info)->read_exec_only == 1 && \
15964 - (info)->seg_32bit == 0 && \
15965 - (info)->limit_in_pages == 0 && \
15966 - (info)->seg_not_present == 1 && \
15967 - (info)->useable == 0)
15968 +#define _LDT_empty(info) \
15969 + ((info)->base_addr == 0 && \
15970 + (info)->limit == 0 && \
15971 + (info)->contents == 0 && \
15972 + (info)->read_exec_only == 1 && \
15973 + (info)->seg_32bit == 0 && \
15974 + (info)->limit_in_pages == 0 && \
15975 + (info)->seg_not_present == 1 && \
15976 + (info)->useable == 0)
15977
15978 #ifdef CONFIG_X86_64
15979 #define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
15980 @@ -309,7 +310,7 @@ static inline unsigned long get_desc_lim
15981
15982 #ifndef CONFIG_X86_NO_IDT
15983 static inline void _set_gate(int gate, unsigned type, void *addr,
15984 - unsigned dpl, unsigned ist, unsigned seg)
15985 + unsigned dpl, unsigned ist, unsigned seg)
15986 {
15987 gate_desc s;
15988 pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
15989 @@ -393,10 +394,10 @@ static inline void set_system_gate_ist(i
15990 * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
15991 */
15992 #define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
15993 - movb idx*8+4(gdt), lo_b; \
15994 - movb idx*8+7(gdt), hi_b; \
15995 - shll $16, base; \
15996 - movw idx*8+2(gdt), lo_w;
15997 + movb idx * 8 + 4(gdt), lo_b; \
15998 + movb idx * 8 + 7(gdt), hi_b; \
15999 + shll $16, base; \
16000 + movw idx * 8 + 2(gdt), lo_w;
16001
16002
16003 #endif /* __ASSEMBLY__ */
16004 --- a/include/asm-x86/mach-xen/asm/dma-mapping_32.h
16005 +++ /dev/null
16006 @@ -1,141 +0,0 @@
16007 -#ifndef _ASM_I386_DMA_MAPPING_H
16008 -#define _ASM_I386_DMA_MAPPING_H
16009 -
16010 -/*
16011 - * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
16012 - * documentation.
16013 - */
16014 -
16015 -#include <linux/mm.h>
16016 -#include <linux/scatterlist.h>
16017 -#include <asm/cache.h>
16018 -#include <asm/io.h>
16019 -#include <asm/swiotlb.h>
16020 -
16021 -static inline int
16022 -address_needs_mapping(struct device *hwdev, dma_addr_t addr)
16023 -{
16024 - dma_addr_t mask = 0xffffffff;
16025 - /* If the device has a mask, use it, otherwise default to 32 bits */
16026 - if (hwdev && hwdev->dma_mask)
16027 - mask = *hwdev->dma_mask;
16028 - return (addr & ~mask) != 0;
16029 -}
16030 -
16031 -extern int range_straddles_page_boundary(paddr_t p, size_t size);
16032 -
16033 -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
16034 -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
16035 -
16036 -void *dma_alloc_coherent(struct device *dev, size_t size,
16037 - dma_addr_t *dma_handle, gfp_t flag);
16038 -
16039 -void dma_free_coherent(struct device *dev, size_t size,
16040 - void *vaddr, dma_addr_t dma_handle);
16041 -
16042 -extern dma_addr_t
16043 -dma_map_single(struct device *dev, void *ptr, size_t size,
16044 - enum dma_data_direction direction);
16045 -
16046 -extern void
16047 -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
16048 - enum dma_data_direction direction);
16049 -
16050 -extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
16051 - int nents, enum dma_data_direction direction);
16052 -extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
16053 - int nents, enum dma_data_direction direction);
16054 -
16055 -#ifdef CONFIG_HIGHMEM
16056 -extern dma_addr_t
16057 -dma_map_page(struct device *dev, struct page *page, unsigned long offset,
16058 - size_t size, enum dma_data_direction direction);
16059 -
16060 -extern void
16061 -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
16062 - enum dma_data_direction direction);
16063 -#else
16064 -#define dma_map_page(dev, page, offset, size, dir) \
16065 - dma_map_single(dev, page_address(page) + (offset), (size), (dir))
16066 -#define dma_unmap_page dma_unmap_single
16067 -#endif
16068 -
16069 -extern void
16070 -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
16071 - enum dma_data_direction direction);
16072 -
16073 -extern void
16074 -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
16075 - enum dma_data_direction direction);
16076 -
16077 -static inline void
16078 -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
16079 - unsigned long offset, size_t size,
16080 - enum dma_data_direction direction)
16081 -{
16082 - dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
16083 -}
16084 -
16085 -static inline void
16086 -dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
16087 - unsigned long offset, size_t size,
16088 - enum dma_data_direction direction)
16089 -{
16090 - dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
16091 -}
16092 -
16093 -extern void
16094 -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
16095 - enum dma_data_direction direction);
16096 -
16097 -extern void
16098 -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
16099 - enum dma_data_direction direction);
16100 -
16101 -extern int
16102 -dma_mapping_error(dma_addr_t dma_addr);
16103 -
16104 -extern int
16105 -dma_supported(struct device *dev, u64 mask);
16106 -
16107 -static inline int
16108 -dma_set_mask(struct device *dev, u64 mask)
16109 -{
16110 - if(!dev->dma_mask || !dma_supported(dev, mask))
16111 - return -EIO;
16112 -
16113 - *dev->dma_mask = mask;
16114 -
16115 - return 0;
16116 -}
16117 -
16118 -static inline int
16119 -dma_get_cache_alignment(void)
16120 -{
16121 - /* no easy way to get cache size on all x86, so return the
16122 - * maximum possible, to be safe */
16123 - return (1 << INTERNODE_CACHE_SHIFT);
16124 -}
16125 -
16126 -#define dma_is_consistent(d, h) (1)
16127 -
16128 -static inline void
16129 -dma_cache_sync(struct device *dev, void *vaddr, size_t size,
16130 - enum dma_data_direction direction)
16131 -{
16132 - flush_write_buffers();
16133 -}
16134 -
16135 -#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
16136 -extern int
16137 -dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
16138 - dma_addr_t device_addr, size_t size, int flags);
16139 -
16140 -extern void
16141 -dma_release_declared_memory(struct device *dev);
16142 -
16143 -extern void *
16144 -dma_mark_declared_memory_occupied(struct device *dev,
16145 - dma_addr_t device_addr, size_t size);
16146 -
16147 -#endif
16148 --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
16149 +++ /dev/null
16150 @@ -1,205 +0,0 @@
16151 -#ifndef _X8664_DMA_MAPPING_H
16152 -#define _X8664_DMA_MAPPING_H 1
16153 -
16154 -/*
16155 - * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
16156 - * documentation.
16157 - */
16158 -
16159 -#include <linux/scatterlist.h>
16160 -#include <asm/io.h>
16161 -
16162 -struct dma_mapping_ops {
16163 - int (*mapping_error)(dma_addr_t dma_addr);
16164 - void* (*alloc_coherent)(struct device *dev, size_t size,
16165 - dma_addr_t *dma_handle, gfp_t gfp);
16166 - void (*free_coherent)(struct device *dev, size_t size,
16167 - void *vaddr, dma_addr_t dma_handle);
16168 - dma_addr_t (*map_single)(struct device *hwdev, void *ptr,
16169 - size_t size, int direction);
16170 - /* like map_single, but doesn't check the device mask */
16171 - dma_addr_t (*map_simple)(struct device *hwdev, char *ptr,
16172 - size_t size, int direction);
16173 - void (*unmap_single)(struct device *dev, dma_addr_t addr,
16174 - size_t size, int direction);
16175 - void (*sync_single_for_cpu)(struct device *hwdev,
16176 - dma_addr_t dma_handle, size_t size,
16177 - int direction);
16178 - void (*sync_single_for_device)(struct device *hwdev,
16179 - dma_addr_t dma_handle, size_t size,
16180 - int direction);
16181 - void (*sync_single_range_for_cpu)(struct device *hwdev,
16182 - dma_addr_t dma_handle, unsigned long offset,
16183 - size_t size, int direction);
16184 - void (*sync_single_range_for_device)(struct device *hwdev,
16185 - dma_addr_t dma_handle, unsigned long offset,
16186 - size_t size, int direction);
16187 - void (*sync_sg_for_cpu)(struct device *hwdev,
16188 - struct scatterlist *sg, int nelems,
16189 - int direction);
16190 - void (*sync_sg_for_device)(struct device *hwdev,
16191 - struct scatterlist *sg, int nelems,
16192 - int direction);
16193 - int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
16194 - int nents, int direction);
16195 - void (*unmap_sg)(struct device *hwdev,
16196 - struct scatterlist *sg, int nents,
16197 - int direction);
16198 - int (*dma_supported)(struct device *hwdev, u64 mask);
16199 - int is_phys;
16200 -};
16201 -
16202 -extern dma_addr_t bad_dma_address;
16203 -extern const struct dma_mapping_ops* dma_ops;
16204 -extern int iommu_merge;
16205 -
16206 -#if 0
16207 -static inline int dma_mapping_error(dma_addr_t dma_addr)
16208 -{
16209 - if (dma_ops->mapping_error)
16210 - return dma_ops->mapping_error(dma_addr);
16211 -
16212 - return (dma_addr == bad_dma_address);
16213 -}
16214 -
16215 -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
16216 -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
16217 -
16218 -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
16219 -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
16220 -
16221 -extern void *dma_alloc_coherent(struct device *dev, size_t size,
16222 - dma_addr_t *dma_handle, gfp_t gfp);
16223 -extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
16224 - dma_addr_t dma_handle);
16225 -
16226 -static inline dma_addr_t
16227 -dma_map_single(struct device *hwdev, void *ptr, size_t size,
16228 - int direction)
16229 -{
16230 - BUG_ON(!valid_dma_direction(direction));
16231 - return dma_ops->map_single(hwdev, ptr, size, direction);
16232 -}
16233 -
16234 -static inline void
16235 -dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
16236 - int direction)
16237 -{
16238 - BUG_ON(!valid_dma_direction(direction));
16239 - dma_ops->unmap_single(dev, addr, size, direction);
16240 -}
16241 -
16242 -#define dma_map_page(dev,page,offset,size,dir) \
16243 - dma_map_single((dev), page_address(page)+(offset), (size), (dir))
16244 -
16245 -#define dma_unmap_page dma_unmap_single
16246 -
16247 -static inline void
16248 -dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
16249 - size_t size, int direction)
16250 -{
16251 - BUG_ON(!valid_dma_direction(direction));
16252 - if (dma_ops->sync_single_for_cpu)
16253 - dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
16254 - direction);
16255 - flush_write_buffers();
16256 -}
16257 -
16258 -static inline void
16259 -dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
16260 - size_t size, int direction)
16261 -{
16262 - BUG_ON(!valid_dma_direction(direction));
16263 - if (dma_ops->sync_single_for_device)
16264 - dma_ops->sync_single_for_device(hwdev, dma_handle, size,
16265 - direction);
16266 - flush_write_buffers();
16267 -}
16268 -
16269 -static inline void
16270 -dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
16271 - unsigned long offset, size_t size, int direction)
16272 -{
16273 - BUG_ON(!valid_dma_direction(direction));
16274 - if (dma_ops->sync_single_range_for_cpu) {
16275 - dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
16276 - }
16277 -
16278 - flush_write_buffers();
16279 -}
16280 -
16281 -static inline void
16282 -dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
16283 - unsigned long offset, size_t size, int direction)
16284 -{
16285 - BUG_ON(!valid_dma_direction(direction));
16286 - if (dma_ops->sync_single_range_for_device)
16287 - dma_ops->sync_single_range_for_device(hwdev, dma_handle,
16288 - offset, size, direction);
16289 -
16290 - flush_write_buffers();
16291 -}
16292 -
16293 -static inline void
16294 -dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
16295 - int nelems, int direction)
16296 -{
16297 - BUG_ON(!valid_dma_direction(direction));
16298 - if (dma_ops->sync_sg_for_cpu)
16299 - dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
16300 - flush_write_buffers();
16301 -}
16302 -
16303 -static inline void
16304 -dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
16305 - int nelems, int direction)
16306 -{
16307 - BUG_ON(!valid_dma_direction(direction));
16308 - if (dma_ops->sync_sg_for_device) {
16309 - dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
16310 - }
16311 -
16312 - flush_write_buffers();
16313 -}
16314 -
16315 -static inline int
16316 -dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
16317 -{
16318 - BUG_ON(!valid_dma_direction(direction));
16319 - return dma_ops->map_sg(hwdev, sg, nents, direction);
16320 -}
16321 -
16322 -static inline void
16323 -dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
16324 - int direction)
16325 -{
16326 - BUG_ON(!valid_dma_direction(direction));
16327 - dma_ops->unmap_sg(hwdev, sg, nents, direction);
16328 -}
16329 -
16330 -extern int dma_supported(struct device *hwdev, u64 mask);
16331 -
16332 -/* same for gart, swiotlb, and nommu */
16333 -static inline int dma_get_cache_alignment(void)
16334 -{
16335 - return boot_cpu_data.x86_clflush_size;
16336 -}
16337 -
16338 -#define dma_is_consistent(d, h) 1
16339 -
16340 -extern int dma_set_mask(struct device *dev, u64 mask);
16341 -
16342 -static inline void
16343 -dma_cache_sync(struct device *dev, void *vaddr, size_t size,
16344 - enum dma_data_direction dir)
16345 -{
16346 - flush_write_buffers();
16347 -}
16348 -
16349 -extern struct device fallback_dev;
16350 -extern int panic_on_overflow;
16351 -#endif
16352 -
16353 -#endif /* _X8664_DMA_MAPPING_H */
16354 -
16355 -#include "dma-mapping_32.h"
16356 --- a/include/asm-x86/mach-xen/asm/dma-mapping.h
16357 +++ b/include/asm-x86/mach-xen/asm/dma-mapping.h
16358 @@ -1,5 +1,17 @@
16359 -#ifdef CONFIG_X86_32
16360 -# include "dma-mapping_32.h"
16361 -#else
16362 -# include "dma-mapping_64.h"
16363 -#endif
16364 +#ifndef _ASM_DMA_MAPPING_H_
16365 +
16366 +#include "../../dma-mapping.h"
16367 +
16368 +static inline int
16369 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
16370 +{
16371 + dma_addr_t mask = 0xffffffff;
16372 + /* If the device has a mask, use it, otherwise default to 32 bits */
16373 + if (hwdev && hwdev->dma_mask)
16374 + mask = *hwdev->dma_mask;
16375 + return (addr & ~mask) != 0;
16376 +}
16377 +
16378 +extern int range_straddles_page_boundary(paddr_t p, size_t size);
16379 +
16380 +#endif /* _ASM_DMA_MAPPING_H_ */
16381 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
16382 +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
16383 @@ -10,8 +10,8 @@
16384 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
16385 */
16386
16387 -#ifndef _ASM_FIXMAP_H
16388 -#define _ASM_FIXMAP_H
16389 +#ifndef _ASM_FIXMAP_32_H
16390 +#define _ASM_FIXMAP_32_H
16391
16392 /* used by vmalloc.c, vsyscall.lds.S.
16393 *
16394 @@ -102,8 +102,7 @@ enum fixed_addresses {
16395 */
16396 #define NR_FIX_BTMAPS 64
16397 #define FIX_BTMAPS_NESTING 4
16398 - FIX_BTMAP_END =
16399 - __end_of_permanent_fixed_addresses + 512 -
16400 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
16401 (__end_of_permanent_fixed_addresses & 511),
16402 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
16403 FIX_WP_TEST,
16404 @@ -114,19 +113,16 @@ enum fixed_addresses {
16405 };
16406
16407 extern void __set_fixmap(enum fixed_addresses idx,
16408 - maddr_t phys, pgprot_t flags);
16409 + maddr_t phys, pgprot_t flags);
16410 extern void reserve_top_address(unsigned long reserve);
16411
16412 -#define set_fixmap(idx, phys) \
16413 - __set_fixmap(idx, phys, PAGE_KERNEL)
16414 +#define set_fixmap(idx, phys) \
16415 + __set_fixmap(idx, phys, PAGE_KERNEL)
16416 /*
16417 * Some hardware wants to get fixmapped without caching.
16418 */
16419 -#define set_fixmap_nocache(idx, phys) \
16420 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16421 -
16422 -#define clear_fixmap(idx) \
16423 - __set_fixmap(idx, 0, __pgprot(0))
16424 +#define set_fixmap_nocache(idx, phys) \
16425 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16426
16427 #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
16428
16429 @@ -159,7 +155,7 @@ static __always_inline unsigned long fix
16430 if (idx >= __end_of_fixed_addresses)
16431 __this_fixmap_does_not_exist();
16432
16433 - return __fix_to_virt(idx);
16434 + return __fix_to_virt(idx);
16435 }
16436
16437 static inline unsigned long virt_to_fix(const unsigned long vaddr)
16438 --- a/include/asm-x86/mach-xen/asm/fixmap_64.h
16439 +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
16440 @@ -8,8 +8,8 @@
16441 * Copyright (C) 1998 Ingo Molnar
16442 */
16443
16444 -#ifndef _ASM_FIXMAP_H
16445 -#define _ASM_FIXMAP_H
16446 +#ifndef _ASM_FIXMAP_64_H
16447 +#define _ASM_FIXMAP_64_H
16448
16449 #include <linux/kernel.h>
16450 #include <asm/apicdef.h>
16451 @@ -35,7 +35,8 @@
16452
16453 enum fixed_addresses {
16454 VSYSCALL_LAST_PAGE,
16455 - VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
16456 + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
16457 + + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
16458 VSYSCALL_HPET,
16459 FIX_DBGP_BASE,
16460 FIX_EARLYCON_MEM_BASE,
16461 @@ -45,11 +46,12 @@ enum fixed_addresses {
16462 #endif
16463 #ifndef CONFIG_XEN
16464 FIX_IO_APIC_BASE_0,
16465 - FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
16466 + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
16467 #endif
16468 #ifdef CONFIG_EFI
16469 FIX_EFI_IO_MAP_LAST_PAGE,
16470 - FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
16471 + FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
16472 + + MAX_EFI_IO_PAGES - 1,
16473 #endif
16474 #ifdef CONFIG_ACPI
16475 FIX_ACPI_BEGIN,
16476 @@ -79,19 +81,16 @@ enum fixed_addresses {
16477 __end_of_fixed_addresses
16478 };
16479
16480 -extern void __set_fixmap (enum fixed_addresses idx,
16481 - unsigned long phys, pgprot_t flags);
16482 +extern void __set_fixmap(enum fixed_addresses idx,
16483 + unsigned long phys, pgprot_t flags);
16484
16485 -#define set_fixmap(idx, phys) \
16486 - __set_fixmap(idx, phys, PAGE_KERNEL)
16487 +#define set_fixmap(idx, phys) \
16488 + __set_fixmap(idx, phys, PAGE_KERNEL)
16489 /*
16490 * Some hardware wants to get fixmapped without caching.
16491 */
16492 -#define set_fixmap_nocache(idx, phys) \
16493 - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16494 -
16495 -#define clear_fixmap(idx) \
16496 - __set_fixmap(idx, 0, __pgprot(0))
16497 +#define set_fixmap_nocache(idx, phys) \
16498 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
16499
16500 #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
16501 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
16502 --- a/include/asm-x86/mach-xen/asm/fixmap.h
16503 +++ b/include/asm-x86/mach-xen/asm/fixmap.h
16504 @@ -1,5 +1,13 @@
16505 +#ifndef _ASM_FIXMAP_H
16506 +#define _ASM_FIXMAP_H
16507 +
16508 #ifdef CONFIG_X86_32
16509 # include "fixmap_32.h"
16510 #else
16511 # include "fixmap_64.h"
16512 #endif
16513 +
16514 +#define clear_fixmap(idx) \
16515 + __set_fixmap(idx, 0, __pgprot(0))
16516 +
16517 +#endif
16518 --- a/include/asm-x86/mach-xen/asm/highmem.h
16519 +++ b/include/asm-x86/mach-xen/asm/highmem.h
16520 @@ -8,7 +8,7 @@
16521 * Gerhard.Wichert@pdb.siemens.de
16522 *
16523 *
16524 - * Redesigned the x86 32-bit VM architecture to deal with
16525 + * Redesigned the x86 32-bit VM architecture to deal with
16526 * up to 16 Terabyte physical memory. With current x86 CPUs
16527 * we now support up to 64 Gigabytes physical RAM.
16528 *
16529 --- a/include/asm-x86/mach-xen/asm/io_32.h
16530 +++ b/include/asm-x86/mach-xen/asm/io_32.h
16531 @@ -50,12 +50,6 @@
16532 #include <asm/fixmap.h>
16533
16534 /*
16535 - * Convert a physical pointer to a virtual kernel pointer for /dev/mem
16536 - * access
16537 - */
16538 -#define xlate_dev_mem_ptr(p) __va(p)
16539 -
16540 -/*
16541 * Convert a virtual cached pointer to an uncached pointer
16542 */
16543 #define xlate_dev_kmem_ptr(p) p
16544 @@ -66,14 +60,14 @@
16545 *
16546 * The returned physical address is the physical (CPU) mapping for
16547 * the memory address given. It is only valid to use this function on
16548 - * addresses directly mapped or allocated via kmalloc.
16549 + * addresses directly mapped or allocated via kmalloc.
16550 *
16551 * This function does not give bus mappings for DMA transfers. In
16552 * almost all conceivable cases a device driver should not be using
16553 * this function
16554 */
16555 -
16556 -static inline unsigned long virt_to_phys(volatile void * address)
16557 +
16558 +static inline unsigned long virt_to_phys(volatile void *address)
16559 {
16560 return __pa(address);
16561 }
16562 @@ -91,7 +85,7 @@ static inline unsigned long virt_to_phys
16563 * this function
16564 */
16565
16566 -static inline void * phys_to_virt(unsigned long address)
16567 +static inline void *phys_to_virt(unsigned long address)
16568 {
16569 return __va(address);
16570 }
16571 @@ -152,11 +146,6 @@ extern void *early_ioremap(unsigned long
16572 extern void early_iounmap(void *addr, unsigned long size);
16573 extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
16574
16575 -/* Use early IO mappings for DMI because it's initialized early */
16576 -#define dmi_ioremap early_ioremap
16577 -#define dmi_iounmap early_iounmap
16578 -#define dmi_alloc alloc_bootmem
16579 -
16580 /*
16581 * ISA I/O bus memory addresses are 1:1 with the physical address.
16582 */
16583 @@ -182,16 +171,19 @@ extern void __iomem *fix_ioremap(unsigne
16584
16585 static inline unsigned char readb(const volatile void __iomem *addr)
16586 {
16587 - return *(volatile unsigned char __force *) addr;
16588 + return *(volatile unsigned char __force *)addr;
16589 }
16590 +
16591 static inline unsigned short readw(const volatile void __iomem *addr)
16592 {
16593 - return *(volatile unsigned short __force *) addr;
16594 + return *(volatile unsigned short __force *)addr;
16595 }
16596 +
16597 static inline unsigned int readl(const volatile void __iomem *addr)
16598 {
16599 return *(volatile unsigned int __force *) addr;
16600 }
16601 +
16602 #define readb_relaxed(addr) readb(addr)
16603 #define readw_relaxed(addr) readw(addr)
16604 #define readl_relaxed(addr) readl(addr)
16605 @@ -201,15 +193,17 @@ static inline unsigned int readl(const v
16606
16607 static inline void writeb(unsigned char b, volatile void __iomem *addr)
16608 {
16609 - *(volatile unsigned char __force *) addr = b;
16610 + *(volatile unsigned char __force *)addr = b;
16611 }
16612 +
16613 static inline void writew(unsigned short b, volatile void __iomem *addr)
16614 {
16615 - *(volatile unsigned short __force *) addr = b;
16616 + *(volatile unsigned short __force *)addr = b;
16617 }
16618 +
16619 static inline void writel(unsigned int b, volatile void __iomem *addr)
16620 {
16621 - *(volatile unsigned int __force *) addr = b;
16622 + *(volatile unsigned int __force *)addr = b;
16623 }
16624 #define __raw_writeb writeb
16625 #define __raw_writew writew
16626 @@ -252,12 +246,12 @@ memcpy_toio(volatile void __iomem *dst,
16627 * 1. Out of order aware processors
16628 * 2. Accidentally out of order processors (PPro errata #51)
16629 */
16630 -
16631 +
16632 #if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
16633
16634 static inline void flush_write_buffers(void)
16635 {
16636 - __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
16637 + asm volatile("lock; addl $0,0(%%esp)": : :"memory");
16638 }
16639
16640 #else
16641 @@ -274,7 +268,8 @@ extern void xen_io_delay(void);
16642 extern int io_delay_type;
16643 extern void io_delay_init(void);
16644
16645 -static inline void slow_down_io(void) {
16646 +static inline void slow_down_io(void)
16647 +{
16648 native_io_delay();
16649 #ifdef REALLY_SLOW_IO
16650 native_io_delay();
16651 @@ -283,52 +278,75 @@ static inline void slow_down_io(void) {
16652 #endif
16653 }
16654
16655 -#define __BUILDIO(bwl,bw,type) \
16656 -static inline void out##bwl(unsigned type value, int port) { \
16657 - out##bwl##_local(value, port); \
16658 -} \
16659 -static inline unsigned type in##bwl(int port) { \
16660 - return in##bwl##_local(port); \
16661 -}
16662 -
16663 -#define BUILDIO(bwl,bw,type) \
16664 -static inline void out##bwl##_local(unsigned type value, int port) { \
16665 - __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
16666 -} \
16667 -static inline unsigned type in##bwl##_local(int port) { \
16668 - unsigned type value; \
16669 - __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
16670 - return value; \
16671 -} \
16672 -static inline void out##bwl##_local_p(unsigned type value, int port) { \
16673 - out##bwl##_local(value, port); \
16674 - slow_down_io(); \
16675 -} \
16676 -static inline unsigned type in##bwl##_local_p(int port) { \
16677 - unsigned type value = in##bwl##_local(port); \
16678 - slow_down_io(); \
16679 - return value; \
16680 -} \
16681 -__BUILDIO(bwl,bw,type) \
16682 -static inline void out##bwl##_p(unsigned type value, int port) { \
16683 - out##bwl(value, port); \
16684 - slow_down_io(); \
16685 -} \
16686 -static inline unsigned type in##bwl##_p(int port) { \
16687 - unsigned type value = in##bwl(port); \
16688 - slow_down_io(); \
16689 - return value; \
16690 -} \
16691 -static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
16692 - __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
16693 -} \
16694 -static inline void ins##bwl(int port, void *addr, unsigned long count) { \
16695 - __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
16696 -}
16697 -
16698 -BUILDIO(b,b,char)
16699 -BUILDIO(w,w,short)
16700 -BUILDIO(l,,int)
16701 +#define __BUILDIO(bwl, bw, type) \
16702 +static inline void out##bwl(unsigned type value, int port) \
16703 +{ \
16704 + out##bwl##_local(value, port); \
16705 +} \
16706 + \
16707 +static inline unsigned type in##bwl(int port) \
16708 +{ \
16709 + return in##bwl##_local(port); \
16710 +}
16711 +
16712 +#define BUILDIO(bwl, bw, type) \
16713 +static inline void out##bwl##_local(unsigned type value, int port) \
16714 +{ \
16715 + asm volatile("out" #bwl " %" #bw "0, %w1" \
16716 + : : "a"(value), "Nd"(port)); \
16717 +} \
16718 + \
16719 +static inline unsigned type in##bwl##_local(int port) \
16720 +{ \
16721 + unsigned type value; \
16722 + asm volatile("in" #bwl " %w1, %" #bw "0" \
16723 + : "=a"(value) : "Nd"(port)); \
16724 + return value; \
16725 +} \
16726 + \
16727 +static inline void out##bwl##_local_p(unsigned type value, int port) \
16728 +{ \
16729 + out##bwl##_local(value, port); \
16730 + slow_down_io(); \
16731 +} \
16732 + \
16733 +static inline unsigned type in##bwl##_local_p(int port) \
16734 +{ \
16735 + unsigned type value = in##bwl##_local(port); \
16736 + slow_down_io(); \
16737 + return value; \
16738 +} \
16739 + \
16740 +__BUILDIO(bwl, bw, type) \
16741 + \
16742 +static inline void out##bwl##_p(unsigned type value, int port) \
16743 +{ \
16744 + out##bwl(value, port); \
16745 + slow_down_io(); \
16746 +} \
16747 + \
16748 +static inline unsigned type in##bwl##_p(int port) \
16749 +{ \
16750 + unsigned type value = in##bwl(port); \
16751 + slow_down_io(); \
16752 + return value; \
16753 +} \
16754 + \
16755 +static inline void outs##bwl(int port, const void *addr, unsigned long count) \
16756 +{ \
16757 + asm volatile("rep; outs" #bwl \
16758 + : "+S"(addr), "+c"(count) : "d"(port)); \
16759 +} \
16760 + \
16761 +static inline void ins##bwl(int port, void *addr, unsigned long count) \
16762 +{ \
16763 + asm volatile("rep; ins" #bwl \
16764 + : "+D"(addr), "+c"(count) : "d"(port)); \
16765 +}
16766 +
16767 +BUILDIO(b, b, char)
16768 +BUILDIO(w, w, short)
16769 +BUILDIO(l, , int)
16770
16771 /* We will be supplying our own /dev/mem implementation */
16772 #define ARCH_HAS_DEV_MEM
16773 --- a/include/asm-x86/mach-xen/asm/io_64.h
16774 +++ b/include/asm-x86/mach-xen/asm/io_64.h
16775 @@ -55,60 +55,75 @@ static inline void slow_down_io(void)
16776 /*
16777 * Talk about misusing macros..
16778 */
16779 -#define __OUT1(s,x) \
16780 +#define __OUT1(s, x) \
16781 static inline void out##s(unsigned x value, unsigned short port) {
16782
16783 -#define __OUT2(s,s1,s2) \
16784 -__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
16785 +#define __OUT2(s, s1, s2) \
16786 +asm volatile ("out" #s " %" s1 "0,%" s2 "1"
16787
16788 #ifndef REALLY_SLOW_IO
16789 #define REALLY_SLOW_IO
16790 #define UNSET_REALLY_SLOW_IO
16791 #endif
16792
16793 -#define __OUT(s,s1,x) \
16794 -__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
16795 -__OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
16796 - slow_down_io(); }
16797 -
16798 -#define __IN1(s) \
16799 -static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
16800 -
16801 -#define __IN2(s,s1,s2) \
16802 -__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
16803 -
16804 -#define __IN(s,s1,i...) \
16805 -__IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); return _v; } \
16806 -__IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
16807 - slow_down_io(); return _v; }
16808 +#define __OUT(s, s1, x) \
16809 + __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
16810 + } \
16811 + __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
16812 + slow_down_io(); \
16813 +}
16814 +
16815 +#define __IN1(s) \
16816 +static inline RETURN_TYPE in##s(unsigned short port) \
16817 +{ \
16818 + RETURN_TYPE _v;
16819 +
16820 +#define __IN2(s, s1, s2) \
16821 + asm volatile ("in" #s " %" s2 "1,%" s1 "0"
16822 +
16823 +#define __IN(s, s1, i...) \
16824 + __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
16825 + return _v; \
16826 + } \
16827 + __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
16828 + slow_down_io(); \
16829 + return _v; }
16830
16831 #ifdef UNSET_REALLY_SLOW_IO
16832 #undef REALLY_SLOW_IO
16833 #endif
16834
16835 -#define __INS(s) \
16836 -static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
16837 -{ __asm__ __volatile__ ("rep ; ins" #s \
16838 -: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
16839 -
16840 -#define __OUTS(s) \
16841 -static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
16842 -{ __asm__ __volatile__ ("rep ; outs" #s \
16843 -: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
16844 +#define __INS(s) \
16845 +static inline void ins##s(unsigned short port, void *addr, \
16846 + unsigned long count) \
16847 +{ \
16848 + asm volatile ("rep ; ins" #s \
16849 + : "=D" (addr), "=c" (count) \
16850 + : "d" (port), "0" (addr), "1" (count)); \
16851 +}
16852 +
16853 +#define __OUTS(s) \
16854 +static inline void outs##s(unsigned short port, const void *addr, \
16855 + unsigned long count) \
16856 +{ \
16857 + asm volatile ("rep ; outs" #s \
16858 + : "=S" (addr), "=c" (count) \
16859 + : "d" (port), "0" (addr), "1" (count)); \
16860 +}
16861
16862 #define RETURN_TYPE unsigned char
16863 -__IN(b,"")
16864 +__IN(b, "")
16865 #undef RETURN_TYPE
16866 #define RETURN_TYPE unsigned short
16867 -__IN(w,"")
16868 +__IN(w, "")
16869 #undef RETURN_TYPE
16870 #define RETURN_TYPE unsigned int
16871 -__IN(l,"")
16872 +__IN(l, "")
16873 #undef RETURN_TYPE
16874
16875 -__OUT(b,"b",char)
16876 -__OUT(w,"w",short)
16877 -__OUT(l,,int)
16878 +__OUT(b, "b", char)
16879 +__OUT(w, "w", short)
16880 +__OUT(l, , int)
16881
16882 __INS(b)
16883 __INS(w)
16884 @@ -129,12 +144,12 @@ __OUTS(l)
16885 * Change virtual addresses to physical addresses and vv.
16886 * These are pretty trivial
16887 */
16888 -static inline unsigned long virt_to_phys(volatile void * address)
16889 +static inline unsigned long virt_to_phys(volatile void *address)
16890 {
16891 return __pa(address);
16892 }
16893
16894 -static inline void * phys_to_virt(unsigned long address)
16895 +static inline void *phys_to_virt(unsigned long address)
16896 {
16897 return __va(address);
16898 }
16899 @@ -216,18 +231,22 @@ static inline __u8 __readb(const volatil
16900 {
16901 return *(__force volatile __u8 *)addr;
16902 }
16903 +
16904 static inline __u16 __readw(const volatile void __iomem *addr)
16905 {
16906 return *(__force volatile __u16 *)addr;
16907 }
16908 +
16909 static __always_inline __u32 __readl(const volatile void __iomem *addr)
16910 {
16911 return *(__force volatile __u32 *)addr;
16912 }
16913 +
16914 static inline __u64 __readq(const volatile void __iomem *addr)
16915 {
16916 return *(__force volatile __u64 *)addr;
16917 }
16918 +
16919 #define readb(x) __readb(x)
16920 #define readw(x) __readw(x)
16921 #define readl(x) __readl(x)
16922 @@ -247,37 +266,44 @@ static inline void __writel(__u32 b, vol
16923 {
16924 *(__force volatile __u32 *)addr = b;
16925 }
16926 +
16927 static inline void __writeq(__u64 b, volatile void __iomem *addr)
16928 {
16929 *(__force volatile __u64 *)addr = b;
16930 }
16931 +
16932 static inline void __writeb(__u8 b, volatile void __iomem *addr)
16933 {
16934 *(__force volatile __u8 *)addr = b;
16935 }
16936 +
16937 static inline void __writew(__u16 b, volatile void __iomem *addr)
16938 {
16939 *(__force volatile __u16 *)addr = b;
16940 }
16941 -#define writeq(val,addr) __writeq((val),(addr))
16942 -#define writel(val,addr) __writel((val),(addr))
16943 -#define writew(val,addr) __writew((val),(addr))
16944 -#define writeb(val,addr) __writeb((val),(addr))
16945 +
16946 +#define writeq(val, addr) __writeq((val), (addr))
16947 +#define writel(val, addr) __writel((val), (addr))
16948 +#define writew(val, addr) __writew((val), (addr))
16949 +#define writeb(val, addr) __writeb((val), (addr))
16950 #define __raw_writeb writeb
16951 #define __raw_writew writew
16952 #define __raw_writel writel
16953 #define __raw_writeq writeq
16954
16955 -void __memcpy_fromio(void*,unsigned long,unsigned);
16956 -void __memcpy_toio(unsigned long,const void*,unsigned);
16957 +void __memcpy_fromio(void *, unsigned long, unsigned);
16958 +void __memcpy_toio(unsigned long, const void *, unsigned);
16959
16960 -static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
16961 +static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
16962 + unsigned len)
16963 {
16964 - __memcpy_fromio(to,(unsigned long)from,len);
16965 + __memcpy_fromio(to, (unsigned long)from, len);
16966 }
16967 -static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
16968 +
16969 +static inline void memcpy_toio(volatile void __iomem *to, const void *from,
16970 + unsigned len)
16971 {
16972 - __memcpy_toio((unsigned long)to,from,len);
16973 + __memcpy_toio((unsigned long)to, from, len);
16974 }
16975
16976 void memset_io(volatile void __iomem *a, int b, size_t c);
16977 @@ -292,18 +318,12 @@ void memset_io(volatile void __iomem *a,
16978 */
16979 #define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
16980
16981 -#define flush_write_buffers()
16982 +#define flush_write_buffers()
16983
16984 extern int iommu_bio_merge;
16985 #define BIO_VMERGE_BOUNDARY iommu_bio_merge
16986
16987 /*
16988 - * Convert a physical pointer to a virtual kernel pointer for /dev/mem
16989 - * access
16990 - */
16991 -#define xlate_dev_mem_ptr(p) __va(p)
16992 -
16993 -/*
16994 * Convert a virtual cached pointer to an uncached pointer
16995 */
16996 #define xlate_dev_kmem_ptr(p) p
16997 --- a/include/asm-x86/mach-xen/asm/io.h
16998 +++ b/include/asm-x86/mach-xen/asm/io.h
16999 @@ -1,5 +1,22 @@
17000 +#ifndef _ASM_X86_IO_H
17001 +#define _ASM_X86_IO_H
17002 +
17003 +#define ARCH_HAS_IOREMAP_WC
17004 +
17005 #ifdef CONFIG_X86_32
17006 # include "io_32.h"
17007 #else
17008 # include "io_64.h"
17009 #endif
17010 +
17011 +extern void *xlate_dev_mem_ptr(unsigned long phys);
17012 +extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
17013 +
17014 +extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
17015 +extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
17016 +
17017 +extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
17018 + unsigned long prot_val);
17019 +extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
17020 +
17021 +#endif /* _ASM_X86_IO_H */
17022 --- a/include/asm-x86/mach-xen/asm/irqflags.h
17023 +++ b/include/asm-x86/mach-xen/asm/irqflags.h
17024 @@ -137,11 +137,11 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT
17025 #endif /* __ASSEMBLY__ */
17026
17027 #ifndef __ASSEMBLY__
17028 -#define raw_local_save_flags(flags) \
17029 - do { (flags) = __raw_local_save_flags(); } while (0)
17030 +#define raw_local_save_flags(flags) \
17031 + do { (flags) = __raw_local_save_flags(); } while (0)
17032
17033 -#define raw_local_irq_save(flags) \
17034 - do { (flags) = __raw_local_irq_save(); } while (0)
17035 +#define raw_local_irq_save(flags) \
17036 + do { (flags) = __raw_local_irq_save(); } while (0)
17037
17038 static inline int raw_irqs_disabled_flags(unsigned long flags)
17039 {
17040 --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
17041 +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
17042 @@ -94,7 +94,7 @@ static inline void switch_mm(struct mm_s
17043 BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
17044
17045 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
17046 - /* We were in lazy tlb mode and leave_mm disabled
17047 + /* We were in lazy tlb mode and leave_mm disabled
17048 * tlb flush IPI delivery. We must reload %cr3.
17049 */
17050 load_cr3(next->pgd);
17051 @@ -107,10 +107,10 @@ static inline void switch_mm(struct mm_s
17052 #define deactivate_mm(tsk, mm) \
17053 asm("movl %0,%%gs": :"r" (0));
17054
17055 -#define activate_mm(prev, next) \
17056 - do { \
17057 - xen_activate_mm(prev, next); \
17058 - switch_mm((prev),(next),NULL); \
17059 - } while(0)
17060 +#define activate_mm(prev, next) \
17061 +do { \
17062 + xen_activate_mm(prev, next); \
17063 + switch_mm((prev), (next), NULL); \
17064 +} while (0)
17065
17066 #endif
17067 --- a/include/asm-x86/mach-xen/asm/mmu_context_64.h
17068 +++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h
17069 @@ -21,7 +21,7 @@ void destroy_context(struct mm_struct *m
17070 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
17071 {
17072 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
17073 - if (read_pda(mmu_state) == TLBSTATE_OK)
17074 + if (read_pda(mmu_state) == TLBSTATE_OK)
17075 write_pda(mmu_state, TLBSTATE_LAZY);
17076 #endif
17077 }
17078 @@ -62,7 +62,7 @@ extern void mm_pin(struct mm_struct *mm)
17079 extern void mm_unpin(struct mm_struct *mm);
17080 void mm_pin_all(void);
17081
17082 -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
17083 +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
17084 struct task_struct *tsk)
17085 {
17086 unsigned cpu = smp_processor_id();
17087 @@ -106,7 +106,7 @@ static inline void switch_mm(struct mm_s
17088 if (read_pda(active_mm) != next)
17089 BUG();
17090 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
17091 - /* We were in lazy tlb mode and leave_mm disabled
17092 + /* We were in lazy tlb mode and leave_mm disabled
17093 * tlb flush IPI delivery. We must reload CR3
17094 * to make sure to use no freed page tables.
17095 */
17096 @@ -118,10 +118,11 @@ static inline void switch_mm(struct mm_s
17097 #endif
17098 }
17099
17100 -#define deactivate_mm(tsk,mm) do { \
17101 - load_gs_index(0); \
17102 - asm volatile("movl %0,%%fs"::"r"(0)); \
17103 -} while(0)
17104 +#define deactivate_mm(tsk, mm) \
17105 +do { \
17106 + load_gs_index(0); \
17107 + asm volatile("movl %0,%%fs"::"r"(0)); \
17108 +} while (0)
17109
17110 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
17111 {
17112 --- a/include/asm-x86/mach-xen/asm/page_64.h
17113 +++ b/include/asm-x86/mach-xen/asm/page_64.h
17114 @@ -5,7 +5,7 @@
17115
17116 #define THREAD_ORDER 1
17117 #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
17118 -#define CURRENT_MASK (~(THREAD_SIZE-1))
17119 +#define CURRENT_MASK (~(THREAD_SIZE - 1))
17120
17121 #define EXCEPTION_STACK_ORDER 0
17122 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
17123 @@ -53,10 +53,10 @@
17124 #define __VIRTUAL_MASK_SHIFT 48
17125
17126 /*
17127 - * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
17128 + * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
17129 * arch/x86/kernel/head_64.S), and it is mapped here:
17130 */
17131 -#define KERNEL_IMAGE_SIZE (128*1024*1024)
17132 +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
17133 #define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
17134
17135 #ifndef __ASSEMBLY__
17136 @@ -64,7 +64,6 @@ void clear_page(void *page);
17137 void copy_page(void *to, void *from);
17138
17139 extern unsigned long end_pfn;
17140 -extern unsigned long end_pfn_map;
17141
17142 static inline unsigned long __phys_addr(unsigned long x)
17143 {
17144 @@ -89,6 +88,9 @@ typedef union { pteval_t pte; unsigned i
17145
17146 #define vmemmap ((struct page *)VMEMMAP_START)
17147
17148 +extern unsigned long init_memory_mapping(unsigned long start,
17149 + unsigned long end);
17150 +
17151 #endif /* !__ASSEMBLY__ */
17152
17153 #ifdef CONFIG_FLATMEM
17154 --- a/include/asm-x86/mach-xen/asm/page.h
17155 +++ b/include/asm-x86/mach-xen/asm/page.h
17156 @@ -20,8 +20,16 @@
17157 #define _PAGE_BIT_IO 9
17158 #define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
17159
17160 -#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
17161 -#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK)
17162 +#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
17163 +#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
17164 +
17165 +/* Cast PAGE_MASK to a signed type so that it is sign-extended if
17166 + virtual addresses are 32-bits but physical addresses are larger
17167 + (ie, 32-bit PAE). */
17168 +#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
17169 +
17170 +/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
17171 +#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
17172
17173 #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
17174 #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
17175 @@ -34,19 +42,14 @@
17176 /* to align the pointer to the (next) page boundary */
17177 #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
17178
17179 -#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
17180 -#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
17181 -
17182 #ifndef __ASSEMBLY__
17183 #include <linux/types.h>
17184 #endif
17185
17186 #ifdef CONFIG_X86_64
17187 #include <asm/page_64.h>
17188 -#define max_pfn_mapped end_pfn_map
17189 #else
17190 #include <asm/page_32.h>
17191 -#define max_pfn_mapped max_low_pfn
17192 #endif /* CONFIG_X86_64 */
17193
17194 #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
17195 @@ -59,6 +62,9 @@
17196 #ifndef __ASSEMBLY__
17197
17198 extern int page_is_ram(unsigned long pagenr);
17199 +extern int devmem_is_allowed(unsigned long pagenr);
17200 +
17201 +extern unsigned long max_pfn_mapped;
17202
17203 struct page;
17204
17205 --- a/include/asm-x86/mach-xen/asm/pci_64.h
17206 +++ b/include/asm-x86/mach-xen/asm/pci_64.h
17207 @@ -1,12 +1,10 @@
17208 #ifndef __x8664_PCI_H
17209 #define __x8664_PCI_H
17210
17211 -
17212 #ifdef __KERNEL__
17213
17214 -
17215 #ifdef CONFIG_CALGARY_IOMMU
17216 -static inline void* pci_iommu(struct pci_bus *bus)
17217 +static inline void *pci_iommu(struct pci_bus *bus)
17218 {
17219 struct pci_sysdata *sd = bus->sysdata;
17220 return sd->iommu;
17221 @@ -19,13 +17,12 @@ static inline void set_pci_iommu(struct
17222 }
17223 #endif /* CONFIG_CALGARY_IOMMU */
17224
17225 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
17226 + int reg, int len, u32 *value);
17227 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
17228 + int reg, int len, u32 value);
17229
17230 -extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
17231 -extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
17232 -
17233 -
17234 -
17235 -extern void pci_iommu_alloc(void);
17236 +extern void dma32_reserve_bootmem(void);
17237
17238 /* The PCI address space does equal the physical memory
17239 * address space. The networking and block device layers use
17240 @@ -82,5 +79,4 @@ extern void pci_iommu_alloc(void);
17241
17242 #endif /* __KERNEL__ */
17243
17244 -
17245 #endif /* __x8664_PCI_H */
17246 --- a/include/asm-x86/mach-xen/asm/pci.h
17247 +++ b/include/asm-x86/mach-xen/asm/pci.h
17248 @@ -8,14 +8,13 @@
17249 #include <asm/scatterlist.h>
17250 #include <asm/io.h>
17251
17252 -
17253 #ifdef __KERNEL__
17254
17255 struct pci_sysdata {
17256 int domain; /* PCI domain */
17257 int node; /* NUMA node */
17258 #ifdef CONFIG_X86_64
17259 - void* iommu; /* IOMMU private data */
17260 + void *iommu; /* IOMMU private data */
17261 #endif
17262 #ifdef CONFIG_XEN_PCIDEV_FRONTEND
17263 struct pcifront_device *pdev;
17264 @@ -23,6 +22,8 @@ struct pci_sysdata {
17265 };
17266
17267 /* scan a bus after allocating a pci_sysdata for it */
17268 +extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
17269 + int node);
17270 extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
17271
17272 static inline int pci_domain_nr(struct pci_bus *bus)
17273 @@ -36,6 +37,7 @@ static inline int pci_proc_domain(struct
17274 return pci_domain_nr(bus);
17275 }
17276
17277 +extern void pci_iommu_alloc(void);
17278
17279 /* Can be used to override the logic in pci_scan_bus for skipping
17280 already-configured bus numbers - to be used for buggy BIOSes
17281 @@ -57,7 +59,7 @@ extern unsigned long pci_mem_start;
17282 #define PCIBIOS_MIN_CARDBUS_IO 0x4000
17283
17284 void pcibios_config_init(void);
17285 -struct pci_bus * pcibios_scan_root(int bus);
17286 +struct pci_bus *pcibios_scan_root(int bus);
17287
17288 void pcibios_set_master(struct pci_dev *dev);
17289 void pcibios_penalize_isa_irq(int irq, int active);
17290 @@ -67,7 +69,8 @@ int pcibios_set_irq_routing(struct pci_d
17291
17292 #define HAVE_PCI_MMAP
17293 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
17294 - enum pci_mmap_state mmap_state, int write_combine);
17295 + enum pci_mmap_state mmap_state,
17296 + int write_combine);
17297
17298
17299 #ifdef CONFIG_PCI
17300 --- a/include/asm-x86/mach-xen/asm/pgalloc_32.h
17301 +++ /dev/null
17302 @@ -1,111 +0,0 @@
17303 -#ifndef _I386_PGALLOC_H
17304 -#define _I386_PGALLOC_H
17305 -
17306 -#include <linux/threads.h>
17307 -#include <linux/mm.h> /* for struct page */
17308 -#include <linux/pagemap.h>
17309 -#include <asm/tlb.h>
17310 -#include <asm-generic/tlb.h>
17311 -#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
17312 -
17313 -#define paravirt_alloc_pt(mm, pfn) do { } while (0)
17314 -#define paravirt_alloc_pd(mm, pfn) do { } while (0)
17315 -#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
17316 -#define paravirt_release_pt(pfn) do { } while (0)
17317 -#define paravirt_release_pd(pfn) do { } while (0)
17318 -
17319 -static inline void pmd_populate_kernel(struct mm_struct *mm,
17320 - pmd_t *pmd, pte_t *pte)
17321 -{
17322 - paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
17323 - set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
17324 -}
17325 -
17326 -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
17327 -{
17328 - unsigned long pfn = page_to_pfn(pte);
17329 -
17330 - paravirt_alloc_pt(mm, pfn);
17331 - if (PagePinned(virt_to_page(mm->pgd))) {
17332 - if (!PageHighMem(pte))
17333 - BUG_ON(HYPERVISOR_update_va_mapping(
17334 - (unsigned long)__va(pfn << PAGE_SHIFT),
17335 - pfn_pte(pfn, PAGE_KERNEL_RO), 0));
17336 - else if (!test_and_set_bit(PG_pinned, &pte->flags))
17337 - kmap_flush_unused();
17338 - set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
17339 - } else
17340 - *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
17341 -}
17342 -#define pmd_pgtable(pmd) pmd_page(pmd)
17343 -
17344 -/*
17345 - * Allocate and free page tables.
17346 - */
17347 -extern void pgd_test_and_unpin(pgd_t *);
17348 -extern pgd_t *pgd_alloc(struct mm_struct *);
17349 -extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
17350 -
17351 -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
17352 -extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
17353 -
17354 -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
17355 -{
17356 - make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
17357 - free_page((unsigned long)pte);
17358 -}
17359 -
17360 -extern void __pte_free(pgtable_t);
17361 -static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
17362 -{
17363 - __pte_free(pte);
17364 -}
17365 -
17366 -
17367 -extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
17368 -
17369 -#ifdef CONFIG_X86_PAE
17370 -/*
17371 - * In the PAE case we free the pmds as part of the pgd.
17372 - */
17373 -extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
17374 -
17375 -extern void __pmd_free(pgtable_t);
17376 -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
17377 -{
17378 - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
17379 - __pmd_free(virt_to_page(pmd));
17380 -}
17381 -
17382 -extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
17383 -
17384 -static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
17385 -{
17386 - struct page *page = virt_to_page(pmd);
17387 - unsigned long pfn = page_to_pfn(page);
17388 -
17389 - paravirt_alloc_pd(mm, pfn);
17390 -
17391 - /* Note: almost everything apart from _PAGE_PRESENT is
17392 - reserved at the pmd (PDPT) level. */
17393 - if (PagePinned(virt_to_page(mm->pgd))) {
17394 - BUG_ON(PageHighMem(page));
17395 - BUG_ON(HYPERVISOR_update_va_mapping(
17396 - (unsigned long)__va(pfn << PAGE_SHIFT),
17397 - pfn_pte(pfn, PAGE_KERNEL_RO), 0));
17398 - set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
17399 - } else
17400 - *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
17401 -
17402 - /*
17403 - * According to Intel App note "TLBs, Paging-Structure Caches,
17404 - * and Their Invalidation", April 2007, document 317080-001,
17405 - * section 8.1: in PAE mode we explicitly have to flush the
17406 - * TLB via cr3 if the top-level pgd is changed...
17407 - */
17408 - if (mm == current->active_mm)
17409 - xen_tlb_flush();
17410 -}
17411 -#endif /* CONFIG_X86_PAE */
17412 -
17413 -#endif /* _I386_PGALLOC_H */
17414 --- a/include/asm-x86/mach-xen/asm/pgalloc_64.h
17415 +++ /dev/null
17416 @@ -1,179 +0,0 @@
17417 -#ifndef _X86_64_PGALLOC_H
17418 -#define _X86_64_PGALLOC_H
17419 -
17420 -#include <asm/pda.h>
17421 -#include <linux/threads.h>
17422 -#include <linux/mm.h>
17423 -#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
17424 -
17425 -pmd_t *early_get_pmd(unsigned long va);
17426 -void early_make_page_readonly(void *va, unsigned int feature);
17427 -
17428 -#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
17429 -
17430 -#define pmd_populate_kernel(mm, pmd, pte) \
17431 - set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
17432 -
17433 -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
17434 -{
17435 - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
17436 - BUG_ON(HYPERVISOR_update_va_mapping(
17437 - (unsigned long)pmd,
17438 - pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
17439 - PAGE_KERNEL_RO), 0));
17440 - set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
17441 - } else {
17442 - *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
17443 - }
17444 -}
17445 -
17446 -/*
17447 - * We need to use the batch mode here, but pgd_pupulate() won't be
17448 - * be called frequently.
17449 - */
17450 -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
17451 -{
17452 - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
17453 - BUG_ON(HYPERVISOR_update_va_mapping(
17454 - (unsigned long)pud,
17455 - pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
17456 - PAGE_KERNEL_RO), 0));
17457 - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
17458 - set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
17459 - } else {
17460 - *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
17461 - *(__user_pgd(pgd)) = *(pgd);
17462 - }
17463 -}
17464 -
17465 -#define pmd_pgtable(pmd) pmd_page(pmd)
17466 -
17467 -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
17468 -{
17469 - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
17470 - BUG_ON(HYPERVISOR_update_va_mapping(
17471 - (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
17472 - pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
17473 - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
17474 - } else {
17475 - *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
17476 - }
17477 -}
17478 -
17479 -extern void __pmd_free(pgtable_t);
17480 -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
17481 -{
17482 - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
17483 - __pmd_free(virt_to_page(pmd));
17484 -}
17485 -
17486 -extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
17487 -
17488 -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
17489 -{
17490 - return (pud_t *)pmd_alloc_one(mm, addr);
17491 -}
17492 -
17493 -static inline void pud_free(struct mm_struct *mm, pud_t *pud)
17494 -{
17495 - BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
17496 - __pmd_free(virt_to_page(pud));
17497 -}
17498 -
17499 -static inline void pgd_list_add(pgd_t *pgd)
17500 -{
17501 - struct page *page = virt_to_page(pgd);
17502 - unsigned long flags;
17503 -
17504 - spin_lock_irqsave(&pgd_lock, flags);
17505 - list_add(&page->lru, &pgd_list);
17506 - spin_unlock_irqrestore(&pgd_lock, flags);
17507 -}
17508 -
17509 -static inline void pgd_list_del(pgd_t *pgd)
17510 -{
17511 - struct page *page = virt_to_page(pgd);
17512 - unsigned long flags;
17513 -
17514 - spin_lock_irqsave(&pgd_lock, flags);
17515 - list_del(&page->lru);
17516 - spin_unlock_irqrestore(&pgd_lock, flags);
17517 -}
17518 -
17519 -extern void pgd_test_and_unpin(pgd_t *);
17520 -
17521 -static inline pgd_t *pgd_alloc(struct mm_struct *mm)
17522 -{
17523 - /*
17524 - * We allocate two contiguous pages for kernel and user.
17525 - */
17526 - unsigned boundary;
17527 - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
17528 - if (!pgd)
17529 - return NULL;
17530 - pgd_list_add(pgd);
17531 - pgd_test_and_unpin(pgd);
17532 - /*
17533 - * Copy kernel pointers in from init.
17534 - * Could keep a freelist or slab cache of those because the kernel
17535 - * part never changes.
17536 - */
17537 - boundary = pgd_index(__PAGE_OFFSET);
17538 - memset(pgd, 0, boundary * sizeof(pgd_t));
17539 - memcpy(pgd + boundary,
17540 - init_level4_pgt + boundary,
17541 - (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
17542 -
17543 - memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
17544 - /*
17545 - * Set level3_user_pgt for vsyscall area
17546 - */
17547 - __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
17548 - __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
17549 - return pgd;
17550 -}
17551 -
17552 -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
17553 -{
17554 - pgd_test_and_unpin(pgd);
17555 - pgd_list_del(pgd);
17556 - free_pages((unsigned long)pgd, 1);
17557 -}
17558 -
17559 -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
17560 -{
17561 - pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
17562 - if (pte)
17563 - make_page_readonly(pte, XENFEAT_writable_page_tables);
17564 -
17565 - return pte;
17566 -}
17567 -
17568 -extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
17569 -
17570 -/* Should really implement gc for free page table pages. This could be
17571 - done with a reference count in struct page. */
17572 -
17573 -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
17574 -{
17575 - BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
17576 - make_page_writable(pte, XENFEAT_writable_page_tables);
17577 - free_page((unsigned long)pte);
17578 -}
17579 -
17580 -extern void __pte_free(pgtable_t);
17581 -static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
17582 -{
17583 - __pte_free(pte);
17584 -}
17585 -
17586 -#define __pte_free_tlb(tlb,pte) \
17587 -do { \
17588 - pgtable_page_dtor((pte)); \
17589 - tlb_remove_page((tlb), (pte)); \
17590 -} while (0)
17591 -
17592 -#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
17593 -#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
17594 -
17595 -#endif /* _X86_64_PGALLOC_H */
17596 --- a/include/asm-x86/mach-xen/asm/pgalloc.h
17597 +++ b/include/asm-x86/mach-xen/asm/pgalloc.h
17598 @@ -1,5 +1,149 @@
17599 -#ifdef CONFIG_X86_32
17600 -# include "pgalloc_32.h"
17601 -#else
17602 -# include "pgalloc_64.h"
17603 +#ifndef _ASM_X86_PGALLOC_H
17604 +#define _ASM_X86_PGALLOC_H
17605 +
17606 +#include <linux/threads.h>
17607 +#include <linux/mm.h> /* for struct page */
17608 +#include <linux/pagemap.h>
17609 +
17610 +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
17611 +
17612 +static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
17613 +static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
17614 +static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
17615 + unsigned long start, unsigned long count) {}
17616 +static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
17617 +static inline void paravirt_release_pte(unsigned long pfn) {}
17618 +static inline void paravirt_release_pmd(unsigned long pfn) {}
17619 +static inline void paravirt_release_pud(unsigned long pfn) {}
17620 +
17621 +#ifdef CONFIG_X86_64
17622 +void early_make_page_readonly(void *va, unsigned int feature);
17623 +pmd_t *early_get_pmd(unsigned long va);
17624 +#define make_lowmem_page_readonly make_page_readonly
17625 +#define make_lowmem_page_writable make_page_writable
17626 #endif
17627 +
17628 +/*
17629 + * Allocate and free page tables.
17630 + */
17631 +extern pgd_t *pgd_alloc(struct mm_struct *);
17632 +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
17633 +
17634 +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
17635 +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
17636 +
17637 +/* Should really implement gc for free page table pages. This could be
17638 + done with a reference count in struct page. */
17639 +
17640 +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
17641 +{
17642 + BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
17643 + make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
17644 + free_page((unsigned long)pte);
17645 +}
17646 +
17647 +extern void __pte_free(pgtable_t);
17648 +static inline void pte_free(struct mm_struct *mm, struct page *pte)
17649 +{
17650 + __pte_free(pte);
17651 +}
17652 +
17653 +extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
17654 +
17655 +static inline void pmd_populate_kernel(struct mm_struct *mm,
17656 + pmd_t *pmd, pte_t *pte)
17657 +{
17658 + paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
17659 + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
17660 +}
17661 +
17662 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
17663 + struct page *pte)
17664 +{
17665 + unsigned long pfn = page_to_pfn(pte);
17666 +
17667 + paravirt_alloc_pte(mm, pfn);
17668 + if (PagePinned(virt_to_page(mm->pgd))) {
17669 + if (!PageHighMem(pte))
17670 + BUG_ON(HYPERVISOR_update_va_mapping(
17671 + (unsigned long)__va(pfn << PAGE_SHIFT),
17672 + pfn_pte(pfn, PAGE_KERNEL_RO), 0));
17673 +#ifndef CONFIG_X86_64
17674 + else if (!TestSetPagePinned(pte))
17675 + kmap_flush_unused();
17676 +#endif
17677 + set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
17678 + } else
17679 + *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
17680 +}
17681 +
17682 +#define pmd_pgtable(pmd) pmd_page(pmd)
17683 +
17684 +#if PAGETABLE_LEVELS > 2
17685 +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
17686 +extern void __pmd_free(pgtable_t);
17687 +
17688 +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
17689 +{
17690 + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
17691 + __pmd_free(virt_to_page(pmd));
17692 +}
17693 +
17694 +extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
17695 +
17696 +#ifdef CONFIG_X86_PAE
17697 +extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
17698 +#else /* !CONFIG_X86_PAE */
17699 +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
17700 +{
17701 + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
17702 + if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
17703 + BUG_ON(HYPERVISOR_update_va_mapping(
17704 + (unsigned long)pmd,
17705 + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
17706 + PAGE_KERNEL_RO), 0));
17707 + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
17708 + } else
17709 + *pud = __pud(_PAGE_TABLE | __pa(pmd));
17710 +}
17711 +#endif /* CONFIG_X86_PAE */
17712 +
17713 +#if PAGETABLE_LEVELS > 3
17714 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
17715 +
17716 +/*
17717 + * We need to use the batch mode here, but pgd_pupulate() won't be
17718 + * be called frequently.
17719 + */
17720 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
17721 +{
17722 + paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
17723 + if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
17724 + BUG_ON(HYPERVISOR_update_va_mapping(
17725 + (unsigned long)pud,
17726 + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
17727 + PAGE_KERNEL_RO), 0));
17728 + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
17729 + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
17730 + } else {
17731 + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
17732 + *__user_pgd(pgd) = *(pgd);
17733 + }
17734 +}
17735 +
17736 +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
17737 +{
17738 + return (pud_t *)pmd_alloc_one(mm, addr);
17739 +}
17740 +
17741 +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
17742 +{
17743 + BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
17744 + __pmd_free(virt_to_page(pud));
17745 +}
17746 +
17747 +extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
17748 +#endif /* PAGETABLE_LEVELS > 3 */
17749 +#endif /* PAGETABLE_LEVELS > 2 */
17750 +
17751 +#endif /* _ASM_X86_PGALLOC_H */
17752 --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
17753 +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
17754 @@ -38,16 +38,13 @@ void paging_init(void);
17755 #ifdef CONFIG_X86_PAE
17756 # include <asm/pgtable-3level-defs.h>
17757 # define PMD_SIZE (1UL << PMD_SHIFT)
17758 -# define PMD_MASK (~(PMD_SIZE-1))
17759 +# define PMD_MASK (~(PMD_SIZE - 1))
17760 #else
17761 # include <asm/pgtable-2level-defs.h>
17762 #endif
17763
17764 #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
17765 -#define PGDIR_MASK (~(PGDIR_SIZE-1))
17766 -
17767 -#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
17768 -#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
17769 +#define PGDIR_MASK (~(PGDIR_SIZE - 1))
17770
17771 /* Just any arbitrary offset to the start of the vmalloc VM area: the
17772 * current 8MB value just means that there will be a 8MB "hole" after the
17773 @@ -56,21 +53,22 @@ void paging_init(void);
17774 * The vmalloc() routines leaves a hole of 4kB between each vmalloced
17775 * area for the same reason. ;)
17776 */
17777 -#define VMALLOC_OFFSET (8*1024*1024)
17778 -#define VMALLOC_START (((unsigned long) high_memory + \
17779 - 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
17780 +#define VMALLOC_OFFSET (8 * 1024 * 1024)
17781 +#define VMALLOC_START (((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) \
17782 + & ~(VMALLOC_OFFSET - 1))
17783 #ifdef CONFIG_X86_PAE
17784 #define LAST_PKMAP 512
17785 #else
17786 #define LAST_PKMAP 1024
17787 #endif
17788
17789 -#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
17790 +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
17791 + & PMD_MASK)
17792
17793 #ifdef CONFIG_HIGHMEM
17794 -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
17795 +# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
17796 #else
17797 -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
17798 +# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
17799 #endif
17800
17801 /*
17802 @@ -91,10 +89,10 @@ extern unsigned long pg0[];
17803 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
17804 can temporarily clear it. */
17805 #define pmd_present(x) (__pmd_val(x))
17806 -#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
17807 +#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
17808 #else
17809 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
17810 -#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
17811 +#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
17812 #endif
17813
17814
17815 @@ -107,32 +105,18 @@ extern unsigned long pg0[];
17816 #endif
17817
17818 /*
17819 - * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
17820 - *
17821 - * dst - pointer to pgd range anwhere on a pgd page
17822 - * src - ""
17823 - * count - the number of pgds to copy.
17824 - *
17825 - * dst and src can be on the same page, but the range must not overlap,
17826 - * and must not cross a page boundary.
17827 + * Macro to mark a page protection value as "uncacheable".
17828 + * On processors which do not support it, this is a no-op.
17829 */
17830 -static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
17831 -{
17832 - memcpy(dst, src, count * sizeof(pgd_t));
17833 -}
17834 -
17835 -/*
17836 - * Macro to mark a page protection value as "uncacheable". On processors which do not support
17837 - * it, this is a no-op.
17838 - */
17839 -#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
17840 - ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
17841 +#define pgprot_noncached(prot) \
17842 + ((boot_cpu_data.x86 > 3) \
17843 + ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
17844 + : (prot))
17845
17846 /*
17847 * Conversion functions: convert a page and protection to a page entry,
17848 * and a page entry and page directory to the page they refer to.
17849 */
17850 -
17851 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
17852
17853 /*
17854 @@ -141,20 +125,20 @@ static inline void clone_pgd_range(pgd_t
17855 * this macro returns the index of the entry in the pgd page which would
17856 * control the given virtual address
17857 */
17858 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
17859 -#define pgd_index_k(addr) pgd_index(addr)
17860 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
17861 +#define pgd_index_k(addr) pgd_index((addr))
17862
17863 /*
17864 * pgd_offset() returns a (pgd_t *)
17865 * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
17866 */
17867 -#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
17868 +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
17869
17870 /*
17871 * a shortcut which implies the use of the kernel's pgd, instead
17872 * of a process's
17873 */
17874 -#define pgd_offset_k(address) pgd_offset(&init_mm, address)
17875 +#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
17876
17877 static inline int pud_large(pud_t pud) { return 0; }
17878
17879 @@ -164,8 +148,8 @@ static inline int pud_large(pud_t pud) {
17880 * this macro returns the index of the entry in the pmd page which would
17881 * control the given virtual address
17882 */
17883 -#define pmd_index(address) \
17884 - (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
17885 +#define pmd_index(address) \
17886 + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
17887
17888 /*
17889 * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
17890 @@ -173,33 +157,36 @@ static inline int pud_large(pud_t pud) {
17891 * this macro returns the index of the entry in the pte page which would
17892 * control the given virtual address
17893 */
17894 -#define pte_index(address) \
17895 - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
17896 -#define pte_offset_kernel(dir, address) \
17897 - ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
17898 +#define pte_index(address) \
17899 + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
17900 +#define pte_offset_kernel(dir, address) \
17901 + ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
17902
17903 -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
17904 +#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
17905
17906 -#define pmd_page_vaddr(pmd) \
17907 - ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
17908 +#define pmd_page_vaddr(pmd) \
17909 + ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
17910
17911 #if defined(CONFIG_HIGHPTE)
17912 -#define pte_offset_map(dir, address) \
17913 - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
17914 -#define pte_offset_map_nested(dir, address) \
17915 - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
17916 -#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
17917 -#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
17918 -#else
17919 -#define pte_offset_map(dir, address) \
17920 - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
17921 -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
17922 +#define pte_offset_map(dir, address) \
17923 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
17924 + pte_index((address)))
17925 +#define pte_offset_map_nested(dir, address) \
17926 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
17927 + pte_index((address)))
17928 +#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
17929 +#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
17930 +#else
17931 +#define pte_offset_map(dir, address) \
17932 + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
17933 +#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
17934 #define pte_unmap(pte) do { } while (0)
17935 #define pte_unmap_nested(pte) do { } while (0)
17936 #endif
17937
17938 /* Clear a kernel PTE and flush it from the TLB */
17939 -#define kpte_clear_flush(ptep, vaddr) do { \
17940 +#define kpte_clear_flush(ptep, vaddr) \
17941 +do { \
17942 if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
17943 BUG(); \
17944 } while (0)
17945 @@ -208,7 +195,7 @@ static inline int pud_large(pud_t pud) {
17946 * The i386 doesn't have any external MMU info: the kernel page
17947 * tables contain all the necessary information.
17948 */
17949 -#define update_mmu_cache(vma,address,pte) do { } while (0)
17950 +#define update_mmu_cache(vma, address, pte) do { } while (0)
17951
17952 void make_lowmem_page_readonly(void *va, unsigned int feature);
17953 void make_lowmem_page_writable(void *va, unsigned int feature);
17954 @@ -225,7 +212,7 @@ void make_lowmem_page_writable(void *va,
17955 #define kern_addr_valid(kaddr) (0)
17956 #endif
17957
17958 -#define io_remap_pfn_range(vma,from,pfn,size,prot) \
17959 -direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
17960 +#define io_remap_pfn_range(vma, from, pfn, size, prot) \
17961 + direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO)
17962
17963 #endif /* _I386_PGTABLE_H */
17964 --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
17965 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
17966 @@ -8,25 +8,28 @@
17967 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
17968 */
17969
17970 -#define pte_ERROR(e) \
17971 - printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \
17972 - &(e), __pte_val(e), pte_pfn(e))
17973 -#define pmd_ERROR(e) \
17974 - printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
17975 - &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17976 -#define pgd_ERROR(e) \
17977 - printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
17978 - &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17979 -
17980 +#define pte_ERROR(e) \
17981 + printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \
17982 + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
17983 +#define pmd_ERROR(e) \
17984 + printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
17985 + __FILE__, __LINE__, &(e), __pmd_val(e), \
17986 + (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17987 +#define pgd_ERROR(e) \
17988 + printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
17989 + __FILE__, __LINE__, &(e), __pgd_val(e), \
17990 + (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
17991
17992 static inline int pud_none(pud_t pud)
17993 {
17994 return __pud_val(pud) == 0;
17995 +
17996 }
17997 static inline int pud_bad(pud_t pud)
17998 {
17999 return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
18000 }
18001 +
18002 static inline int pud_present(pud_t pud)
18003 {
18004 return __pud_val(pud) & _PAGE_PRESENT;
18005 @@ -48,12 +51,14 @@ static inline void xen_set_pte(pte_t *pt
18006
18007 static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
18008 {
18009 - set_64bit((unsigned long long *)(ptep),__pte_val(pte));
18010 + set_64bit((unsigned long long *)(ptep), __pte_val(pte));
18011 }
18012 +
18013 static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
18014 {
18015 xen_l2_entry_update(pmdp, pmd);
18016 }
18017 +
18018 static inline void xen_set_pud(pud_t *pudp, pud_t pud)
18019 {
18020 xen_l3_entry_update(pudp, pud);
18021 @@ -92,20 +97,19 @@ static inline void pud_clear(pud_t *pudp
18022 * current pgd to avoid unnecessary TLB flushes.
18023 */
18024 pgd = read_cr3();
18025 - if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
18026 + if (__pa(pudp) >= pgd && __pa(pudp) <
18027 + (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
18028 xen_tlb_flush();
18029 }
18030
18031 -#define pud_page(pud) \
18032 -((struct page *) __va(pud_val(pud) & PAGE_MASK))
18033 +#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
18034
18035 -#define pud_page_vaddr(pud) \
18036 -((unsigned long) __va(pud_val(pud) & PAGE_MASK))
18037 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
18038
18039
18040 /* Find an entry in the second-level page table.. */
18041 -#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
18042 - pmd_index(address))
18043 +#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \
18044 + pmd_index(address))
18045
18046 #ifdef CONFIG_SMP
18047 static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
18048 @@ -150,7 +154,8 @@ static inline int pte_none(pte_t pte)
18049 * put the 32 bits of offset into the high part.
18050 */
18051 #define pte_to_pgoff(pte) ((pte).pte_high)
18052 -#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
18053 +#define pgoff_to_pte(off) \
18054 + ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
18055 #define PTE_FILE_MAX_BITS 32
18056
18057 /* Encode and de-code a swap entry */
18058 --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
18059 +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
18060 @@ -31,7 +31,7 @@ extern void paging_init(void);
18061
18062 #endif /* !__ASSEMBLY__ */
18063
18064 -#define SHARED_KERNEL_PMD 1
18065 +#define SHARED_KERNEL_PMD 0
18066
18067 /*
18068 * PGDIR_SHIFT determines what a top-level page table entry can map
18069 @@ -59,18 +59,20 @@ extern void paging_init(void);
18070
18071 #ifndef __ASSEMBLY__
18072
18073 -#define pte_ERROR(e) \
18074 - printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
18075 - &(e), __pte_val(e), pte_pfn(e))
18076 -#define pmd_ERROR(e) \
18077 - printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
18078 - &(e), __pmd_val(e), pmd_pfn(e))
18079 -#define pud_ERROR(e) \
18080 - printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
18081 - &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
18082 -#define pgd_ERROR(e) \
18083 - printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
18084 - &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
18085 +#define pte_ERROR(e) \
18086 + printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \
18087 + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
18088 +#define pmd_ERROR(e) \
18089 + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \
18090 + __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
18091 +#define pud_ERROR(e) \
18092 + printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", \
18093 + __FILE__, __LINE__, &(e), __pud_val(e), \
18094 + (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
18095 +#define pgd_ERROR(e) \
18096 + printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", \
18097 + __FILE__, __LINE__, &(e), __pgd_val(e), \
18098 + (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
18099
18100 #define pgd_none(x) (!__pgd_val(x))
18101 #define pud_none(x) (!__pud_val(x))
18102 @@ -125,7 +127,7 @@ static inline void xen_set_pgd(pgd_t *pg
18103 xen_l4_entry_update(pgdp, pgd);
18104 }
18105
18106 -static inline void xen_pgd_clear(pgd_t * pgd)
18107 +static inline void xen_pgd_clear(pgd_t *pgd)
18108 {
18109 xen_set_pgd(pgd, xen_make_pgd(0));
18110 xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
18111 @@ -135,43 +137,43 @@ static inline void xen_pgd_clear(pgd_t *
18112
18113 #endif /* !__ASSEMBLY__ */
18114
18115 -#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
18116 -#define PMD_MASK (~(PMD_SIZE-1))
18117 -#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT)
18118 -#define PUD_MASK (~(PUD_SIZE-1))
18119 -#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
18120 -#define PGDIR_MASK (~(PGDIR_SIZE-1))
18121 +#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
18122 +#define PMD_MASK (~(PMD_SIZE - 1))
18123 +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
18124 +#define PUD_MASK (~(PUD_SIZE - 1))
18125 +#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
18126 +#define PGDIR_MASK (~(PGDIR_SIZE - 1))
18127
18128
18129 -#define MAXMEM _AC(0x3fffffffffff, UL)
18130 +#define MAXMEM _AC(0x00003fffffffffff, UL)
18131 #define VMALLOC_START _AC(0xffffc20000000000, UL)
18132 #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
18133 #define VMEMMAP_START _AC(0xffffe20000000000, UL)
18134 -#define MODULES_VADDR _AC(0xffffffff88000000, UL)
18135 +#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
18136 #define MODULES_END _AC(0xfffffffffff00000, UL)
18137 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
18138
18139 #ifndef __ASSEMBLY__
18140
18141 -static inline unsigned long pgd_bad(pgd_t pgd)
18142 +static inline int pgd_bad(pgd_t pgd)
18143 {
18144 - return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
18145 + return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
18146 }
18147
18148 -static inline unsigned long pud_bad(pud_t pud)
18149 +static inline int pud_bad(pud_t pud)
18150 {
18151 - return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
18152 + return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
18153 }
18154
18155 -static inline unsigned long pmd_bad(pmd_t pmd)
18156 +static inline int pmd_bad(pmd_t pmd)
18157 {
18158 - return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
18159 + return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
18160 }
18161
18162 #define pte_none(x) (!(x).pte)
18163 #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
18164
18165 -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
18166 +#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
18167
18168 #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
18169 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
18170 @@ -181,13 +183,13 @@ static inline unsigned long pmd_bad(pmd_
18171 mfn_to_local_pfn(__pte_mfn(_pte)) : \
18172 __pte_mfn(_pte))
18173
18174 -#define pte_page(x) pfn_to_page(pte_pfn(x))
18175 +#define pte_page(x) pfn_to_page(pte_pfn((x)))
18176
18177 /*
18178 * Macro to mark a page protection value as "uncacheable".
18179 */
18180 -#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
18181 -
18182 +#define pgprot_noncached(prot) \
18183 + (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
18184
18185 /*
18186 * Conversion functions: convert a page and protection to a page entry,
18187 @@ -197,36 +199,39 @@ static inline unsigned long pmd_bad(pmd_
18188 /*
18189 * Level 4 access.
18190 */
18191 -#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
18192 -#define pgd_page(pgd) (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
18193 -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
18194 -#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
18195 -#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
18196 +#define pgd_page_vaddr(pgd) \
18197 + ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
18198 +#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
18199 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
18200 +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
18201 +#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
18202 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
18203 static inline int pgd_large(pgd_t pgd) { return 0; }
18204 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
18205
18206 /* PUD - Level3 access */
18207 /* to find an entry in a page-table-directory. */
18208 -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
18209 -#define pud_page(pud) (pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
18210 -#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
18211 -#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
18212 +#define pud_page_vaddr(pud) \
18213 + ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
18214 +#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
18215 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
18216 +#define pud_offset(pgd, address) \
18217 + ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
18218 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
18219
18220 static inline int pud_large(pud_t pte)
18221 {
18222 - return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
18223 - (_PAGE_PSE|_PAGE_PRESENT);
18224 + return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
18225 + (_PAGE_PSE | _PAGE_PRESENT);
18226 }
18227
18228 /* PMD - Level 2 access */
18229 -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
18230 -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
18231 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
18232 +#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
18233
18234 -#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
18235 -#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
18236 - pmd_index(address))
18237 +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
18238 +#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
18239 + pmd_index(address))
18240 #define pmd_none(x) (!__pmd_val(x))
18241 #if CONFIG_XEN_COMPAT <= 0x030002
18242 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
18243 @@ -235,43 +240,56 @@ static inline int pud_large(pud_t pte)
18244 #else
18245 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
18246 #endif
18247 -#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
18248 -#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
18249 +#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
18250 +#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
18251
18252 #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
18253 -#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
18254 +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
18255 + _PAGE_FILE })
18256 #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
18257
18258 /* PTE - Level 1 access. */
18259
18260 /* page, protection -> pte */
18261 -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
18262 -
18263 -#define pte_index(address) \
18264 - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
18265 +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
18266 +
18267 +#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
18268 #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
18269 - pte_index(address))
18270 + pte_index((address)))
18271
18272 /* x86-64 always has all page tables mapped. */
18273 -#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
18274 -#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
18275 +#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
18276 +#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
18277 #define pte_unmap(pte) /* NOP */
18278 -#define pte_unmap_nested(pte) /* NOP */
18279 +#define pte_unmap_nested(pte) /* NOP */
18280 +
18281 +#define update_mmu_cache(vma, address, pte) do { } while (0)
18282
18283 -#define update_mmu_cache(vma,address,pte) do { } while (0)
18284 +extern int direct_gbpages;
18285
18286 /* Encode and de-code a swap entry */
18287 -#define __swp_type(x) (((x).val >> 1) & 0x3f)
18288 -#define __swp_offset(x) ((x).val >> 8)
18289 -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
18290 +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
18291 +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
18292 +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
18293 +#else
18294 +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
18295 +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
18296 +#endif
18297 +
18298 +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
18299 + & ((1U << SWP_TYPE_BITS) - 1))
18300 +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
18301 +#define __swp_entry(type, offset) ((swp_entry_t) { \
18302 + ((type) << (_PAGE_BIT_PRESENT + 1)) \
18303 + | ((offset) << SWP_OFFSET_SHIFT) })
18304 #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
18305 #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
18306
18307 -extern int kern_addr_valid(unsigned long addr);
18308 +extern int kern_addr_valid(unsigned long addr);
18309 extern void cleanup_highmap(void);
18310
18311 -#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
18312 - direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
18313 +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
18314 + direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
18315
18316 #define HAVE_ARCH_UNMAPPED_AREA
18317 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
18318 @@ -284,8 +302,10 @@ extern void cleanup_highmap(void);
18319
18320 /* fs/proc/kcore.c */
18321 #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
18322 -#define kc_offset_to_vaddr(o) \
18323 - (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
18324 +#define kc_offset_to_vaddr(o) \
18325 + (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
18326 + ? ((o) | ~__VIRTUAL_MASK) \
18327 + : (o))
18328
18329 #define __HAVE_ARCH_PTE_SAME
18330 #endif /* !__ASSEMBLY__ */
18331 --- a/include/asm-x86/mach-xen/asm/pgtable.h
18332 +++ b/include/asm-x86/mach-xen/asm/pgtable.h
18333 @@ -1,17 +1,15 @@
18334 #ifndef _ASM_X86_PGTABLE_H
18335 #define _ASM_X86_PGTABLE_H
18336
18337 -#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
18338 #define FIRST_USER_ADDRESS 0
18339
18340 -#define _PAGE_BIT_PRESENT 0
18341 -#define _PAGE_BIT_RW 1
18342 -#define _PAGE_BIT_USER 2
18343 -#define _PAGE_BIT_PWT 3
18344 -#define _PAGE_BIT_PCD 4
18345 -#define _PAGE_BIT_ACCESSED 5
18346 -#define _PAGE_BIT_DIRTY 6
18347 -#define _PAGE_BIT_FILE 6
18348 +#define _PAGE_BIT_PRESENT 0 /* is present */
18349 +#define _PAGE_BIT_RW 1 /* writeable */
18350 +#define _PAGE_BIT_USER 2 /* userspace addressable */
18351 +#define _PAGE_BIT_PWT 3 /* page write through */
18352 +#define _PAGE_BIT_PCD 4 /* page cache disabled */
18353 +#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
18354 +#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
18355 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
18356 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
18357 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
18358 @@ -22,6 +20,14 @@
18359 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
18360 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
18361
18362 +/* If _PAGE_BIT_PRESENT is clear, we use these: */
18363 +
18364 +/* set: nonlinear file mapping, saved PTE; unset:swap */
18365 +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
18366 +
18367 +/* if the user mapped it with PROT_NONE; pte_present gives true */
18368 +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
18369 +
18370 /*
18371 * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
18372 * sign-extended value on 32-bit with all 1's in the upper word,
18373 @@ -48,10 +54,8 @@
18374 #define _PAGE_NX 0
18375 #endif
18376
18377 -/* If _PAGE_PRESENT is clear, we use these: */
18378 -#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
18379 -#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
18380 - pte_present gives true */
18381 +#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
18382 +#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
18383
18384 #ifndef __ASSEMBLY__
18385 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
18386 @@ -61,20 +65,42 @@ extern unsigned int __kernel_page_user;
18387 #endif
18388 #endif
18389
18390 -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
18391 -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
18392 +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
18393 + _PAGE_ACCESSED | _PAGE_DIRTY)
18394 +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
18395 + _PAGE_DIRTY | __kernel_page_user)
18396 +
18397 +/* Set of bits not changed in pte_modify */
18398 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
18399 + _PAGE_ACCESSED | _PAGE_DIRTY)
18400
18401 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
18402 +/*
18403 + * PAT settings are part of the hypervisor interface, which sets the
18404 + * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
18405 + */
18406 +#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
18407 +#define _PAGE_CACHE_WB (0)
18408 +#define _PAGE_CACHE_WT (_PAGE_PWT)
18409 +#define _PAGE_CACHE_WC (_PAGE_PAT)
18410 +#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT)
18411 +#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
18412 +#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
18413
18414 #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
18415 -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
18416 +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
18417 + _PAGE_ACCESSED | _PAGE_NX)
18418
18419 -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
18420 -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
18421 -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
18422 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
18423 + _PAGE_USER | _PAGE_ACCESSED)
18424 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
18425 + _PAGE_ACCESSED | _PAGE_NX)
18426 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
18427 + _PAGE_ACCESSED)
18428 #define PAGE_COPY PAGE_COPY_NOEXEC
18429 -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
18430 -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
18431 +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
18432 + _PAGE_ACCESSED | _PAGE_NX)
18433 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
18434 + _PAGE_ACCESSED)
18435
18436 #ifdef CONFIG_X86_32
18437 #define _PAGE_KERNEL_EXEC \
18438 @@ -93,6 +119,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
18439 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
18440 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
18441 #define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
18442 +#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
18443 #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
18444 #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
18445 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
18446 @@ -109,6 +136,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
18447 #define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
18448 #define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
18449 #define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
18450 +#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
18451 #define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
18452 #define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
18453 #define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
18454 @@ -142,7 +170,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
18455 * ZERO_PAGE is a global shared page that is always zero: used
18456 * for zero-mapped memory areas etc..
18457 */
18458 -extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
18459 +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
18460 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
18461
18462 extern spinlock_t pgd_lock;
18463 @@ -152,30 +180,111 @@ extern struct list_head pgd_list;
18464 * The following only work if pte_present() is true.
18465 * Undefined behaviour if not..
18466 */
18467 -static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
18468 -static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
18469 -static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
18470 -static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
18471 -static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
18472 -static inline int pte_global(pte_t pte) { return 0; }
18473 -static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
18474 -
18475 -static inline int pmd_large(pmd_t pte) {
18476 - return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
18477 - (_PAGE_PSE|_PAGE_PRESENT);
18478 -}
18479 -
18480 -static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
18481 -static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
18482 -static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
18483 -static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
18484 -static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
18485 -static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
18486 -static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
18487 -static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
18488 -static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
18489 -static inline pte_t pte_mkglobal(pte_t pte) { return pte; }
18490 -static inline pte_t pte_clrglobal(pte_t pte) { return pte; }
18491 +static inline int pte_dirty(pte_t pte)
18492 +{
18493 + return __pte_val(pte) & _PAGE_DIRTY;
18494 +}
18495 +
18496 +static inline int pte_young(pte_t pte)
18497 +{
18498 + return __pte_val(pte) & _PAGE_ACCESSED;
18499 +}
18500 +
18501 +static inline int pte_write(pte_t pte)
18502 +{
18503 + return __pte_val(pte) & _PAGE_RW;
18504 +}
18505 +
18506 +static inline int pte_file(pte_t pte)
18507 +{
18508 + return __pte_val(pte) & _PAGE_FILE;
18509 +}
18510 +
18511 +static inline int pte_huge(pte_t pte)
18512 +{
18513 + return __pte_val(pte) & _PAGE_PSE;
18514 +}
18515 +
18516 +static inline int pte_global(pte_t pte)
18517 +{
18518 + return 0;
18519 +}
18520 +
18521 +static inline int pte_exec(pte_t pte)
18522 +{
18523 + return !(__pte_val(pte) & _PAGE_NX);
18524 +}
18525 +
18526 +static inline int pte_special(pte_t pte)
18527 +{
18528 + return 0;
18529 +}
18530 +
18531 +static inline int pmd_large(pmd_t pte)
18532 +{
18533 + return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
18534 + (_PAGE_PSE | _PAGE_PRESENT);
18535 +}
18536 +
18537 +static inline pte_t pte_mkclean(pte_t pte)
18538 +{
18539 + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
18540 +}
18541 +
18542 +static inline pte_t pte_mkold(pte_t pte)
18543 +{
18544 + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
18545 +}
18546 +
18547 +static inline pte_t pte_wrprotect(pte_t pte)
18548 +{
18549 + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
18550 +}
18551 +
18552 +static inline pte_t pte_mkexec(pte_t pte)
18553 +{
18554 + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
18555 +}
18556 +
18557 +static inline pte_t pte_mkdirty(pte_t pte)
18558 +{
18559 + return __pte_ma(__pte_val(pte) | _PAGE_DIRTY);
18560 +}
18561 +
18562 +static inline pte_t pte_mkyoung(pte_t pte)
18563 +{
18564 + return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED);
18565 +}
18566 +
18567 +static inline pte_t pte_mkwrite(pte_t pte)
18568 +{
18569 + return __pte_ma(__pte_val(pte) | _PAGE_RW);
18570 +}
18571 +
18572 +static inline pte_t pte_mkhuge(pte_t pte)
18573 +{
18574 + return __pte_ma(__pte_val(pte) | _PAGE_PSE);
18575 +}
18576 +
18577 +static inline pte_t pte_clrhuge(pte_t pte)
18578 +{
18579 + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
18580 +}
18581 +
18582 +static inline pte_t pte_mkglobal(pte_t pte)
18583 +{
18584 + return pte;
18585 +}
18586 +
18587 +static inline pte_t pte_clrglobal(pte_t pte)
18588 +{
18589 + return pte;
18590 +}
18591 +
18592 +static inline pte_t pte_mkspecial(pte_t pte)
18593 +{
18594 + return pte;
18595 +}
18596
18597 extern pteval_t __supported_pte_mask;
18598
18599 @@ -202,15 +311,33 @@ static inline pte_t pte_modify(pte_t pte
18600 pteval_t val = pte_val(pte);
18601
18602 val &= _PAGE_CHG_MASK;
18603 - val |= pgprot_val(newprot) & __supported_pte_mask;
18604 + val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
18605
18606 return __pte(val);
18607 }
18608
18609 -#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
18610 +/* mprotect needs to preserve PAT bits when updating vm_page_prot */
18611 +#define pgprot_modify pgprot_modify
18612 +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
18613 +{
18614 + pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
18615 + pgprotval_t addbits = pgprot_val(newprot);
18616 + return __pgprot(preservebits | addbits);
18617 +}
18618 +
18619 +#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
18620
18621 #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
18622
18623 +#ifndef __ASSEMBLY__
18624 +#define __HAVE_PHYS_MEM_ACCESS_PROT
18625 +struct file;
18626 +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
18627 + unsigned long size, pgprot_t vma_prot);
18628 +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
18629 + unsigned long size, pgprot_t *vma_prot);
18630 +#endif
18631 +
18632 #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
18633 #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
18634
18635 @@ -246,6 +373,9 @@ static inline pte_t pte_modify(pte_t pte
18636 # include "pgtable_64.h"
18637 #endif
18638
18639 +#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
18640 +#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
18641 +
18642 #ifndef __ASSEMBLY__
18643
18644 enum {
18645 @@ -312,46 +442,17 @@ static inline void xen_pte_clear(struct
18646 * bit at the same time.
18647 */
18648 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
18649 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
18650 -({ \
18651 - int __changed = !pte_same(*(ptep), entry); \
18652 - if (__changed && (dirty)) { \
18653 - if ( likely((vma)->vm_mm == current->mm) ) { \
18654 - BUG_ON(HYPERVISOR_update_va_mapping(address, \
18655 - entry, \
18656 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
18657 - UVMF_INVLPG|UVMF_MULTI)); \
18658 - } else { \
18659 - xen_l1_entry_update(ptep, entry); \
18660 - flush_tlb_page(vma, address); \
18661 - } \
18662 - } \
18663 - __changed; \
18664 -})
18665 +extern int ptep_set_access_flags(struct vm_area_struct *vma,
18666 + unsigned long address, pte_t *ptep,
18667 + pte_t entry, int dirty);
18668
18669 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
18670 -#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
18671 - int __ret = 0; \
18672 - if (pte_young(*(ptep))) \
18673 - __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
18674 - &(ptep)->pte); \
18675 - if (__ret) \
18676 - pte_update((vma)->vm_mm, addr, ptep); \
18677 - __ret; \
18678 -})
18679 +extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
18680 + unsigned long addr, pte_t *ptep);
18681
18682 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
18683 -#define ptep_clear_flush_young(vma, address, ptep) \
18684 -({ \
18685 - pte_t __pte = *(ptep); \
18686 - int __young = pte_young(__pte); \
18687 - __pte = pte_mkold(__pte); \
18688 - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
18689 - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
18690 - else if (__young) \
18691 - (ptep)->pte_low = __pte.pte_low; \
18692 - __young; \
18693 -})
18694 +extern int ptep_clear_flush_young(struct vm_area_struct *vma,
18695 + unsigned long address, pte_t *ptep);
18696
18697 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
18698 #define ptep_clear_flush(vma, addr, ptep) \
18699 @@ -370,7 +471,8 @@ static inline void xen_pte_clear(struct
18700 })
18701
18702 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
18703 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
18704 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
18705 + pte_t *ptep)
18706 {
18707 pte_t pte = *ptep;
18708 if (!pte_none(pte)
18709 @@ -398,13 +500,29 @@ static inline pte_t ptep_get_and_clear(s
18710 pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
18711
18712 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
18713 -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
18714 +static inline void ptep_set_wrprotect(struct mm_struct *mm,
18715 + unsigned long addr, pte_t *ptep)
18716 {
18717 pte_t pte = *ptep;
18718 if (pte_write(pte))
18719 set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
18720 }
18721
18722 +/*
18723 + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
18724 + *
18725 + * dst - pointer to pgd range anwhere on a pgd page
18726 + * src - ""
18727 + * count - the number of pgds to copy.
18728 + *
18729 + * dst and src can be on the same page, but the range must not overlap,
18730 + * and must not cross a page boundary.
18731 + */
18732 +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
18733 +{
18734 + memcpy(dst, src, count * sizeof(pgd_t));
18735 +}
18736 +
18737 #define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
18738 xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
18739
18740 --- a/include/asm-x86/mach-xen/asm/processor.h
18741 +++ b/include/asm-x86/mach-xen/asm/processor.h
18742 @@ -3,10 +3,6 @@
18743
18744 #include <asm/processor-flags.h>
18745
18746 -/* migration helpers, for KVM - will be removed in 2.6.25: */
18747 -#include <asm/vm86.h>
18748 -#define Xgt_desc_struct desc_ptr
18749 -
18750 /* Forward declaration, a strange C thing */
18751 struct task_struct;
18752 struct mm_struct;
18753 @@ -24,6 +20,7 @@ struct mm_struct;
18754 #include <asm/msr.h>
18755 #include <asm/desc_defs.h>
18756 #include <asm/nops.h>
18757 +
18758 #include <linux/personality.h>
18759 #include <linux/cpumask.h>
18760 #include <linux/cache.h>
18761 @@ -38,16 +35,18 @@ struct mm_struct;
18762 static inline void *current_text_addr(void)
18763 {
18764 void *pc;
18765 - asm volatile("mov $1f,%0\n1:":"=r" (pc));
18766 +
18767 + asm volatile("mov $1f, %0; 1:":"=r" (pc));
18768 +
18769 return pc;
18770 }
18771
18772 #ifdef CONFIG_X86_VSMP
18773 -#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
18774 -#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
18775 +# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
18776 +# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
18777 #else
18778 -#define ARCH_MIN_TASKALIGN 16
18779 -#define ARCH_MIN_MMSTRUCT_ALIGN 0
18780 +# define ARCH_MIN_TASKALIGN 16
18781 +# define ARCH_MIN_MMSTRUCT_ALIGN 0
18782 #endif
18783
18784 /*
18785 @@ -57,68 +56,80 @@ static inline void *current_text_addr(vo
18786 */
18787
18788 struct cpuinfo_x86 {
18789 - __u8 x86; /* CPU family */
18790 - __u8 x86_vendor; /* CPU vendor */
18791 - __u8 x86_model;
18792 - __u8 x86_mask;
18793 + __u8 x86; /* CPU family */
18794 + __u8 x86_vendor; /* CPU vendor */
18795 + __u8 x86_model;
18796 + __u8 x86_mask;
18797 #ifdef CONFIG_X86_32
18798 - char wp_works_ok; /* It doesn't on 386's */
18799 - char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
18800 - char hard_math;
18801 - char rfu;
18802 - char fdiv_bug;
18803 - char f00f_bug;
18804 - char coma_bug;
18805 - char pad0;
18806 + char wp_works_ok; /* It doesn't on 386's */
18807 +
18808 + /* Problems on some 486Dx4's and old 386's: */
18809 + char hlt_works_ok;
18810 + char hard_math;
18811 + char rfu;
18812 + char fdiv_bug;
18813 + char f00f_bug;
18814 + char coma_bug;
18815 + char pad0;
18816 #else
18817 - /* number of 4K pages in DTLB/ITLB combined(in pages)*/
18818 - int x86_tlbsize;
18819 - __u8 x86_virt_bits, x86_phys_bits;
18820 - /* cpuid returned core id bits */
18821 - __u8 x86_coreid_bits;
18822 - /* Max extended CPUID function supported */
18823 - __u32 extended_cpuid_level;
18824 -#endif
18825 - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
18826 - __u32 x86_capability[NCAPINTS];
18827 - char x86_vendor_id[16];
18828 - char x86_model_id[64];
18829 - int x86_cache_size; /* in KB - valid for CPUS which support this
18830 - call */
18831 - int x86_cache_alignment; /* In bytes */
18832 - int x86_power;
18833 - unsigned long loops_per_jiffy;
18834 + /* Number of 4K pages in DTLB/ITLB combined(in pages): */
18835 + int x86_tlbsize;
18836 + __u8 x86_virt_bits;
18837 + __u8 x86_phys_bits;
18838 + /* CPUID returned core id bits: */
18839 + __u8 x86_coreid_bits;
18840 + /* Max extended CPUID function supported: */
18841 + __u32 extended_cpuid_level;
18842 +#endif
18843 + /* Maximum supported CPUID level, -1=no CPUID: */
18844 + int cpuid_level;
18845 + __u32 x86_capability[NCAPINTS];
18846 + char x86_vendor_id[16];
18847 + char x86_model_id[64];
18848 + /* in KB - valid for CPUS which support this call: */
18849 + int x86_cache_size;
18850 + int x86_cache_alignment; /* In bytes */
18851 + int x86_power;
18852 + unsigned long loops_per_jiffy;
18853 #ifdef CONFIG_SMP
18854 - cpumask_t llc_shared_map; /* cpus sharing the last level cache */
18855 + /* cpus sharing the last level cache: */
18856 + cpumask_t llc_shared_map;
18857 #endif
18858 - u16 x86_max_cores; /* cpuid returned max cores value */
18859 - u16 apicid;
18860 - u16 x86_clflush_size;
18861 + /* cpuid returned max cores value: */
18862 + u16 x86_max_cores;
18863 + u16 apicid;
18864 + u16 initial_apicid;
18865 + u16 x86_clflush_size;
18866 #ifdef CONFIG_SMP
18867 - u16 booted_cores; /* number of cores as seen by OS */
18868 - u16 phys_proc_id; /* Physical processor id. */
18869 - u16 cpu_core_id; /* Core id */
18870 - u16 cpu_index; /* index into per_cpu list */
18871 + /* number of cores as seen by the OS: */
18872 + u16 booted_cores;
18873 + /* Physical processor id: */
18874 + u16 phys_proc_id;
18875 + /* Core id: */
18876 + u16 cpu_core_id;
18877 + /* Index into per_cpu list: */
18878 + u16 cpu_index;
18879 #endif
18880 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
18881
18882 -#define X86_VENDOR_INTEL 0
18883 -#define X86_VENDOR_CYRIX 1
18884 -#define X86_VENDOR_AMD 2
18885 -#define X86_VENDOR_UMC 3
18886 -#define X86_VENDOR_NEXGEN 4
18887 -#define X86_VENDOR_CENTAUR 5
18888 -#define X86_VENDOR_TRANSMETA 7
18889 -#define X86_VENDOR_NSC 8
18890 -#define X86_VENDOR_NUM 9
18891 -#define X86_VENDOR_UNKNOWN 0xff
18892 +#define X86_VENDOR_INTEL 0
18893 +#define X86_VENDOR_CYRIX 1
18894 +#define X86_VENDOR_AMD 2
18895 +#define X86_VENDOR_UMC 3
18896 +#define X86_VENDOR_CENTAUR 5
18897 +#define X86_VENDOR_TRANSMETA 7
18898 +#define X86_VENDOR_NSC 8
18899 +#define X86_VENDOR_NUM 9
18900 +
18901 +#define X86_VENDOR_UNKNOWN 0xff
18902
18903 /*
18904 * capabilities of CPUs
18905 */
18906 -extern struct cpuinfo_x86 boot_cpu_data;
18907 -extern struct cpuinfo_x86 new_cpu_data;
18908 -extern __u32 cleared_cpu_caps[NCAPINTS];
18909 +extern struct cpuinfo_x86 boot_cpu_data;
18910 +extern struct cpuinfo_x86 new_cpu_data;
18911 +
18912 +extern __u32 cleared_cpu_caps[NCAPINTS];
18913
18914 #ifdef CONFIG_SMP
18915 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
18916 @@ -129,7 +140,18 @@ DECLARE_PER_CPU(struct cpuinfo_x86, cpu_
18917 #define current_cpu_data boot_cpu_data
18918 #endif
18919
18920 -void cpu_detect(struct cpuinfo_x86 *c);
18921 +static inline int hlt_works(int cpu)
18922 +{
18923 +#ifdef CONFIG_X86_32
18924 + return cpu_data(cpu).hlt_works_ok;
18925 +#else
18926 + return 1;
18927 +#endif
18928 +}
18929 +
18930 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
18931 +
18932 +extern void cpu_detect(struct cpuinfo_x86 *c);
18933
18934 extern void identify_cpu(struct cpuinfo_x86 *);
18935 extern void identify_boot_cpu(void);
18936 @@ -149,12 +171,12 @@ static inline void xen_cpuid(unsigned in
18937 unsigned int *ecx, unsigned int *edx)
18938 {
18939 /* ecx is often an input as well as an output. */
18940 - __asm__(XEN_CPUID
18941 - : "=a" (*eax),
18942 - "=b" (*ebx),
18943 - "=c" (*ecx),
18944 - "=d" (*edx)
18945 - : "0" (*eax), "2" (*ecx));
18946 + asm(XEN_CPUID
18947 + : "=a" (*eax),
18948 + "=b" (*ebx),
18949 + "=c" (*ecx),
18950 + "=d" (*edx)
18951 + : "0" (*eax), "2" (*ecx));
18952 }
18953
18954 static inline void load_cr3(pgd_t *pgdir)
18955 @@ -166,57 +188,70 @@ static inline void load_cr3(pgd_t *pgdir
18956 #ifdef CONFIG_X86_32
18957 /* This is the TSS defined by the hardware. */
18958 struct x86_hw_tss {
18959 - unsigned short back_link, __blh;
18960 - unsigned long sp0;
18961 - unsigned short ss0, __ss0h;
18962 - unsigned long sp1;
18963 - unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
18964 - unsigned long sp2;
18965 - unsigned short ss2, __ss2h;
18966 - unsigned long __cr3;
18967 - unsigned long ip;
18968 - unsigned long flags;
18969 - unsigned long ax, cx, dx, bx;
18970 - unsigned long sp, bp, si, di;
18971 - unsigned short es, __esh;
18972 - unsigned short cs, __csh;
18973 - unsigned short ss, __ssh;
18974 - unsigned short ds, __dsh;
18975 - unsigned short fs, __fsh;
18976 - unsigned short gs, __gsh;
18977 - unsigned short ldt, __ldth;
18978 - unsigned short trace, io_bitmap_base;
18979 + unsigned short back_link, __blh;
18980 + unsigned long sp0;
18981 + unsigned short ss0, __ss0h;
18982 + unsigned long sp1;
18983 + /* ss1 caches MSR_IA32_SYSENTER_CS: */
18984 + unsigned short ss1, __ss1h;
18985 + unsigned long sp2;
18986 + unsigned short ss2, __ss2h;
18987 + unsigned long __cr3;
18988 + unsigned long ip;
18989 + unsigned long flags;
18990 + unsigned long ax;
18991 + unsigned long cx;
18992 + unsigned long dx;
18993 + unsigned long bx;
18994 + unsigned long sp;
18995 + unsigned long bp;
18996 + unsigned long si;
18997 + unsigned long di;
18998 + unsigned short es, __esh;
18999 + unsigned short cs, __csh;
19000 + unsigned short ss, __ssh;
19001 + unsigned short ds, __dsh;
19002 + unsigned short fs, __fsh;
19003 + unsigned short gs, __gsh;
19004 + unsigned short ldt, __ldth;
19005 + unsigned short trace;
19006 + unsigned short io_bitmap_base;
19007 +
19008 } __attribute__((packed));
19009 extern struct tss_struct doublefault_tss;
19010 #else
19011 struct x86_hw_tss {
19012 - u32 reserved1;
19013 - u64 sp0;
19014 - u64 sp1;
19015 - u64 sp2;
19016 - u64 reserved2;
19017 - u64 ist[7];
19018 - u32 reserved3;
19019 - u32 reserved4;
19020 - u16 reserved5;
19021 - u16 io_bitmap_base;
19022 + u32 reserved1;
19023 + u64 sp0;
19024 + u64 sp1;
19025 + u64 sp2;
19026 + u64 reserved2;
19027 + u64 ist[7];
19028 + u32 reserved3;
19029 + u32 reserved4;
19030 + u16 reserved5;
19031 + u16 io_bitmap_base;
19032 +
19033 } __attribute__((packed)) ____cacheline_aligned;
19034 #endif
19035 #endif /* CONFIG_X86_NO_TSS */
19036
19037 /*
19038 - * Size of io_bitmap.
19039 + * IO-bitmap sizes:
19040 */
19041 -#define IO_BITMAP_BITS 65536
19042 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
19043 -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
19044 -#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
19045 -#define INVALID_IO_BITMAP_OFFSET 0x8000
19046 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
19047 +#define IO_BITMAP_BITS 65536
19048 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
19049 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
19050 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
19051 +#define INVALID_IO_BITMAP_OFFSET 0x8000
19052 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
19053
19054 #ifndef CONFIG_X86_NO_TSS
19055 struct tss_struct {
19056 - struct x86_hw_tss x86_tss;
19057 + /*
19058 + * The hardware state:
19059 + */
19060 + struct x86_hw_tss x86_tss;
19061
19062 /*
19063 * The extra 1 is there because the CPU will access an
19064 @@ -224,136 +259,162 @@ struct tss_struct {
19065 * bitmap. The extra byte must be all 1 bits, and must
19066 * be within the limit.
19067 */
19068 - unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
19069 + unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
19070 /*
19071 * Cache the current maximum and the last task that used the bitmap:
19072 */
19073 - unsigned long io_bitmap_max;
19074 - struct thread_struct *io_bitmap_owner;
19075 + unsigned long io_bitmap_max;
19076 + struct thread_struct *io_bitmap_owner;
19077 +
19078 /*
19079 - * pads the TSS to be cacheline-aligned (size is 0x100)
19080 + * Pad the TSS to be cacheline-aligned (size is 0x100):
19081 */
19082 - unsigned long __cacheline_filler[35];
19083 + unsigned long __cacheline_filler[35];
19084 /*
19085 - * .. and then another 0x100 bytes for emergency kernel stack
19086 + * .. and then another 0x100 bytes for the emergency kernel stack:
19087 */
19088 - unsigned long stack[64];
19089 + unsigned long stack[64];
19090 +
19091 } __attribute__((packed));
19092
19093 DECLARE_PER_CPU(struct tss_struct, init_tss);
19094
19095 -/* Save the original ist values for checking stack pointers during debugging */
19096 +/*
19097 + * Save the original ist values for checking stack pointers during debugging
19098 + */
19099 struct orig_ist {
19100 - unsigned long ist[7];
19101 + unsigned long ist[7];
19102 };
19103 #endif /* CONFIG_X86_NO_TSS */
19104
19105 #define MXCSR_DEFAULT 0x1f80
19106
19107 struct i387_fsave_struct {
19108 - u32 cwd;
19109 - u32 swd;
19110 - u32 twd;
19111 - u32 fip;
19112 - u32 fcs;
19113 - u32 foo;
19114 - u32 fos;
19115 - u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
19116 - u32 status; /* software status information */
19117 + u32 cwd; /* FPU Control Word */
19118 + u32 swd; /* FPU Status Word */
19119 + u32 twd; /* FPU Tag Word */
19120 + u32 fip; /* FPU IP Offset */
19121 + u32 fcs; /* FPU IP Selector */
19122 + u32 foo; /* FPU Operand Pointer Offset */
19123 + u32 fos; /* FPU Operand Pointer Selector */
19124 +
19125 + /* 8*10 bytes for each FP-reg = 80 bytes: */
19126 + u32 st_space[20];
19127 +
19128 + /* Software status information [not touched by FSAVE ]: */
19129 + u32 status;
19130 };
19131
19132 struct i387_fxsave_struct {
19133 - u16 cwd;
19134 - u16 swd;
19135 - u16 twd;
19136 - u16 fop;
19137 + u16 cwd; /* Control Word */
19138 + u16 swd; /* Status Word */
19139 + u16 twd; /* Tag Word */
19140 + u16 fop; /* Last Instruction Opcode */
19141 union {
19142 struct {
19143 - u64 rip;
19144 - u64 rdp;
19145 + u64 rip; /* Instruction Pointer */
19146 + u64 rdp; /* Data Pointer */
19147 };
19148 struct {
19149 - u32 fip;
19150 - u32 fcs;
19151 - u32 foo;
19152 - u32 fos;
19153 + u32 fip; /* FPU IP Offset */
19154 + u32 fcs; /* FPU IP Selector */
19155 + u32 foo; /* FPU Operand Offset */
19156 + u32 fos; /* FPU Operand Selector */
19157 };
19158 };
19159 - u32 mxcsr;
19160 - u32 mxcsr_mask;
19161 - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
19162 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
19163 - u32 padding[24];
19164 + u32 mxcsr; /* MXCSR Register State */
19165 + u32 mxcsr_mask; /* MXCSR Mask */
19166 +
19167 + /* 8*16 bytes for each FP-reg = 128 bytes: */
19168 + u32 st_space[32];
19169 +
19170 + /* 16*16 bytes for each XMM-reg = 256 bytes: */
19171 + u32 xmm_space[64];
19172 +
19173 + u32 padding[24];
19174 +
19175 } __attribute__((aligned(16)));
19176
19177 struct i387_soft_struct {
19178 - u32 cwd;
19179 - u32 swd;
19180 - u32 twd;
19181 - u32 fip;
19182 - u32 fcs;
19183 - u32 foo;
19184 - u32 fos;
19185 - u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
19186 - u8 ftop, changed, lookahead, no_update, rm, alimit;
19187 - struct info *info;
19188 - u32 entry_eip;
19189 + u32 cwd;
19190 + u32 swd;
19191 + u32 twd;
19192 + u32 fip;
19193 + u32 fcs;
19194 + u32 foo;
19195 + u32 fos;
19196 + /* 8*10 bytes for each FP-reg = 80 bytes: */
19197 + u32 st_space[20];
19198 + u8 ftop;
19199 + u8 changed;
19200 + u8 lookahead;
19201 + u8 no_update;
19202 + u8 rm;
19203 + u8 alimit;
19204 + struct info *info;
19205 + u32 entry_eip;
19206 };
19207
19208 -union i387_union {
19209 +union thread_xstate {
19210 struct i387_fsave_struct fsave;
19211 struct i387_fxsave_struct fxsave;
19212 - struct i387_soft_struct soft;
19213 + struct i387_soft_struct soft;
19214 };
19215
19216 -#ifdef CONFIG_X86_32
19217 -DECLARE_PER_CPU(u8, cpu_llc_id);
19218 -#elif !defined(CONFIG_X86_NO_TSS)
19219 +#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS)
19220 DECLARE_PER_CPU(struct orig_ist, orig_ist);
19221 #endif
19222
19223 extern void print_cpu_info(struct cpuinfo_x86 *);
19224 +extern unsigned int xstate_size;
19225 +extern void free_thread_xstate(struct task_struct *);
19226 +extern struct kmem_cache *task_xstate_cachep;
19227 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
19228 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
19229 extern unsigned short num_cache_leaves;
19230
19231 struct thread_struct {
19232 -/* cached TLS descriptors. */
19233 - struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
19234 - unsigned long sp0;
19235 - unsigned long sp;
19236 + /* Cached TLS descriptors: */
19237 + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
19238 + unsigned long sp0;
19239 + unsigned long sp;
19240 #ifdef CONFIG_X86_32
19241 - unsigned long sysenter_cs;
19242 + unsigned long sysenter_cs;
19243 #else
19244 - unsigned long usersp; /* Copy from PDA */
19245 - unsigned short es, ds, fsindex, gsindex;
19246 -#endif
19247 - unsigned long ip;
19248 - unsigned long fs;
19249 - unsigned long gs;
19250 -/* Hardware debugging registers */
19251 - unsigned long debugreg0;
19252 - unsigned long debugreg1;
19253 - unsigned long debugreg2;
19254 - unsigned long debugreg3;
19255 - unsigned long debugreg6;
19256 - unsigned long debugreg7;
19257 -/* fault info */
19258 - unsigned long cr2, trap_no, error_code;
19259 -/* floating point info */
19260 - union i387_union i387 __attribute__((aligned(16)));;
19261 + unsigned long usersp; /* Copy from PDA */
19262 + unsigned short es;
19263 + unsigned short ds;
19264 + unsigned short fsindex;
19265 + unsigned short gsindex;
19266 +#endif
19267 + unsigned long ip;
19268 + unsigned long fs;
19269 + unsigned long gs;
19270 + /* Hardware debugging registers: */
19271 + unsigned long debugreg0;
19272 + unsigned long debugreg1;
19273 + unsigned long debugreg2;
19274 + unsigned long debugreg3;
19275 + unsigned long debugreg6;
19276 + unsigned long debugreg7;
19277 + /* Fault info: */
19278 + unsigned long cr2;
19279 + unsigned long trap_no;
19280 + unsigned long error_code;
19281 + /* floating point and extended processor state */
19282 + union thread_xstate *xstate;
19283 #ifdef CONFIG_X86_32
19284 -/* virtual 86 mode info */
19285 + /* Virtual 86 mode info */
19286 struct vm86_struct __user *vm86_info;
19287 unsigned long screen_bitmap;
19288 unsigned long v86flags, v86mask, saved_sp0;
19289 unsigned int saved_fs, saved_gs;
19290 #endif
19291 -/* IO permissions */
19292 - unsigned long *io_bitmap_ptr;
19293 - unsigned long iopl;
19294 -/* max allowed port in the bitmap, in bytes: */
19295 - unsigned io_bitmap_max;
19296 + /* IO permissions: */
19297 + unsigned long *io_bitmap_ptr;
19298 + unsigned long iopl;
19299 + /* Max allowed port in the bitmap, in bytes: */
19300 + unsigned io_bitmap_max;
19301 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
19302 unsigned long debugctlmsr;
19303 /* Debug Store - if not 0 points to a DS Save Area configuration;
19304 @@ -384,12 +445,12 @@ static inline void xen_set_iopl_mask(uns
19305 }
19306
19307 #ifndef CONFIG_X86_NO_TSS
19308 -static inline void native_load_sp0(struct tss_struct *tss,
19309 - struct thread_struct *thread)
19310 +static inline void
19311 +native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
19312 {
19313 tss->x86_tss.sp0 = thread->sp0;
19314 #ifdef CONFIG_X86_32
19315 - /* Only happens when SEP is enabled, no need to test "SEP"arately */
19316 + /* Only happens when SEP is enabled, no need to test "SEP"arately: */
19317 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
19318 tss->x86_tss.ss1 = thread->sysenter_cs;
19319 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
19320 @@ -403,8 +464,8 @@ static inline void native_load_sp0(struc
19321 } while (0)
19322 #endif
19323
19324 -#define __cpuid xen_cpuid
19325 -#define paravirt_enabled() 0
19326 +#define __cpuid xen_cpuid
19327 +#define paravirt_enabled() 0
19328
19329 /*
19330 * These special macros can be used to get or set a debugging register
19331 @@ -424,11 +485,12 @@ static inline void native_load_sp0(struc
19332 * enable), so that any CPU's that boot up
19333 * after us can get the correct flags.
19334 */
19335 -extern unsigned long mmu_cr4_features;
19336 +extern unsigned long mmu_cr4_features;
19337
19338 static inline void set_in_cr4(unsigned long mask)
19339 {
19340 unsigned cr4;
19341 +
19342 mmu_cr4_features |= mask;
19343 cr4 = read_cr4();
19344 cr4 |= mask;
19345 @@ -438,6 +500,7 @@ static inline void set_in_cr4(unsigned l
19346 static inline void clear_in_cr4(unsigned long mask)
19347 {
19348 unsigned cr4;
19349 +
19350 mmu_cr4_features &= ~mask;
19351 cr4 = read_cr4();
19352 cr4 &= ~mask;
19353 @@ -445,42 +508,42 @@ static inline void clear_in_cr4(unsigned
19354 }
19355
19356 struct microcode_header {
19357 - unsigned int hdrver;
19358 - unsigned int rev;
19359 - unsigned int date;
19360 - unsigned int sig;
19361 - unsigned int cksum;
19362 - unsigned int ldrver;
19363 - unsigned int pf;
19364 - unsigned int datasize;
19365 - unsigned int totalsize;
19366 - unsigned int reserved[3];
19367 + unsigned int hdrver;
19368 + unsigned int rev;
19369 + unsigned int date;
19370 + unsigned int sig;
19371 + unsigned int cksum;
19372 + unsigned int ldrver;
19373 + unsigned int pf;
19374 + unsigned int datasize;
19375 + unsigned int totalsize;
19376 + unsigned int reserved[3];
19377 };
19378
19379 struct microcode {
19380 - struct microcode_header hdr;
19381 - unsigned int bits[0];
19382 + struct microcode_header hdr;
19383 + unsigned int bits[0];
19384 };
19385
19386 -typedef struct microcode microcode_t;
19387 -typedef struct microcode_header microcode_header_t;
19388 +typedef struct microcode microcode_t;
19389 +typedef struct microcode_header microcode_header_t;
19390
19391 /* microcode format is extended from prescott processors */
19392 struct extended_signature {
19393 - unsigned int sig;
19394 - unsigned int pf;
19395 - unsigned int cksum;
19396 + unsigned int sig;
19397 + unsigned int pf;
19398 + unsigned int cksum;
19399 };
19400
19401 struct extended_sigtable {
19402 - unsigned int count;
19403 - unsigned int cksum;
19404 - unsigned int reserved[3];
19405 + unsigned int count;
19406 + unsigned int cksum;
19407 + unsigned int reserved[3];
19408 struct extended_signature sigs[0];
19409 };
19410
19411 typedef struct {
19412 - unsigned long seg;
19413 + unsigned long seg;
19414 } mm_segment_t;
19415
19416
19417 @@ -492,7 +555,7 @@ extern int kernel_thread(int (*fn)(void
19418 /* Free all resources held by a thread. */
19419 extern void release_thread(struct task_struct *);
19420
19421 -/* Prepare to copy thread state - unlazy all lazy status */
19422 +/* Prepare to copy thread state - unlazy all lazy state */
19423 extern void prepare_to_copy(struct task_struct *tsk);
19424
19425 unsigned long get_wchan(struct task_struct *p);
19426 @@ -529,118 +592,138 @@ static inline unsigned int cpuid_eax(uns
19427 unsigned int eax, ebx, ecx, edx;
19428
19429 cpuid(op, &eax, &ebx, &ecx, &edx);
19430 +
19431 return eax;
19432 }
19433 +
19434 static inline unsigned int cpuid_ebx(unsigned int op)
19435 {
19436 unsigned int eax, ebx, ecx, edx;
19437
19438 cpuid(op, &eax, &ebx, &ecx, &edx);
19439 +
19440 return ebx;
19441 }
19442 +
19443 static inline unsigned int cpuid_ecx(unsigned int op)
19444 {
19445 unsigned int eax, ebx, ecx, edx;
19446
19447 cpuid(op, &eax, &ebx, &ecx, &edx);
19448 +
19449 return ecx;
19450 }
19451 +
19452 static inline unsigned int cpuid_edx(unsigned int op)
19453 {
19454 unsigned int eax, ebx, ecx, edx;
19455
19456 cpuid(op, &eax, &ebx, &ecx, &edx);
19457 +
19458 return edx;
19459 }
19460
19461 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
19462 static inline void rep_nop(void)
19463 {
19464 - __asm__ __volatile__("rep;nop": : :"memory");
19465 + asm volatile("rep; nop" ::: "memory");
19466 }
19467
19468 -/* Stop speculative execution */
19469 +static inline void cpu_relax(void)
19470 +{
19471 + rep_nop();
19472 +}
19473 +
19474 +/* Stop speculative execution: */
19475 static inline void sync_core(void)
19476 {
19477 int tmp;
19478 +
19479 asm volatile("cpuid" : "=a" (tmp) : "0" (1)
19480 - : "ebx", "ecx", "edx", "memory");
19481 + : "ebx", "ecx", "edx", "memory");
19482 }
19483
19484 -#define cpu_relax() rep_nop()
19485 -
19486 static inline void __monitor(const void *eax, unsigned long ecx,
19487 - unsigned long edx)
19488 + unsigned long edx)
19489 {
19490 - /* "monitor %eax,%ecx,%edx;" */
19491 - asm volatile(
19492 - ".byte 0x0f,0x01,0xc8;"
19493 - : :"a" (eax), "c" (ecx), "d"(edx));
19494 + /* "monitor %eax, %ecx, %edx;" */
19495 + asm volatile(".byte 0x0f, 0x01, 0xc8;"
19496 + :: "a" (eax), "c" (ecx), "d"(edx));
19497 }
19498
19499 static inline void __mwait(unsigned long eax, unsigned long ecx)
19500 {
19501 - /* "mwait %eax,%ecx;" */
19502 - asm volatile(
19503 - ".byte 0x0f,0x01,0xc9;"
19504 - : :"a" (eax), "c" (ecx));
19505 + /* "mwait %eax, %ecx;" */
19506 + asm volatile(".byte 0x0f, 0x01, 0xc9;"
19507 + :: "a" (eax), "c" (ecx));
19508 }
19509
19510 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
19511 {
19512 - /* "mwait %eax,%ecx;" */
19513 - asm volatile(
19514 - "sti; .byte 0x0f,0x01,0xc9;"
19515 - : :"a" (eax), "c" (ecx));
19516 + trace_hardirqs_on();
19517 + /* "mwait %eax, %ecx;" */
19518 + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
19519 + :: "a" (eax), "c" (ecx));
19520 }
19521
19522 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
19523
19524 -extern int force_mwait;
19525 +extern int force_mwait;
19526
19527 extern void select_idle_routine(const struct cpuinfo_x86 *c);
19528
19529 -extern unsigned long boot_option_idle_override;
19530 +extern unsigned long boot_option_idle_override;
19531
19532 extern void enable_sep_cpu(void);
19533 extern int sysenter_setup(void);
19534
19535 /* Defined in head.S */
19536 -extern struct desc_ptr early_gdt_descr;
19537 +extern struct desc_ptr early_gdt_descr;
19538
19539 extern void cpu_set_gdt(int);
19540 extern void switch_to_new_gdt(void);
19541 extern void cpu_init(void);
19542 extern void init_gdt(int cpu);
19543
19544 -/* from system description table in BIOS. Mostly for MCA use, but
19545 - * others may find it useful. */
19546 -extern unsigned int machine_id;
19547 -extern unsigned int machine_submodel_id;
19548 -extern unsigned int BIOS_revision;
19549 +static inline void update_debugctlmsr(unsigned long debugctlmsr)
19550 +{
19551 +#ifndef CONFIG_X86_DEBUGCTLMSR
19552 + if (boot_cpu_data.x86 < 6)
19553 + return;
19554 +#endif
19555 + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
19556 +}
19557
19558 -/* Boot loader type from the setup header */
19559 -extern int bootloader_type;
19560 +/*
19561 + * from system description table in BIOS. Mostly for MCA use, but
19562 + * others may find it useful:
19563 + */
19564 +extern unsigned int machine_id;
19565 +extern unsigned int machine_submodel_id;
19566 +extern unsigned int BIOS_revision;
19567 +
19568 +/* Boot loader type from the setup header: */
19569 +extern int bootloader_type;
19570
19571 -extern char ignore_fpu_irq;
19572 -#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
19573 +extern char ignore_fpu_irq;
19574
19575 #define HAVE_ARCH_PICK_MMAP_LAYOUT 1
19576 #define ARCH_HAS_PREFETCHW
19577 #define ARCH_HAS_SPINLOCK_PREFETCH
19578
19579 #ifdef CONFIG_X86_32
19580 -#define BASE_PREFETCH ASM_NOP4
19581 -#define ARCH_HAS_PREFETCH
19582 +# define BASE_PREFETCH ASM_NOP4
19583 +# define ARCH_HAS_PREFETCH
19584 #else
19585 -#define BASE_PREFETCH "prefetcht0 (%1)"
19586 +# define BASE_PREFETCH "prefetcht0 (%1)"
19587 #endif
19588
19589 -/* Prefetch instructions for Pentium III and AMD Athlon */
19590 -/* It's not worth to care about 3dnow! prefetches for the K6
19591 - because they are microcoded there and very slow.
19592 - However we don't do prefetches for pre XP Athlons currently
19593 - That should be fixed. */
19594 +/*
19595 + * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
19596 + *
19597 + * It's not worth to care about 3dnow prefetches for the K6
19598 + * because they are microcoded there and very slow.
19599 + */
19600 static inline void prefetch(const void *x)
19601 {
19602 alternative_input(BASE_PREFETCH,
19603 @@ -649,8 +732,11 @@ static inline void prefetch(const void *
19604 "r" (x));
19605 }
19606
19607 -/* 3dnow! prefetch to get an exclusive cache line. Useful for
19608 - spinlocks to avoid one state transition in the cache coherency protocol. */
19609 +/*
19610 + * 3dnow prefetch to get an exclusive cache line.
19611 + * Useful for spinlocks to avoid one state transition in the
19612 + * cache coherency protocol:
19613 + */
19614 static inline void prefetchw(const void *x)
19615 {
19616 alternative_input(BASE_PREFETCH,
19617 @@ -659,21 +745,25 @@ static inline void prefetchw(const void
19618 "r" (x));
19619 }
19620
19621 -#define spin_lock_prefetch(x) prefetchw(x)
19622 +static inline void spin_lock_prefetch(const void *x)
19623 +{
19624 + prefetchw(x);
19625 +}
19626 +
19627 #ifdef CONFIG_X86_32
19628 /*
19629 * User space process size: 3GB (default).
19630 */
19631 -#define TASK_SIZE (PAGE_OFFSET)
19632 -#define STACK_TOP TASK_SIZE
19633 -#define STACK_TOP_MAX STACK_TOP
19634 -
19635 -#define INIT_THREAD { \
19636 - .sp0 = sizeof(init_stack) + (long)&init_stack, \
19637 - .vm86_info = NULL, \
19638 - .sysenter_cs = __KERNEL_CS, \
19639 - .io_bitmap_ptr = NULL, \
19640 - .fs = __KERNEL_PERCPU, \
19641 +#define TASK_SIZE PAGE_OFFSET
19642 +#define STACK_TOP TASK_SIZE
19643 +#define STACK_TOP_MAX STACK_TOP
19644 +
19645 +#define INIT_THREAD { \
19646 + .sp0 = sizeof(init_stack) + (long)&init_stack, \
19647 + .vm86_info = NULL, \
19648 + .sysenter_cs = __KERNEL_CS, \
19649 + .io_bitmap_ptr = NULL, \
19650 + .fs = __KERNEL_PERCPU, \
19651 }
19652
19653 /*
19654 @@ -682,28 +772,15 @@ static inline void prefetchw(const void
19655 * permission bitmap. The extra byte must be all 1 bits, and must
19656 * be within the limit.
19657 */
19658 -#define INIT_TSS { \
19659 - .x86_tss = { \
19660 +#define INIT_TSS { \
19661 + .x86_tss = { \
19662 .sp0 = sizeof(init_stack) + (long)&init_stack, \
19663 - .ss0 = __KERNEL_DS, \
19664 - .ss1 = __KERNEL_CS, \
19665 - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
19666 - }, \
19667 - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
19668 -}
19669 -
19670 -#define start_thread(regs, new_eip, new_esp) do { \
19671 - __asm__("movl %0,%%gs": :"r" (0)); \
19672 - regs->fs = 0; \
19673 - set_fs(USER_DS); \
19674 - regs->ds = __USER_DS; \
19675 - regs->es = __USER_DS; \
19676 - regs->ss = __USER_DS; \
19677 - regs->cs = __USER_CS; \
19678 - regs->ip = new_eip; \
19679 - regs->sp = new_esp; \
19680 -} while (0)
19681 -
19682 + .ss0 = __KERNEL_DS, \
19683 + .ss1 = __KERNEL_CS, \
19684 + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
19685 + }, \
19686 + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
19687 +}
19688
19689 extern unsigned long thread_saved_pc(struct task_struct *tsk);
19690
19691 @@ -731,24 +808,24 @@ extern unsigned long thread_saved_pc(str
19692 __regs__ - 1; \
19693 })
19694
19695 -#define KSTK_ESP(task) (task_pt_regs(task)->sp)
19696 +#define KSTK_ESP(task) (task_pt_regs(task)->sp)
19697
19698 #else
19699 /*
19700 * User space process size. 47bits minus one guard page.
19701 */
19702 -#define TASK_SIZE64 (0x800000000000UL - 4096)
19703 +#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
19704
19705 /* This decides where the kernel will search for a free chunk of vm
19706 * space during mmap's.
19707 */
19708 -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
19709 - 0xc0000000 : 0xFFFFe000)
19710 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
19711 + 0xc0000000 : 0xFFFFe000)
19712
19713 -#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
19714 - IA32_PAGE_OFFSET : TASK_SIZE64)
19715 -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
19716 - IA32_PAGE_OFFSET : TASK_SIZE64)
19717 +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
19718 + IA32_PAGE_OFFSET : TASK_SIZE64)
19719 +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
19720 + IA32_PAGE_OFFSET : TASK_SIZE64)
19721
19722 #define STACK_TOP TASK_SIZE
19723 #define STACK_TOP_MAX TASK_SIZE64
19724 @@ -761,33 +838,32 @@ extern unsigned long thread_saved_pc(str
19725 .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
19726 }
19727
19728 -#define start_thread(regs, new_rip, new_rsp) do { \
19729 - asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
19730 - load_gs_index(0); \
19731 - (regs)->ip = (new_rip); \
19732 - (regs)->sp = (new_rsp); \
19733 - write_pda(oldrsp, (new_rsp)); \
19734 - (regs)->cs = __USER_CS; \
19735 - (regs)->ss = __USER_DS; \
19736 - (regs)->flags = 0x200; \
19737 - set_fs(USER_DS); \
19738 -} while (0)
19739 -
19740 /*
19741 * Return saved PC of a blocked thread.
19742 * What is this good for? it will be always the scheduler or ret_from_fork.
19743 */
19744 -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
19745 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
19746
19747 -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
19748 -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
19749 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
19750 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
19751 #endif /* CONFIG_X86_64 */
19752
19753 -/* This decides where the kernel will search for a free chunk of vm
19754 +extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
19755 + unsigned long new_sp);
19756 +
19757 +/*
19758 + * This decides where the kernel will search for a free chunk of vm
19759 * space during mmap's.
19760 */
19761 #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
19762
19763 -#define KSTK_EIP(task) (task_pt_regs(task)->ip)
19764 +#define KSTK_EIP(task) (task_pt_regs(task)->ip)
19765 +
19766 +/* Get/set a process' ability to use the timestamp counter instruction */
19767 +#define GET_TSC_CTL(adr) get_tsc_mode((adr))
19768 +#define SET_TSC_CTL(val) set_tsc_mode((val))
19769 +
19770 +extern int get_tsc_mode(unsigned long adr);
19771 +extern int set_tsc_mode(unsigned int val);
19772
19773 #endif
19774 --- a/include/asm-x86/mach-xen/asm/segment.h
19775 +++ b/include/asm-x86/mach-xen/asm/segment.h
19776 @@ -191,13 +191,14 @@
19777 #define SEGMENT_TI_MASK 0x4
19778
19779 #define IDT_ENTRIES 256
19780 +#define NUM_EXCEPTION_VECTORS 32
19781 #define GDT_SIZE (GDT_ENTRIES * 8)
19782 #define GDT_ENTRY_TLS_ENTRIES 3
19783 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
19784
19785 #ifdef __KERNEL__
19786 #ifndef __ASSEMBLY__
19787 -extern const char early_idt_handlers[IDT_ENTRIES][10];
19788 +extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
19789 #endif
19790 #endif
19791
19792 --- a/include/asm-x86/mach-xen/asm/smp_32.h
19793 +++ /dev/null
19794 @@ -1,178 +0,0 @@
19795 -#ifndef __ASM_SMP_H
19796 -#define __ASM_SMP_H
19797 -
19798 -#ifndef __ASSEMBLY__
19799 -#include <linux/cpumask.h>
19800 -#include <linux/init.h>
19801 -
19802 -/*
19803 - * We need the APIC definitions automatically as part of 'smp.h'
19804 - */
19805 -#ifdef CONFIG_X86_LOCAL_APIC
19806 -# include <asm/mpspec.h>
19807 -# include <asm/apic.h>
19808 -# ifdef CONFIG_X86_IO_APIC
19809 -# include <asm/io_apic.h>
19810 -# endif
19811 -#endif
19812 -
19813 -#define cpu_callout_map cpu_possible_map
19814 -#define cpu_callin_map cpu_possible_map
19815 -
19816 -extern int smp_num_siblings;
19817 -extern unsigned int num_processors;
19818 -
19819 -extern void smp_alloc_memory(void);
19820 -extern void lock_ipi_call_lock(void);
19821 -extern void unlock_ipi_call_lock(void);
19822 -
19823 -extern void (*mtrr_hook) (void);
19824 -extern void zap_low_mappings (void);
19825 -
19826 -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
19827 -DECLARE_PER_CPU(cpumask_t, cpu_core_map);
19828 -DECLARE_PER_CPU(u8, cpu_llc_id);
19829 -DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
19830 -
19831 -#ifdef CONFIG_HOTPLUG_CPU
19832 -extern void cpu_exit_clear(void);
19833 -extern void cpu_uninit(void);
19834 -#endif
19835 -
19836 -#ifdef CONFIG_SMP
19837 -
19838 -#ifndef CONFIG_XEN
19839 -
19840 -/* Globals due to paravirt */
19841 -extern void set_cpu_sibling_map(int cpu);
19842 -
19843 -struct smp_ops
19844 -{
19845 - void (*smp_prepare_boot_cpu)(void);
19846 - void (*smp_prepare_cpus)(unsigned max_cpus);
19847 - int (*cpu_up)(unsigned cpu);
19848 - void (*smp_cpus_done)(unsigned max_cpus);
19849 -
19850 - void (*smp_send_stop)(void);
19851 - void (*smp_send_reschedule)(int cpu);
19852 - int (*smp_call_function_mask)(cpumask_t mask,
19853 - void (*func)(void *info), void *info,
19854 - int wait);
19855 -};
19856 -
19857 -extern struct smp_ops smp_ops;
19858 -
19859 -static inline void smp_prepare_boot_cpu(void)
19860 -{
19861 - smp_ops.smp_prepare_boot_cpu();
19862 -}
19863 -static inline void smp_prepare_cpus(unsigned int max_cpus)
19864 -{
19865 - smp_ops.smp_prepare_cpus(max_cpus);
19866 -}
19867 -static inline int __cpu_up(unsigned int cpu)
19868 -{
19869 - return smp_ops.cpu_up(cpu);
19870 -}
19871 -static inline void smp_cpus_done(unsigned int max_cpus)
19872 -{
19873 - smp_ops.smp_cpus_done(max_cpus);
19874 -}
19875 -
19876 -static inline void smp_send_stop(void)
19877 -{
19878 - smp_ops.smp_send_stop();
19879 -}
19880 -static inline void smp_send_reschedule(int cpu)
19881 -{
19882 - smp_ops.smp_send_reschedule(cpu);
19883 -}
19884 -static inline int smp_call_function_mask(cpumask_t mask,
19885 - void (*func) (void *info), void *info,
19886 - int wait)
19887 -{
19888 - return smp_ops.smp_call_function_mask(mask, func, info, wait);
19889 -}
19890 -
19891 -void native_smp_prepare_boot_cpu(void);
19892 -void native_smp_prepare_cpus(unsigned int max_cpus);
19893 -int native_cpu_up(unsigned int cpunum);
19894 -void native_smp_cpus_done(unsigned int max_cpus);
19895 -
19896 -#ifndef CONFIG_PARAVIRT
19897 -#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
19898 -#endif
19899 -
19900 -#else /* CONFIG_XEN */
19901 -
19902 -void xen_smp_send_stop(void);
19903 -void xen_smp_send_reschedule(int cpu);
19904 -int xen_smp_call_function_mask(cpumask_t mask,
19905 - void (*func) (void *info), void *info,
19906 - int wait);
19907 -
19908 -#define smp_send_stop xen_smp_send_stop
19909 -#define smp_send_reschedule xen_smp_send_reschedule
19910 -#define smp_call_function_mask xen_smp_call_function_mask
19911 -
19912 -extern void prefill_possible_map(void);
19913 -
19914 -#endif /* CONFIG_XEN */
19915 -
19916 -extern int __cpu_disable(void);
19917 -extern void __cpu_die(unsigned int cpu);
19918 -
19919 -/*
19920 - * This function is needed by all SMP systems. It must _always_ be valid
19921 - * from the initial startup. We map APIC_BASE very early in page_setup(),
19922 - * so this is correct in the x86 case.
19923 - */
19924 -DECLARE_PER_CPU(int, cpu_number);
19925 -#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
19926 -
19927 -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
19928 -
19929 -#define safe_smp_processor_id() smp_processor_id()
19930 -
19931 -/* We don't mark CPUs online until __cpu_up(), so we need another measure */
19932 -static inline int num_booting_cpus(void)
19933 -{
19934 - return cpus_weight(cpu_callout_map);
19935 -}
19936 -
19937 -#else /* CONFIG_SMP */
19938 -
19939 -#define safe_smp_processor_id() 0
19940 -#define cpu_physical_id(cpu) boot_cpu_physical_apicid
19941 -
19942 -#endif /* !CONFIG_SMP */
19943 -
19944 -#ifdef CONFIG_X86_LOCAL_APIC
19945 -
19946 -static __inline int logical_smp_processor_id(void)
19947 -{
19948 - /* we don't want to mark this access volatile - bad code generation */
19949 - return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
19950 -}
19951 -
19952 -# ifdef APIC_DEFINITION
19953 -extern int hard_smp_processor_id(void);
19954 -# else
19955 -# include <mach_apicdef.h>
19956 -static inline int hard_smp_processor_id(void)
19957 -{
19958 - /* we don't want to mark this access volatile - bad code generation */
19959 - return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
19960 -}
19961 -# endif /* APIC_DEFINITION */
19962 -
19963 -#else /* CONFIG_X86_LOCAL_APIC */
19964 -
19965 -# ifndef CONFIG_SMP
19966 -# define hard_smp_processor_id() 0
19967 -# endif
19968 -
19969 -#endif /* CONFIG_X86_LOCAL_APIC */
19970 -
19971 -#endif /* !ASSEMBLY */
19972 -#endif
19973 --- a/include/asm-x86/mach-xen/asm/smp_64.h
19974 +++ /dev/null
19975 @@ -1,103 +0,0 @@
19976 -#ifndef __ASM_SMP_H
19977 -#define __ASM_SMP_H
19978 -
19979 -#include <linux/cpumask.h>
19980 -#include <linux/init.h>
19981 -
19982 -#ifdef CONFIG_X86_LOCAL_APIC
19983 -/*
19984 - * We need the APIC definitions automatically as part of 'smp.h'
19985 - */
19986 -#include <asm/apic.h>
19987 -#ifdef CONFIG_X86_IO_APIC
19988 -#include <asm/io_apic.h>
19989 -#endif
19990 -#include <asm/mpspec.h>
19991 -#endif
19992 -#include <asm/pda.h>
19993 -#include <asm/thread_info.h>
19994 -
19995 -extern cpumask_t cpu_initialized;
19996 -
19997 -extern int smp_num_siblings;
19998 -extern unsigned int num_processors;
19999 -
20000 -extern void smp_alloc_memory(void);
20001 -extern void lock_ipi_call_lock(void);
20002 -extern void unlock_ipi_call_lock(void);
20003 -
20004 -extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
20005 - void *info, int wait);
20006 -
20007 -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
20008 -DECLARE_PER_CPU(cpumask_t, cpu_core_map);
20009 -DECLARE_PER_CPU(u16, cpu_llc_id);
20010 -DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
20011 -DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
20012 -
20013 -#ifdef CONFIG_X86_LOCAL_APIC
20014 -static inline int cpu_present_to_apicid(int mps_cpu)
20015 -{
20016 - if (cpu_present(mps_cpu))
20017 - return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
20018 - else
20019 - return BAD_APICID;
20020 -}
20021 -#endif
20022 -
20023 -#ifdef CONFIG_SMP
20024 -
20025 -#define SMP_TRAMPOLINE_BASE 0x6000
20026 -
20027 -extern int __cpu_disable(void);
20028 -extern void __cpu_die(unsigned int cpu);
20029 -extern void prefill_possible_map(void);
20030 -extern unsigned __cpuinitdata disabled_cpus;
20031 -
20032 -#define raw_smp_processor_id() read_pda(cpunumber)
20033 -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
20034 -
20035 -#define stack_smp_processor_id() \
20036 - ({ \
20037 - struct thread_info *ti; \
20038 - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
20039 - ti->cpu; \
20040 -})
20041 -
20042 -/*
20043 - * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
20044 - * scheduling and IPI sending and compresses data structures.
20045 - */
20046 -static inline int num_booting_cpus(void)
20047 -{
20048 - return cpus_weight(cpu_possible_map);
20049 -}
20050 -
20051 -extern void smp_send_reschedule(int cpu);
20052 -
20053 -#else /* CONFIG_SMP */
20054 -
20055 -extern unsigned int boot_cpu_id;
20056 -#define cpu_physical_id(cpu) boot_cpu_id
20057 -#define stack_smp_processor_id() 0
20058 -
20059 -#endif /* !CONFIG_SMP */
20060 -
20061 -#define safe_smp_processor_id() smp_processor_id()
20062 -
20063 -#ifdef CONFIG_X86_LOCAL_APIC
20064 -static __inline int logical_smp_processor_id(void)
20065 -{
20066 - /* we don't want to mark this access volatile - bad code generation */
20067 - return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
20068 -}
20069 -
20070 -static inline int hard_smp_processor_id(void)
20071 -{
20072 - /* we don't want to mark this access volatile - bad code generation */
20073 - return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
20074 -}
20075 -#endif
20076 -
20077 -#endif
20078 -
20079 --- a/include/asm-x86/mach-xen/asm/smp.h
20080 +++ b/include/asm-x86/mach-xen/asm/smp.h
20081 @@ -1,5 +1,227 @@
20082 -#ifdef CONFIG_X86_32
20083 -# include "smp_32.h"
20084 +#ifndef _ASM_X86_SMP_H_
20085 +#define _ASM_X86_SMP_H_
20086 +#ifndef __ASSEMBLY__
20087 +#include <linux/cpumask.h>
20088 +#include <linux/init.h>
20089 +#include <asm/percpu.h>
20090 +
20091 +/*
20092 + * We need the APIC definitions automatically as part of 'smp.h'
20093 + */
20094 +#ifdef CONFIG_X86_LOCAL_APIC
20095 +# include <asm/mpspec.h>
20096 +# include <asm/apic.h>
20097 +# ifdef CONFIG_X86_IO_APIC
20098 +# include <asm/io_apic.h>
20099 +# endif
20100 +#endif
20101 +#include <asm/pda.h>
20102 +#include <asm/thread_info.h>
20103 +
20104 +#define cpu_callout_map cpu_possible_map
20105 +extern cpumask_t cpu_initialized;
20106 +#define cpu_callin_map cpu_possible_map
20107 +
20108 +extern void (*mtrr_hook)(void);
20109 +extern void zap_low_mappings(void);
20110 +
20111 +extern int smp_num_siblings;
20112 +extern unsigned int num_processors;
20113 +extern cpumask_t cpu_initialized;
20114 +
20115 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
20116 +extern u16 x86_cpu_to_apicid_init[];
20117 +extern u16 x86_bios_cpu_apicid_init[];
20118 +extern void *x86_cpu_to_apicid_early_ptr;
20119 +extern void *x86_bios_cpu_apicid_early_ptr;
20120 #else
20121 -# include "smp_64.h"
20122 +#define x86_cpu_to_apicid_early_ptr NULL
20123 +#define x86_bios_cpu_apicid_early_ptr NULL
20124 +#endif
20125 +
20126 +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
20127 +DECLARE_PER_CPU(cpumask_t, cpu_core_map);
20128 +DECLARE_PER_CPU(u16, cpu_llc_id);
20129 +DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
20130 +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
20131 +
20132 +#ifdef CONFIG_SMP
20133 +
20134 +#ifndef CONFIG_XEN
20135 +
20136 +/* Static state in head.S used to set up a CPU */
20137 +extern struct {
20138 + void *sp;
20139 + unsigned short ss;
20140 +} stack_start;
20141 +
20142 +struct smp_ops {
20143 + void (*smp_prepare_boot_cpu)(void);
20144 + void (*smp_prepare_cpus)(unsigned max_cpus);
20145 + int (*cpu_up)(unsigned cpu);
20146 + void (*smp_cpus_done)(unsigned max_cpus);
20147 +
20148 + void (*smp_send_stop)(void);
20149 + void (*smp_send_reschedule)(int cpu);
20150 + int (*smp_call_function_mask)(cpumask_t mask,
20151 + void (*func)(void *info), void *info,
20152 + int wait);
20153 +};
20154 +
20155 +/* Globals due to paravirt */
20156 +extern void set_cpu_sibling_map(int cpu);
20157 +
20158 +#ifndef CONFIG_PARAVIRT
20159 +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
20160 +#endif
20161 +extern struct smp_ops smp_ops;
20162 +
20163 +static inline void smp_send_stop(void)
20164 +{
20165 + smp_ops.smp_send_stop();
20166 +}
20167 +
20168 +static inline void smp_prepare_boot_cpu(void)
20169 +{
20170 + smp_ops.smp_prepare_boot_cpu();
20171 +}
20172 +
20173 +static inline void smp_prepare_cpus(unsigned int max_cpus)
20174 +{
20175 + smp_ops.smp_prepare_cpus(max_cpus);
20176 +}
20177 +
20178 +static inline void smp_cpus_done(unsigned int max_cpus)
20179 +{
20180 + smp_ops.smp_cpus_done(max_cpus);
20181 +}
20182 +
20183 +static inline int __cpu_up(unsigned int cpu)
20184 +{
20185 + return smp_ops.cpu_up(cpu);
20186 +}
20187 +
20188 +static inline void smp_send_reschedule(int cpu)
20189 +{
20190 + smp_ops.smp_send_reschedule(cpu);
20191 +}
20192 +
20193 +static inline int smp_call_function_mask(cpumask_t mask,
20194 + void (*func) (void *info), void *info,
20195 + int wait)
20196 +{
20197 + return smp_ops.smp_call_function_mask(mask, func, info, wait);
20198 +}
20199 +
20200 +void native_smp_prepare_boot_cpu(void);
20201 +void native_smp_prepare_cpus(unsigned int max_cpus);
20202 +void native_smp_cpus_done(unsigned int max_cpus);
20203 +int native_cpu_up(unsigned int cpunum);
20204 +
20205 +#else /* CONFIG_XEN */
20206 +
20207 +void xen_smp_send_stop(void);
20208 +void xen_smp_send_reschedule(int cpu);
20209 +int xen_smp_call_function_mask(cpumask_t mask,
20210 + void (*func) (void *info), void *info,
20211 + int wait);
20212 +
20213 +#define smp_send_stop xen_smp_send_stop
20214 +#define smp_send_reschedule xen_smp_send_reschedule
20215 +#define smp_call_function_mask xen_smp_call_function_mask
20216 +
20217 +extern void prefill_possible_map(void);
20218 +
20219 +#endif /* CONFIG_XEN */
20220 +
20221 +extern int __cpu_disable(void);
20222 +extern void __cpu_die(unsigned int cpu);
20223 +
20224 +extern void prefill_possible_map(void);
20225 +
20226 +void smp_store_cpu_info(int id);
20227 +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
20228 +
20229 +/* We don't mark CPUs online until __cpu_up(), so we need another measure */
20230 +static inline int num_booting_cpus(void)
20231 +{
20232 + return cpus_weight(cpu_callout_map);
20233 +}
20234 +#endif /* CONFIG_SMP */
20235 +
20236 +extern unsigned disabled_cpus __cpuinitdata;
20237 +
20238 +#ifdef CONFIG_X86_32_SMP
20239 +/*
20240 + * This function is needed by all SMP systems. It must _always_ be valid
20241 + * from the initial startup. We map APIC_BASE very early in page_setup(),
20242 + * so this is correct in the x86 case.
20243 + */
20244 +DECLARE_PER_CPU(int, cpu_number);
20245 +#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
20246 +#define safe_smp_processor_id() smp_processor_id()
20247 +
20248 +#elif defined(CONFIG_X86_64_SMP)
20249 +#define raw_smp_processor_id() read_pda(cpunumber)
20250 +
20251 +#define stack_smp_processor_id() \
20252 +({ \
20253 + struct thread_info *ti; \
20254 + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
20255 + ti->cpu; \
20256 +})
20257 +#define safe_smp_processor_id() smp_processor_id()
20258 +
20259 +#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
20260 +#define cpu_physical_id(cpu) boot_cpu_physical_apicid
20261 +#define safe_smp_processor_id() 0
20262 +#define stack_smp_processor_id() 0
20263 +#endif
20264 +
20265 +#ifdef CONFIG_X86_LOCAL_APIC
20266 +
20267 +static inline int logical_smp_processor_id(void)
20268 +{
20269 + /* we don't want to mark this access volatile - bad code generation */
20270 + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
20271 +}
20272 +
20273 +#ifndef CONFIG_X86_64
20274 +static inline unsigned int read_apic_id(void)
20275 +{
20276 + return *(u32 *)(APIC_BASE + APIC_ID);
20277 +}
20278 +#else
20279 +extern unsigned int read_apic_id(void);
20280 +#endif
20281 +
20282 +
20283 +# ifdef APIC_DEFINITION
20284 +extern int hard_smp_processor_id(void);
20285 +# else
20286 +# include <mach_apicdef.h>
20287 +static inline int hard_smp_processor_id(void)
20288 +{
20289 + /* we don't want to mark this access volatile - bad code generation */
20290 + return GET_APIC_ID(read_apic_id());
20291 +}
20292 +# endif /* APIC_DEFINITION */
20293 +
20294 +#else /* CONFIG_X86_LOCAL_APIC */
20295 +
20296 +# ifndef CONFIG_SMP
20297 +# define hard_smp_processor_id() 0
20298 +# endif
20299 +
20300 +#endif /* CONFIG_X86_LOCAL_APIC */
20301 +
20302 +#ifdef CONFIG_HOTPLUG_CPU
20303 +extern void cpu_exit_clear(void);
20304 +extern void cpu_uninit(void);
20305 +#endif
20306 +
20307 +extern void smp_alloc_memory(void);
20308 +extern void lock_ipi_call_lock(void);
20309 +extern void unlock_ipi_call_lock(void);
20310 +#endif /* __ASSEMBLY__ */
20311 #endif
20312 --- a/include/asm-x86/mach-xen/asm/spinlock.h
20313 +++ b/include/asm-x86/mach-xen/asm/spinlock.h
20314 @@ -88,7 +88,7 @@ extern void xen_spin_kick(raw_spinlock_t
20315 : "memory", "cc")
20316
20317
20318 -static inline int __raw_spin_trylock(raw_spinlock_t *lock)
20319 +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
20320 {
20321 int tmp, new;
20322
20323 @@ -107,7 +107,7 @@ static inline int __raw_spin_trylock(raw
20324 return tmp;
20325 }
20326
20327 -static inline void __raw_spin_unlock(raw_spinlock_t *lock)
20328 +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
20329 {
20330 unsigned int token;
20331 unsigned char kick;
20332 @@ -155,7 +155,7 @@ static inline void __raw_spin_unlock(raw
20333 : "memory", "cc"); \
20334 } while (0)
20335
20336 -static inline int __raw_spin_trylock(raw_spinlock_t *lock)
20337 +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
20338 {
20339 int tmp;
20340 int new;
20341 @@ -177,7 +177,7 @@ static inline int __raw_spin_trylock(raw
20342 return tmp;
20343 }
20344
20345 -static inline void __raw_spin_unlock(raw_spinlock_t *lock)
20346 +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
20347 {
20348 unsigned int token, tmp;
20349 bool kick;
20350 @@ -197,19 +197,19 @@ static inline void __raw_spin_unlock(raw
20351
20352 static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
20353 {
20354 - int tmp = *(volatile signed int *)(&(lock)->slock);
20355 + int tmp = ACCESS_ONCE(lock->slock);
20356
20357 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
20358 }
20359
20360 static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
20361 {
20362 - int tmp = *(volatile signed int *)(&(lock)->slock);
20363 + int tmp = ACCESS_ONCE(lock->slock);
20364
20365 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
20366 }
20367
20368 -static inline void __raw_spin_lock(raw_spinlock_t *lock)
20369 +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
20370 {
20371 unsigned int token, count;
20372 bool free;
20373 @@ -223,8 +223,8 @@ static inline void __raw_spin_lock(raw_s
20374 } while (unlikely(!count) && !xen_spin_wait(lock, token));
20375 }
20376
20377 -static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
20378 - unsigned long flags)
20379 +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
20380 + unsigned long flags)
20381 {
20382 unsigned int token, count;
20383 bool free;
20384 --- a/include/asm-x86/mach-xen/asm/swiotlb_32.h
20385 +++ /dev/null
20386 @@ -1,43 +0,0 @@
20387 -#ifndef _ASM_SWIOTLB_H
20388 -#define _ASM_SWIOTLB_H 1
20389 -
20390 -/* SWIOTLB interface */
20391 -
20392 -extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
20393 - int dir);
20394 -extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
20395 - size_t size, int dir);
20396 -extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
20397 - dma_addr_t dev_addr,
20398 - size_t size, int dir);
20399 -extern void swiotlb_sync_single_for_device(struct device *hwdev,
20400 - dma_addr_t dev_addr,
20401 - size_t size, int dir);
20402 -extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
20403 - struct scatterlist *sg, int nelems,
20404 - int dir);
20405 -extern void swiotlb_sync_sg_for_device(struct device *hwdev,
20406 - struct scatterlist *sg, int nelems,
20407 - int dir);
20408 -extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
20409 - int nents, int direction);
20410 -extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
20411 - int nents, int direction);
20412 -extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
20413 -#ifdef CONFIG_HIGHMEM
20414 -extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
20415 - unsigned long offset, size_t size,
20416 - enum dma_data_direction direction);
20417 -extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
20418 - size_t size, enum dma_data_direction direction);
20419 -#endif
20420 -extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
20421 -extern void swiotlb_init(void);
20422 -
20423 -#ifdef CONFIG_SWIOTLB
20424 -extern int swiotlb;
20425 -#else
20426 -#define swiotlb 0
20427 -#endif
20428 -
20429 -#endif
20430 --- a/include/asm-x86/mach-xen/asm/swiotlb.h
20431 +++ b/include/asm-x86/mach-xen/asm/swiotlb.h
20432 @@ -1,5 +1,8 @@
20433 -#ifdef CONFIG_X86_32
20434 -# include "swiotlb_32.h"
20435 -#else
20436 -# include "../../swiotlb.h"
20437 -#endif
20438 +#ifndef _ASM_SWIOTLB_H
20439 +
20440 +#include "../../swiotlb.h"
20441 +
20442 +dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
20443 + int dir);
20444 +
20445 +#endif /* _ASM_SWIOTLB_H */
20446 --- a/include/asm-x86/mach-xen/asm/system.h
20447 +++ b/include/asm-x86/mach-xen/asm/system.h
20448 @@ -28,22 +28,44 @@ struct task_struct *__switch_to(struct t
20449 * Saving eflags is important. It switches not only IOPL between tasks,
20450 * it also protects other tasks from NT leaking through sysenter etc.
20451 */
20452 -#define switch_to(prev, next, last) do { \
20453 - unsigned long esi, edi; \
20454 - asm volatile("pushfl\n\t" /* Save flags */ \
20455 - "pushl %%ebp\n\t" \
20456 - "movl %%esp,%0\n\t" /* save ESP */ \
20457 - "movl %5,%%esp\n\t" /* restore ESP */ \
20458 - "movl $1f,%1\n\t" /* save EIP */ \
20459 - "pushl %6\n\t" /* restore EIP */ \
20460 - "jmp __switch_to\n" \
20461 +#define switch_to(prev, next, last) \
20462 +do { \
20463 + /* \
20464 + * Context-switching clobbers all registers, so we clobber \
20465 + * them explicitly, via unused output variables. \
20466 + * (EAX and EBP is not listed because EBP is saved/restored \
20467 + * explicitly for wchan access and EAX is the return value of \
20468 + * __switch_to()) \
20469 + */ \
20470 + unsigned long ebx, ecx, edx, esi, edi; \
20471 + \
20472 + asm volatile("pushfl\n\t" /* save flags */ \
20473 + "pushl %%ebp\n\t" /* save EBP */ \
20474 + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
20475 + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
20476 + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
20477 + "pushl %[next_ip]\n\t" /* restore EIP */ \
20478 + "jmp __switch_to\n" /* regparm call */ \
20479 "1:\t" \
20480 - "popl %%ebp\n\t" \
20481 - "popfl" \
20482 - :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \
20483 - "=a" (last), "=S" (esi), "=D" (edi) \
20484 - :"m" (next->thread.sp), "m" (next->thread.ip), \
20485 - "2" (prev), "d" (next)); \
20486 + "popl %%ebp\n\t" /* restore EBP */ \
20487 + "popfl\n" /* restore flags */ \
20488 + \
20489 + /* output parameters */ \
20490 + : [prev_sp] "=m" (prev->thread.sp), \
20491 + [prev_ip] "=m" (prev->thread.ip), \
20492 + "=a" (last), \
20493 + \
20494 + /* clobbered output registers: */ \
20495 + "=b" (ebx), "=c" (ecx), "=d" (edx), \
20496 + "=S" (esi), "=D" (edi) \
20497 + \
20498 + /* input parameters: */ \
20499 + : [next_sp] "m" (next->thread.sp), \
20500 + [next_ip] "m" (next->thread.ip), \
20501 + \
20502 + /* regparm parameters for __switch_to(): */ \
20503 + [prev] "a" (prev), \
20504 + [next] "d" (next)); \
20505 } while (0)
20506
20507 /*
20508 @@ -123,30 +145,29 @@ extern void load_gs_index(unsigned);
20509 */
20510 #define loadsegment(seg, value) \
20511 asm volatile("\n" \
20512 - "1:\t" \
20513 - "movl %k0,%%" #seg "\n" \
20514 - "2:\n" \
20515 - ".section .fixup,\"ax\"\n" \
20516 - "3:\t" \
20517 - "movl %k1, %%" #seg "\n\t" \
20518 - "jmp 2b\n" \
20519 - ".previous\n" \
20520 - _ASM_EXTABLE(1b,3b) \
20521 - : :"r" (value), "r" (0))
20522 + "1:\t" \
20523 + "movl %k0,%%" #seg "\n" \
20524 + "2:\n" \
20525 + ".section .fixup,\"ax\"\n" \
20526 + "3:\t" \
20527 + "movl %k1, %%" #seg "\n\t" \
20528 + "jmp 2b\n" \
20529 + ".previous\n" \
20530 + _ASM_EXTABLE(1b,3b) \
20531 + : :"r" (value), "r" (0))
20532
20533
20534 /*
20535 * Save a segment register away
20536 */
20537 -#define savesegment(seg, value) \
20538 +#define savesegment(seg, value) \
20539 asm volatile("mov %%" #seg ",%0":"=rm" (value))
20540
20541 static inline unsigned long get_limit(unsigned long segment)
20542 {
20543 unsigned long __limit;
20544 - __asm__("lsll %1,%0"
20545 - :"=r" (__limit):"r" (segment));
20546 - return __limit+1;
20547 + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
20548 + return __limit + 1;
20549 }
20550
20551 static inline void xen_clts(void)
20552 @@ -171,13 +192,13 @@ static unsigned long __force_order;
20553 static inline unsigned long xen_read_cr0(void)
20554 {
20555 unsigned long val;
20556 - asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
20557 + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
20558 return val;
20559 }
20560
20561 static inline void xen_write_cr0(unsigned long val)
20562 {
20563 - asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
20564 + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
20565 }
20566
20567 #define xen_read_cr2() (current_vcpu_info()->arch.cr2)
20568 @@ -186,7 +207,7 @@ static inline void xen_write_cr0(unsigne
20569 static inline unsigned long xen_read_cr3(void)
20570 {
20571 unsigned long val;
20572 - asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
20573 + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
20574 #ifdef CONFIG_X86_32
20575 return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
20576 #else
20577 @@ -201,13 +222,13 @@ static inline void xen_write_cr3(unsigne
20578 #else
20579 val = phys_to_machine(val);
20580 #endif
20581 - asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
20582 + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
20583 }
20584
20585 static inline unsigned long xen_read_cr4(void)
20586 {
20587 unsigned long val;
20588 - asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
20589 + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
20590 return val;
20591 }
20592
20593 @@ -215,7 +236,7 @@ static inline unsigned long xen_read_cr4
20594
20595 static inline void xen_write_cr4(unsigned long val)
20596 {
20597 - asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
20598 + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
20599 }
20600
20601 #ifdef CONFIG_X86_64
20602 @@ -234,6 +255,7 @@ static inline void xen_wbinvd(void)
20603 {
20604 asm volatile("wbinvd": : :"memory");
20605 }
20606 +
20607 #define read_cr0() (xen_read_cr0())
20608 #define write_cr0(x) (xen_write_cr0(x))
20609 #define read_cr2() (xen_read_cr2())
20610 @@ -260,7 +282,7 @@ static inline void clflush(volatile void
20611 asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
20612 }
20613
20614 -#define nop() __asm__ __volatile__ ("nop")
20615 +#define nop() asm volatile ("nop")
20616
20617 void disable_hlt(void);
20618 void enable_hlt(void);
20619 @@ -280,16 +302,7 @@ void default_idle(void);
20620 */
20621 #ifdef CONFIG_X86_32
20622 /*
20623 - * For now, "wmb()" doesn't actually do anything, as all
20624 - * Intel CPU's follow what Intel calls a *Processor Order*,
20625 - * in which all writes are seen in the program order even
20626 - * outside the CPU.
20627 - *
20628 - * I expect future Intel CPU's to have a weaker ordering,
20629 - * but I'd also expect them to finally get their act together
20630 - * and add some real memory barriers if so.
20631 - *
20632 - * Some non intel clones support out of order store. wmb() ceases to be a
20633 + * Some non-Intel clones support out of order store. wmb() ceases to be a
20634 * nop for these.
20635 */
20636 #define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
20637 @@ -368,7 +381,7 @@ void default_idle(void);
20638 # define smp_wmb() barrier()
20639 #endif
20640 #define smp_read_barrier_depends() read_barrier_depends()
20641 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
20642 +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
20643 #else
20644 #define smp_mb() barrier()
20645 #define smp_rmb() barrier()
20646 --- a/include/asm-x86/mach-xen/asm/tlbflush.h
20647 +++ b/include/asm-x86/mach-xen/asm/tlbflush.h
20648 @@ -86,8 +86,7 @@ static inline void flush_tlb_range(struc
20649 #define TLBSTATE_LAZY 2
20650
20651 #ifdef CONFIG_X86_32
20652 -struct tlb_state
20653 -{
20654 +struct tlb_state {
20655 struct mm_struct *active_mm;
20656 int state;
20657 char __cacheline_padding[L1_CACHE_BYTES-8];
20658 --- a/include/asm-x86/mach-xen/asm/vga.h
20659 +++ b/include/asm-x86/mach-xen/asm/vga.h
20660 @@ -12,9 +12,9 @@
20661 * access the videoram directly without any black magic.
20662 */
20663
20664 -#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x)
20665 +#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x)
20666
20667 #define vga_readb(x) (*(x))
20668 -#define vga_writeb(x,y) (*(y) = (x))
20669 +#define vga_writeb(x, y) (*(y) = (x))
20670
20671 #endif
20672 --- a/include/asm-x86/mach-xen/asm/xor_64.h
20673 +++ b/include/asm-x86/mach-xen/asm/xor_64.h
20674 @@ -1,20 +1,23 @@
20675 /*
20676 - * x86-64 changes / gcc fixes from Andi Kleen.
20677 + * x86-64 changes / gcc fixes from Andi Kleen.
20678 * Copyright 2002 Andi Kleen, SuSE Labs.
20679 *
20680 * This hasn't been optimized for the hammer yet, but there are likely
20681 * no advantages to be gotten from x86-64 here anyways.
20682 */
20683
20684 -typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
20685 +typedef struct {
20686 + unsigned long a, b;
20687 +} __attribute__((aligned(16))) xmm_store_t;
20688
20689 -/* Doesn't use gcc to save the XMM registers, because there is no easy way to
20690 +/* Doesn't use gcc to save the XMM registers, because there is no easy way to
20691 tell it to do a clts before the register saving. */
20692 -#define XMMS_SAVE do { \
20693 +#define XMMS_SAVE \
20694 +do { \
20695 preempt_disable(); \
20696 if (!(current_thread_info()->status & TS_USEDFPU)) \
20697 clts(); \
20698 - __asm__ __volatile__ ( \
20699 + asm volatile( \
20700 "movups %%xmm0,(%1) ;\n\t" \
20701 "movups %%xmm1,0x10(%1) ;\n\t" \
20702 "movups %%xmm2,0x20(%1) ;\n\t" \
20703 @@ -22,10 +25,11 @@ typedef struct { unsigned long a,b; } __
20704 : "=&r" (cr0) \
20705 : "r" (xmm_save) \
20706 : "memory"); \
20707 -} while(0)
20708 +} while (0)
20709
20710 -#define XMMS_RESTORE do { \
20711 - asm volatile ( \
20712 +#define XMMS_RESTORE \
20713 +do { \
20714 + asm volatile( \
20715 "sfence ;\n\t" \
20716 "movups (%1),%%xmm0 ;\n\t" \
20717 "movups 0x10(%1),%%xmm1 ;\n\t" \
20718 @@ -37,72 +41,72 @@ typedef struct { unsigned long a,b; } __
20719 if (!(current_thread_info()->status & TS_USEDFPU)) \
20720 stts(); \
20721 preempt_enable(); \
20722 -} while(0)
20723 +} while (0)
20724
20725 #define OFFS(x) "16*("#x")"
20726 #define PF_OFFS(x) "256+16*("#x")"
20727 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
20728 -#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
20729 -#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
20730 +#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
20731 +#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
20732 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
20733 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
20734 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
20735 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
20736 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
20737 -#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
20738 -#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
20739 -#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
20740 -#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
20741 -#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
20742 +#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
20743 +#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
20744 +#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
20745 +#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
20746 +#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
20747
20748
20749 static void
20750 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
20751 {
20752 - unsigned int lines = bytes >> 8;
20753 + unsigned int lines = bytes >> 8;
20754 unsigned long cr0;
20755 xmm_store_t xmm_save[4];
20756
20757 XMMS_SAVE;
20758
20759 - asm volatile (
20760 + asm volatile(
20761 #undef BLOCK
20762 #define BLOCK(i) \
20763 - LD(i,0) \
20764 - LD(i+1,1) \
20765 + LD(i, 0) \
20766 + LD(i + 1, 1) \
20767 PF1(i) \
20768 - PF1(i+2) \
20769 - LD(i+2,2) \
20770 - LD(i+3,3) \
20771 - PF0(i+4) \
20772 - PF0(i+6) \
20773 - XO1(i,0) \
20774 - XO1(i+1,1) \
20775 - XO1(i+2,2) \
20776 - XO1(i+3,3) \
20777 - ST(i,0) \
20778 - ST(i+1,1) \
20779 - ST(i+2,2) \
20780 - ST(i+3,3) \
20781 + PF1(i + 2) \
20782 + LD(i + 2, 2) \
20783 + LD(i + 3, 3) \
20784 + PF0(i + 4) \
20785 + PF0(i + 6) \
20786 + XO1(i, 0) \
20787 + XO1(i + 1, 1) \
20788 + XO1(i + 2, 2) \
20789 + XO1(i + 3, 3) \
20790 + ST(i, 0) \
20791 + ST(i + 1, 1) \
20792 + ST(i + 2, 2) \
20793 + ST(i + 3, 3) \
20794
20795
20796 PF0(0)
20797 PF0(2)
20798
20799 " .align 32 ;\n"
20800 - " 1: ;\n"
20801 + " 1: ;\n"
20802
20803 BLOCK(0)
20804 BLOCK(4)
20805 BLOCK(8)
20806 BLOCK(12)
20807
20808 - " addq %[inc], %[p1] ;\n"
20809 - " addq %[inc], %[p2] ;\n"
20810 + " addq %[inc], %[p1] ;\n"
20811 + " addq %[inc], %[p2] ;\n"
20812 " decl %[cnt] ; jnz 1b"
20813 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
20814 - : [inc] "r" (256UL)
20815 - : "memory");
20816 + : [inc] "r" (256UL)
20817 + : "memory");
20818
20819 XMMS_RESTORE;
20820 }
20821 @@ -117,52 +121,52 @@ xor_sse_3(unsigned long bytes, unsigned
20822
20823 XMMS_SAVE;
20824
20825 - __asm__ __volatile__ (
20826 + asm volatile(
20827 #undef BLOCK
20828 #define BLOCK(i) \
20829 PF1(i) \
20830 - PF1(i+2) \
20831 - LD(i,0) \
20832 - LD(i+1,1) \
20833 - LD(i+2,2) \
20834 - LD(i+3,3) \
20835 + PF1(i + 2) \
20836 + LD(i, 0) \
20837 + LD(i + 1, 1) \
20838 + LD(i + 2, 2) \
20839 + LD(i + 3, 3) \
20840 PF2(i) \
20841 - PF2(i+2) \
20842 - PF0(i+4) \
20843 - PF0(i+6) \
20844 - XO1(i,0) \
20845 - XO1(i+1,1) \
20846 - XO1(i+2,2) \
20847 - XO1(i+3,3) \
20848 - XO2(i,0) \
20849 - XO2(i+1,1) \
20850 - XO2(i+2,2) \
20851 - XO2(i+3,3) \
20852 - ST(i,0) \
20853 - ST(i+1,1) \
20854 - ST(i+2,2) \
20855 - ST(i+3,3) \
20856 + PF2(i + 2) \
20857 + PF0(i + 4) \
20858 + PF0(i + 6) \
20859 + XO1(i, 0) \
20860 + XO1(i + 1, 1) \
20861 + XO1(i + 2, 2) \
20862 + XO1(i + 3, 3) \
20863 + XO2(i, 0) \
20864 + XO2(i + 1, 1) \
20865 + XO2(i + 2, 2) \
20866 + XO2(i + 3, 3) \
20867 + ST(i, 0) \
20868 + ST(i + 1, 1) \
20869 + ST(i + 2, 2) \
20870 + ST(i + 3, 3) \
20871
20872
20873 PF0(0)
20874 PF0(2)
20875
20876 " .align 32 ;\n"
20877 - " 1: ;\n"
20878 + " 1: ;\n"
20879
20880 BLOCK(0)
20881 BLOCK(4)
20882 BLOCK(8)
20883 BLOCK(12)
20884
20885 - " addq %[inc], %[p1] ;\n"
20886 - " addq %[inc], %[p2] ;\n"
20887 - " addq %[inc], %[p3] ;\n"
20888 + " addq %[inc], %[p1] ;\n"
20889 + " addq %[inc], %[p2] ;\n"
20890 + " addq %[inc], %[p3] ;\n"
20891 " decl %[cnt] ; jnz 1b"
20892 : [cnt] "+r" (lines),
20893 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
20894 : [inc] "r" (256UL)
20895 - : "memory");
20896 + : "memory");
20897 XMMS_RESTORE;
20898 }
20899
20900 @@ -171,64 +175,64 @@ xor_sse_4(unsigned long bytes, unsigned
20901 unsigned long *p3, unsigned long *p4)
20902 {
20903 unsigned int lines = bytes >> 8;
20904 - xmm_store_t xmm_save[4];
20905 + xmm_store_t xmm_save[4];
20906 unsigned long cr0;
20907
20908 XMMS_SAVE;
20909
20910 - __asm__ __volatile__ (
20911 + asm volatile(
20912 #undef BLOCK
20913 #define BLOCK(i) \
20914 PF1(i) \
20915 - PF1(i+2) \
20916 - LD(i,0) \
20917 - LD(i+1,1) \
20918 - LD(i+2,2) \
20919 - LD(i+3,3) \
20920 + PF1(i + 2) \
20921 + LD(i, 0) \
20922 + LD(i + 1, 1) \
20923 + LD(i + 2, 2) \
20924 + LD(i + 3, 3) \
20925 PF2(i) \
20926 - PF2(i+2) \
20927 - XO1(i,0) \
20928 - XO1(i+1,1) \
20929 - XO1(i+2,2) \
20930 - XO1(i+3,3) \
20931 + PF2(i + 2) \
20932 + XO1(i, 0) \
20933 + XO1(i + 1, 1) \
20934 + XO1(i + 2, 2) \
20935 + XO1(i + 3, 3) \
20936 PF3(i) \
20937 - PF3(i+2) \
20938 - PF0(i+4) \
20939 - PF0(i+6) \
20940 - XO2(i,0) \
20941 - XO2(i+1,1) \
20942 - XO2(i+2,2) \
20943 - XO2(i+3,3) \
20944 - XO3(i,0) \
20945 - XO3(i+1,1) \
20946 - XO3(i+2,2) \
20947 - XO3(i+3,3) \
20948 - ST(i,0) \
20949 - ST(i+1,1) \
20950 - ST(i+2,2) \
20951 - ST(i+3,3) \
20952 + PF3(i + 2) \
20953 + PF0(i + 4) \
20954 + PF0(i + 6) \
20955 + XO2(i, 0) \
20956 + XO2(i + 1, 1) \
20957 + XO2(i + 2, 2) \
20958 + XO2(i + 3, 3) \
20959 + XO3(i, 0) \
20960 + XO3(i + 1, 1) \
20961 + XO3(i + 2, 2) \
20962 + XO3(i + 3, 3) \
20963 + ST(i, 0) \
20964 + ST(i + 1, 1) \
20965 + ST(i + 2, 2) \
20966 + ST(i + 3, 3) \
20967
20968
20969 PF0(0)
20970 PF0(2)
20971
20972 " .align 32 ;\n"
20973 - " 1: ;\n"
20974 + " 1: ;\n"
20975
20976 BLOCK(0)
20977 BLOCK(4)
20978 BLOCK(8)
20979 BLOCK(12)
20980
20981 - " addq %[inc], %[p1] ;\n"
20982 - " addq %[inc], %[p2] ;\n"
20983 - " addq %[inc], %[p3] ;\n"
20984 - " addq %[inc], %[p4] ;\n"
20985 + " addq %[inc], %[p1] ;\n"
20986 + " addq %[inc], %[p2] ;\n"
20987 + " addq %[inc], %[p3] ;\n"
20988 + " addq %[inc], %[p4] ;\n"
20989 " decl %[cnt] ; jnz 1b"
20990 : [cnt] "+c" (lines),
20991 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
20992 : [inc] "r" (256UL)
20993 - : "memory" );
20994 + : "memory" );
20995
20996 XMMS_RESTORE;
20997 }
20998 @@ -237,70 +241,70 @@ static void
20999 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
21000 unsigned long *p3, unsigned long *p4, unsigned long *p5)
21001 {
21002 - unsigned int lines = bytes >> 8;
21003 + unsigned int lines = bytes >> 8;
21004 xmm_store_t xmm_save[4];
21005 unsigned long cr0;
21006
21007 XMMS_SAVE;
21008
21009 - __asm__ __volatile__ (
21010 + asm volatile(
21011 #undef BLOCK
21012 #define BLOCK(i) \
21013 PF1(i) \
21014 - PF1(i+2) \
21015 - LD(i,0) \
21016 - LD(i+1,1) \
21017 - LD(i+2,2) \
21018 - LD(i+3,3) \
21019 + PF1(i + 2) \
21020 + LD(i, 0) \
21021 + LD(i + 1, 1) \
21022 + LD(i + 2, 2) \
21023 + LD(i + 3, 3) \
21024 PF2(i) \
21025 - PF2(i+2) \
21026 - XO1(i,0) \
21027 - XO1(i+1,1) \
21028 - XO1(i+2,2) \
21029 - XO1(i+3,3) \
21030 + PF2(i + 2) \
21031 + XO1(i, 0) \
21032 + XO1(i + 1, 1) \
21033 + XO1(i + 2, 2) \
21034 + XO1(i + 3, 3) \
21035 PF3(i) \
21036 - PF3(i+2) \
21037 - XO2(i,0) \
21038 - XO2(i+1,1) \
21039 - XO2(i+2,2) \
21040 - XO2(i+3,3) \
21041 + PF3(i + 2) \
21042 + XO2(i, 0) \
21043 + XO2(i + 1, 1) \
21044 + XO2(i + 2, 2) \
21045 + XO2(i + 3, 3) \
21046 PF4(i) \
21047 - PF4(i+2) \
21048 - PF0(i+4) \
21049 - PF0(i+6) \
21050 - XO3(i,0) \
21051 - XO3(i+1,1) \
21052 - XO3(i+2,2) \
21053 - XO3(i+3,3) \
21054 - XO4(i,0) \
21055 - XO4(i+1,1) \
21056 - XO4(i+2,2) \
21057 - XO4(i+3,3) \
21058 - ST(i,0) \
21059 - ST(i+1,1) \
21060 - ST(i+2,2) \
21061 - ST(i+3,3) \
21062 + PF4(i + 2) \
21063 + PF0(i + 4) \
21064 + PF0(i + 6) \
21065 + XO3(i, 0) \
21066 + XO3(i + 1, 1) \
21067 + XO3(i + 2, 2) \
21068 + XO3(i + 3, 3) \
21069 + XO4(i, 0) \
21070 + XO4(i + 1, 1) \
21071 + XO4(i + 2, 2) \
21072 + XO4(i + 3, 3) \
21073 + ST(i, 0) \
21074 + ST(i + 1, 1) \
21075 + ST(i + 2, 2) \
21076 + ST(i + 3, 3) \
21077
21078
21079 PF0(0)
21080 PF0(2)
21081
21082 " .align 32 ;\n"
21083 - " 1: ;\n"
21084 + " 1: ;\n"
21085
21086 BLOCK(0)
21087 BLOCK(4)
21088 BLOCK(8)
21089 BLOCK(12)
21090
21091 - " addq %[inc], %[p1] ;\n"
21092 - " addq %[inc], %[p2] ;\n"
21093 - " addq %[inc], %[p3] ;\n"
21094 - " addq %[inc], %[p4] ;\n"
21095 - " addq %[inc], %[p5] ;\n"
21096 + " addq %[inc], %[p1] ;\n"
21097 + " addq %[inc], %[p2] ;\n"
21098 + " addq %[inc], %[p3] ;\n"
21099 + " addq %[inc], %[p4] ;\n"
21100 + " addq %[inc], %[p5] ;\n"
21101 " decl %[cnt] ; jnz 1b"
21102 : [cnt] "+c" (lines),
21103 - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
21104 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
21105 [p5] "+r" (p5)
21106 : [inc] "r" (256UL)
21107 : "memory");
21108 @@ -309,18 +313,18 @@ xor_sse_5(unsigned long bytes, unsigned
21109 }
21110
21111 static struct xor_block_template xor_block_sse = {
21112 - .name = "generic_sse",
21113 - .do_2 = xor_sse_2,
21114 - .do_3 = xor_sse_3,
21115 - .do_4 = xor_sse_4,
21116 - .do_5 = xor_sse_5,
21117 + .name = "generic_sse",
21118 + .do_2 = xor_sse_2,
21119 + .do_3 = xor_sse_3,
21120 + .do_4 = xor_sse_4,
21121 + .do_5 = xor_sse_5,
21122 };
21123
21124 #undef XOR_TRY_TEMPLATES
21125 -#define XOR_TRY_TEMPLATES \
21126 - do { \
21127 - xor_speed(&xor_block_sse); \
21128 - } while (0)
21129 +#define XOR_TRY_TEMPLATES \
21130 +do { \
21131 + xor_speed(&xor_block_sse); \
21132 +} while (0)
21133
21134 /* We force the use of the SSE xor block because it can write around L2.
21135 We may also be able to load into the L1 only depending on how the cpu
21136 --- a/include/asm-x86/scatterlist.h
21137 +++ b/include/asm-x86/scatterlist.h
21138 @@ -24,7 +24,7 @@ struct scatterlist {
21139 * returns.
21140 */
21141 #define sg_dma_address(sg) ((sg)->dma_address)
21142 -#ifdef CONFIG_X86_32
21143 +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
21144 # define sg_dma_len(sg) ((sg)->length)
21145 #else
21146 # define sg_dma_len(sg) ((sg)->dma_length)
21147 --- a/include/linux/page-flags.h
21148 +++ b/include/linux/page-flags.h
21149 @@ -276,18 +276,25 @@ static inline void SetPageUptodate(struc
21150
21151 CLEARPAGEFLAG(Uptodate, uptodate)
21152
21153 -#define PageForeign(page) test_bit(PG_foreign, &(page)->flags)
21154 -#define SetPageForeign(_page, dtor) do { \
21155 - set_bit(PG_foreign, &(_page)->flags); \
21156 - BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0); \
21157 - (_page)->index = (long)(dtor); \
21158 -} while (0)
21159 -#define ClearPageForeign(page) do { \
21160 - clear_bit(PG_foreign, &(page)->flags); \
21161 - (page)->index = 0; \
21162 -} while (0)
21163 -#define PageForeignDestructor(_page, order) \
21164 - ((void (*)(struct page *, unsigned int))(_page)->index)(_page, order)
21165 +#ifdef CONFIG_XEN
21166 +TESTPAGEFLAG(Foreign, foreign)
21167 +static inline void SetPageForeign(struct page *page,
21168 + void (*dtor)(struct page *, unsigned int))
21169 +{
21170 + BUG_ON(!dtor);
21171 + set_bit(PG_foreign, &page->flags);
21172 + page->index = (long)dtor;
21173 +}
21174 +static inline void ClearPageForeign(struct page *page)
21175 +{
21176 + clear_bit(PG_foreign, &page->flags);
21177 + page->index = 0;
21178 +}
21179 +static inline void PageForeignDestructor(struct page *page, unsigned int order)
21180 +{
21181 + ((void (*)(struct page *, unsigned int))page->index)(page, order);
21182 +}
21183 +#endif
21184
21185 extern void cancel_dirty_page(struct page *page, unsigned int account_size);
21186
21187 --- a/include/xen/balloon.h
21188 +++ b/include/xen/balloon.h
21189 @@ -31,9 +31,12 @@
21190 * IN THE SOFTWARE.
21191 */
21192
21193 -#ifndef __ASM_BALLOON_H__
21194 -#define __ASM_BALLOON_H__
21195 +#ifndef __XEN_BALLOON_H__
21196 +#define __XEN_BALLOON_H__
21197
21198 +#include <linux/spinlock.h>
21199 +
21200 +#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
21201 /*
21202 * Inform the balloon driver that it should allow some slop for device-driver
21203 * memory activities.
21204 @@ -53,5 +56,6 @@ void balloon_release_driver_page(struct
21205 extern spinlock_t balloon_lock;
21206 #define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags)
21207 #define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
21208 +#endif
21209
21210 -#endif /* __ASM_BALLOON_H__ */
21211 +#endif /* __XEN_BALLOON_H__ */
21212 --- a/include/xen/interface/grant_table.h
21213 +++ b/include/xen/interface/grant_table.h
21214 @@ -193,6 +193,7 @@ struct gnttab_map_grant_ref {
21215 grant_handle_t handle;
21216 uint64_t dev_bus_addr;
21217 };
21218 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
21219 typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
21220 DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
21221
21222 @@ -216,6 +217,7 @@ struct gnttab_unmap_grant_ref {
21223 /* OUT parameters. */
21224 int16_t status; /* GNTST_* */
21225 };
21226 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
21227 typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
21228 DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
21229
21230 @@ -237,6 +239,7 @@ struct gnttab_setup_table {
21231 int16_t status; /* GNTST_* */
21232 XEN_GUEST_HANDLE(ulong) frame_list;
21233 };
21234 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_setup_table);
21235 typedef struct gnttab_setup_table gnttab_setup_table_t;
21236 DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
21237
21238 @@ -251,6 +254,7 @@ struct gnttab_dump_table {
21239 /* OUT parameters. */
21240 int16_t status; /* GNTST_* */
21241 };
21242 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_dump_table);
21243 typedef struct gnttab_dump_table gnttab_dump_table_t;
21244 DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
21245
21246 @@ -271,6 +275,7 @@ struct gnttab_transfer {
21247 /* OUT parameters. */
21248 int16_t status;
21249 };
21250 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_transfer);
21251 typedef struct gnttab_transfer gnttab_transfer_t;
21252 DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
21253
21254 @@ -314,6 +319,7 @@ typedef struct gnttab_copy {
21255 /* OUT parameters. */
21256 int16_t status;
21257 } gnttab_copy_t;
21258 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_copy);
21259 DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);
21260
21261 /*
21262 @@ -332,6 +338,7 @@ struct gnttab_query_size {
21263 uint32_t max_nr_frames;
21264 int16_t status; /* GNTST_* */
21265 };
21266 +DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_query_size);
21267 typedef struct gnttab_query_size gnttab_query_size_t;
21268 DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t);
21269
21270 --- a/include/xen/interface/io/fbif.h
21271 +++ b/include/xen/interface/io/fbif.h
21272 @@ -150,7 +150,12 @@ struct xenfb_page
21273 * framebuffer with a max resolution of 12,800x10,240. Should
21274 * be enough for a while with room leftover for expansion.
21275 */
21276 +#ifndef CONFIG_PARAVIRT_XEN
21277 unsigned long pd[256];
21278 +#else
21279 + /* Two directory pages should be enough for a while. */
21280 + unsigned long pd[2];
21281 +#endif
21282 };
21283
21284 /*
21285 --- a/include/xen/interface/memory.h
21286 +++ b/include/xen/interface/memory.h
21287 @@ -62,7 +62,7 @@ struct xen_memory_reservation {
21288 * OUT: GMFN bases of extents that were allocated
21289 * (NB. This command also updates the mach_to_phys translation table)
21290 */
21291 - XEN_GUEST_HANDLE(ulong) extent_start;
21292 + XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
21293
21294 /* Number of extents, and size/alignment of each (2^extent_order pages). */
21295 xen_ulong_t nr_extents;
21296 @@ -82,7 +82,6 @@ struct xen_memory_reservation {
21297 domid_t domid;
21298
21299 };
21300 -DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_memory_reservation);
21301 typedef struct xen_memory_reservation xen_memory_reservation_t;
21302 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
21303
21304 @@ -168,7 +167,11 @@ struct xen_machphys_mfn_list {
21305 * any large discontiguities in the machine address space, 2MB gaps in
21306 * the machphys table will be represented by an MFN base of zero.
21307 */
21308 +#ifndef CONFIG_PARAVIRT_XEN
21309 XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
21310 +#else
21311 + ulong extent_start;
21312 +#endif
21313
21314 /*
21315 * Number of extents written to the above array. This will be smaller
21316 @@ -176,7 +179,6 @@ struct xen_machphys_mfn_list {
21317 */
21318 unsigned int nr_extents;
21319 };
21320 -DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
21321 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
21322 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
21323
21324 @@ -216,7 +218,6 @@ struct xen_add_to_physmap {
21325 /* GPFN where the source mapping page should appear. */
21326 xen_pfn_t gpfn;
21327 };
21328 -DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
21329 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
21330 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
21331
21332 @@ -249,13 +250,21 @@ struct xen_translate_gpfn_list {
21333 xen_ulong_t nr_gpfns;
21334
21335 /* List of GPFNs to translate. */
21336 +#ifndef CONFIG_PARAVIRT_XEN
21337 XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
21338 +#else
21339 + ulong gpfn_list;
21340 +#endif
21341
21342 /*
21343 * Output list to contain MFN translations. May be the same as the input
21344 * list (in which case each input GPFN is overwritten with the output MFN).
21345 */
21346 +#ifndef CONFIG_PARAVIRT_XEN
21347 XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
21348 +#else
21349 + ulong mfn_list;
21350 +#endif
21351 };
21352 DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
21353 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
21354 --- a/include/xen/interface/vcpu.h
21355 +++ b/include/xen/interface/vcpu.h
21356 @@ -85,6 +85,7 @@ struct vcpu_runstate_info {
21357 */
21358 uint64_t time[4];
21359 };
21360 +DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
21361 typedef struct vcpu_runstate_info vcpu_runstate_info_t;
21362 DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);
21363
21364 @@ -140,6 +141,7 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_register_ru
21365 struct vcpu_set_periodic_timer {
21366 uint64_t period_ns;
21367 };
21368 +DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer);
21369 typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t;
21370 DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t);
21371
21372 @@ -153,6 +155,7 @@ struct vcpu_set_singleshot_timer {
21373 uint64_t timeout_abs_ns; /* Absolute system time value in nanoseconds. */
21374 uint32_t flags; /* VCPU_SSHOTTMR_??? */
21375 };
21376 +DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
21377 typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t;
21378 DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t);
21379
21380 @@ -176,6 +179,7 @@ struct vcpu_register_vcpu_info {
21381 uint32_t offset; /* offset within page */
21382 uint32_t rsvd; /* unused */
21383 };
21384 +DEFINE_XEN_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
21385 typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t;
21386 DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t);
21387
21388 --- a/lib/swiotlb-xen.c
21389 +++ b/lib/swiotlb-xen.c
21390 @@ -20,6 +20,7 @@
21391 #include <linux/ctype.h>
21392 #include <linux/init.h>
21393 #include <linux/bootmem.h>
21394 +#include <linux/iommu-helper.h>
21395 #include <linux/highmem.h>
21396 #include <asm/io.h>
21397 #include <asm/pci.h>
21398 @@ -288,15 +289,6 @@ __sync_single(struct phys_addr buffer, c
21399 }
21400 }
21401
21402 -static inline unsigned int is_span_boundary(unsigned int index,
21403 - unsigned int nslots,
21404 - unsigned long offset_slots,
21405 - unsigned long max_slots)
21406 -{
21407 - unsigned long offset = (offset_slots + index) & (max_slots - 1);
21408 - return offset + nslots > max_slots;
21409 -}
21410 -
21411 /*
21412 * Allocates bounce buffer and returns its kernel virtual address.
21413 */
21414 @@ -335,61 +327,53 @@ map_single(struct device *hwdev, struct
21415 * request and allocate a buffer from that IO TLB pool.
21416 */
21417 spin_lock_irqsave(&io_tlb_lock, flags);
21418 - {
21419 - index = ALIGN(io_tlb_index, stride);
21420 - if (index >= iotlb_nslabs)
21421 - index = 0;
21422 - wrap = index;
21423 + index = ALIGN(io_tlb_index, stride);
21424 + if (index >= iotlb_nslabs)
21425 + index = 0;
21426 + wrap = index;
21427
21428 - do {
21429 - while (is_span_boundary(index, nslots, offset_slots,
21430 - max_slots)) {
21431 - index += stride;
21432 - if (index >= iotlb_nslabs)
21433 - index = 0;
21434 - if (index == wrap)
21435 - goto not_found;
21436 - }
21437 + do {
21438 + while (iommu_is_span_boundary(index, nslots, offset_slots,
21439 + max_slots)) {
21440 + index += stride;
21441 + if (index >= iotlb_nslabs)
21442 + index = 0;
21443 + if (index == wrap)
21444 + goto not_found;
21445 + }
21446 +
21447 + /*
21448 + * If we find a slot that indicates we have 'nslots' number of
21449 + * contiguous buffers, we allocate the buffers from that slot
21450 + * and mark the entries as '0' indicating unavailable.
21451 + */
21452 + if (io_tlb_list[index] >= nslots) {
21453 + int count = 0;
21454 +
21455 + for (i = index; i < (int) (index + nslots); i++)
21456 + io_tlb_list[i] = 0;
21457 + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
21458 + io_tlb_list[i] = ++count;
21459 + dma_addr = iotlb_virt_start + (index << IO_TLB_SHIFT);
21460
21461 /*
21462 - * If we find a slot that indicates we have 'nslots'
21463 - * number of contiguous buffers, we allocate the
21464 - * buffers from that slot and mark the entries as '0'
21465 - * indicating unavailable.
21466 + * Update the indices to avoid searching in the next
21467 + * round.
21468 */
21469 - if (io_tlb_list[index] >= nslots) {
21470 - int count = 0;
21471 -
21472 - for (i = index; i < (int)(index + nslots); i++)
21473 - io_tlb_list[i] = 0;
21474 - for (i = index - 1;
21475 - (OFFSET(i, IO_TLB_SEGSIZE) !=
21476 - IO_TLB_SEGSIZE -1) && io_tlb_list[i];
21477 - i--)
21478 - io_tlb_list[i] = ++count;
21479 - dma_addr = iotlb_virt_start +
21480 - (index << IO_TLB_SHIFT);
21481 -
21482 - /*
21483 - * Update the indices to avoid searching in
21484 - * the next round.
21485 - */
21486 - io_tlb_index =
21487 - ((index + nslots) < iotlb_nslabs
21488 - ? (index + nslots) : 0);
21489 + io_tlb_index = ((index + nslots) < iotlb_nslabs
21490 + ? (index + nslots) : 0);
21491
21492 - goto found;
21493 - }
21494 - index += stride;
21495 - if (index >= iotlb_nslabs)
21496 - index = 0;
21497 - } while (index != wrap);
21498 + goto found;
21499 + }
21500 + index += stride;
21501 + if (index >= iotlb_nslabs)
21502 + index = 0;
21503 + } while (index != wrap);
21504
21505 - not_found:
21506 - spin_unlock_irqrestore(&io_tlb_lock, flags);
21507 - return NULL;
21508 - }
21509 - found:
21510 +not_found:
21511 + spin_unlock_irqrestore(&io_tlb_lock, flags);
21512 + return NULL;
21513 +found:
21514 spin_unlock_irqrestore(&io_tlb_lock, flags);
21515
21516 /*
21517 @@ -502,11 +486,13 @@ swiotlb_full(struct device *dev, size_t
21518 * Once the device is given the dma address, the device owns this memory until
21519 * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
21520 */
21521 -dma_addr_t
21522 -swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
21523 -{
21524 - dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) +
21525 - offset_in_page(ptr);
21526 +static dma_addr_t
21527 +_swiotlb_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
21528 + int dir, struct dma_attrs *attrs)
21529 +{
21530 + struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
21531 + dma_addr_t dev_addr = gnttab_dma_map_page(page) +
21532 + offset_in_page(paddr);
21533 void *map;
21534 struct phys_addr buffer;
21535
21536 @@ -517,7 +503,7 @@ swiotlb_map_single(struct device *hwdev,
21537 * we can safely return the device addr and not worry about bounce
21538 * buffering it.
21539 */
21540 - if (!range_straddles_page_boundary(__pa(ptr), size) &&
21541 + if (!range_straddles_page_boundary(paddr, size) &&
21542 !address_needs_mapping(hwdev, dev_addr))
21543 return dev_addr;
21544
21545 @@ -525,8 +511,8 @@ swiotlb_map_single(struct device *hwdev,
21546 * Oh well, have to allocate and map a bounce buffer.
21547 */
21548 gnttab_dma_unmap_page(dev_addr);
21549 - buffer.page = virt_to_page(ptr);
21550 - buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
21551 + buffer.page = page;
21552 + buffer.offset = offset_in_page(paddr);
21553 map = map_single(hwdev, buffer, size, dir);
21554 if (!map) {
21555 swiotlb_full(hwdev, size, dir, 1);
21556 @@ -537,6 +523,26 @@ swiotlb_map_single(struct device *hwdev,
21557 return dev_addr;
21558 }
21559
21560 +dma_addr_t
21561 +swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
21562 + int dir, struct dma_attrs *attrs)
21563 +{
21564 + return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, attrs);
21565 +}
21566 +EXPORT_SYMBOL(swiotlb_map_single_attrs);
21567 +
21568 +dma_addr_t
21569 +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
21570 +{
21571 + return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL);
21572 +}
21573 +
21574 +dma_addr_t
21575 +swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
21576 +{
21577 + return _swiotlb_map_single(hwdev, paddr, size, dir, NULL);
21578 +}
21579 +
21580 /*
21581 * Unmap a single streaming mode DMA translation. The dma_addr and size must
21582 * match what was provided for in a previous swiotlb_map_single call. All
21583 @@ -546,8 +552,8 @@ swiotlb_map_single(struct device *hwdev,
21584 * whatever the device wrote there.
21585 */
21586 void
21587 -swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
21588 - int dir)
21589 +swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
21590 + size_t size, int dir, struct dma_attrs *attrs)
21591 {
21592 BUG_ON(dir == DMA_NONE);
21593 if (in_swiotlb_aperture(dev_addr))
21594 @@ -555,7 +561,14 @@ swiotlb_unmap_single(struct device *hwde
21595 else
21596 gnttab_dma_unmap_page(dev_addr);
21597 }
21598 +EXPORT_SYMBOL(swiotlb_unmap_single_attrs);
21599
21600 +void
21601 +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
21602 + int dir)
21603 +{
21604 + return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
21605 +}
21606 /*
21607 * Make physical memory consistent for a single streaming mode DMA translation
21608 * after a transfer.
21609 @@ -584,6 +597,26 @@ swiotlb_sync_single_for_device(struct de
21610 sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
21611 }
21612
21613 +void
21614 +swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
21615 + unsigned long offset, size_t size, int dir)
21616 +{
21617 + BUG_ON(dir == DMA_NONE);
21618 + if (in_swiotlb_aperture(dev_addr))
21619 + sync_single(hwdev, bus_to_virt(dev_addr + offset), size, dir);
21620 +}
21621 +
21622 +void
21623 +swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
21624 + unsigned long offset, size_t size, int dir)
21625 +{
21626 + BUG_ON(dir == DMA_NONE);
21627 + if (in_swiotlb_aperture(dev_addr))
21628 + sync_single(hwdev, bus_to_virt(dev_addr + offset), size, dir);
21629 +}
21630 +
21631 +void swiotlb_unmap_sg_attrs(struct device *, struct scatterlist *, int, int,
21632 + struct dma_attrs *);
21633 /*
21634 * Map a set of buffers described by scatterlist in streaming mode for DMA.
21635 * This is the scatter-gather version of the above swiotlb_map_single
21636 @@ -601,8 +634,8 @@ swiotlb_sync_single_for_device(struct de
21637 * same here.
21638 */
21639 int
21640 -swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21641 - int dir)
21642 +swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
21643 + int dir, struct dma_attrs *attrs)
21644 {
21645 struct scatterlist *sg;
21646 struct phys_addr buffer;
21647 @@ -626,7 +659,8 @@ swiotlb_map_sg(struct device *hwdev, str
21648 /* Don't panic here, we expect map_sg users
21649 to do proper error handling. */
21650 swiotlb_full(hwdev, sg->length, dir, 0);
21651 - swiotlb_unmap_sg(hwdev, sgl, i, dir);
21652 + swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
21653 + attrs);
21654 sgl[0].dma_length = 0;
21655 return 0;
21656 }
21657 @@ -637,14 +671,22 @@ swiotlb_map_sg(struct device *hwdev, str
21658 }
21659 return nelems;
21660 }
21661 +EXPORT_SYMBOL(swiotlb_map_sg_attrs);
21662 +
21663 +int
21664 +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21665 + int dir)
21666 +{
21667 + return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
21668 +}
21669
21670 /*
21671 * Unmap a set of streaming mode DMA translations. Again, cpu read rules
21672 * concerning calls here are the same as for swiotlb_unmap_single() above.
21673 */
21674 void
21675 -swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21676 - int dir)
21677 +swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
21678 + int nelems, int dir, struct dma_attrs *attrs)
21679 {
21680 struct scatterlist *sg;
21681 int i;
21682 @@ -659,6 +701,14 @@ swiotlb_unmap_sg(struct device *hwdev, s
21683 gnttab_dma_unmap_page(sg->dma_address);
21684 }
21685 }
21686 +EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
21687 +
21688 +void
21689 +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
21690 + int dir)
21691 +{
21692 + return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
21693 +}
21694
21695 /*
21696 * Make physical memory consistent for a set of streaming mode DMA translations
21697 @@ -699,46 +749,6 @@ swiotlb_sync_sg_for_device(struct device
21698 }
21699 }
21700
21701 -#ifdef CONFIG_HIGHMEM
21702 -
21703 -dma_addr_t
21704 -swiotlb_map_page(struct device *hwdev, struct page *page,
21705 - unsigned long offset, size_t size,
21706 - enum dma_data_direction direction)
21707 -{
21708 - struct phys_addr buffer;
21709 - dma_addr_t dev_addr;
21710 - char *map;
21711 -
21712 - dev_addr = gnttab_dma_map_page(page) + offset;
21713 - if (address_needs_mapping(hwdev, dev_addr)) {
21714 - gnttab_dma_unmap_page(dev_addr);
21715 - buffer.page = page;
21716 - buffer.offset = offset;
21717 - map = map_single(hwdev, buffer, size, direction);
21718 - if (!map) {
21719 - swiotlb_full(hwdev, size, direction, 1);
21720 - map = io_tlb_overflow_buffer;
21721 - }
21722 - dev_addr = (dma_addr_t)virt_to_bus(map);
21723 - }
21724 -
21725 - return dev_addr;
21726 -}
21727 -
21728 -void
21729 -swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
21730 - size_t size, enum dma_data_direction direction)
21731 -{
21732 - BUG_ON(direction == DMA_NONE);
21733 - if (in_swiotlb_aperture(dma_address))
21734 - unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
21735 - else
21736 - gnttab_dma_unmap_page(dma_address);
21737 -}
21738 -
21739 -#endif
21740 -
21741 int
21742 swiotlb_dma_mapping_error(dma_addr_t dma_addr)
21743 {