Acked-by: Jeff Mahoney <jeffm@suse.com>
Automatically created from "patches.kernel.org/patch-2.6.26" by xen-port-patches.py
----
- arch/x86/Kconfig | 10
- arch/x86/ia32/ia32entry-xen.S | 14
- arch/x86/kernel/Makefile | 5
- arch/x86/kernel/acpi/Makefile | 2
- arch/x86/kernel/acpi/boot.c | 8
- arch/x86/kernel/acpi/sleep-xen.c | 87 +
- arch/x86/kernel/cpu/common-xen.c | 158 +--
- arch/x86/kernel/cpu/mtrr/main-xen.c | 138 +++
- arch/x86/kernel/e820_32-xen.c | 32
- arch/x86/kernel/e820_64-xen.c | 197 +++-
- arch/x86/kernel/early_printk-xen.c | 24
- arch/x86/kernel/entry_32-xen.S | 44
- arch/x86/kernel/entry_64-xen.S | 8
- arch/x86/kernel/genapic_64-xen.c | 55 +
- arch/x86/kernel/genapic_xen_64.c | 4
- arch/x86/kernel/head64-xen.c | 101 +-
- arch/x86/kernel/head_32-xen.S | 2
- arch/x86/kernel/init_task-xen.c | 1
- arch/x86/kernel/io_apic_32-xen.c | 155 +--
- arch/x86/kernel/io_apic_64-xen.c | 67 -
- arch/x86/kernel/ipi-xen.c | 232 +++++
- arch/x86/kernel/irq_32-xen.c | 6
- arch/x86/kernel/machine_kexec_64.c | 2
- arch/x86/kernel/microcode-xen.c | 2
- arch/x86/kernel/mmconf-fam10h_64.c | 10
- arch/x86/kernel/mpparse-xen.c | 1104 ++++++++++++++++++++++++
- arch/x86/kernel/mpparse_32-xen.c | 1161 --------------------------
- arch/x86/kernel/mpparse_64-xen.c | 879 -------------------
- arch/x86/kernel/pci-dma-xen.c | 735 +++++++++-------
- arch/x86/kernel/pci-nommu-xen.c | 103 ++
- arch/x86/kernel/process-xen.c | 188 ++++
- arch/x86/kernel/process_32-xen.c | 146 +--
- arch/x86/kernel/process_64-xen.c | 165 ++-
- arch/x86/kernel/setup-xen.c | 141 +++
- arch/x86/kernel/setup64-xen.c | 103 --
- arch/x86/kernel/setup_32-xen.c | 127 ++
- arch/x86/kernel/setup_64-xen.c | 303 +++---
- arch/x86/kernel/smp-xen.c | 329 +++++++
- arch/x86/kernel/smp_32-xen.c | 647 --------------
- arch/x86/kernel/smp_64-xen.c | 554 ------------
- arch/x86/kernel/time_32-xen.c | 2
- arch/x86/kernel/traps_32-xen.c | 592 +++++++------
- arch/x86/kernel/traps_64-xen.c | 46 -
- arch/x86/kernel/vsyscall_64-xen.c | 2
- arch/x86/mm/fault-xen.c | 11
- arch/x86/mm/highmem_32-xen.c | 1
- arch/x86/mm/init_32-xen.c | 122 +-
- arch/x86/mm/init_64-xen.c | 292 +++++-
- arch/x86/mm/ioremap-xen.c | 269 ++++--
- arch/x86/mm/pageattr-xen.c | 481 ++--------
- arch/x86/mm/pat-xen.c | 602 +++++++++++++
- arch/x86/mm/pgtable-xen.c | 709 +++++++++++++++
- arch/x86/mm/pgtable_32-xen.c | 242 -----
- arch/x86/pci/i386.c | 4
- arch/x86/pci/irq-xen.c | 23
- arch/x86/vdso/vdso32-setup-xen.c | 15
- drivers/acpi/processor_core.c | 2
- drivers/input/xen-kbdfront.c | 1
- drivers/oprofile/cpu_buffer.c | 2
- drivers/pci/msi-xen.c | 12
- drivers/video/Kconfig | 2
- drivers/video/xen-fbfront.c | 1
- drivers/xen/Kconfig | 2
- drivers/xen/Makefile | 8
- drivers/xen/blkfront/blkfront.c | 4
- drivers/xen/blkfront/block.h | 1
- drivers/xen/blkfront/vbd.c | 58 -
- drivers/xen/blktap/blktap.c | 27
- drivers/xen/char/mem.c | 53 +
- drivers/xen/console/console.c | 13
- drivers/xen/core/machine_kexec.c | 8
- drivers/xen/core/machine_reboot.c | 8
- drivers/xen/core/smpboot.c | 23
- drivers/xen/core/xen_proc.c | 2
- drivers/xen/fbfront/xenfb.c | 24
- drivers/xen/gntdev/gntdev.c | 8
- drivers/xen/netfront/netfront.c | 6
- drivers/xen/privcmd/privcmd.c | 8
- drivers/xen/xenbus/xenbus_client.c | 6
- drivers/xen/xenbus/xenbus_probe.c | 25
- fs/aio.c | 15
- include/asm-x86/dma-mapping.h | 5
- include/asm-x86/genapic_64.h | 5
- include/asm-x86/mach-xen/asm/desc.h | 65 -
- include/asm-x86/mach-xen/asm/dma-mapping.h | 22
- include/asm-x86/mach-xen/asm/dma-mapping_32.h | 141 ---
- include/asm-x86/mach-xen/asm/dma-mapping_64.h | 205 ----
- include/asm-x86/mach-xen/asm/fixmap.h | 8
- include/asm-x86/mach-xen/asm/fixmap_32.h | 22
- include/asm-x86/mach-xen/asm/fixmap_64.h | 27
- include/asm-x86/mach-xen/asm/highmem.h | 2
- include/asm-x86/mach-xen/asm/io.h | 17
- include/asm-x86/mach-xen/asm/io_32.h | 156 +--
- include/asm-x86/mach-xen/asm/io_64.h | 124 +-
- include/asm-x86/mach-xen/asm/irqflags.h | 8
- include/asm-x86/mach-xen/asm/mmu_context_32.h | 12
- include/asm-x86/mach-xen/asm/mmu_context_64.h | 15
- include/asm-x86/mach-xen/asm/page.h | 20
- include/asm-x86/mach-xen/asm/page_64.h | 10
- include/asm-x86/mach-xen/asm/pci.h | 11
- include/asm-x86/mach-xen/asm/pci_64.h | 16
- include/asm-x86/mach-xen/asm/pgalloc.h | 152 +++
- include/asm-x86/mach-xen/asm/pgalloc_32.h | 111 --
- include/asm-x86/mach-xen/asm/pgalloc_64.h | 179 ----
- include/asm-x86/mach-xen/asm/pgtable-3level.h | 43
- include/asm-x86/mach-xen/asm/pgtable.h | 292 ++++--
- include/asm-x86/mach-xen/asm/pgtable_32.h | 107 +-
- include/asm-x86/mach-xen/asm/pgtable_64.h | 156 +--
- include/asm-x86/mach-xen/asm/processor.h | 688 ++++++++-------
- include/asm-x86/mach-xen/asm/segment.h | 3
- include/asm-x86/mach-xen/asm/smp.h | 228 +++++
- include/asm-x86/mach-xen/asm/smp_32.h | 178 ---
- include/asm-x86/mach-xen/asm/smp_64.h | 103 --
- include/asm-x86/mach-xen/asm/spinlock.h | 18
- include/asm-x86/mach-xen/asm/swiotlb.h | 13
- include/asm-x86/mach-xen/asm/swiotlb_32.h | 43
- include/asm-x86/mach-xen/asm/system.h | 107 +-
- include/asm-x86/mach-xen/asm/tlbflush.h | 3
- include/asm-x86/mach-xen/asm/vga.h | 4
- include/asm-x86/mach-xen/asm/xor_64.h | 294 +++---
- include/asm-x86/scatterlist.h | 2
- include/linux/page-flags.h | 31
- include/xen/balloon.h | 10
- include/xen/interface/grant_table.h | 7
- include/xen/interface/io/fbif.h | 5
- include/xen/interface/memory.h | 17
- include/xen/interface/vcpu.h | 4
- lib/swiotlb-xen.c | 236 ++---
- 128 files changed, 8046 insertions(+), 7660 deletions(-)
-
---- a/arch/x86/ia32/ia32entry-xen.S
-+++ b/arch/x86/ia32/ia32entry-xen.S
+--- sle11-2009-05-14.orig/arch/x86/Kconfig 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/Kconfig 2009-03-16 16:38:05.000000000 +0100
+@@ -28,7 +28,7 @@ config X86
+ select HAVE_DYNAMIC_FTRACE
+ select HAVE_FTRACE
+ select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
+- select HAVE_ARCH_KGDB if !X86_VOYAGER
++ select HAVE_ARCH_KGDB if !X86_VOYAGER && !XEN
+ select HAVE_ARCH_TRACEHOOK
+ select HAVE_GENERIC_DMA_COHERENT if X86_32
+ select HAVE_EFFICIENT_UNALIGNED_ACCESS
+@@ -486,6 +486,7 @@ config PARAVIRT_DEBUG
+
+ config MEMTEST
+ bool "Memtest"
++ depends on !XEN
+ help
+ This option adds a kernel parameter 'memtest', which allows memtest
+ to be set.
+@@ -1007,7 +1008,7 @@ config X86_PAE
+ config DIRECT_GBPAGES
+ bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
+ default y
+- depends on X86_64
++ depends on X86_64 && !XEN
+ help
+ Allow the kernel linear mapping to use 1GB pages on CPUs that
+ support it. This can improve the kernel's performance a tiny bit by
+@@ -1349,8 +1350,7 @@ source kernel/Kconfig.hz
+
+ config KEXEC
+ bool "kexec system call"
+- depends on X86_BIOS_REBOOT
+- depends on !XEN_UNPRIVILEGED_GUEST
++ depends on X86_BIOS_REBOOT || (XEN && !XEN_UNPRIVILEGED_GUEST)
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+@@ -1948,6 +1948,4 @@ source "crypto/Kconfig"
+
+ source "arch/x86/kvm/Kconfig"
+
+-source "drivers/xen/Kconfig"
+-
+ source "lib/Kconfig"
+--- sle11-2009-05-14.orig/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:38:05.000000000 +0100
@@ -129,12 +129,14 @@ sysenter_tracesys:
SAVE_REST
CLEAR_RREGS
.quad sys_alarm
.quad sys_fstat /* (old)fstat */
.quad sys_pause
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -28,6 +28,6 @@ config X86
- select HAVE_DYNAMIC_FTRACE
- select HAVE_FTRACE
- select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
-- select HAVE_ARCH_KGDB if !X86_VOYAGER
-+ select HAVE_ARCH_KGDB if !X86_VOYAGER && !XEN
- select HAVE_GENERIC_DMA_COHERENT if X86_32
- select HAVE_EFFICIENT_UNALIGNED_ACCESS
-@@ -482,6 +482,7 @@ config PARAVIRT_DEBUG
-
- config MEMTEST
- bool "Memtest"
-+ depends on !XEN
- help
- This option adds a kernel parameter 'memtest', which allows memtest
- to be set.
-@@ -1345,8 +1346,7 @@ source kernel/Kconfig.hz
+--- sle11-2009-05-14.orig/arch/x86/kernel/Makefile 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/Makefile 2009-03-16 16:38:05.000000000 +0100
+@@ -122,8 +122,7 @@ ifeq ($(CONFIG_X86_64),y)
- config KEXEC
- bool "kexec system call"
-- depends on X86_BIOS_REBOOT
-- depends on !XEN_UNPRIVILEGED_GUEST
-+ depends on X86_BIOS_REBOOT || (XEN && !XEN_UNPRIVILEGED_GUEST)
- help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel. It is like a reboot
-@@ -1944,6 +1944,4 @@ source "crypto/Kconfig"
+ obj-$(CONFIG_XEN) += nmi_64.o
+ time_64-$(CONFIG_XEN) += time_32.o
+- pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
+ endif
- source "arch/x86/kvm/Kconfig"
+-disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
+- smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
++disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
++ pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
+--- sle11-2009-05-14.orig/arch/x86/kernel/acpi/Makefile 2008-12-01 11:11:08.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/acpi/Makefile 2009-03-16 16:38:05.000000000 +0100
+@@ -15,4 +15,4 @@ $(obj)/wakeup_rm.o: $(obj)/realmode/w
+ $(obj)/realmode/wakeup.bin: FORCE
+ $(Q)$(MAKE) $(build)=$(obj)/realmode
--source "drivers/xen/Kconfig"
--
- source "lib/Kconfig"
---- a/arch/x86/kernel/acpi/boot.c
-+++ b/arch/x86/kernel/acpi/boot.c
+-disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_$(BITS).o
++disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_%.o
+--- sle11-2009-05-14.orig/arch/x86/kernel/acpi/boot.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/acpi/boot.c 2009-03-16 16:38:05.000000000 +0100
@@ -251,19 +251,23 @@ static int __init acpi_parse_madt(struct
static void __cpuinit acpi_register_lapic(int id, u8 enabled)
static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
{
---- a/arch/x86/kernel/acpi/Makefile
-+++ b/arch/x86/kernel/acpi/Makefile
-@@ -15,4 +15,4 @@ $(obj)/wakeup_rm.o: $(obj)/realmode/w
- $(obj)/realmode/wakeup.bin: FORCE
- $(Q)$(MAKE) $(build)=$(obj)/realmode
-
--disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_$(BITS).o
-+disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_%.o
---- a/arch/x86/kernel/acpi/sleep-xen.c
-+++ b/arch/x86/kernel/acpi/sleep-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -10,15 +10,19 @@
#include <linux/dmi.h>
#include <linux/cpumask.h>
#endif
}
---- a/arch/x86/kernel/cpu/common-xen.c
-+++ b/arch/x86/kernel/cpu/common-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -5,7 +5,6 @@
#include <linux/module.h>
#include <linux/percpu.h>
void __cpuinit cpu_uninit(void)
{
int cpu = raw_smp_processor_id();
---- a/arch/x86/kernel/cpu/mtrr/main-xen.c
-+++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -35,6 +35,8 @@ struct mtrr_ops *mtrr_if = &generic_mtrr
unsigned int num_var_ranges;
unsigned int mtrr_usage_table[MAX_VAR_RANGES];
}
void mtrr_ap_init(void)
---- a/arch/x86/kernel/e820_32-xen.c
-+++ b/arch/x86/kernel/e820_32-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -469,7 +469,7 @@ int __init sanitize_e820_map(struct e820
* thinkpad 560x, for example, does not cooperate with the memory
* detection code.)
saved_max_pfn = max_pfn;
#endif
e820.nr_map = 0;
---- a/arch/x86/kernel/e820_64-xen.c
-+++ b/arch/x86/kernel/e820_64-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -40,11 +40,11 @@ struct e820map machine_e820;
unsigned long end_pfn;
e820.nr_map = 0;
userdef = 1;
return 0;
---- a/arch/x86/kernel/early_printk-xen.c
-+++ b/arch/x86/kernel/early_printk-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -13,7 +13,7 @@
#ifndef CONFIG_XEN
#ifdef CONFIG_XEN
} else if (!strncmp(buf, "xen", 3)) {
early_console = &xenboot_console;
---- a/arch/x86/kernel/entry_32-xen.S
-+++ b/arch/x86/kernel/entry_32-xen.S
+--- sle11-2009-05-14.orig/arch/x86/kernel/entry_32-xen.S 2009-05-14 11:18:32.000000000 +0200
++++ sle11-2009-05-14/arch/x86/kernel/entry_32-xen.S 2009-03-16 16:38:05.000000000 +0100
@@ -1,5 +1,4 @@
/*
- * linux/arch/i386/entry.S
GET_THREAD_INFO(%ebp)
movl $-EFAULT,PT_EAX(%esp)
jmp resume_userspace
---- a/arch/x86/kernel/entry_64-xen.S
-+++ b/arch/x86/kernel/entry_64-xen.S
+--- sle11-2009-05-14.orig/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:38:05.000000000 +0100
@@ -338,19 +338,17 @@ badsys:
/* Do syscall tracing */
tracesys:
/* Use IRET because user could have changed frame */
/*
---- a/arch/x86/kernel/genapic_64-xen.c
-+++ b/arch/x86/kernel/genapic_64-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -15,6 +15,7 @@
#include <linux/kernel.h>
#include <linux/ctype.h>
+ return uv_system_type != UV_NONE;
+}
+#endif
---- a/arch/x86/kernel/genapic_xen_64.c
-+++ b/arch/x86/kernel/genapic_xen_64.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/genapic_xen_64.c 2008-12-15 11:27:22.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/genapic_xen_64.c 2009-03-16 16:38:05.000000000 +0100
@@ -72,9 +72,7 @@ static cpumask_t xen_target_cpus(void)
static cpumask_t xen_vector_allocation_domain(int cpu)
}
/*
---- a/arch/x86/kernel/head_32-xen.S
-+++ b/arch/x86/kernel/head_32-xen.S
-@@ -69,7 +69,7 @@ ENTRY(startup_32)
- cld # gcc2 wants the direction flag cleared at all times
-
- pushl $0 # fake return address for unwinder
-- jmp start_kernel
-+ jmp i386_start_kernel
-
- #define HYPERCALL_PAGE_OFFSET 0x1000
- .org HYPERCALL_PAGE_OFFSET
---- a/arch/x86/kernel/head64-xen.c
-+++ b/arch/x86/kernel/head64-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/head64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/head64-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -17,6 +17,7 @@
#include <linux/string.h>
#include <linux/percpu.h>
/*
* At this point everything still needed from the boot loader
---- a/arch/x86/kernel/init_task-xen.c
-+++ b/arch/x86/kernel/init_task-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/head_32-xen.S 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/head_32-xen.S 2009-03-16 16:38:05.000000000 +0100
+@@ -69,7 +69,7 @@ ENTRY(startup_32)
+ cld # gcc2 wants the direction flag cleared at all times
+
+ pushl $0 # fake return address for unwinder
+- jmp start_kernel
++ jmp i386_start_kernel
+
+ #define HYPERCALL_PAGE_OFFSET 0x1000
+ .org HYPERCALL_PAGE_OFFSET
+--- sle11-2009-05-14.orig/arch/x86/kernel/init_task-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/init_task-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -11,7 +11,6 @@
#include <asm/desc.h>
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
#ifdef CONFIG_X86_XEN
---- a/arch/x86/kernel/io_apic_32-xen.c
-+++ b/arch/x86/kernel/io_apic_32-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -88,6 +88,16 @@ int sis_apic_bug = -1;
*/
int nr_ioapic_registers[MAX_IO_APICS];
return 0;
}
---- a/arch/x86/kernel/io_apic_64-xen.c
-+++ b/arch/x86/kernel/io_apic_64-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -43,13 +43,15 @@
#include <asm/smp.h>
#include <asm/desc.h>
mem += sizeof(struct resource) * nr_ioapics;
for (i = 0; i < nr_ioapics; i++) {
---- /dev/null
-+++ b/arch/x86/kernel/ipi-xen.c
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-05-14/arch/x86/kernel/ipi-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -0,0 +1,232 @@
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+}
+#endif
+#endif
---- a/arch/x86/kernel/irq_32-xen.c
-+++ b/arch/x86/kernel/irq_32-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -79,7 +79,7 @@ unsigned int do_IRQ(struct pt_regs *regs
if (unlikely((unsigned)irq >= NR_IRQS)) {
asmlinkage void do_softirq(void)
{
unsigned long flags;
---- a/arch/x86/kernel/machine_kexec_64.c
-+++ b/arch/x86/kernel/machine_kexec_64.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/machine_kexec_64.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/machine_kexec_64.c 2009-03-16 16:38:05.000000000 +0100
@@ -120,8 +120,6 @@ int __init machine_kexec_setup_resources
return 0;
}
#else /* CONFIG_XEN */
#define x__pmd(x) __pmd(x)
---- a/arch/x86/kernel/Makefile
-+++ b/arch/x86/kernel/Makefile
-@@ -122,8 +122,7 @@ ifeq ($(CONFIG_X86_64),y)
-
- obj-$(CONFIG_XEN) += nmi_64.o
- time_64-$(CONFIG_XEN) += time_32.o
-- pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
- endif
-
--disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
-- smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
-+disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
-+ pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
---- a/arch/x86/kernel/microcode-xen.c
-+++ b/arch/x86/kernel/microcode-xen.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/microcode-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/microcode-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -162,7 +162,7 @@ static int request_microcode(void)
c->x86, c->x86_model, c->x86_mask);
error = request_firmware(&firmware, name, µcode_pdev->dev);
return error;
}
---- a/arch/x86/kernel/mmconf-fam10h_64.c
-+++ b/arch/x86/kernel/mmconf-fam10h_64.c
+--- sle11-2009-05-14.orig/arch/x86/kernel/mmconf-fam10h_64.c 2009-05-14 10:56:29.000000000 +0200
++++ sle11-2009-05-14/arch/x86/kernel/mmconf-fam10h_64.c 2009-03-16 16:38:05.000000000 +0100
@@ -219,6 +219,16 @@ void __cpuinit fam10h_check_enable_mmcfg
val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
FAM10H_MMIO_CONF_ENABLE;
}
static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
---- a/arch/x86/kernel/mpparse_32-xen.c
-+++ /dev/null
-@@ -1,1161 +0,0 @@
--/*
-- * Intel Multiprocessor Specification 1.1 and 1.4
-- * compliant MP-table parsing routines.
-- *
-- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
-- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
-- *
-- * Fixes
-- * Erich Boleyn : MP v1.4 and additional changes.
-- * Alan Cox : Added EBDA scanning
-- * Ingo Molnar : various cleanups and rewrites
-- * Maciej W. Rozycki: Bits for default MP configurations
-- * Paul Diefenbaugh: Added full ACPI support
-- */
--
--#include <linux/mm.h>
--#include <linux/init.h>
--#include <linux/acpi.h>
--#include <linux/delay.h>
--#include <linux/bootmem.h>
--#include <linux/kernel_stat.h>
--#include <linux/mc146818rtc.h>
--#include <linux/bitops.h>
--
--#include <asm/smp.h>
--#include <asm/acpi.h>
--#include <asm/mtrr.h>
--#include <asm/mpspec.h>
--#include <asm/io_apic.h>
--
--#include <mach_apic.h>
--#include <mach_apicdef.h>
--#include <mach_mpparse.h>
--#include <bios_ebda.h>
--
--/* Have we found an MP table */
--int smp_found_config;
--unsigned int __cpuinitdata maxcpus = NR_CPUS;
--
--/*
-- * Various Linux-internal data structures created from the
-- * MP-table.
-- */
--int apic_version [MAX_APICS];
--int mp_bus_id_to_type [MAX_MP_BUSSES];
--int mp_bus_id_to_node [MAX_MP_BUSSES];
--int mp_bus_id_to_local [MAX_MP_BUSSES];
--int quad_local_to_mp_bus_id [NR_CPUS/4][4];
--int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
--static int mp_current_pci_id;
--
--/* I/O APIC entries */
--struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
--
--/* # of MP IRQ source entries */
--struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
--
--/* MP IRQ source entries */
--int mp_irq_entries;
--
--int nr_ioapics;
--
--int pic_mode;
--unsigned long mp_lapic_addr;
--
--unsigned int def_to_bigsmp = 0;
--
--/* Processor that is doing the boot up */
--unsigned int boot_cpu_physical_apicid = -1U;
--/* Internal processor count */
--unsigned int num_processors;
--
--/* Bitmask of physically existing CPUs */
--physid_mask_t phys_cpu_present_map;
--
--u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
--
--/*
-- * Intel MP BIOS table parsing routines:
-- */
--
--
--/*
-- * Checksum an MP configuration block.
-- */
--
--static int __init mpf_checksum(unsigned char *mp, int len)
--{
-- int sum = 0;
--
-- while (len--)
-- sum += *mp++;
--
-- return sum & 0xFF;
--}
--
--/*
-- * Have to match translation table entries to main table entries by counter
-- * hence the mpc_record variable .... can't see a less disgusting way of
-- * doing this ....
-- */
--
--static int mpc_record;
--static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
--
--#ifndef CONFIG_XEN
--static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
--{
-- int ver, apicid;
-- physid_mask_t phys_cpu;
--
-- if (!(m->mpc_cpuflag & CPU_ENABLED))
-- return;
--
-- apicid = mpc_apic_id(m, translation_table[mpc_record]);
--
-- if (m->mpc_featureflag&(1<<0))
-- Dprintk(" Floating point unit present.\n");
-- if (m->mpc_featureflag&(1<<7))
-- Dprintk(" Machine Exception supported.\n");
-- if (m->mpc_featureflag&(1<<8))
-- Dprintk(" 64 bit compare & exchange supported.\n");
-- if (m->mpc_featureflag&(1<<9))
-- Dprintk(" Internal APIC present.\n");
-- if (m->mpc_featureflag&(1<<11))
-- Dprintk(" SEP present.\n");
-- if (m->mpc_featureflag&(1<<12))
-- Dprintk(" MTRR present.\n");
-- if (m->mpc_featureflag&(1<<13))
-- Dprintk(" PGE present.\n");
-- if (m->mpc_featureflag&(1<<14))
-- Dprintk(" MCA present.\n");
-- if (m->mpc_featureflag&(1<<15))
-- Dprintk(" CMOV present.\n");
-- if (m->mpc_featureflag&(1<<16))
-- Dprintk(" PAT present.\n");
-- if (m->mpc_featureflag&(1<<17))
-- Dprintk(" PSE present.\n");
-- if (m->mpc_featureflag&(1<<18))
-- Dprintk(" PSN present.\n");
-- if (m->mpc_featureflag&(1<<19))
-- Dprintk(" Cache Line Flush Instruction present.\n");
-- /* 20 Reserved */
-- if (m->mpc_featureflag&(1<<21))
-- Dprintk(" Debug Trace and EMON Store present.\n");
-- if (m->mpc_featureflag&(1<<22))
-- Dprintk(" ACPI Thermal Throttle Registers present.\n");
-- if (m->mpc_featureflag&(1<<23))
-- Dprintk(" MMX present.\n");
-- if (m->mpc_featureflag&(1<<24))
-- Dprintk(" FXSR present.\n");
-- if (m->mpc_featureflag&(1<<25))
-- Dprintk(" XMM present.\n");
-- if (m->mpc_featureflag&(1<<26))
-- Dprintk(" Willamette New Instructions present.\n");
-- if (m->mpc_featureflag&(1<<27))
-- Dprintk(" Self Snoop present.\n");
-- if (m->mpc_featureflag&(1<<28))
-- Dprintk(" HT present.\n");
-- if (m->mpc_featureflag&(1<<29))
-- Dprintk(" Thermal Monitor present.\n");
-- /* 30, 31 Reserved */
--
--
-- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
-- Dprintk(" Bootup CPU\n");
-- boot_cpu_physical_apicid = m->mpc_apicid;
-- }
--
-- ver = m->mpc_apicver;
--
-- /*
-- * Validate version
-- */
-- if (ver == 0x0) {
-- printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
-- "fixing up to 0x10. (tell your hw vendor)\n",
-- m->mpc_apicid);
-- ver = 0x10;
-- }
-- apic_version[m->mpc_apicid] = ver;
--
-- phys_cpu = apicid_to_cpu_present(apicid);
-- physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
--
-- if (num_processors >= NR_CPUS) {
-- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
-- " Processor ignored.\n", NR_CPUS);
-- return;
-- }
--
-- if (num_processors >= maxcpus) {
-- printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-- " Processor ignored.\n", maxcpus);
-- return;
-- }
--
-- cpu_set(num_processors, cpu_possible_map);
-- num_processors++;
--
-- /*
-- * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
-- * but we need to work other dependencies like SMP_SUSPEND etc
-- * before this can be done without some confusion.
-- * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
-- * - Ashok Raj <ashok.raj@intel.com>
-- */
-- if (num_processors > 8) {
-- switch (boot_cpu_data.x86_vendor) {
-- case X86_VENDOR_INTEL:
-- if (!APIC_XAPIC(ver)) {
-- def_to_bigsmp = 0;
-- break;
-- }
-- /* If P4 and above fall through */
-- case X86_VENDOR_AMD:
-- def_to_bigsmp = 1;
-- }
-- }
-- bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
--}
--#else
--static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
--{
-- num_processors++;
--}
--#endif /* CONFIG_XEN */
--
--static void __init MP_bus_info (struct mpc_config_bus *m)
--{
-- char str[7];
--
-- memcpy(str, m->mpc_bustype, 6);
-- str[6] = 0;
--
-- mpc_oem_bus_info(m, str, translation_table[mpc_record]);
--
--#if MAX_MP_BUSSES < 256
-- if (m->mpc_busid >= MAX_MP_BUSSES) {
-- printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
-- " is too large, max. supported is %d\n",
-- m->mpc_busid, str, MAX_MP_BUSSES - 1);
-- return;
-- }
--#endif
--
-- if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
-- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
-- } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
-- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
-- } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
-- mpc_oem_pci_bus(m, translation_table[mpc_record]);
-- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
-- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-- mp_current_pci_id++;
-- } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
-- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
-- } else {
-- printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
-- }
--}
--
--static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
--{
-- if (!(m->mpc_flags & MPC_APIC_USABLE))
-- return;
--
-- printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
-- m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
-- if (nr_ioapics >= MAX_IO_APICS) {
-- printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
-- MAX_IO_APICS, nr_ioapics);
-- panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
-- }
-- if (!m->mpc_apicaddr) {
-- printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
-- " found in MP table, skipping!\n");
-- return;
-- }
-- mp_ioapics[nr_ioapics] = *m;
-- nr_ioapics++;
--}
--
--static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
--{
-- mp_irqs [mp_irq_entries] = *m;
-- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
-- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
-- m->mpc_irqtype, m->mpc_irqflag & 3,
-- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
-- m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-- if (++mp_irq_entries == MAX_IRQ_SOURCES)
-- panic("Max # of irq sources exceeded!!\n");
--}
--
--static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
--{
-- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
-- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
-- m->mpc_irqtype, m->mpc_irqflag & 3,
-- (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
-- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
--}
--
--#ifdef CONFIG_X86_NUMAQ
--static void __init MP_translation_info (struct mpc_config_translation *m)
--{
-- printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
--
-- if (mpc_record >= MAX_MPC_ENTRY)
-- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
-- else
-- translation_table[mpc_record] = m; /* stash this for later */
-- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
-- node_set_online(m->trans_quad);
--}
--
--/*
-- * Read/parse the MPC oem tables
-- */
--
--static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
-- unsigned short oemsize)
--{
-- int count = sizeof (*oemtable); /* the header size */
-- unsigned char *oemptr = ((unsigned char *)oemtable)+count;
--
-- mpc_record = 0;
-- printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
-- if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
-- {
-- printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
-- oemtable->oem_signature[0],
-- oemtable->oem_signature[1],
-- oemtable->oem_signature[2],
-- oemtable->oem_signature[3]);
-- return;
-- }
-- if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
-- {
-- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
-- return;
-- }
-- while (count < oemtable->oem_length) {
-- switch (*oemptr) {
-- case MP_TRANSLATION:
-- {
-- struct mpc_config_translation *m=
-- (struct mpc_config_translation *)oemptr;
-- MP_translation_info(m);
-- oemptr += sizeof(*m);
-- count += sizeof(*m);
-- ++mpc_record;
-- break;
-- }
-- default:
-- {
-- printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
-- return;
-- }
-- }
-- }
--}
--
--static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
-- char *productid)
--{
-- if (strncmp(oem, "IBM NUMA", 8))
-- printk("Warning! May not be a NUMA-Q system!\n");
-- if (mpc->mpc_oemptr)
-- smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
-- mpc->mpc_oemsize);
--}
--#endif /* CONFIG_X86_NUMAQ */
--
--/*
-- * Read/parse the MPC
-- */
--
--static int __init smp_read_mpc(struct mp_config_table *mpc)
--{
-- char str[16];
-- char oem[10];
-- int count=sizeof(*mpc);
-- unsigned char *mpt=((unsigned char *)mpc)+count;
--
-- if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
-- printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
-- *(u32 *)mpc->mpc_signature);
-- return 0;
-- }
-- if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
-- printk(KERN_ERR "SMP mptable: checksum error!\n");
-- return 0;
-- }
-- if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
-- printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
-- mpc->mpc_spec);
-- return 0;
-- }
-- if (!mpc->mpc_lapic) {
-- printk(KERN_ERR "SMP mptable: null local APIC address!\n");
-- return 0;
-- }
-- memcpy(oem,mpc->mpc_oem,8);
-- oem[8]=0;
-- printk(KERN_INFO "OEM ID: %s ",oem);
--
-- memcpy(str,mpc->mpc_productid,12);
-- str[12]=0;
-- printk("Product ID: %s ",str);
--
-- mps_oem_check(mpc, oem, str);
--
-- printk("APIC at: 0x%X\n", mpc->mpc_lapic);
--
-- /*
-- * Save the local APIC address (it might be non-default) -- but only
-- * if we're not using ACPI.
-- */
-- if (!acpi_lapic)
-- mp_lapic_addr = mpc->mpc_lapic;
--
-- /*
-- * Now process the configuration blocks.
-- */
-- mpc_record = 0;
-- while (count < mpc->mpc_length) {
-- switch(*mpt) {
-- case MP_PROCESSOR:
-- {
-- struct mpc_config_processor *m=
-- (struct mpc_config_processor *)mpt;
-- /* ACPI may have already provided this data */
-- if (!acpi_lapic)
-- MP_processor_info(m);
-- mpt += sizeof(*m);
-- count += sizeof(*m);
-- break;
-- }
-- case MP_BUS:
-- {
-- struct mpc_config_bus *m=
-- (struct mpc_config_bus *)mpt;
-- MP_bus_info(m);
-- mpt += sizeof(*m);
-- count += sizeof(*m);
-- break;
-- }
-- case MP_IOAPIC:
-- {
-- struct mpc_config_ioapic *m=
-- (struct mpc_config_ioapic *)mpt;
-- MP_ioapic_info(m);
-- mpt+=sizeof(*m);
-- count+=sizeof(*m);
-- break;
-- }
-- case MP_INTSRC:
-- {
-- struct mpc_config_intsrc *m=
-- (struct mpc_config_intsrc *)mpt;
--
-- MP_intsrc_info(m);
-- mpt+=sizeof(*m);
-- count+=sizeof(*m);
-- break;
-- }
-- case MP_LINTSRC:
-- {
-- struct mpc_config_lintsrc *m=
-- (struct mpc_config_lintsrc *)mpt;
-- MP_lintsrc_info(m);
-- mpt+=sizeof(*m);
-- count+=sizeof(*m);
-- break;
-- }
-- default:
-- {
-- count = mpc->mpc_length;
-- break;
-- }
-- }
-- ++mpc_record;
-- }
-- setup_apic_routing();
-- if (!num_processors)
-- printk(KERN_ERR "SMP mptable: no processors registered!\n");
-- return num_processors;
--}
--
--static int __init ELCR_trigger(unsigned int irq)
--{
-- unsigned int port;
--
-- port = 0x4d0 + (irq >> 3);
-- return (inb(port) >> (irq & 7)) & 1;
--}
--
--static void __init construct_default_ioirq_mptable(int mpc_default_type)
--{
-- struct mpc_config_intsrc intsrc;
-- int i;
-- int ELCR_fallback = 0;
--
-- intsrc.mpc_type = MP_INTSRC;
-- intsrc.mpc_irqflag = 0; /* conforming */
-- intsrc.mpc_srcbus = 0;
-- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
--
-- intsrc.mpc_irqtype = mp_INT;
--
-- /*
-- * If true, we have an ISA/PCI system with no IRQ entries
-- * in the MP table. To prevent the PCI interrupts from being set up
-- * incorrectly, we try to use the ELCR. The sanity check to see if
-- * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
-- * never be level sensitive, so we simply see if the ELCR agrees.
-- * If it does, we assume it's valid.
-- */
-- if (mpc_default_type == 5) {
-- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
--
-- if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
-- printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
-- else {
-- printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
-- ELCR_fallback = 1;
-- }
-- }
--
-- for (i = 0; i < 16; i++) {
-- switch (mpc_default_type) {
-- case 2:
-- if (i == 0 || i == 13)
-- continue; /* IRQ0 & IRQ13 not connected */
-- /* fall through */
-- default:
-- if (i == 2)
-- continue; /* IRQ2 is never connected */
-- }
--
-- if (ELCR_fallback) {
-- /*
-- * If the ELCR indicates a level-sensitive interrupt, we
-- * copy that information over to the MP table in the
-- * irqflag field (level sensitive, active high polarity).
-- */
-- if (ELCR_trigger(i))
-- intsrc.mpc_irqflag = 13;
-- else
-- intsrc.mpc_irqflag = 0;
-- }
--
-- intsrc.mpc_srcbusirq = i;
-- intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
-- MP_intsrc_info(&intsrc);
-- }
--
-- intsrc.mpc_irqtype = mp_ExtINT;
-- intsrc.mpc_srcbusirq = 0;
-- intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
-- MP_intsrc_info(&intsrc);
--}
--
--static inline void __init construct_default_ISA_mptable(int mpc_default_type)
--{
-- struct mpc_config_processor processor;
-- struct mpc_config_bus bus;
-- struct mpc_config_ioapic ioapic;
-- struct mpc_config_lintsrc lintsrc;
-- int linttypes[2] = { mp_ExtINT, mp_NMI };
-- int i;
--
-- /*
-- * local APIC has default address
-- */
-- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
--
-- /*
-- * 2 CPUs, numbered 0 & 1.
-- */
-- processor.mpc_type = MP_PROCESSOR;
-- /* Either an integrated APIC or a discrete 82489DX. */
-- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-- processor.mpc_cpuflag = CPU_ENABLED;
-- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-- (boot_cpu_data.x86_model << 4) |
-- boot_cpu_data.x86_mask;
-- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-- processor.mpc_reserved[0] = 0;
-- processor.mpc_reserved[1] = 0;
-- for (i = 0; i < 2; i++) {
-- processor.mpc_apicid = i;
-- MP_processor_info(&processor);
-- }
--
-- bus.mpc_type = MP_BUS;
-- bus.mpc_busid = 0;
-- switch (mpc_default_type) {
-- default:
-- printk("???\n");
-- printk(KERN_ERR "Unknown standard configuration %d\n",
-- mpc_default_type);
-- /* fall through */
-- case 1:
-- case 5:
-- memcpy(bus.mpc_bustype, "ISA ", 6);
-- break;
-- case 2:
-- case 6:
-- case 3:
-- memcpy(bus.mpc_bustype, "EISA ", 6);
-- break;
-- case 4:
-- case 7:
-- memcpy(bus.mpc_bustype, "MCA ", 6);
-- }
-- MP_bus_info(&bus);
-- if (mpc_default_type > 4) {
-- bus.mpc_busid = 1;
-- memcpy(bus.mpc_bustype, "PCI ", 6);
-- MP_bus_info(&bus);
-- }
--
-- ioapic.mpc_type = MP_IOAPIC;
-- ioapic.mpc_apicid = 2;
-- ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-- ioapic.mpc_flags = MPC_APIC_USABLE;
-- ioapic.mpc_apicaddr = 0xFEC00000;
-- MP_ioapic_info(&ioapic);
--
-- /*
-- * We set up most of the low 16 IO-APIC pins according to MPS rules.
-- */
-- construct_default_ioirq_mptable(mpc_default_type);
--
-- lintsrc.mpc_type = MP_LINTSRC;
-- lintsrc.mpc_irqflag = 0; /* conforming */
-- lintsrc.mpc_srcbusid = 0;
-- lintsrc.mpc_srcbusirq = 0;
-- lintsrc.mpc_destapic = MP_APIC_ALL;
-- for (i = 0; i < 2; i++) {
-- lintsrc.mpc_irqtype = linttypes[i];
-- lintsrc.mpc_destapiclint = i;
-- MP_lintsrc_info(&lintsrc);
-- }
--}
--
--static struct intel_mp_floating *mpf_found;
--
--/*
-- * Scan the memory blocks for an SMP configuration block.
-- */
--void __init get_smp_config (void)
--{
-- struct intel_mp_floating *mpf = mpf_found;
--
-- /*
-- * ACPI supports both logical (e.g. Hyper-Threading) and physical
-- * processors, where MPS only supports physical.
-- */
-- if (acpi_lapic && acpi_ioapic) {
-- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
-- return;
-- }
-- else if (acpi_lapic)
-- printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
--
-- printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
-- if (mpf->mpf_feature2 & (1<<7)) {
-- printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
-- pic_mode = 1;
-- } else {
-- printk(KERN_INFO " Virtual Wire compatibility mode.\n");
-- pic_mode = 0;
-- }
--
-- /*
-- * Now see if we need to read further.
-- */
-- if (mpf->mpf_feature1 != 0) {
--
-- printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
-- construct_default_ISA_mptable(mpf->mpf_feature1);
--
-- } else if (mpf->mpf_physptr) {
--
-- /*
-- * Read the physical hardware table. Anything here will
-- * override the defaults.
-- */
-- if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
-- smp_found_config = 0;
-- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
-- printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
-- return;
-- }
-- /*
-- * If there are no explicit MP IRQ entries, then we are
-- * broken. We set up most of the low 16 IO-APIC pins to
-- * ISA defaults and hope it will work.
-- */
-- if (!mp_irq_entries) {
-- struct mpc_config_bus bus;
--
-- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
--
-- bus.mpc_type = MP_BUS;
-- bus.mpc_busid = 0;
-- memcpy(bus.mpc_bustype, "ISA ", 6);
-- MP_bus_info(&bus);
--
-- construct_default_ioirq_mptable(0);
-- }
--
-- } else
-- BUG();
--
-- printk(KERN_INFO "Processors: %d\n", num_processors);
-- /*
-- * Only use the first configuration found.
-- */
--}
--
--static int __init smp_scan_config (unsigned long base, unsigned long length)
--{
-- unsigned long *bp = isa_bus_to_virt(base);
-- struct intel_mp_floating *mpf;
--
-- printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
-- if (sizeof(*mpf) != 16)
-- printk("Error: MPF size\n");
--
-- while (length > 0) {
-- mpf = (struct intel_mp_floating *)bp;
-- if ((*bp == SMP_MAGIC_IDENT) &&
-- (mpf->mpf_length == 1) &&
-- !mpf_checksum((unsigned char *)bp, 16) &&
-- ((mpf->mpf_specification == 1)
-- || (mpf->mpf_specification == 4)) ) {
--
-- smp_found_config = 1;
--#ifndef CONFIG_XEN
-- printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
-- mpf, virt_to_phys(mpf));
-- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
-- BOOTMEM_DEFAULT);
-- if (mpf->mpf_physptr) {
-- /*
-- * We cannot access to MPC table to compute
-- * table size yet, as only few megabytes from
-- * the bottom is mapped now.
-- * PC-9800's MPC table places on the very last
-- * of physical memory; so that simply reserving
-- * PAGE_SIZE from mpg->mpf_physptr yields BUG()
-- * in reserve_bootmem.
-- */
-- unsigned long size = PAGE_SIZE;
-- unsigned long end = max_low_pfn * PAGE_SIZE;
-- if (mpf->mpf_physptr + size > end)
-- size = end - mpf->mpf_physptr;
-- reserve_bootmem(mpf->mpf_physptr, size,
-- BOOTMEM_DEFAULT);
-- }
--#else
-- printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
-- mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
--#endif
--
-- mpf_found = mpf;
-- return 1;
-- }
-- bp += 4;
-- length -= 16;
-- }
-- return 0;
--}
--
--void __init find_smp_config (void)
--{
--#ifndef CONFIG_XEN
-- unsigned int address;
--#endif
--
-- /*
-- * FIXME: Linux assumes you have 640K of base ram..
-- * this continues the error...
-- *
-- * 1) Scan the bottom 1K for a signature
-- * 2) Scan the top 1K of base RAM
-- * 3) Scan the 64K of bios
-- */
-- if (smp_scan_config(0x0,0x400) ||
-- smp_scan_config(639*0x400,0x400) ||
-- smp_scan_config(0xF0000,0x10000))
-- return;
-- /*
-- * If it is an SMP machine we should know now, unless the
-- * configuration is in an EISA/MCA bus machine with an
-- * extended bios data area.
-- *
-- * there is a real-mode segmented pointer pointing to the
-- * 4K EBDA area at 0x40E, calculate and scan it here.
-- *
-- * NOTE! There are Linux loaders that will corrupt the EBDA
-- * area, and as such this kind of SMP config may be less
-- * trustworthy, simply because the SMP table may have been
-- * stomped on during early boot. These loaders are buggy and
-- * should be fixed.
-- *
-- * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
-- */
--
--#ifndef CONFIG_XEN
-- address = get_bios_ebda();
-- if (address)
-- smp_scan_config(address, 0x400);
--#endif
--}
--
--int es7000_plat;
--
--/* --------------------------------------------------------------------------
-- ACPI-based MP Configuration
-- -------------------------------------------------------------------------- */
--
--#ifdef CONFIG_ACPI
--
--void __init mp_register_lapic_address(u64 address)
--{
--#ifndef CONFIG_XEN
-- mp_lapic_addr = (unsigned long) address;
--
-- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
--
-- if (boot_cpu_physical_apicid == -1U)
-- boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
--
-- Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
--#endif
--}
--
--void __cpuinit mp_register_lapic (u8 id, u8 enabled)
--{
-- struct mpc_config_processor processor;
-- int boot_cpu = 0;
--
-- if (MAX_APICS - id <= 0) {
-- printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
-- id, MAX_APICS);
-- return;
-- }
--
-- if (id == boot_cpu_physical_apicid)
-- boot_cpu = 1;
--
--#ifndef CONFIG_XEN
-- processor.mpc_type = MP_PROCESSOR;
-- processor.mpc_apicid = id;
-- processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
-- processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
-- processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
-- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-- processor.mpc_reserved[0] = 0;
-- processor.mpc_reserved[1] = 0;
--#endif
--
-- MP_processor_info(&processor);
--}
--
--#ifdef CONFIG_X86_IO_APIC
--
--#define MP_ISA_BUS 0
--#define MP_MAX_IOAPIC_PIN 127
--
--static struct mp_ioapic_routing {
-- int apic_id;
-- int gsi_base;
-- int gsi_end;
-- u32 pin_programmed[4];
--} mp_ioapic_routing[MAX_IO_APICS];
--
--static int mp_find_ioapic (int gsi)
--{
-- int i = 0;
--
-- /* Find the IOAPIC that manages this GSI. */
-- for (i = 0; i < nr_ioapics; i++) {
-- if ((gsi >= mp_ioapic_routing[i].gsi_base)
-- && (gsi <= mp_ioapic_routing[i].gsi_end))
-- return i;
-- }
--
-- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
--
-- return -1;
--}
--
--void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
--{
-- int idx = 0;
-- int tmpid;
--
-- if (nr_ioapics >= MAX_IO_APICS) {
-- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-- }
-- if (!address) {
-- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-- " found in MADT table, skipping!\n");
-- return;
-- }
--
-- idx = nr_ioapics++;
--
-- mp_ioapics[idx].mpc_type = MP_IOAPIC;
-- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-- mp_ioapics[idx].mpc_apicaddr = address;
--
--#ifndef CONFIG_XEN
-- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
--#endif
-- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-- && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-- tmpid = io_apic_get_unique_id(idx, id);
-- else
-- tmpid = id;
-- if (tmpid == -1) {
-- nr_ioapics--;
-- return;
-- }
-- mp_ioapics[idx].mpc_apicid = tmpid;
-- mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
--
-- /*
-- * Build basic GSI lookup table to facilitate gsi->io_apic lookups
-- * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
-- */
-- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-- mp_ioapic_routing[idx].gsi_base = gsi_base;
-- mp_ioapic_routing[idx].gsi_end = gsi_base +
-- io_apic_get_redir_entries(idx);
--
-- printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-- mp_ioapic_routing[idx].gsi_base,
-- mp_ioapic_routing[idx].gsi_end);
--}
--
--void __init
--mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
--{
-- struct mpc_config_intsrc intsrc;
-- int ioapic = -1;
-- int pin = -1;
--
-- /*
-- * Convert 'gsi' to 'ioapic.pin'.
-- */
-- ioapic = mp_find_ioapic(gsi);
-- if (ioapic < 0)
-- return;
-- pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
--
-- /*
-- * TBD: This check is for faulty timer entries, where the override
-- * erroneously sets the trigger to level, resulting in a HUGE
-- * increase of timer interrupts!
-- */
-- if ((bus_irq == 0) && (trigger == 3))
-- trigger = 1;
--
-- intsrc.mpc_type = MP_INTSRC;
-- intsrc.mpc_irqtype = mp_INT;
-- intsrc.mpc_irqflag = (trigger << 2) | polarity;
-- intsrc.mpc_srcbus = MP_ISA_BUS;
-- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
-- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
-- intsrc.mpc_dstirq = pin; /* INTIN# */
--
-- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
-- intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
-- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
-- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
--
-- mp_irqs[mp_irq_entries] = intsrc;
-- if (++mp_irq_entries == MAX_IRQ_SOURCES)
-- panic("Max # of irq sources exceeded!\n");
--}
--
--void __init mp_config_acpi_legacy_irqs (void)
--{
-- struct mpc_config_intsrc intsrc;
-- int i = 0;
-- int ioapic = -1;
--
-- /*
-- * Fabricate the legacy ISA bus (bus #31).
-- */
-- mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-- Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
--
-- /*
-- * Older generations of ES7000 have no legacy identity mappings
-- */
-- if (es7000_plat == 1)
-- return;
--
-- /*
-- * Locate the IOAPIC that manages the ISA IRQs (0-15).
-- */
-- ioapic = mp_find_ioapic(0);
-- if (ioapic < 0)
-- return;
--
-- intsrc.mpc_type = MP_INTSRC;
-- intsrc.mpc_irqflag = 0; /* Conforming */
-- intsrc.mpc_srcbus = MP_ISA_BUS;
-- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
--
-- /*
-- * Use the default configuration for the IRQs 0-15. Unless
-- * overridden by (MADT) interrupt source override entries.
-- */
-- for (i = 0; i < 16; i++) {
-- int idx;
--
-- for (idx = 0; idx < mp_irq_entries; idx++) {
-- struct mpc_config_intsrc *irq = mp_irqs + idx;
--
-- /* Do we already have a mapping for this ISA IRQ? */
-- if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
-- break;
--
-- /* Do we already have a mapping for this IOAPIC pin */
-- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-- (irq->mpc_dstirq == i))
-- break;
-- }
--
-- if (idx != mp_irq_entries) {
-- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-- continue; /* IRQ already used */
-- }
--
-- intsrc.mpc_irqtype = mp_INT;
-- intsrc.mpc_srcbusirq = i; /* Identity mapped */
-- intsrc.mpc_dstirq = i;
--
-- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
-- "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
-- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
-- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
-- intsrc.mpc_dstirq);
--
-- mp_irqs[mp_irq_entries] = intsrc;
-- if (++mp_irq_entries == MAX_IRQ_SOURCES)
-- panic("Max # of irq sources exceeded!\n");
-- }
--}
--
--#define MAX_GSI_NUM 4096
--#define IRQ_COMPRESSION_START 64
--
--int mp_register_gsi(u32 gsi, int triggering, int polarity)
--{
-- int ioapic = -1;
-- int ioapic_pin = 0;
-- int idx, bit = 0;
-- static int pci_irq = IRQ_COMPRESSION_START;
-- /*
-- * Mapping between Global System Interrupts, which
-- * represent all possible interrupts, and IRQs
-- * assigned to actual devices.
-- */
-- static int gsi_to_irq[MAX_GSI_NUM];
--
-- /* Don't set up the ACPI SCI because it's already set up */
-- if (acpi_gbl_FADT.sci_interrupt == gsi)
-- return gsi;
--
-- ioapic = mp_find_ioapic(gsi);
-- if (ioapic < 0) {
-- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-- return gsi;
-- }
--
-- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
--
-- if (ioapic_renumber_irq)
-- gsi = ioapic_renumber_irq(ioapic, gsi);
--
-- /*
-- * Avoid pin reprogramming. PRTs typically include entries
-- * with redundant pin->gsi mappings (but unique PCI devices);
-- * we only program the IOAPIC on the first.
-- */
-- bit = ioapic_pin % 32;
-- idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
-- if (idx > 3) {
-- printk(KERN_ERR "Invalid reference to IOAPIC pin "
-- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
-- ioapic_pin);
-- return gsi;
-- }
-- if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
-- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-- }
--
-- mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
--
-- /*
-- * For GSI >= 64, use IRQ compression
-- */
-- if ((gsi >= IRQ_COMPRESSION_START)
-- && (triggering == ACPI_LEVEL_SENSITIVE)) {
-- /*
-- * For PCI devices assign IRQs in order, avoiding gaps
-- * due to unused I/O APIC pins.
-- */
-- int irq = gsi;
-- if (gsi < MAX_GSI_NUM) {
-- /*
-- * Retain the VIA chipset work-around (gsi > 15), but
-- * avoid a problem where the 8254 timer (IRQ0) is setup
-- * via an override (so it's not on pin 0 of the ioapic),
-- * and at the same time, the pin 0 interrupt is a PCI
-- * type. The gsi > 15 test could cause these two pins
-- * to be shared as IRQ0, and they are not shareable.
-- * So test for this condition, and if necessary, avoid
-- * the pin collision.
-- */
-- if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
-- gsi = pci_irq++;
-- /*
-- * Don't assign IRQ used by ACPI SCI
-- */
-- if (gsi == acpi_gbl_FADT.sci_interrupt)
-- gsi = pci_irq++;
-- gsi_to_irq[irq] = gsi;
-- } else {
-- printk(KERN_ERR "GSI %u is too high\n", gsi);
-- return gsi;
-- }
-- }
--
-- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-- return gsi;
--}
--
--#endif /* CONFIG_X86_IO_APIC */
--#endif /* CONFIG_ACPI */
---- a/arch/x86/kernel/mpparse_64-xen.c
-+++ /dev/null
-@@ -1,879 +0,0 @@
--/*
-- * Intel Multiprocessor Specification 1.1 and 1.4
-- * compliant MP-table parsing routines.
-- *
-- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
-- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
-- *
-- * Fixes
-- * Erich Boleyn : MP v1.4 and additional changes.
-- * Alan Cox : Added EBDA scanning
-- * Ingo Molnar : various cleanups and rewrites
-- * Maciej W. Rozycki: Bits for default MP configurations
-- * Paul Diefenbaugh: Added full ACPI support
-- */
--
--#include <linux/mm.h>
--#include <linux/init.h>
--#include <linux/delay.h>
--#include <linux/bootmem.h>
--#include <linux/kernel_stat.h>
--#include <linux/mc146818rtc.h>
--#include <linux/acpi.h>
--#include <linux/module.h>
--
--#include <asm/smp.h>
--#include <asm/mtrr.h>
--#include <asm/mpspec.h>
--#include <asm/pgalloc.h>
--#include <asm/io_apic.h>
--#include <asm/proto.h>
--#include <asm/acpi.h>
--
--/* Have we found an MP table */
--int smp_found_config;
--
--/*
-- * Various Linux-internal data structures created from the
-- * MP-table.
-- */
--DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
--int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
--
--static int mp_current_pci_id = 0;
--/* I/O APIC entries */
--struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
--
--/* # of MP IRQ source entries */
--struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
--
--/* MP IRQ source entries */
--int mp_irq_entries;
--
--int nr_ioapics;
--unsigned long mp_lapic_addr = 0;
--
--
--
--/* Processor that is doing the boot up */
--unsigned int boot_cpu_id = -1U;
--EXPORT_SYMBOL(boot_cpu_id);
--
--/* Internal processor count */
--unsigned int num_processors;
--
--unsigned disabled_cpus __cpuinitdata;
--
--/* Bitmask of physically existing CPUs */
--physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
--
--#ifndef CONFIG_XEN
--u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
-- = { [0 ... NR_CPUS-1] = BAD_APICID };
--void *x86_bios_cpu_apicid_early_ptr;
--#endif
--DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
--EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
--
--
--/*
-- * Intel MP BIOS table parsing routines:
-- */
--
--/*
-- * Checksum an MP configuration block.
-- */
--
--static int __init mpf_checksum(unsigned char *mp, int len)
--{
-- int sum = 0;
--
-- while (len--)
-- sum += *mp++;
--
-- return sum & 0xFF;
--}
--
--#ifndef CONFIG_XEN
--static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
--{
-- int cpu;
-- cpumask_t tmp_map;
-- char *bootup_cpu = "";
--
-- if (!(m->mpc_cpuflag & CPU_ENABLED)) {
-- disabled_cpus++;
-- return;
-- }
-- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
-- bootup_cpu = " (Bootup-CPU)";
-- boot_cpu_id = m->mpc_apicid;
-- }
--
-- printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
--
-- if (num_processors >= NR_CPUS) {
-- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
-- " Processor ignored.\n", NR_CPUS);
-- return;
-- }
--
-- num_processors++;
-- cpus_complement(tmp_map, cpu_present_map);
-- cpu = first_cpu(tmp_map);
--
-- physid_set(m->mpc_apicid, phys_cpu_present_map);
-- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
-- /*
-- * x86_bios_cpu_apicid is required to have processors listed
-- * in same order as logical cpu numbers. Hence the first
-- * entry is BSP, and so on.
-- */
-- cpu = 0;
-- }
-- /* are we being called early in kernel startup? */
-- if (x86_cpu_to_apicid_early_ptr) {
-- u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
-- u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
--
-- cpu_to_apicid[cpu] = m->mpc_apicid;
-- bios_cpu_apicid[cpu] = m->mpc_apicid;
-- } else {
-- per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
-- per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
-- }
--
-- cpu_set(cpu, cpu_possible_map);
-- cpu_set(cpu, cpu_present_map);
--}
--#else
--static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
--{
-- num_processors++;
--}
--#endif /* CONFIG_XEN */
--
--static void __init MP_bus_info (struct mpc_config_bus *m)
--{
-- char str[7];
--
-- memcpy(str, m->mpc_bustype, 6);
-- str[6] = 0;
-- Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
--
-- if (strncmp(str, "ISA", 3) == 0) {
-- set_bit(m->mpc_busid, mp_bus_not_pci);
-- } else if (strncmp(str, "PCI", 3) == 0) {
-- clear_bit(m->mpc_busid, mp_bus_not_pci);
-- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-- mp_current_pci_id++;
-- } else {
-- printk(KERN_ERR "Unknown bustype %s\n", str);
-- }
--}
--
--static int bad_ioapic(unsigned long address)
--{
-- if (nr_ioapics >= MAX_IO_APICS) {
-- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-- }
-- if (!address) {
-- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-- " found in table, skipping!\n");
-- return 1;
-- }
-- return 0;
--}
--
--static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
--{
-- if (!(m->mpc_flags & MPC_APIC_USABLE))
-- return;
--
-- printk("I/O APIC #%d at 0x%X.\n",
-- m->mpc_apicid, m->mpc_apicaddr);
--
-- if (bad_ioapic(m->mpc_apicaddr))
-- return;
--
-- mp_ioapics[nr_ioapics] = *m;
-- nr_ioapics++;
--}
--
--static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
--{
-- mp_irqs [mp_irq_entries] = *m;
-- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
-- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
-- m->mpc_irqtype, m->mpc_irqflag & 3,
-- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
-- m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-- if (++mp_irq_entries >= MAX_IRQ_SOURCES)
-- panic("Max # of irq sources exceeded!!\n");
--}
--
--static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
--{
-- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
-- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
-- m->mpc_irqtype, m->mpc_irqflag & 3,
-- (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
-- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
--}
--
--/*
-- * Read/parse the MPC
-- */
--
--static int __init smp_read_mpc(struct mp_config_table *mpc)
--{
-- char str[16];
-- int count=sizeof(*mpc);
-- unsigned char *mpt=((unsigned char *)mpc)+count;
--
-- if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
-- printk("MPTABLE: bad signature [%c%c%c%c]!\n",
-- mpc->mpc_signature[0],
-- mpc->mpc_signature[1],
-- mpc->mpc_signature[2],
-- mpc->mpc_signature[3]);
-- return 0;
-- }
-- if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
-- printk("MPTABLE: checksum error!\n");
-- return 0;
-- }
-- if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
-- printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
-- mpc->mpc_spec);
-- return 0;
-- }
-- if (!mpc->mpc_lapic) {
-- printk(KERN_ERR "MPTABLE: null local APIC address!\n");
-- return 0;
-- }
-- memcpy(str,mpc->mpc_oem,8);
-- str[8] = 0;
-- printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
--
-- memcpy(str,mpc->mpc_productid,12);
-- str[12] = 0;
-- printk("MPTABLE: Product ID: %s ",str);
--
-- printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
--
-- /* save the local APIC address, it might be non-default */
-- if (!acpi_lapic)
-- mp_lapic_addr = mpc->mpc_lapic;
--
-- /*
-- * Now process the configuration blocks.
-- */
-- while (count < mpc->mpc_length) {
-- switch(*mpt) {
-- case MP_PROCESSOR:
-- {
-- struct mpc_config_processor *m=
-- (struct mpc_config_processor *)mpt;
-- if (!acpi_lapic)
-- MP_processor_info(m);
-- mpt += sizeof(*m);
-- count += sizeof(*m);
-- break;
-- }
-- case MP_BUS:
-- {
-- struct mpc_config_bus *m=
-- (struct mpc_config_bus *)mpt;
-- MP_bus_info(m);
-- mpt += sizeof(*m);
-- count += sizeof(*m);
-- break;
-- }
-- case MP_IOAPIC:
-- {
-- struct mpc_config_ioapic *m=
-- (struct mpc_config_ioapic *)mpt;
-- MP_ioapic_info(m);
-- mpt += sizeof(*m);
-- count += sizeof(*m);
-- break;
-- }
-- case MP_INTSRC:
-- {
-- struct mpc_config_intsrc *m=
-- (struct mpc_config_intsrc *)mpt;
--
-- MP_intsrc_info(m);
-- mpt += sizeof(*m);
-- count += sizeof(*m);
-- break;
-- }
-- case MP_LINTSRC:
-- {
-- struct mpc_config_lintsrc *m=
-- (struct mpc_config_lintsrc *)mpt;
-- MP_lintsrc_info(m);
-- mpt += sizeof(*m);
-- count += sizeof(*m);
-- break;
-- }
-- }
-- }
-- setup_apic_routing();
-- if (!num_processors)
-- printk(KERN_ERR "MPTABLE: no processors registered!\n");
-- return num_processors;
--}
--
--static int __init ELCR_trigger(unsigned int irq)
--{
-- unsigned int port;
--
-- port = 0x4d0 + (irq >> 3);
-- return (inb(port) >> (irq & 7)) & 1;
--}
--
--static void __init construct_default_ioirq_mptable(int mpc_default_type)
--{
-- struct mpc_config_intsrc intsrc;
-- int i;
-- int ELCR_fallback = 0;
--
-- intsrc.mpc_type = MP_INTSRC;
-- intsrc.mpc_irqflag = 0; /* conforming */
-- intsrc.mpc_srcbus = 0;
-- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
--
-- intsrc.mpc_irqtype = mp_INT;
--
-- /*
-- * If true, we have an ISA/PCI system with no IRQ entries
-- * in the MP table. To prevent the PCI interrupts from being set up
-- * incorrectly, we try to use the ELCR. The sanity check to see if
-- * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
-- * never be level sensitive, so we simply see if the ELCR agrees.
-- * If it does, we assume it's valid.
-- */
-- if (mpc_default_type == 5) {
-- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
--
-- if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
-- printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
-- else {
-- printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
-- ELCR_fallback = 1;
-- }
-- }
--
-- for (i = 0; i < 16; i++) {
-- switch (mpc_default_type) {
-- case 2:
-- if (i == 0 || i == 13)
-- continue; /* IRQ0 & IRQ13 not connected */
-- /* fall through */
-- default:
-- if (i == 2)
-- continue; /* IRQ2 is never connected */
-- }
--
-- if (ELCR_fallback) {
-- /*
-- * If the ELCR indicates a level-sensitive interrupt, we
-- * copy that information over to the MP table in the
-- * irqflag field (level sensitive, active high polarity).
-- */
-- if (ELCR_trigger(i))
-- intsrc.mpc_irqflag = 13;
-- else
-- intsrc.mpc_irqflag = 0;
-- }
--
-- intsrc.mpc_srcbusirq = i;
-- intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
-- MP_intsrc_info(&intsrc);
-- }
--
-- intsrc.mpc_irqtype = mp_ExtINT;
-- intsrc.mpc_srcbusirq = 0;
-- intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
-- MP_intsrc_info(&intsrc);
--}
--
--static inline void __init construct_default_ISA_mptable(int mpc_default_type)
--{
-- struct mpc_config_processor processor;
-- struct mpc_config_bus bus;
-- struct mpc_config_ioapic ioapic;
-- struct mpc_config_lintsrc lintsrc;
-- int linttypes[2] = { mp_ExtINT, mp_NMI };
-- int i;
--
-- /*
-- * local APIC has default address
-- */
-- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
--
-- /*
-- * 2 CPUs, numbered 0 & 1.
-- */
-- processor.mpc_type = MP_PROCESSOR;
-- processor.mpc_apicver = 0;
-- processor.mpc_cpuflag = CPU_ENABLED;
-- processor.mpc_cpufeature = 0;
-- processor.mpc_featureflag = 0;
-- processor.mpc_reserved[0] = 0;
-- processor.mpc_reserved[1] = 0;
-- for (i = 0; i < 2; i++) {
-- processor.mpc_apicid = i;
-- MP_processor_info(&processor);
-- }
--
-- bus.mpc_type = MP_BUS;
-- bus.mpc_busid = 0;
-- switch (mpc_default_type) {
-- default:
-- printk(KERN_ERR "???\nUnknown standard configuration %d\n",
-- mpc_default_type);
-- /* fall through */
-- case 1:
-- case 5:
-- memcpy(bus.mpc_bustype, "ISA ", 6);
-- break;
-- }
-- MP_bus_info(&bus);
-- if (mpc_default_type > 4) {
-- bus.mpc_busid = 1;
-- memcpy(bus.mpc_bustype, "PCI ", 6);
-- MP_bus_info(&bus);
-- }
--
-- ioapic.mpc_type = MP_IOAPIC;
-- ioapic.mpc_apicid = 2;
-- ioapic.mpc_apicver = 0;
-- ioapic.mpc_flags = MPC_APIC_USABLE;
-- ioapic.mpc_apicaddr = 0xFEC00000;
-- MP_ioapic_info(&ioapic);
--
-- /*
-- * We set up most of the low 16 IO-APIC pins according to MPS rules.
-- */
-- construct_default_ioirq_mptable(mpc_default_type);
--
-- lintsrc.mpc_type = MP_LINTSRC;
-- lintsrc.mpc_irqflag = 0; /* conforming */
-- lintsrc.mpc_srcbusid = 0;
-- lintsrc.mpc_srcbusirq = 0;
-- lintsrc.mpc_destapic = MP_APIC_ALL;
-- for (i = 0; i < 2; i++) {
-- lintsrc.mpc_irqtype = linttypes[i];
-- lintsrc.mpc_destapiclint = i;
-- MP_lintsrc_info(&lintsrc);
-- }
--}
--
--static struct intel_mp_floating *mpf_found;
--
--/*
-- * Scan the memory blocks for an SMP configuration block.
-- */
--void __init get_smp_config (void)
--{
-- struct intel_mp_floating *mpf = mpf_found;
--
-- /*
-- * ACPI supports both logical (e.g. Hyper-Threading) and physical
-- * processors, where MPS only supports physical.
-- */
-- if (acpi_lapic && acpi_ioapic) {
-- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
-- return;
-- }
-- else if (acpi_lapic)
-- printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
--
-- printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
--
-- /*
-- * Now see if we need to read further.
-- */
-- if (mpf->mpf_feature1 != 0) {
--
-- printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
-- construct_default_ISA_mptable(mpf->mpf_feature1);
--
-- } else if (mpf->mpf_physptr) {
--
-- /*
-- * Read the physical hardware table. Anything here will
-- * override the defaults.
-- */
-- if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
-- smp_found_config = 0;
-- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
-- printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
-- return;
-- }
-- /*
-- * If there are no explicit MP IRQ entries, then we are
-- * broken. We set up most of the low 16 IO-APIC pins to
-- * ISA defaults and hope it will work.
-- */
-- if (!mp_irq_entries) {
-- struct mpc_config_bus bus;
--
-- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
--
-- bus.mpc_type = MP_BUS;
-- bus.mpc_busid = 0;
-- memcpy(bus.mpc_bustype, "ISA ", 6);
-- MP_bus_info(&bus);
--
-- construct_default_ioirq_mptable(0);
-- }
--
-- } else
-- BUG();
--
-- printk(KERN_INFO "Processors: %d\n", num_processors);
-- /*
-- * Only use the first configuration found.
-- */
--}
--
--static int __init smp_scan_config (unsigned long base, unsigned long length)
--{
-- extern void __bad_mpf_size(void);
-- unsigned int *bp = isa_bus_to_virt(base);
-- struct intel_mp_floating *mpf;
--
-- Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
-- if (sizeof(*mpf) != 16)
-- __bad_mpf_size();
--
-- while (length > 0) {
-- mpf = (struct intel_mp_floating *)bp;
-- if ((*bp == SMP_MAGIC_IDENT) &&
-- (mpf->mpf_length == 1) &&
-- !mpf_checksum((unsigned char *)bp, 16) &&
-- ((mpf->mpf_specification == 1)
-- || (mpf->mpf_specification == 4)) ) {
--
-- smp_found_config = 1;
-- mpf_found = mpf;
-- return 1;
-- }
-- bp += 4;
-- length -= 16;
-- }
-- return 0;
--}
--
--void __init find_smp_config(void)
--{
-- unsigned int address;
--
-- /*
-- * FIXME: Linux assumes you have 640K of base ram..
-- * this continues the error...
-- *
-- * 1) Scan the bottom 1K for a signature
-- * 2) Scan the top 1K of base RAM
-- * 3) Scan the 64K of bios
-- */
-- if (smp_scan_config(0x0,0x400) ||
-- smp_scan_config(639*0x400,0x400) ||
-- smp_scan_config(0xF0000,0x10000))
-- return;
-- /*
-- * If it is an SMP machine we should know now.
-- *
-- * there is a real-mode segmented pointer pointing to the
-- * 4K EBDA area at 0x40E, calculate and scan it here.
-- *
-- * NOTE! There are Linux loaders that will corrupt the EBDA
-- * area, and as such this kind of SMP config may be less
-- * trustworthy, simply because the SMP table may have been
-- * stomped on during early boot. These loaders are buggy and
-- * should be fixed.
-- */
--
-- address = *(unsigned short *)phys_to_virt(0x40E);
-- address <<= 4;
-- if (smp_scan_config(address, 0x1000))
-- return;
--
-- /* If we have come this far, we did not find an MP table */
-- printk(KERN_INFO "No mptable found.\n");
--}
--
--/* --------------------------------------------------------------------------
-- ACPI-based MP Configuration
-- -------------------------------------------------------------------------- */
--
--#ifdef CONFIG_ACPI
--
--void __init mp_register_lapic_address(u64 address)
--{
--#ifndef CONFIG_XEN
-- mp_lapic_addr = (unsigned long) address;
-- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-- if (boot_cpu_id == -1U)
-- boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
--#endif
--}
--
--void __cpuinit mp_register_lapic (u8 id, u8 enabled)
--{
-- struct mpc_config_processor processor;
-- int boot_cpu = 0;
--
-- if (id == boot_cpu_id)
-- boot_cpu = 1;
--
--#ifndef CONFIG_XEN
-- processor.mpc_type = MP_PROCESSOR;
-- processor.mpc_apicid = id;
-- processor.mpc_apicver = 0;
-- processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
-- processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
-- processor.mpc_cpufeature = 0;
-- processor.mpc_featureflag = 0;
-- processor.mpc_reserved[0] = 0;
-- processor.mpc_reserved[1] = 0;
--#endif
--
-- MP_processor_info(&processor);
--}
--
--#define MP_ISA_BUS 0
--#define MP_MAX_IOAPIC_PIN 127
--
--static struct mp_ioapic_routing {
-- int apic_id;
-- int gsi_start;
-- int gsi_end;
-- u32 pin_programmed[4];
--} mp_ioapic_routing[MAX_IO_APICS];
--
--static int mp_find_ioapic(int gsi)
--{
-- int i = 0;
--
-- /* Find the IOAPIC that manages this GSI. */
-- for (i = 0; i < nr_ioapics; i++) {
-- if ((gsi >= mp_ioapic_routing[i].gsi_start)
-- && (gsi <= mp_ioapic_routing[i].gsi_end))
-- return i;
-- }
--
-- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-- return -1;
--}
--
--static u8 uniq_ioapic_id(u8 id)
--{
-- int i;
-- DECLARE_BITMAP(used, 256);
-- bitmap_zero(used, 256);
-- for (i = 0; i < nr_ioapics; i++) {
-- struct mpc_config_ioapic *ia = &mp_ioapics[i];
-- __set_bit(ia->mpc_apicid, used);
-- }
-- if (!test_bit(id, used))
-- return id;
-- return find_first_zero_bit(used, 256);
--}
--
--void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
--{
-- int idx = 0;
--
-- if (bad_ioapic(address))
-- return;
--
-- idx = nr_ioapics;
--
-- mp_ioapics[idx].mpc_type = MP_IOAPIC;
-- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-- mp_ioapics[idx].mpc_apicaddr = address;
--
--#ifndef CONFIG_XEN
-- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
--#endif
-- mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-- mp_ioapics[idx].mpc_apicver = 0;
--
-- /*
-- * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
-- * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
-- */
-- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-- mp_ioapic_routing[idx].gsi_start = gsi_base;
-- mp_ioapic_routing[idx].gsi_end = gsi_base +
-- io_apic_get_redir_entries(idx);
--
-- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
-- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-- mp_ioapics[idx].mpc_apicaddr,
-- mp_ioapic_routing[idx].gsi_start,
-- mp_ioapic_routing[idx].gsi_end);
--
-- nr_ioapics++;
--}
--
--void __init
--mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
--{
-- struct mpc_config_intsrc intsrc;
-- int ioapic = -1;
-- int pin = -1;
--
-- /*
-- * Convert 'gsi' to 'ioapic.pin'.
-- */
-- ioapic = mp_find_ioapic(gsi);
-- if (ioapic < 0)
-- return;
-- pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
--
-- /*
-- * TBD: This check is for faulty timer entries, where the override
-- * erroneously sets the trigger to level, resulting in a HUGE
-- * increase of timer interrupts!
-- */
-- if ((bus_irq == 0) && (trigger == 3))
-- trigger = 1;
--
-- intsrc.mpc_type = MP_INTSRC;
-- intsrc.mpc_irqtype = mp_INT;
-- intsrc.mpc_irqflag = (trigger << 2) | polarity;
-- intsrc.mpc_srcbus = MP_ISA_BUS;
-- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
-- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
-- intsrc.mpc_dstirq = pin; /* INTIN# */
--
-- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
-- intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
-- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
-- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
--
-- mp_irqs[mp_irq_entries] = intsrc;
-- if (++mp_irq_entries == MAX_IRQ_SOURCES)
-- panic("Max # of irq sources exceeded!\n");
--}
--
--void __init mp_config_acpi_legacy_irqs(void)
--{
-- struct mpc_config_intsrc intsrc;
-- int i = 0;
-- int ioapic = -1;
--
-- /*
-- * Fabricate the legacy ISA bus (bus #31).
-- */
-- set_bit(MP_ISA_BUS, mp_bus_not_pci);
--
-- /*
-- * Locate the IOAPIC that manages the ISA IRQs (0-15).
-- */
-- ioapic = mp_find_ioapic(0);
-- if (ioapic < 0)
-- return;
--
-- intsrc.mpc_type = MP_INTSRC;
-- intsrc.mpc_irqflag = 0; /* Conforming */
-- intsrc.mpc_srcbus = MP_ISA_BUS;
-- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
--
-- /*
-- * Use the default configuration for the IRQs 0-15. Unless
-- * overridden by (MADT) interrupt source override entries.
-- */
-- for (i = 0; i < 16; i++) {
-- int idx;
--
-- for (idx = 0; idx < mp_irq_entries; idx++) {
-- struct mpc_config_intsrc *irq = mp_irqs + idx;
--
-- /* Do we already have a mapping for this ISA IRQ? */
-- if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
-- break;
--
-- /* Do we already have a mapping for this IOAPIC pin */
-- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-- (irq->mpc_dstirq == i))
-- break;
-- }
--
-- if (idx != mp_irq_entries) {
-- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-- continue; /* IRQ already used */
-- }
--
-- intsrc.mpc_irqtype = mp_INT;
-- intsrc.mpc_srcbusirq = i; /* Identity mapped */
-- intsrc.mpc_dstirq = i;
--
-- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
-- "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
-- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
-- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
-- intsrc.mpc_dstirq);
--
-- mp_irqs[mp_irq_entries] = intsrc;
-- if (++mp_irq_entries == MAX_IRQ_SOURCES)
-- panic("Max # of irq sources exceeded!\n");
-- }
--}
--
--int mp_register_gsi(u32 gsi, int triggering, int polarity)
--{
-- int ioapic = -1;
-- int ioapic_pin = 0;
-- int idx, bit = 0;
--
-- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-- return gsi;
--
-- /* Don't set up the ACPI SCI because it's already set up */
-- if (acpi_gbl_FADT.sci_interrupt == gsi)
-- return gsi;
--
-- ioapic = mp_find_ioapic(gsi);
-- if (ioapic < 0) {
-- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-- return gsi;
-- }
--
-- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
--
-- /*
-- * Avoid pin reprogramming. PRTs typically include entries
-- * with redundant pin->gsi mappings (but unique PCI devices);
-- * we only program the IOAPIC on the first.
-- */
-- bit = ioapic_pin % 32;
-- idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
-- if (idx > 3) {
-- printk(KERN_ERR "Invalid reference to IOAPIC pin "
-- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
-- ioapic_pin);
-- return gsi;
-- }
-- if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
-- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-- return gsi;
-- }
--
-- mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
--
-- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-- return gsi;
--}
--#endif /*CONFIG_ACPI*/
---- /dev/null
-+++ b/arch/x86/kernel/mpparse-xen.c
-@@ -0,0 +1,1104 @@
-+/*
-+ * Intel Multiprocessor Specification 1.1 and 1.4
-+ * compliant MP-table parsing routines.
-+ *
-+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
-+ * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
-+ * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
-+ */
-+
-+#include <linux/mm.h>
-+#include <linux/init.h>
-+#include <linux/delay.h>
-+#include <linux/bootmem.h>
-+#include <linux/kernel_stat.h>
-+#include <linux/mc146818rtc.h>
-+#include <linux/bitops.h>
-+#include <linux/acpi.h>
-+#include <linux/module.h>
-+
-+#include <asm/smp.h>
-+#include <asm/mtrr.h>
-+#include <asm/mpspec.h>
-+#include <asm/pgalloc.h>
-+#include <asm/io_apic.h>
-+#include <asm/proto.h>
-+#include <asm/acpi.h>
-+#include <asm/bios_ebda.h>
-+
-+#include <mach_apic.h>
-+#ifdef CONFIG_X86_32
-+#include <mach_apicdef.h>
-+#include <mach_mpparse.h>
-+#endif
-+
-+/* Have we found an MP table */
-+int smp_found_config;
-+
-+/*
-+ * Various Linux-internal data structures created from the
-+ * MP-table.
-+ */
-+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-+int mp_bus_id_to_type[MAX_MP_BUSSES];
-+#endif
-+
-+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-+int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
-+
-+static int mp_current_pci_id;
-+
-+int pic_mode;
-+
-+/*
-+ * Intel MP BIOS table parsing routines:
-+ */
-+
-+/*
-+ * Checksum an MP configuration block.
-+ */
-+
-+static int __init mpf_checksum(unsigned char *mp, int len)
-+{
-+ int sum = 0;
-+
-+ while (len--)
-+ sum += *mp++;
-+
-+ return sum & 0xFF;
-+}
-+
-+#ifdef CONFIG_X86_NUMAQ
-+/*
-+ * Have to match translation table entries to main table entries by counter
-+ * hence the mpc_record variable .... can't see a less disgusting way of
-+ * doing this ....
-+ */
-+
-+static int mpc_record;
-+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
-+ __cpuinitdata;
-+#endif
-+
-+#ifndef CONFIG_XEN
-+static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
-+{
-+ int apicid;
-+ char *bootup_cpu = "";
-+
-+ if (!(m->mpc_cpuflag & CPU_ENABLED)) {
-+ disabled_cpus++;
-+ return;
-+ }
-+#ifdef CONFIG_X86_NUMAQ
-+ apicid = mpc_apic_id(m, translation_table[mpc_record]);
-+#else
-+ apicid = m->mpc_apicid;
-+#endif
-+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
-+ bootup_cpu = " (Bootup-CPU)";
-+ boot_cpu_physical_apicid = m->mpc_apicid;
-+ }
-+
-+ printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
-+ generic_processor_info(apicid, m->mpc_apicver);
-+}
-+#else
-+static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
-+{
-+ num_processors++;
-+}
-+#endif /* CONFIG_XEN */
-+
-+static void __init MP_bus_info(struct mpc_config_bus *m)
-+{
-+ char str[7];
-+
-+ memcpy(str, m->mpc_bustype, 6);
-+ str[6] = 0;
-+
-+#ifdef CONFIG_X86_NUMAQ
-+ mpc_oem_bus_info(m, str, translation_table[mpc_record]);
-+#else
-+ Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
-+#endif
-+
-+#if MAX_MP_BUSSES < 256
-+ if (m->mpc_busid >= MAX_MP_BUSSES) {
-+ printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
-+ " is too large, max. supported is %d\n",
-+ m->mpc_busid, str, MAX_MP_BUSSES - 1);
-+ return;
-+ }
-+#endif
-+
-+ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
-+ set_bit(m->mpc_busid, mp_bus_not_pci);
-+#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
-+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
-+#endif
-+ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
-+#ifdef CONFIG_X86_NUMAQ
-+ mpc_oem_pci_bus(m, translation_table[mpc_record]);
-+#endif
-+ clear_bit(m->mpc_busid, mp_bus_not_pci);
-+ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-+ mp_current_pci_id++;
-+#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
-+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
-+ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
-+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
-+ } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
-+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
-+#endif
-+ } else
-+ printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
-+}
-+
-+#ifdef CONFIG_X86_IO_APIC
-+
-+static int bad_ioapic(unsigned long address)
-+{
-+ if (nr_ioapics >= MAX_IO_APICS) {
-+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-+ }
-+ if (!address) {
-+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-+ " found in table, skipping!\n");
-+ return 1;
-+ }
-+ return 0;
-+}
-+
-+static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
-+{
-+ if (!(m->mpc_flags & MPC_APIC_USABLE))
-+ return;
-+
-+ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
-+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
-+
-+ if (bad_ioapic(m->mpc_apicaddr))
-+ return;
-+
-+ mp_ioapics[nr_ioapics] = *m;
-+ nr_ioapics++;
-+}
-+
-+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
-+{
-+ mp_irqs[mp_irq_entries] = *m;
-+ Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
-+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
-+ m->mpc_irqtype, m->mpc_irqflag & 3,
-+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
-+ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
-+ panic("Max # of irq sources exceeded!!\n");
-+}
-+
-+#endif
-+
-+static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
-+{
-+ Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
-+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
-+ m->mpc_irqtype, m->mpc_irqflag & 3,
-+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
-+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
-+}
-+
-+#ifdef CONFIG_X86_NUMAQ
-+static void __init MP_translation_info(struct mpc_config_translation *m)
-+{
-+ printk(KERN_INFO
-+ "Translation: record %d, type %d, quad %d, global %d, local %d\n",
-+ mpc_record, m->trans_type, m->trans_quad, m->trans_global,
-+ m->trans_local);
-+
-+ if (mpc_record >= MAX_MPC_ENTRY)
-+ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
-+ else
-+ translation_table[mpc_record] = m; /* stash this for later */
-+ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
-+ node_set_online(m->trans_quad);
-+}
-+
-+/*
-+ * Read/parse the MPC oem tables
-+ */
-+
-+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
-+ unsigned short oemsize)
-+{
-+ int count = sizeof(*oemtable); /* the header size */
-+ unsigned char *oemptr = ((unsigned char *)oemtable) + count;
-+
-+ mpc_record = 0;
-+ printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
-+ oemtable);
-+ if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
-+ printk(KERN_WARNING
-+ "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
-+ oemtable->oem_signature[0], oemtable->oem_signature[1],
-+ oemtable->oem_signature[2], oemtable->oem_signature[3]);
-+ return;
-+ }
-+ if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
-+ printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
-+ return;
-+ }
-+ while (count < oemtable->oem_length) {
-+ switch (*oemptr) {
-+ case MP_TRANSLATION:
-+ {
-+ struct mpc_config_translation *m =
-+ (struct mpc_config_translation *)oemptr;
-+ MP_translation_info(m);
-+ oemptr += sizeof(*m);
-+ count += sizeof(*m);
-+ ++mpc_record;
-+ break;
-+ }
-+ default:
-+ {
-+ printk(KERN_WARNING
-+ "Unrecognised OEM table entry type! - %d\n",
-+ (int)*oemptr);
-+ return;
-+ }
-+ }
-+ }
-+}
-+
-+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
-+ char *productid)
-+{
-+ if (strncmp(oem, "IBM NUMA", 8))
-+ printk("Warning! May not be a NUMA-Q system!\n");
-+ if (mpc->mpc_oemptr)
-+ smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
-+ mpc->mpc_oemsize);
-+}
-+#endif /* CONFIG_X86_NUMAQ */
-+
-+/*
-+ * Read/parse the MPC
-+ */
-+
-+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
-+{
-+ char str[16];
-+ char oem[10];
-+ int count = sizeof(*mpc);
-+ unsigned char *mpt = ((unsigned char *)mpc) + count;
-+
-+ if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
-+ printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
-+ mpc->mpc_signature[0], mpc->mpc_signature[1],
-+ mpc->mpc_signature[2], mpc->mpc_signature[3]);
-+ return 0;
-+ }
-+ if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
-+ printk(KERN_ERR "MPTABLE: checksum error!\n");
-+ return 0;
-+ }
-+ if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
-+ printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
-+ mpc->mpc_spec);
-+ return 0;
-+ }
-+ if (!mpc->mpc_lapic) {
-+ printk(KERN_ERR "MPTABLE: null local APIC address!\n");
-+ return 0;
-+ }
-+ memcpy(oem, mpc->mpc_oem, 8);
-+ oem[8] = 0;
-+ printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
-+
-+ memcpy(str, mpc->mpc_productid, 12);
-+ str[12] = 0;
-+ printk("Product ID: %s ", str);
-+
-+#ifdef CONFIG_X86_32
-+ mps_oem_check(mpc, oem, str);
-+#endif
-+ printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
-+
-+ printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
-+
-+ /* save the local APIC address, it might be non-default */
-+ if (!acpi_lapic)
-+ mp_lapic_addr = mpc->mpc_lapic;
-+
-+ if (early)
-+ return 1;
-+
-+ /*
-+ * Now process the configuration blocks.
-+ */
-+#ifdef CONFIG_X86_NUMAQ
-+ mpc_record = 0;
-+#endif
-+ while (count < mpc->mpc_length) {
-+ switch (*mpt) {
-+ case MP_PROCESSOR:
-+ {
-+ struct mpc_config_processor *m =
-+ (struct mpc_config_processor *)mpt;
-+ /* ACPI may have already provided this data */
-+ if (!acpi_lapic)
-+ MP_processor_info(m);
-+ mpt += sizeof(*m);
-+ count += sizeof(*m);
-+ break;
-+ }
-+ case MP_BUS:
-+ {
-+ struct mpc_config_bus *m =
-+ (struct mpc_config_bus *)mpt;
-+ MP_bus_info(m);
-+ mpt += sizeof(*m);
-+ count += sizeof(*m);
-+ break;
-+ }
-+ case MP_IOAPIC:
-+ {
-+#ifdef CONFIG_X86_IO_APIC
-+ struct mpc_config_ioapic *m =
-+ (struct mpc_config_ioapic *)mpt;
-+ MP_ioapic_info(m);
-+#endif
-+ mpt += sizeof(struct mpc_config_ioapic);
-+ count += sizeof(struct mpc_config_ioapic);
-+ break;
-+ }
-+ case MP_INTSRC:
-+ {
-+#ifdef CONFIG_X86_IO_APIC
-+ struct mpc_config_intsrc *m =
-+ (struct mpc_config_intsrc *)mpt;
-+
-+ MP_intsrc_info(m);
-+#endif
-+ mpt += sizeof(struct mpc_config_intsrc);
-+ count += sizeof(struct mpc_config_intsrc);
-+ break;
-+ }
-+ case MP_LINTSRC:
-+ {
-+ struct mpc_config_lintsrc *m =
-+ (struct mpc_config_lintsrc *)mpt;
-+ MP_lintsrc_info(m);
-+ mpt += sizeof(*m);
-+ count += sizeof(*m);
-+ break;
-+ }
-+ default:
-+ /* wrong mptable */
-+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
-+ printk(KERN_ERR "type %x\n", *mpt);
-+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
-+ 1, mpc, mpc->mpc_length, 1);
-+ count = mpc->mpc_length;
-+ break;
-+ }
-+#ifdef CONFIG_X86_NUMAQ
-+ ++mpc_record;
-+#endif
-+ }
-+ setup_apic_routing();
-+ if (!num_processors)
-+ printk(KERN_ERR "MPTABLE: no processors registered!\n");
-+ return num_processors;
-+}
-+
-+#ifdef CONFIG_X86_IO_APIC
-+
-+static int __init ELCR_trigger(unsigned int irq)
-+{
-+ unsigned int port;
-+
-+ port = 0x4d0 + (irq >> 3);
-+ return (inb(port) >> (irq & 7)) & 1;
-+}
-+
-+static void __init construct_default_ioirq_mptable(int mpc_default_type)
-+{
-+ struct mpc_config_intsrc intsrc;
-+ int i;
-+ int ELCR_fallback = 0;
-+
-+ intsrc.mpc_type = MP_INTSRC;
-+ intsrc.mpc_irqflag = 0; /* conforming */
-+ intsrc.mpc_srcbus = 0;
-+ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
-+
-+ intsrc.mpc_irqtype = mp_INT;
-+
-+ /*
-+ * If true, we have an ISA/PCI system with no IRQ entries
-+ * in the MP table. To prevent the PCI interrupts from being set up
-+ * incorrectly, we try to use the ELCR. The sanity check to see if
-+ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
-+ * never be level sensitive, so we simply see if the ELCR agrees.
-+ * If it does, we assume it's valid.
-+ */
-+ if (mpc_default_type == 5) {
-+ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
-+ "falling back to ELCR\n");
-+
-+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
-+ ELCR_trigger(13))
-+ printk(KERN_ERR "ELCR contains invalid data... "
-+ "not using ELCR\n");
-+ else {
-+ printk(KERN_INFO
-+ "Using ELCR to identify PCI interrupts\n");
-+ ELCR_fallback = 1;
-+ }
-+ }
-+
-+ for (i = 0; i < 16; i++) {
-+ switch (mpc_default_type) {
-+ case 2:
-+ if (i == 0 || i == 13)
-+ continue; /* IRQ0 & IRQ13 not connected */
-+ /* fall through */
-+ default:
-+ if (i == 2)
-+ continue; /* IRQ2 is never connected */
-+ }
-+
-+ if (ELCR_fallback) {
-+ /*
-+ * If the ELCR indicates a level-sensitive interrupt, we
-+ * copy that information over to the MP table in the
-+ * irqflag field (level sensitive, active high polarity).
-+ */
-+ if (ELCR_trigger(i))
-+ intsrc.mpc_irqflag = 13;
-+ else
-+ intsrc.mpc_irqflag = 0;
-+ }
-+
-+ intsrc.mpc_srcbusirq = i;
-+ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
-+ MP_intsrc_info(&intsrc);
-+ }
-+
-+ intsrc.mpc_irqtype = mp_ExtINT;
-+ intsrc.mpc_srcbusirq = 0;
-+ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
-+ MP_intsrc_info(&intsrc);
-+}
-+
-+#endif
-+
-+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
-+{
-+ struct mpc_config_processor processor;
-+ struct mpc_config_bus bus;
-+#ifdef CONFIG_X86_IO_APIC
-+ struct mpc_config_ioapic ioapic;
-+#endif
-+ struct mpc_config_lintsrc lintsrc;
-+ int linttypes[2] = { mp_ExtINT, mp_NMI };
-+ int i;
-+
-+ /*
-+ * local APIC has default address
-+ */
-+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-+
-+ /*
-+ * 2 CPUs, numbered 0 & 1.
-+ */
-+ processor.mpc_type = MP_PROCESSOR;
-+ /* Either an integrated APIC or a discrete 82489DX. */
-+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-+ processor.mpc_cpuflag = CPU_ENABLED;
-+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-+ processor.mpc_reserved[0] = 0;
-+ processor.mpc_reserved[1] = 0;
-+ for (i = 0; i < 2; i++) {
-+ processor.mpc_apicid = i;
-+ MP_processor_info(&processor);
-+ }
-+
-+ bus.mpc_type = MP_BUS;
-+ bus.mpc_busid = 0;
-+ switch (mpc_default_type) {
-+ default:
-+ printk(KERN_ERR "???\nUnknown standard configuration %d\n",
-+ mpc_default_type);
-+ /* fall through */
-+ case 1:
-+ case 5:
-+ memcpy(bus.mpc_bustype, "ISA ", 6);
-+ break;
-+ case 2:
-+ case 6:
-+ case 3:
-+ memcpy(bus.mpc_bustype, "EISA ", 6);
-+ break;
-+ case 4:
-+ case 7:
-+ memcpy(bus.mpc_bustype, "MCA ", 6);
-+ }
-+ MP_bus_info(&bus);
-+ if (mpc_default_type > 4) {
-+ bus.mpc_busid = 1;
-+ memcpy(bus.mpc_bustype, "PCI ", 6);
-+ MP_bus_info(&bus);
-+ }
-+
-+#ifdef CONFIG_X86_IO_APIC
-+ ioapic.mpc_type = MP_IOAPIC;
-+ ioapic.mpc_apicid = 2;
-+ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-+ ioapic.mpc_flags = MPC_APIC_USABLE;
-+ ioapic.mpc_apicaddr = 0xFEC00000;
-+ MP_ioapic_info(&ioapic);
-+
-+ /*
-+ * We set up most of the low 16 IO-APIC pins according to MPS rules.
-+ */
-+ construct_default_ioirq_mptable(mpc_default_type);
-+#endif
-+ lintsrc.mpc_type = MP_LINTSRC;
-+ lintsrc.mpc_irqflag = 0; /* conforming */
-+ lintsrc.mpc_srcbusid = 0;
-+ lintsrc.mpc_srcbusirq = 0;
-+ lintsrc.mpc_destapic = MP_APIC_ALL;
-+ for (i = 0; i < 2; i++) {
-+ lintsrc.mpc_irqtype = linttypes[i];
-+ lintsrc.mpc_destapiclint = i;
-+ MP_lintsrc_info(&lintsrc);
-+ }
-+}
-+
-+static struct intel_mp_floating *mpf_found;
-+
-+/*
-+ * Scan the memory blocks for an SMP configuration block.
-+ */
-+static void __init __get_smp_config(unsigned early)
-+{
-+ struct intel_mp_floating *mpf = mpf_found;
-+
-+ if (acpi_lapic && early)
-+ return;
-+ /*
-+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
-+ * processors, where MPS only supports physical.
-+ */
-+ if (acpi_lapic && acpi_ioapic) {
-+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
-+ "information\n");
-+ return;
-+ } else if (acpi_lapic)
-+ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
-+ "configuration information\n");
-+
-+ printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
-+ mpf->mpf_specification);
-+#ifdef CONFIG_X86_32
-+ if (mpf->mpf_feature2 & (1 << 7)) {
-+ printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
-+ pic_mode = 1;
-+ } else {
-+ printk(KERN_INFO " Virtual Wire compatibility mode.\n");
-+ pic_mode = 0;
-+ }
-+#endif
-+ /*
-+ * Now see if we need to read further.
-+ */
-+ if (mpf->mpf_feature1 != 0) {
-+ if (early) {
-+ /*
-+ * local APIC has default address
-+ */
-+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-+ return;
-+ }
-+
-+ printk(KERN_INFO "Default MP configuration #%d\n",
-+ mpf->mpf_feature1);
-+ construct_default_ISA_mptable(mpf->mpf_feature1);
-+
-+ } else if (mpf->mpf_physptr) {
-+
-+ /*
-+ * Read the physical hardware table. Anything here will
-+ * override the defaults.
-+ */
-+ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
-+ smp_found_config = 0;
-+ printk(KERN_ERR
-+ "BIOS bug, MP table errors detected!...\n");
-+ printk(KERN_ERR "... disabling SMP support. "
-+ "(tell your hw vendor)\n");
-+ return;
-+ }
-+
-+ if (early)
-+ return;
-+#ifdef CONFIG_X86_IO_APIC
-+ /*
-+ * If there are no explicit MP IRQ entries, then we are
-+ * broken. We set up most of the low 16 IO-APIC pins to
-+ * ISA defaults and hope it will work.
-+ */
-+ if (!mp_irq_entries) {
-+ struct mpc_config_bus bus;
-+
-+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
-+ "using default mptable. "
-+ "(tell your hw vendor)\n");
-+
-+ bus.mpc_type = MP_BUS;
-+ bus.mpc_busid = 0;
-+ memcpy(bus.mpc_bustype, "ISA ", 6);
-+ MP_bus_info(&bus);
-+
-+ construct_default_ioirq_mptable(0);
-+ }
-+#endif
-+ } else
-+ BUG();
-+
-+ if (!early)
-+ printk(KERN_INFO "Processors: %d\n", num_processors);
-+ /*
-+ * Only use the first configuration found.
-+ */
-+}
-+
-+void __init early_get_smp_config(void)
-+{
-+ __get_smp_config(1);
-+}
-+
-+void __init get_smp_config(void)
-+{
-+ __get_smp_config(0);
-+}
-+
-+static int __init smp_scan_config(unsigned long base, unsigned long length,
-+ unsigned reserve)
-+{
-+ unsigned int *bp = isa_bus_to_virt(base);
-+ struct intel_mp_floating *mpf;
-+
-+ Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
-+ BUILD_BUG_ON(sizeof(*mpf) != 16);
-+
-+ while (length > 0) {
-+ mpf = (struct intel_mp_floating *)bp;
-+ if ((*bp == SMP_MAGIC_IDENT) &&
-+ (mpf->mpf_length == 1) &&
-+ !mpf_checksum((unsigned char *)bp, 16) &&
-+ ((mpf->mpf_specification == 1)
-+ || (mpf->mpf_specification == 4))) {
-+
-+ smp_found_config = 1;
-+ mpf_found = mpf;
-+#ifdef CONFIG_X86_32
-+#ifndef CONFIG_XEN
-+ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
-+ mpf, virt_to_phys(mpf));
-+ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
-+ BOOTMEM_DEFAULT);
-+ if (mpf->mpf_physptr) {
-+ /*
-+ * We cannot access to MPC table to compute
-+ * table size yet, as only few megabytes from
-+ * the bottom is mapped now.
-+ * PC-9800's MPC table places on the very last
-+ * of physical memory; so that simply reserving
-+ * PAGE_SIZE from mpg->mpf_physptr yields BUG()
-+ * in reserve_bootmem.
-+ */
-+ unsigned long size = PAGE_SIZE;
-+ unsigned long end = max_low_pfn * PAGE_SIZE;
-+ if (mpf->mpf_physptr + size > end)
-+ size = end - mpf->mpf_physptr;
-+ reserve_bootmem(mpf->mpf_physptr, size,
-+ BOOTMEM_DEFAULT);
-+ }
-+#else
-+ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
-+ mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
-+#endif
-+#elif !defined(CONFIG_XEN)
-+ if (!reserve)
-+ return 1;
-+
-+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
-+ if (mpf->mpf_physptr)
-+ reserve_bootmem_generic(mpf->mpf_physptr,
-+ PAGE_SIZE);
-+#endif
-+ return 1;
-+ }
-+ bp += 4;
-+ length -= 16;
-+ }
-+ return 0;
-+}
-+
-+static void __init __find_smp_config(unsigned reserve)
-+{
-+#ifndef CONFIG_XEN
-+ unsigned int address;
-+#endif
-+
-+ /*
-+ * FIXME: Linux assumes you have 640K of base ram..
-+ * this continues the error...
-+ *
-+ * 1) Scan the bottom 1K for a signature
-+ * 2) Scan the top 1K of base RAM
-+ * 3) Scan the 64K of bios
-+ */
-+ if (smp_scan_config(0x0, 0x400, reserve) ||
-+ smp_scan_config(639 * 0x400, 0x400, reserve) ||
-+ smp_scan_config(0xF0000, 0x10000, reserve))
-+ return;
-+ /*
-+ * If it is an SMP machine we should know now, unless the
-+ * configuration is in an EISA/MCA bus machine with an
-+ * extended bios data area.
-+ *
-+ * there is a real-mode segmented pointer pointing to the
-+ * 4K EBDA area at 0x40E, calculate and scan it here.
-+ *
-+ * NOTE! There are Linux loaders that will corrupt the EBDA
-+ * area, and as such this kind of SMP config may be less
-+ * trustworthy, simply because the SMP table may have been
-+ * stomped on during early boot. These loaders are buggy and
-+ * should be fixed.
-+ *
-+ * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
-+ */
-+
-+#ifndef CONFIG_XEN
-+ address = get_bios_ebda();
-+ if (address)
-+ smp_scan_config(address, 0x400, reserve);
-+#endif
-+}
-+
-+void __init early_find_smp_config(void)
-+{
-+ __find_smp_config(0);
-+}
-+
-+void __init find_smp_config(void)
-+{
-+ __find_smp_config(1);
-+}
-+
-+/* --------------------------------------------------------------------------
-+ ACPI-based MP Configuration
-+ -------------------------------------------------------------------------- */
-+
-+/*
-+ * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
-+ */
-+int es7000_plat;
-+
-+#ifdef CONFIG_ACPI
-+
-+#ifdef CONFIG_X86_IO_APIC
-+
-+#define MP_ISA_BUS 0
-+
-+extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
-+
-+static int mp_find_ioapic(int gsi)
-+{
-+ int i = 0;
-+
-+ /* Find the IOAPIC that manages this GSI. */
-+ for (i = 0; i < nr_ioapics; i++) {
-+ if ((gsi >= mp_ioapic_routing[i].gsi_base)
-+ && (gsi <= mp_ioapic_routing[i].gsi_end))
-+ return i;
-+ }
-+
-+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-+ return -1;
-+}
-+
-+static u8 __init uniq_ioapic_id(u8 id)
-+{
-+#ifdef CONFIG_X86_32
-+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-+ !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-+ return io_apic_get_unique_id(nr_ioapics, id);
-+ else
-+ return id;
-+#else
-+ int i;
-+ DECLARE_BITMAP(used, 256);
-+ bitmap_zero(used, 256);
-+ for (i = 0; i < nr_ioapics; i++) {
-+ struct mpc_config_ioapic *ia = &mp_ioapics[i];
-+ __set_bit(ia->mpc_apicid, used);
-+ }
-+ if (!test_bit(id, used))
-+ return id;
-+ return find_first_zero_bit(used, 256);
-+#endif
-+}
-+
-+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
-+{
-+ int idx = 0;
-+
-+ if (bad_ioapic(address))
-+ return;
-+
-+ idx = nr_ioapics;
-+
-+ mp_ioapics[idx].mpc_type = MP_IOAPIC;
-+ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-+ mp_ioapics[idx].mpc_apicaddr = address;
-+
-+#ifndef CONFIG_XEN
-+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-+#endif
-+ mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-+#ifdef CONFIG_X86_32
-+ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-+#else
-+ mp_ioapics[idx].mpc_apicver = 0;
-+#endif
-+ /*
-+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
-+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
-+ */
-+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-+ mp_ioapic_routing[idx].gsi_base = gsi_base;
-+ mp_ioapic_routing[idx].gsi_end = gsi_base +
-+ io_apic_get_redir_entries(idx);
-+
-+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-+ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-+ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-+ mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
-+
-+ nr_ioapics++;
-+}
-+
-+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-+{
-+ struct mpc_config_intsrc intsrc;
-+ int ioapic = -1;
-+ int pin = -1;
-+
-+ /*
-+ * Convert 'gsi' to 'ioapic.pin'.
-+ */
-+ ioapic = mp_find_ioapic(gsi);
-+ if (ioapic < 0)
-+ return;
-+ pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
-+
-+ /*
-+ * TBD: This check is for faulty timer entries, where the override
-+ * erroneously sets the trigger to level, resulting in a HUGE
-+ * increase of timer interrupts!
-+ */
-+ if ((bus_irq == 0) && (trigger == 3))
-+ trigger = 1;
-+
-+ intsrc.mpc_type = MP_INTSRC;
-+ intsrc.mpc_irqtype = mp_INT;
-+ intsrc.mpc_irqflag = (trigger << 2) | polarity;
-+ intsrc.mpc_srcbus = MP_ISA_BUS;
-+ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
-+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
-+ intsrc.mpc_dstirq = pin; /* INTIN# */
-+
-+ MP_intsrc_info(&intsrc);
-+}
-+
-+void __init mp_config_acpi_legacy_irqs(void)
-+{
-+ struct mpc_config_intsrc intsrc;
-+ int i = 0;
-+ int ioapic = -1;
-+
-+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-+ /*
-+ * Fabricate the legacy ISA bus (bus #31).
-+ */
-+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-+#endif
-+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
-+ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
-+
-+ /*
-+ * Older generations of ES7000 have no legacy identity mappings
-+ */
-+ if (es7000_plat == 1)
-+ return;
-+
-+ /*
-+ * Locate the IOAPIC that manages the ISA IRQs (0-15).
-+ */
-+ ioapic = mp_find_ioapic(0);
-+ if (ioapic < 0)
-+ return;
-+
-+ intsrc.mpc_type = MP_INTSRC;
-+ intsrc.mpc_irqflag = 0; /* Conforming */
-+ intsrc.mpc_srcbus = MP_ISA_BUS;
-+#ifdef CONFIG_X86_IO_APIC
-+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
-+#endif
-+ /*
-+ * Use the default configuration for the IRQs 0-15. Unless
-+ * overridden by (MADT) interrupt source override entries.
-+ */
-+ for (i = 0; i < 16; i++) {
-+ int idx;
-+
-+ for (idx = 0; idx < mp_irq_entries; idx++) {
-+ struct mpc_config_intsrc *irq = mp_irqs + idx;
-+
-+ /* Do we already have a mapping for this ISA IRQ? */
-+ if (irq->mpc_srcbus == MP_ISA_BUS
-+ && irq->mpc_srcbusirq == i)
-+ break;
-+
-+ /* Do we already have a mapping for this IOAPIC pin */
-+ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-+ (irq->mpc_dstirq == i))
-+ break;
-+ }
-+
-+ if (idx != mp_irq_entries) {
-+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-+ continue; /* IRQ already used */
-+ }
-+
-+ intsrc.mpc_irqtype = mp_INT;
-+ intsrc.mpc_srcbusirq = i; /* Identity mapped */
-+ intsrc.mpc_dstirq = i;
-+
-+ MP_intsrc_info(&intsrc);
-+ }
-+}
-+
-+int mp_register_gsi(u32 gsi, int triggering, int polarity)
-+{
-+ int ioapic;
-+ int ioapic_pin;
-+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
-+#define MAX_GSI_NUM 4096
-+#define IRQ_COMPRESSION_START 64
-+
-+ static int pci_irq = IRQ_COMPRESSION_START;
-+ /*
-+ * Mapping between Global System Interrupts, which
-+ * represent all possible interrupts, and IRQs
-+ * assigned to actual devices.
-+ */
-+ static int gsi_to_irq[MAX_GSI_NUM];
-+#else
-+
-+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-+ return gsi;
-+#endif
-+
-+ /* Don't set up the ACPI SCI because it's already set up */
-+ if (acpi_gbl_FADT.sci_interrupt == gsi)
-+ return gsi;
-+
-+ ioapic = mp_find_ioapic(gsi);
-+ if (ioapic < 0) {
-+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-+ return gsi;
-+ }
-+
-+ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
-+
-+#ifndef CONFIG_X86_32
-+ if (ioapic_renumber_irq)
-+ gsi = ioapic_renumber_irq(ioapic, gsi);
-+#endif
-+
-+ /*
-+ * Avoid pin reprogramming. PRTs typically include entries
-+ * with redundant pin->gsi mappings (but unique PCI devices);
-+ * we only program the IOAPIC on the first.
-+ */
-+ if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
-+ printk(KERN_ERR "Invalid reference to IOAPIC pin "
-+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
-+ ioapic_pin);
-+ return gsi;
-+ }
-+ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-+ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-+ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
-+ return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-+#else
-+ return gsi;
-+#endif
-+ }
-+
-+ set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
-+ /*
-+ * For GSI >= 64, use IRQ compression
-+ */
-+ if ((gsi >= IRQ_COMPRESSION_START)
-+ && (triggering == ACPI_LEVEL_SENSITIVE)) {
-+ /*
-+ * For PCI devices assign IRQs in order, avoiding gaps
-+ * due to unused I/O APIC pins.
-+ */
-+ int irq = gsi;
-+ if (gsi < MAX_GSI_NUM) {
-+ /*
-+ * Retain the VIA chipset work-around (gsi > 15), but
-+ * avoid a problem where the 8254 timer (IRQ0) is setup
-+ * via an override (so it's not on pin 0 of the ioapic),
-+ * and at the same time, the pin 0 interrupt is a PCI
-+ * type. The gsi > 15 test could cause these two pins
-+ * to be shared as IRQ0, and they are not shareable.
-+ * So test for this condition, and if necessary, avoid
-+ * the pin collision.
-+ */
-+ gsi = pci_irq++;
-+ /*
-+ * Don't assign IRQ used by ACPI SCI
-+ */
-+ if (gsi == acpi_gbl_FADT.sci_interrupt)
-+ gsi = pci_irq++;
-+ gsi_to_irq[irq] = gsi;
-+ } else {
-+ printk(KERN_ERR "GSI %u is too high\n", gsi);
-+ return gsi;
-+ }
-+ }
-+#endif
-+ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-+ return gsi;
-+}
-+
-+#endif /* CONFIG_X86_IO_APIC */
-+#endif /* CONFIG_ACPI */
---- a/arch/x86/kernel/pci-dma-xen.c
-+++ b/arch/x86/kernel/pci-dma-xen.c
-@@ -1,283 +1,251 @@
--/*
-- * Dynamic DMA mapping support.
-- *
-- * On i386 there is no hardware dynamic DMA address translation,
-- * so consistent alloc/free are merely page allocation/freeing.
-- * The rest of the dynamic DMA mapping interface is implemented
-- * in asm/pci.h.
-- */
--
--#include <linux/types.h>
--#include <linux/mm.h>
--#include <linux/string.h>
-+#include <linux/dma-mapping.h>
-+#include <linux/dmar.h>
-+#include <linux/bootmem.h>
- #include <linux/pci.h>
--#include <linux/module.h>
--#include <linux/version.h>
--#include <asm/io.h>
--#include <xen/balloon.h>
--#include <xen/gnttab.h>
--#include <asm/swiotlb.h>
--#include <asm/tlbflush.h>
--#include <asm/swiotlb_32.h>
--#include <asm/gnttab_dma.h>
--#include <asm/bug.h>
-
--#ifdef __x86_64__
--#include <asm/iommu.h>
-+#include <asm/proto.h>
-+#include <asm/dma.h>
-+#include <asm/gart.h>
-+#include <asm/calgary.h>
-+
-+int forbid_dac __read_mostly;
-+EXPORT_SYMBOL(forbid_dac);
-+
-+const struct dma_mapping_ops *dma_ops;
-+EXPORT_SYMBOL(dma_ops);
-+
-+static int iommu_sac_force __read_mostly;
-+
-+#ifdef CONFIG_IOMMU_DEBUG
-+int panic_on_overflow __read_mostly = 1;
-+int force_iommu __read_mostly = 1;
-+#else
-+int panic_on_overflow __read_mostly = 0;
-+int force_iommu __read_mostly = 0;
-+#endif
-
- int iommu_merge __read_mostly = 0;
--EXPORT_SYMBOL(iommu_merge);
-
--dma_addr_t bad_dma_address __read_mostly;
--EXPORT_SYMBOL(bad_dma_address);
-+int no_iommu __read_mostly;
-+/* Set this to 1 if there is a HW IOMMU in the system */
-+int iommu_detected __read_mostly = 0;
-
- /* This tells the BIO block layer to assume merging. Default to off
- because we cannot guarantee merging later. */
- int iommu_bio_merge __read_mostly = 0;
- EXPORT_SYMBOL(iommu_bio_merge);
-
--int force_iommu __read_mostly= 0;
-+dma_addr_t bad_dma_address __read_mostly = 0;
-+EXPORT_SYMBOL(bad_dma_address);
-
--__init int iommu_setup(char *p)
--{
-- return 1;
--}
-+/* Dummy device used for NULL arguments (normally ISA). Better would
-+ be probably a smaller DMA mask, but this is bug-to-bug compatible
-+ to older i386. */
-+struct device fallback_dev = {
-+ .bus_id = "fallback device",
-+ .coherent_dma_mask = DMA_32BIT_MASK,
-+ .dma_mask = &fallback_dev.coherent_dma_mask,
-+};
-
--void __init pci_iommu_alloc(void)
-+int dma_set_mask(struct device *dev, u64 mask)
- {
--#ifdef CONFIG_SWIOTLB
-- pci_swiotlb_init();
--#endif
--}
-+ if (!dev->dma_mask || !dma_supported(dev, mask))
-+ return -EIO;
-+
-+ *dev->dma_mask = mask;
-
--static int __init pci_iommu_init(void)
--{
-- no_iommu_init();
- return 0;
- }
-+EXPORT_SYMBOL(dma_set_mask);
-
--/* Must execute after PCI subsystem */
--fs_initcall(pci_iommu_init);
--#endif
--
--struct dma_coherent_mem {
-- void *virt_base;
-- u32 device_base;
-- int size;
-- int flags;
-- unsigned long *bitmap;
--};
--
--#define IOMMU_BUG_ON(test) \
--do { \
-- if (unlikely(test)) { \
-- printk(KERN_ALERT "Fatal DMA error! " \
-- "Please use 'swiotlb=force'\n"); \
-- BUG(); \
-- } \
--} while (0)
-+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
-+static __initdata void *dma32_bootmem_ptr;
-+static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
-
--static int check_pages_physically_contiguous(unsigned long pfn,
-- unsigned int offset,
-- size_t length)
-+static int __init parse_dma32_size_opt(char *p)
- {
-- unsigned long next_mfn;
-- int i;
-- int nr_pages;
--
-- next_mfn = pfn_to_mfn(pfn);
-- nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
--
-- for (i = 1; i < nr_pages; i++) {
-- if (pfn_to_mfn(++pfn) != ++next_mfn)
-- return 0;
-- }
-- return 1;
-+ if (!p)
-+ return -EINVAL;
-+ dma32_bootmem_size = memparse(p, &p);
-+ return 0;
- }
-+early_param("dma32_size", parse_dma32_size_opt);
-
--int range_straddles_page_boundary(paddr_t p, size_t size)
-+void __init dma32_reserve_bootmem(void)
- {
-- unsigned long pfn = p >> PAGE_SHIFT;
-- unsigned int offset = p & ~PAGE_MASK;
-+ unsigned long size, align;
-+ if (end_pfn <= MAX_DMA32_PFN)
-+ return;
-
-- return ((offset + size > PAGE_SIZE) &&
-- !check_pages_physically_contiguous(pfn, offset, size));
-+ align = 64ULL<<20;
-+ size = round_up(dma32_bootmem_size, align);
-+ dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
-+ __pa(MAX_DMA_ADDRESS));
-+ if (dma32_bootmem_ptr)
-+ dma32_bootmem_size = size;
-+ else
-+ dma32_bootmem_size = 0;
- }
--
--int
--dma_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
-- enum dma_data_direction direction)
-+static void __init dma32_free_bootmem(void)
- {
-- int i, rc;
-+ int node;
-+
-+ if (end_pfn <= MAX_DMA32_PFN)
-+ return;
-
-- BUG_ON(!valid_dma_direction(direction));
-- WARN_ON(nents == 0 || sgl->length == 0);
-+ if (!dma32_bootmem_ptr)
-+ return;
-
-- if (swiotlb) {
-- rc = swiotlb_map_sg(hwdev, sgl, nents, direction);
-- } else {
-- struct scatterlist *sg;
--
-- for_each_sg(sgl, sg, nents, i) {
-- BUG_ON(!sg_page(sg));
-- sg->dma_address =
-- gnttab_dma_map_page(sg_page(sg)) + sg->offset;
-- sg->dma_length = sg->length;
-- IOMMU_BUG_ON(address_needs_mapping(
-- hwdev, sg->dma_address));
-- IOMMU_BUG_ON(range_straddles_page_boundary(
-- page_to_pseudophys(sg_page(sg)) + sg->offset,
-- sg->length));
-- }
-- rc = nents;
-- }
-+ for_each_online_node(node)
-+ free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
-+ dma32_bootmem_size);
-
-- flush_write_buffers();
-- return rc;
-+ dma32_bootmem_ptr = NULL;
-+ dma32_bootmem_size = 0;
- }
--EXPORT_SYMBOL(dma_map_sg);
-+#else
-+#define dma32_free_bootmem() ((void)0)
-+#endif
-
--void
--dma_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
-- enum dma_data_direction direction)
--{
-- int i;
-+static const struct dma_mapping_ops swiotlb_dma_ops = {
-+ .mapping_error = swiotlb_dma_mapping_error,
-+ .map_single = swiotlb_map_single_phys,
-+ .unmap_single = swiotlb_unmap_single,
-+ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
-+ .sync_single_for_device = swiotlb_sync_single_for_device,
-+ .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
-+ .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
-+ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
-+ .sync_sg_for_device = swiotlb_sync_sg_for_device,
-+ .map_sg = swiotlb_map_sg,
-+ .unmap_sg = swiotlb_unmap_sg,
-+ .dma_supported = swiotlb_dma_supported
-+};
-
-- BUG_ON(!valid_dma_direction(direction));
-- if (swiotlb)
-- swiotlb_unmap_sg(hwdev, sgl, nents, direction);
-- else {
-- struct scatterlist *sg;
-+void __init pci_iommu_alloc(void)
-+{
-+ /* free the range so iommu could get some range less than 4G */
-+ dma32_free_bootmem();
-+ /*
-+ * The order of these functions is important for
-+ * fall-back/fail-over reasons
-+ */
-+#ifdef CONFIG_GART_IOMMU
-+ gart_iommu_hole_init();
-+#endif
-
-- for_each_sg(sgl, sg, nents, i)
-- gnttab_dma_unmap_page(sg->dma_address);
-- }
--}
--EXPORT_SYMBOL(dma_unmap_sg);
-+#ifdef CONFIG_CALGARY_IOMMU
-+ detect_calgary();
-+#endif
-
--#ifdef CONFIG_HIGHMEM
--dma_addr_t
--dma_map_page(struct device *dev, struct page *page, unsigned long offset,
-- size_t size, enum dma_data_direction direction)
--{
-- dma_addr_t dma_addr;
-+ detect_intel_iommu();
-
-- BUG_ON(!valid_dma_direction(direction));
-+#ifdef CONFIG_SWIOTLB
-+ swiotlb_init();
- if (swiotlb) {
-- dma_addr = swiotlb_map_page(
-- dev, page, offset, size, direction);
-- } else {
-- dma_addr = gnttab_dma_map_page(page) + offset;
-- IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
-+ printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
-+ dma_ops = &swiotlb_dma_ops;
- }
--
-- return dma_addr;
-+#endif
- }
--EXPORT_SYMBOL(dma_map_page);
-
--void
--dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
-- enum dma_data_direction direction)
-+/*
-+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
-+ * documentation.
-+ */
-+static __init int iommu_setup(char *p)
- {
-- BUG_ON(!valid_dma_direction(direction));
-- if (swiotlb)
-- swiotlb_unmap_page(dev, dma_address, size, direction);
-- else
-- gnttab_dma_unmap_page(dma_address);
--}
--EXPORT_SYMBOL(dma_unmap_page);
--#endif /* CONFIG_HIGHMEM */
-+ iommu_merge = 1;
-
--int
--dma_mapping_error(dma_addr_t dma_addr)
--{
-- if (swiotlb)
-- return swiotlb_dma_mapping_error(dma_addr);
-- return 0;
--}
--EXPORT_SYMBOL(dma_mapping_error);
-+ if (!p)
-+ return -EINVAL;
-
--int
--dma_supported(struct device *dev, u64 mask)
--{
-- if (swiotlb)
-- return swiotlb_dma_supported(dev, mask);
-- /*
-- * By default we'll BUG when an infeasible DMA is requested, and
-- * request swiotlb=force (see IOMMU_BUG_ON).
-- */
-- return 1;
--}
--EXPORT_SYMBOL(dma_supported);
-+ while (*p) {
-+ if (!strncmp(p, "off", 3))
-+ no_iommu = 1;
-+ /* gart_parse_options has more force support */
-+ if (!strncmp(p, "force", 5))
-+ force_iommu = 1;
-+ if (!strncmp(p, "noforce", 7)) {
-+ iommu_merge = 0;
-+ force_iommu = 0;
-+ }
-
--void *dma_alloc_coherent(struct device *dev, size_t size,
-- dma_addr_t *dma_handle, gfp_t gfp)
--{
-- void *ret;
-- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-- unsigned int order = get_order(size);
-- unsigned long vstart;
-- u64 mask;
-+ if (!strncmp(p, "biomerge", 8)) {
-+ iommu_bio_merge = 4096;
-+ iommu_merge = 1;
-+ force_iommu = 1;
-+ }
-+ if (!strncmp(p, "panic", 5))
-+ panic_on_overflow = 1;
-+ if (!strncmp(p, "nopanic", 7))
-+ panic_on_overflow = 0;
-+ if (!strncmp(p, "merge", 5)) {
-+ iommu_merge = 1;
-+ force_iommu = 1;
-+ }
-+ if (!strncmp(p, "nomerge", 7))
-+ iommu_merge = 0;
-+ if (!strncmp(p, "forcesac", 8))
-+ iommu_sac_force = 1;
-+ if (!strncmp(p, "allowdac", 8))
-+ forbid_dac = 0;
-+ if (!strncmp(p, "nodac", 5))
-+ forbid_dac = -1;
-+ if (!strncmp(p, "usedac", 6)) {
-+ forbid_dac = -1;
-+ return 1;
-+ }
-+#ifdef CONFIG_SWIOTLB
-+ if (!strncmp(p, "soft", 4))
-+ swiotlb = 1;
-+#endif
-
-- /* ignore region specifiers */
-- gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
-+#ifdef CONFIG_GART_IOMMU
-+ gart_parse_options(p);
-+#endif
-
-- if (mem) {
-- int page = bitmap_find_free_region(mem->bitmap, mem->size,
-- order);
-- if (page >= 0) {
-- *dma_handle = mem->device_base + (page << PAGE_SHIFT);
-- ret = mem->virt_base + (page << PAGE_SHIFT);
-- memset(ret, 0, size);
-- return ret;
-- }
-- if (mem->flags & DMA_MEMORY_EXCLUSIVE)
-- return NULL;
-+#ifdef CONFIG_CALGARY_IOMMU
-+ if (!strncmp(p, "calgary", 7))
-+ use_calgary = 1;
-+#endif /* CONFIG_CALGARY_IOMMU */
-+
-+ p += strcspn(p, ",");
-+ if (*p == ',')
-+ ++p;
- }
-+ return 0;
-+}
-+early_param("iommu", iommu_setup);
-
-- if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
-- gfp |= GFP_DMA;
--
-- vstart = __get_free_pages(gfp, order);
-- ret = (void *)vstart;
-+static int check_pages_physically_contiguous(unsigned long pfn,
-+ unsigned int offset,
-+ size_t length)
-+{
-+ unsigned long next_mfn;
-+ int i;
-+ int nr_pages;
-
-- if (dev != NULL && dev->coherent_dma_mask)
-- mask = dev->coherent_dma_mask;
-- else
-- mask = 0xffffffff;
-+ next_mfn = pfn_to_mfn(pfn);
-+ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
-
-- if (ret != NULL) {
-- if (xen_create_contiguous_region(vstart, order,
-- fls64(mask)) != 0) {
-- free_pages(vstart, order);
-- return NULL;
-- }
-- memset(ret, 0, size);
-- *dma_handle = virt_to_bus(ret);
-+ for (i = 1; i < nr_pages; i++) {
-+ if (pfn_to_mfn(++pfn) != ++next_mfn)
-+ return 0;
- }
-- return ret;
-+ return 1;
- }
--EXPORT_SYMBOL(dma_alloc_coherent);
-
--void dma_free_coherent(struct device *dev, size_t size,
-- void *vaddr, dma_addr_t dma_handle)
-+int range_straddles_page_boundary(paddr_t p, size_t size)
- {
-- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-- int order = get_order(size);
--
-- WARN_ON(irqs_disabled()); /* for portability */
-- if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
-- int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-+ unsigned long pfn = p >> PAGE_SHIFT;
-+ unsigned int offset = p & ~PAGE_MASK;
-
-- bitmap_release_region(mem->bitmap, page, order);
-- } else {
-- xen_destroy_contiguous_region((unsigned long)vaddr, order);
-- free_pages((unsigned long)vaddr, order);
-- }
-+ return ((offset + size > PAGE_SIZE) &&
-+ !check_pages_physically_contiguous(pfn, offset, size));
- }
--EXPORT_SYMBOL(dma_free_coherent);
-
--#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
-+#ifdef CONFIG_X86_32
- int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
- dma_addr_t device_addr, size_t size, int flags)
- {
-@@ -327,8 +295,8 @@ EXPORT_SYMBOL(dma_declare_coherent_memor
- void dma_release_declared_memory(struct device *dev)
- {
- struct dma_coherent_mem *mem = dev->dma_mem;
--
-- if(!mem)
-+
-+ if (!mem)
- return;
- dev->dma_mem = NULL;
- iounmap(mem->virt_base);
-@@ -341,8 +309,10 @@ void *dma_mark_declared_memory_occupied(
- dma_addr_t device_addr, size_t size)
- {
- struct dma_coherent_mem *mem = dev->dma_mem;
-- int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- int pos, err;
-+ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
-+
-+ pages >>= PAGE_SHIFT;
-
- if (!mem)
- return ERR_PTR(-EINVAL);
-@@ -354,103 +324,270 @@ void *dma_mark_declared_memory_occupied(
- return mem->virt_base + (pos << PAGE_SHIFT);
- }
- EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
--#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
--
--#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
--/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-
--int forbid_dac;
--EXPORT_SYMBOL(forbid_dac);
--
--static __devinit void via_no_dac(struct pci_dev *dev)
-+static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
-+ dma_addr_t *dma_handle, void **ret)
- {
-- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-- printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
-- forbid_dac = 1;
-+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-+ int order = get_order(size);
-+
-+ if (mem) {
-+ int page = bitmap_find_free_region(mem->bitmap, mem->size,
-+ order);
-+ if (page >= 0) {
-+ *dma_handle = mem->device_base + (page << PAGE_SHIFT);
-+ *ret = mem->virt_base + (page << PAGE_SHIFT);
-+ memset(*ret, 0, size);
-+ }
-+ if (mem->flags & DMA_MEMORY_EXCLUSIVE)
-+ *ret = NULL;
- }
-+ return (mem != NULL);
- }
--DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
-
--static int check_iommu(char *s)
-+static int dma_release_coherent(struct device *dev, int order, void *vaddr)
- {
-- if (!strcmp(s, "usedac")) {
-- forbid_dac = -1;
-+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-+
-+ if (mem && vaddr >= mem->virt_base && vaddr <
-+ (mem->virt_base + (mem->size << PAGE_SHIFT))) {
-+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-+
-+ bitmap_release_region(mem->bitmap, page, order);
- return 1;
- }
- return 0;
- }
--__setup("iommu=", check_iommu);
-+#else
-+#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
-+#define dma_release_coherent(dev, order, vaddr) (0)
-+#endif /* CONFIG_X86_32 */
-+
-+int dma_supported(struct device *dev, u64 mask)
-+{
-+#ifdef CONFIG_PCI
-+ if (mask > 0xffffffff && forbid_dac > 0) {
-+ printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
-+ dev->bus_id);
-+ return 0;
-+ }
- #endif
-
--dma_addr_t
--dma_map_single(struct device *dev, void *ptr, size_t size,
-- enum dma_data_direction direction)
-+ if (dma_ops->dma_supported)
-+ return dma_ops->dma_supported(dev, mask);
-+
-+ /* Copied from i386. Doesn't make much sense, because it will
-+ only work for pci_alloc_coherent.
-+ The caller just has to use GFP_DMA in this case. */
-+ if (mask < DMA_24BIT_MASK)
-+ return 0;
-+
-+ /* Tell the device to use SAC when IOMMU force is on. This
-+ allows the driver to use cheaper accesses in some cases.
-+
-+ Problem with this is that if we overflow the IOMMU area and
-+ return DAC as fallback address the device may not handle it
-+ correctly.
-+
-+ As a special case some controllers have a 39bit address
-+ mode that is as efficient as 32bit (aic79xx). Don't force
-+ SAC for these. Assume all masks <= 40 bits are of this
-+ type. Normally this doesn't make any difference, but gives
-+ more gentle handling of IOMMU overflow. */
-+ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
-+ printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
-+ dev->bus_id, mask);
-+ return 0;
-+ }
-+
-+ return 1;
-+}
-+EXPORT_SYMBOL(dma_supported);
-+
-+/* Allocate DMA memory on node near device */
-+static struct page *
-+dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
- {
-- dma_addr_t dma;
-+ int node;
-
-- BUG_ON(!valid_dma_direction(direction));
-- WARN_ON(size == 0);
-+ node = dev_to_node(dev);
-
-- if (swiotlb) {
-- dma = swiotlb_map_single(dev, ptr, size, direction);
-- } else {
-- dma = gnttab_dma_map_page(virt_to_page(ptr)) +
-- offset_in_page(ptr);
-- IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size));
-- IOMMU_BUG_ON(address_needs_mapping(dev, dma));
-- }
--
-- flush_write_buffers();
-- return dma;
--}
--EXPORT_SYMBOL(dma_map_single);
--
--void
--dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-- enum dma_data_direction direction)
--{
-- BUG_ON(!valid_dma_direction(direction));
-- if (swiotlb)
-- swiotlb_unmap_single(dev, dma_addr, size, direction);
-- else
-- gnttab_dma_unmap_page(dma_addr);
-+ return alloc_pages_node(node, gfp, order);
-+}
-+
-+/*
-+ * Allocate memory for a coherent mapping.
-+ */
-+void *
-+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-+ gfp_t gfp)
-+{
-+ void *memory = NULL;
-+ struct page *page;
-+ unsigned long dma_mask = 0;
-+ int noretry = 0;
-+ unsigned int order = get_order(size);
-+
-+ /* ignore region specifiers */
-+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-+
-+ if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
-+ return memory;
-+
-+ if (!dev) {
-+ dev = &fallback_dev;
-+ gfp |= GFP_DMA;
-+ }
-+ dma_mask = dev->coherent_dma_mask;
-+ if (dma_mask == 0)
-+ dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
-+
-+ /* Device not DMA able */
-+ if (dev->dma_mask == NULL)
-+ return NULL;
-+
-+ /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
-+ if (gfp & __GFP_DMA)
-+ noretry = 1;
-+
-+#ifdef CONFIG_XEN
-+ gfp &= ~(__GFP_DMA | __GFP_DMA32);
-+#else
-+#ifdef CONFIG_X86_64
-+ /* Why <=? Even when the mask is smaller than 4GB it is often
-+ larger than 16MB and in this case we have a chance of
-+ finding fitting memory in the next higher zone first. If
-+ not retry with true GFP_DMA. -AK */
-+ if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
-+ gfp |= GFP_DMA32;
-+#endif
-+
-+ again:
-+#endif
-+ page = dma_alloc_pages(dev,
-+ noretry ? gfp | __GFP_NORETRY : gfp, order);
-+ if (page == NULL)
-+ return NULL;
-+
-+#ifndef CONFIG_XEN
-+ {
-+ int high, mmu;
-+ dma_addr_t bus = page_to_phys(page);
-+ memory = page_address(page);
-+ high = (bus + size) >= dma_mask;
-+ mmu = high;
-+ if (force_iommu && !(gfp & GFP_DMA))
-+ mmu = 1;
-+ else if (high) {
-+ free_pages((unsigned long)memory, order);
-+
-+ /* Don't use the 16MB ZONE_DMA unless absolutely
-+ needed. It's better to use remapping first. */
-+ if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
-+ gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
-+ goto again;
-+ }
-+
-+ /* Let low level make its own zone decisions */
-+ gfp &= ~(GFP_DMA32|GFP_DMA);
-+
-+ if (dma_ops->alloc_coherent)
-+ return dma_ops->alloc_coherent(dev, size,
-+ dma_handle, gfp);
-+ return NULL;
-+ }
-+
-+ memset(memory, 0, size);
-+ if (!mmu) {
-+ *dma_handle = bus;
-+ return memory;
-+ }
-+ }
-+
-+ if (dma_ops->alloc_coherent) {
-+ free_pages((unsigned long)memory, order);
-+ gfp &= ~(GFP_DMA|GFP_DMA32);
-+ return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
-+ }
-+
-+ if (dma_ops->map_simple) {
-+ *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
-+ size,
-+ PCI_DMA_BIDIRECTIONAL);
-+ if (*dma_handle != bad_dma_address)
-+ return memory;
-+ }
-+#else
-+ memory = page_address(page);
-+ if (xen_create_contiguous_region((unsigned long)memory, order,
-+ fls64(dma_mask)) == 0) {
-+ memset(memory, 0, size);
-+ *dma_handle = virt_to_bus(memory);
-+ return memory;
-+ }
-+#endif
-+
-+ if (panic_on_overflow)
-+ panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
-+ (unsigned long)size);
-+ free_pages((unsigned long)memory, order);
-+ return NULL;
- }
--EXPORT_SYMBOL(dma_unmap_single);
-+EXPORT_SYMBOL(dma_alloc_coherent);
-
--void
--dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
-- enum dma_data_direction direction)
-+/*
-+ * Unmap coherent memory.
-+ * The caller must ensure that the device has finished accessing the mapping.
-+ */
-+void dma_free_coherent(struct device *dev, size_t size,
-+ void *vaddr, dma_addr_t bus)
- {
-- if (swiotlb)
-- swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
-+ int order = get_order(size);
-+ WARN_ON(irqs_disabled()); /* for portability */
-+ if (dma_release_coherent(dev, order, vaddr))
-+ return;
-+#ifndef CONFIG_XEN
-+ if (dma_ops->unmap_single)
-+ dma_ops->unmap_single(dev, bus, size, 0);
-+#endif
-+ xen_destroy_contiguous_region((unsigned long)vaddr, order);
-+ free_pages((unsigned long)vaddr, order);
- }
--EXPORT_SYMBOL(dma_sync_single_for_cpu);
-+EXPORT_SYMBOL(dma_free_coherent);
-
--void
--dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
-- enum dma_data_direction direction)
-+static int __init pci_iommu_init(void)
- {
-- if (swiotlb)
-- swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
-+#ifdef CONFIG_CALGARY_IOMMU
-+ calgary_iommu_init();
-+#endif
-+
-+ intel_iommu_init();
-+
-+#ifdef CONFIG_GART_IOMMU
-+ gart_iommu_init();
-+#endif
-+
-+ no_iommu_init();
-+ return 0;
- }
--EXPORT_SYMBOL(dma_sync_single_for_device);
-
--void
--dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
-- enum dma_data_direction direction)
-+void pci_iommu_shutdown(void)
- {
-- if (swiotlb)
-- swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
-- flush_write_buffers();
-+ gart_iommu_shutdown();
- }
--EXPORT_SYMBOL(dma_sync_sg_for_cpu);
-+/* Must execute after PCI subsystem */
-+fs_initcall(pci_iommu_init);
-+
-+#ifdef CONFIG_PCI
-+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-
--void
--dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
-- enum dma_data_direction direction)
-+static __devinit void via_no_dac(struct pci_dev *dev)
- {
-- if (swiotlb)
-- swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
-- flush_write_buffers();
-+ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-+ printk(KERN_INFO "PCI: VIA PCI bridge detected."
-+ "Disabling DAC.\n");
-+ forbid_dac = 1;
-+ }
- }
--EXPORT_SYMBOL(dma_sync_sg_for_device);
-+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
-+#endif
---- /dev/null
-+++ b/arch/x86/kernel/pci-nommu-xen.c
-@@ -0,0 +1,103 @@
-+#include <linux/dma-mapping.h>
-+#include <linux/dmar.h>
-+#include <linux/bootmem.h>
-+#include <linux/pci.h>
-+
-+#include <xen/gnttab.h>
-+
-+#include <asm/proto.h>
-+#include <asm/dma.h>
-+#include <asm/swiotlb.h>
-+#include <asm/tlbflush.h>
-+#include <asm/gnttab_dma.h>
-+#include <asm/bug.h>
-+
-+#define IOMMU_BUG_ON(test) \
-+do { \
-+ if (unlikely(test)) { \
-+ printk(KERN_ALERT "Fatal DMA error! " \
-+ "Please use 'swiotlb=force'\n"); \
-+ BUG(); \
-+ } \
-+} while (0)
-+
-+static int
-+gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
-+ int direction)
-+{
-+ unsigned int i;
-+ struct scatterlist *sg;
-+
-+ WARN_ON(nents == 0 || sgl->length == 0);
-+
-+ for_each_sg(sgl, sg, nents, i) {
-+ BUG_ON(!sg_page(sg));
-+ sg->dma_address =
-+ gnttab_dma_map_page(sg_page(sg)) + sg->offset;
-+ sg->dma_length = sg->length;
-+ IOMMU_BUG_ON(address_needs_mapping(
-+ hwdev, sg->dma_address));
-+ IOMMU_BUG_ON(range_straddles_page_boundary(
-+ page_to_pseudophys(sg_page(sg)) + sg->offset,
-+ sg->length));
-+ }
-+
-+ return nents;
-+}
-+
-+static void
-+gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
-+ int direction)
-+{
-+ unsigned int i;
-+ struct scatterlist *sg;
-+
-+ for_each_sg(sgl, sg, nents, i)
-+ gnttab_dma_unmap_page(sg->dma_address);
-+}
-+
-+static dma_addr_t
-+gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size,
-+ int direction)
-+{
-+ dma_addr_t dma;
-+
-+ WARN_ON(size == 0);
-+
-+ dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) +
-+ offset_in_page(paddr);
-+ IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
-+ IOMMU_BUG_ON(address_needs_mapping(dev, dma));
-+
-+ return dma;
-+}
-+
-+static void
-+gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-+ int direction)
-+{
-+ gnttab_dma_unmap_page(dma_addr);
-+}
-+
-+static int nommu_mapping_error(dma_addr_t dma_addr)
-+{
-+ return (dma_addr == bad_dma_address);
-+}
-+
-+static const struct dma_mapping_ops nommu_dma_ops = {
-+ .map_single = gnttab_map_single,
-+ .unmap_single = gnttab_unmap_single,
-+ .map_sg = gnttab_map_sg,
-+ .unmap_sg = gnttab_unmap_sg,
-+ .dma_supported = swiotlb_dma_supported,
-+ .mapping_error = nommu_mapping_error
-+};
-+
-+void __init no_iommu_init(void)
-+{
-+ if (dma_ops)
-+ return;
-+
-+ force_iommu = 0; /* no HW IOMMU */
-+ dma_ops = &nommu_dma_ops;
-+}
---- a/arch/x86/kernel/process_32-xen.c
-+++ b/arch/x86/kernel/process_32-xen.c
-@@ -36,6 +36,7 @@
- #include <linux/personality.h>
- #include <linux/tick.h>
- #include <linux/percpu.h>
-+#include <linux/prctl.h>
-
- #include <asm/uaccess.h>
- #include <asm/pgtable.h>
-@@ -45,7 +46,6 @@
- #include <asm/processor.h>
- #include <asm/i387.h>
- #include <asm/desc.h>
--#include <asm/vm86.h>
- #ifdef CONFIG_MATH_EMULATION
- #include <asm/math_emu.h>
- #endif
-@@ -102,16 +102,6 @@ void enable_hlt(void)
-
- EXPORT_SYMBOL(enable_hlt);
-
--/*
-- * On SMP it's slightly faster (but much more power-consuming!)
-- * to poll the ->work.need_resched flag instead of waiting for the
-- * cross-CPU IPI to arrive. Use this option with caution.
-- */
--static void poll_idle(void)
--{
-- cpu_relax();
--}
--
- static void xen_idle(void)
- {
- current_thread_info()->status &= ~TS_POLLING;
-@@ -121,20 +111,10 @@ static void xen_idle(void)
- */
- smp_mb();
-
-- local_irq_disable();
-- if (!need_resched()) {
-- ktime_t t0, t1;
-- u64 t0n, t1n;
--
-- t0 = ktime_get();
-- t0n = ktime_to_ns(t0);
-+ if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
-- local_irq_disable();
-- t1 = ktime_get();
-- t1n = ktime_to_ns(t1);
-- sched_clock_idle_wakeup_event(t1n - t0n);
-- }
-- local_irq_enable();
-+ else
-+ local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
- }
- #ifdef CONFIG_APM_MODULE
-@@ -142,7 +122,6 @@ EXPORT_SYMBOL(default_idle);
- #endif
-
- #ifdef CONFIG_HOTPLUG_CPU
--extern cpumask_t cpu_initialized;
- static inline void play_dead(void)
- {
- idle_task_exit();
-@@ -187,6 +166,7 @@ void cpu_idle(void)
- if (cpu_is_offline(cpu))
- play_dead();
-
-+ local_irq_disable();
- __get_cpu_var(irq_stat).idle_timestamp = jiffies;
- idle();
- }
-@@ -197,44 +177,6 @@ void cpu_idle(void)
- }
- }
-
--static void do_nothing(void *unused)
--{
--}
--
--/*
-- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
-- * pm_idle and update to new pm_idle value. Required while changing pm_idle
-- * handler on SMP systems.
-- *
-- * Caller must have changed pm_idle to the new value before the call. Old
-- * pm_idle value will not be used by any CPU after the return of this function.
-- */
--void cpu_idle_wait(void)
--{
-- smp_mb();
-- /* kick all the CPUs so that they exit out of pm_idle */
-- smp_call_function(do_nothing, NULL, 0, 1);
--}
--EXPORT_SYMBOL_GPL(cpu_idle_wait);
--
--void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
--{
--}
--
--static int __init idle_setup(char *str)
--{
-- if (!strcmp(str, "poll")) {
-- printk("using polling idle threads.\n");
-- pm_idle = poll_idle;
-- }
-- else
-- return -1;
--
-- boot_option_idle_override = 1;
-- return 0;
--}
--early_param("idle", idle_setup);
--
- void __show_registers(struct pt_regs *regs, int all)
- {
- unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
-@@ -260,7 +202,7 @@ void __show_registers(struct pt_regs *re
- init_utsname()->version);
-
- printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
-- 0xffff & regs->cs, regs->ip, regs->flags,
-+ (u16)regs->cs, regs->ip, regs->flags,
- smp_processor_id());
- print_symbol("EIP is at %s\n", regs->ip);
-
-@@ -269,8 +211,7 @@ void __show_registers(struct pt_regs *re
- printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
- regs->si, regs->di, regs->bp, sp);
- printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
-- regs->ds & 0xffff, regs->es & 0xffff,
-- regs->fs & 0xffff, gs, ss);
-+ (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
-
- if (!all)
- return;
-@@ -367,6 +308,7 @@ void flush_thread(void)
- /*
- * Forget coprocessor state..
- */
-+ tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
- }
-@@ -437,11 +379,30 @@ int copy_thread(int nr, unsigned long cl
- return err;
- }
-
--#ifdef CONFIG_SECCOMP
-+void
-+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
-+{
-+ __asm__("movl %0, %%gs" :: "r"(0));
-+ regs->fs = 0;
-+ set_fs(USER_DS);
-+ regs->ds = __USER_DS;
-+ regs->es = __USER_DS;
-+ regs->ss = __USER_DS;
-+ regs->cs = __USER_CS;
-+ regs->ip = new_ip;
-+ regs->sp = new_sp;
-+ /*
-+ * Free the old FP and other extended state
-+ */
-+ free_thread_xstate(current);
-+}
-+EXPORT_SYMBOL_GPL(start_thread);
-+
- static void hard_disable_TSC(void)
- {
- write_cr4(read_cr4() | X86_CR4_TSD);
- }
-+
- void disable_TSC(void)
- {
- preempt_disable();
-@@ -453,11 +414,47 @@ void disable_TSC(void)
- hard_disable_TSC();
- preempt_enable();
- }
-+
- static void hard_enable_TSC(void)
- {
- write_cr4(read_cr4() & ~X86_CR4_TSD);
- }
--#endif /* CONFIG_SECCOMP */
-+
-+static void enable_TSC(void)
-+{
-+ preempt_disable();
-+ if (test_and_clear_thread_flag(TIF_NOTSC))
-+ /*
-+ * Must flip the CPU state synchronously with
-+ * TIF_NOTSC in the current running context.
-+ */
-+ hard_enable_TSC();
-+ preempt_enable();
-+}
-+
-+int get_tsc_mode(unsigned long adr)
-+{
-+ unsigned int val;
-+
-+ if (test_thread_flag(TIF_NOTSC))
-+ val = PR_TSC_SIGSEGV;
-+ else
-+ val = PR_TSC_ENABLE;
-+
-+ return put_user(val, (unsigned int __user *)adr);
-+}
-+
-+int set_tsc_mode(unsigned int val)
-+{
-+ if (val == PR_TSC_SIGSEGV)
-+ disable_TSC();
-+ else if (val == PR_TSC_ENABLE)
-+ enable_TSC();
-+ else
-+ return -EINVAL;
-+
-+ return 0;
-+}
-
- static noinline void
- __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
-@@ -473,12 +470,12 @@ __switch_to_xtra(struct task_struct *pre
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
-- wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
-+ update_debugctlmsr(0);
- wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
- }
-
- if (next->debugctlmsr != debugctl)
-- wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
-+ update_debugctlmsr(next->debugctlmsr);
-
- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- set_debugreg(next->debugreg0, 0);
-@@ -490,7 +487,6 @@ __switch_to_xtra(struct task_struct *pre
- set_debugreg(next->debugreg7, 7);
- }
-
--#ifdef CONFIG_SECCOMP
- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
- test_tsk_thread_flag(next_p, TIF_NOTSC)) {
- /* prev and next are different */
-@@ -499,7 +495,6 @@ __switch_to_xtra(struct task_struct *pre
- else
- hard_enable_TSC();
- }
--#endif
-
- #ifdef X86_BTS
- if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
-@@ -637,7 +632,7 @@ struct task_struct * __switch_to(struct
-
- /* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter > 5)
-- prefetch(&next->i387.fxsave);
-+ prefetch(next->xstate);
-
- /*
- * Now maybe handle debug registers
-@@ -658,8 +653,11 @@ struct task_struct * __switch_to(struct
- /* If the task has used fpu the last 5 timeslices, just do a full
- * restore of the math state immediately to avoid the trap; the
- * chances of needing FPU soon are obviously high now
-+ *
-+ * tsk_used_math() checks prevent calling math_state_restore(),
-+ * which can sleep in the case of !tsk_used_math()
- */
-- if (next_p->fpu_counter > 5)
-+ if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
- math_state_restore();
-
- /*
---- a/arch/x86/kernel/process_64-xen.c
-+++ b/arch/x86/kernel/process_64-xen.c
-@@ -39,6 +39,7 @@
- #include <linux/kprobes.h>
- #include <linux/kdebug.h>
- #include <linux/tick.h>
-+#include <linux/prctl.h>
-
- #include <asm/uaccess.h>
- #include <asm/pgtable.h>
-@@ -102,17 +103,6 @@ void exit_idle(void)
- __exit_idle();
- }
-
--/*
-- * On SMP it's slightly faster (but much more power-consuming!)
-- * to poll the ->need_resched flag instead of waiting for the
-- * cross-CPU IPI to arrive. Use this option with caution.
-- */
--static void poll_idle(void)
--{
-- local_irq_enable();
-- cpu_relax();
--}
--
- static void xen_idle(void)
- {
- current_thread_info()->status &= ~TS_POLLING;
-@@ -121,20 +111,10 @@ static void xen_idle(void)
- * test NEED_RESCHED:
- */
- smp_mb();
-- local_irq_disable();
-- if (!need_resched()) {
-- ktime_t t0, t1;
-- u64 t0n, t1n;
--
-- t0 = ktime_get();
-- t0n = ktime_to_ns(t0);
-+ if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
-- local_irq_disable();
-- t1 = ktime_get();
-- t1n = ktime_to_ns(t1);
-- sched_clock_idle_wakeup_event(t1n - t0n);
-- }
-- local_irq_enable();
-+ else
-+ local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
- }
-
-@@ -195,45 +175,6 @@ void cpu_idle(void)
- }
- }
-
--static void do_nothing(void *unused)
--{
--}
--
--/*
-- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
-- * pm_idle and update to new pm_idle value. Required while changing pm_idle
-- * handler on SMP systems.
-- *
-- * Caller must have changed pm_idle to the new value before the call. Old
-- * pm_idle value will not be used by any CPU after the return of this function.
-- */
--void cpu_idle_wait(void)
--{
-- smp_mb();
-- /* kick all the CPUs so that they exit out of pm_idle */
-- smp_call_function(do_nothing, NULL, 0, 1);
--}
--EXPORT_SYMBOL_GPL(cpu_idle_wait);
--
--void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
--{
--}
--
--static int __init idle_setup(char *str)
--{
-- if (!strcmp(str, "poll")) {
-- printk("using polling idle threads.\n");
-- pm_idle = poll_idle;
-- } else if (!strcmp(str, "mwait"))
-- force_mwait = 1;
-- else
-- return -1;
--
-- boot_option_idle_override = 1;
-- return 0;
--}
--early_param("idle", idle_setup);
--
- /* Prints also some state that isn't saved in the pt_regs */
- void __show_regs(struct pt_regs * regs)
- {
-@@ -360,6 +301,7 @@ void flush_thread(void)
- /*
- * Forget coprocessor state..
- */
-+ tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
- }
-@@ -472,6 +414,83 @@ out:
- return err;
- }
-
-+void
-+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
-+{
-+ asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
-+ load_gs_index(0);
-+ regs->ip = new_ip;
-+ regs->sp = new_sp;
-+ write_pda(oldrsp, new_sp);
-+ regs->cs = __USER_CS;
-+ regs->ss = __USER_DS;
-+ regs->flags = 0x200;
-+ set_fs(USER_DS);
-+ /*
-+ * Free the old FP and other extended state
-+ */
-+ free_thread_xstate(current);
-+}
-+EXPORT_SYMBOL_GPL(start_thread);
-+
-+static void hard_disable_TSC(void)
-+{
-+ write_cr4(read_cr4() | X86_CR4_TSD);
-+}
-+
-+void disable_TSC(void)
-+{
-+ preempt_disable();
-+ if (!test_and_set_thread_flag(TIF_NOTSC))
-+ /*
-+ * Must flip the CPU state synchronously with
-+ * TIF_NOTSC in the current running context.
-+ */
-+ hard_disable_TSC();
-+ preempt_enable();
-+}
-+
-+static void hard_enable_TSC(void)
-+{
-+ write_cr4(read_cr4() & ~X86_CR4_TSD);
-+}
-+
-+static void enable_TSC(void)
-+{
-+ preempt_disable();
-+ if (test_and_clear_thread_flag(TIF_NOTSC))
-+ /*
-+ * Must flip the CPU state synchronously with
-+ * TIF_NOTSC in the current running context.
-+ */
-+ hard_enable_TSC();
-+ preempt_enable();
-+}
-+
-+int get_tsc_mode(unsigned long adr)
-+{
-+ unsigned int val;
-+
-+ if (test_thread_flag(TIF_NOTSC))
-+ val = PR_TSC_SIGSEGV;
-+ else
-+ val = PR_TSC_ENABLE;
-+
-+ return put_user(val, (unsigned int __user *)adr);
-+}
-+
-+int set_tsc_mode(unsigned int val)
-+{
-+ if (val == PR_TSC_SIGSEGV)
-+ disable_TSC();
-+ else if (val == PR_TSC_ENABLE)
-+ enable_TSC();
-+ else
-+ return -EINVAL;
-+
-+ return 0;
-+}
-+
- /*
- * This special macro can be used to load a debugging register
- */
-@@ -491,12 +510,12 @@ static inline void __switch_to_xtra(stru
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
-- wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
-+ update_debugctlmsr(0);
- wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
- }
-
- if (next->debugctlmsr != debugctl)
-- wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
-+ update_debugctlmsr(next->debugctlmsr);
-
- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- loaddebug(next, 0);
-@@ -508,6 +527,15 @@ static inline void __switch_to_xtra(stru
- loaddebug(next, 7);
- }
-
-+ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-+ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
-+ /* prev and next are different */
-+ if (test_tsk_thread_flag(next_p, TIF_NOTSC))
-+ hard_disable_TSC();
-+ else
-+ hard_enable_TSC();
-+ }
-+
- #ifdef X86_BTS
- if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-@@ -547,7 +575,7 @@ __switch_to(struct task_struct *prev_p,
-
- /* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter>5)
-- prefetch(&next->i387.fxsave);
-+ prefetch(next->xstate);
-
- /*
- * This is basically '__unlazy_fpu', except that we queue a
-@@ -680,8 +708,11 @@ __switch_to(struct task_struct *prev_p,
- /* If the task has used fpu the last 5 timeslices, just do a full
- * restore of the math state immediately to avoid the trap; the
- * chances of needing FPU soon are obviously high now
-+ *
-+ * tsk_used_math() checks prevent calling math_state_restore(),
-+ * which can sleep in the case of !tsk_used_math()
- */
-- if (next_p->fpu_counter>5)
-+ if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
- math_state_restore();
- return prev_p;
- }
---- /dev/null
-+++ b/arch/x86/kernel/process-xen.c
-@@ -0,0 +1,188 @@
-+#include <linux/errno.h>
-+#include <linux/kernel.h>
-+#include <linux/mm.h>
-+#include <linux/smp.h>
-+#include <linux/slab.h>
-+#include <linux/sched.h>
-+#include <linux/module.h>
-+#include <linux/pm.h>
-+
-+struct kmem_cache *task_xstate_cachep;
-+
-+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
-+{
-+ *dst = *src;
-+ if (src->thread.xstate) {
-+ dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
-+ GFP_KERNEL);
-+ if (!dst->thread.xstate)
-+ return -ENOMEM;
-+ WARN_ON((unsigned long)dst->thread.xstate & 15);
-+ memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
-+ }
-+ return 0;
-+}
-+
-+void free_thread_xstate(struct task_struct *tsk)
-+{
-+ if (tsk->thread.xstate) {
-+ kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
-+ tsk->thread.xstate = NULL;
-+ }
-+}
-+
-+void free_thread_info(struct thread_info *ti)
-+{
-+ free_thread_xstate(ti->task);
-+ free_pages((unsigned long)ti, get_order(THREAD_SIZE));
-+}
-+
-+void arch_task_cache_init(void)
-+{
-+ task_xstate_cachep =
-+ kmem_cache_create("task_xstate", xstate_size,
-+ __alignof__(union thread_xstate),
-+ SLAB_PANIC, NULL);
-+}
-+
-+static void do_nothing(void *unused)
-+{
-+}
-+
-+/*
-+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
-+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
-+ * handler on SMP systems.
-+ *
-+ * Caller must have changed pm_idle to the new value before the call. Old
-+ * pm_idle value will not be used by any CPU after the return of this function.
-+ */
-+void cpu_idle_wait(void)
-+{
-+ smp_mb();
-+ /* kick all the CPUs so that they exit out of pm_idle */
-+ smp_call_function(do_nothing, NULL, 0, 1);
-+}
-+EXPORT_SYMBOL_GPL(cpu_idle_wait);
-+
-+#ifndef CONFIG_XEN
-+/*
-+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
-+ * which can obviate IPI to trigger checking of need_resched.
-+ * We execute MONITOR against need_resched and enter optimized wait state
-+ * through MWAIT. Whenever someone changes need_resched, we would be woken
-+ * up from MWAIT (without an IPI).
-+ *
-+ * New with Core Duo processors, MWAIT can take some hints based on CPU
-+ * capability.
-+ */
-+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-+{
-+ if (!need_resched()) {
-+ __monitor((void *)¤t_thread_info()->flags, 0, 0);
-+ smp_mb();
-+ if (!need_resched())
-+ __mwait(ax, cx);
-+ }
-+}
-+
-+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-+static void mwait_idle(void)
-+{
-+ if (!need_resched()) {
-+ __monitor((void *)¤t_thread_info()->flags, 0, 0);
-+ smp_mb();
-+ if (!need_resched())
-+ __sti_mwait(0, 0);
-+ else
-+ local_irq_enable();
-+ } else
-+ local_irq_enable();
-+}
-+#endif
-+
-+/*
-+ * On SMP it's slightly faster (but much more power-consuming!)
-+ * to poll the ->work.need_resched flag instead of waiting for the
-+ * cross-CPU IPI to arrive. Use this option with caution.
-+ */
-+static void poll_idle(void)
-+{
-+ local_irq_enable();
-+ cpu_relax();
-+}
-+
-+#ifndef CONFIG_XEN
-+/*
-+ * mwait selection logic:
-+ *
-+ * It depends on the CPU. For AMD CPUs that support MWAIT this is
-+ * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
-+ * then depend on a clock divisor and current Pstate of the core. If
-+ * all cores of a processor are in halt state (C1) the processor can
-+ * enter the C1E (C1 enhanced) state. If mwait is used this will never
-+ * happen.
-+ *
-+ * idle=mwait overrides this decision and forces the usage of mwait.
-+ */
-+static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
-+{
-+ if (force_mwait)
-+ return 1;
-+
-+ if (c->x86_vendor == X86_VENDOR_AMD) {
-+ switch(c->x86) {
-+ case 0x10:
-+ case 0x11:
-+ return 0;
-+ }
-+ }
-+ return 1;
-+}
-+#endif
-+
-+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-+{
-+#ifndef CONFIG_XEN
-+ static int selected;
-+
-+ if (selected)
-+ return;
-+#ifdef CONFIG_X86_SMP
-+ if (pm_idle == poll_idle && smp_num_siblings > 1) {
-+ printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
-+ " performance may degrade.\n");
-+ }
-+#endif
-+ if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
-+ /*
-+ * Skip, if setup has overridden idle.
-+ * One CPU supports mwait => All CPUs supports mwait
-+ */
-+ if (!pm_idle) {
-+ printk(KERN_INFO "using mwait in idle threads.\n");
-+ pm_idle = mwait_idle;
-+ }
-+ }
-+ selected = 1;
-+#endif
-+}
-+
-+static int __init idle_setup(char *str)
-+{
-+ if (!strcmp(str, "poll")) {
-+ printk("using polling idle threads.\n");
-+ pm_idle = poll_idle;
-+ }
-+#ifndef CONFIG_XEN
-+ else if (!strcmp(str, "mwait"))
-+ force_mwait = 1;
-+#endif
-+ else
-+ return -1;
-+
-+ boot_option_idle_override = 1;
-+ return 0;
-+}
-+early_param("idle", idle_setup);
-+
---- a/arch/x86/kernel/setup_32-xen.c
-+++ b/arch/x86/kernel/setup_32-xen.c
-@@ -39,6 +39,7 @@
- #include <linux/efi.h>
- #include <linux/init.h>
- #include <linux/edd.h>
-+#include <linux/iscsi_ibft.h>
- #include <linux/nodemask.h>
- #include <linux/kernel.h>
- #include <linux/percpu.h>
-@@ -49,6 +50,7 @@
- #include <linux/pfn.h>
- #include <linux/pci.h>
- #include <linux/init_ohci1394_dma.h>
-+#include <linux/kvm_para.h>
-
- #include <video/edid.h>
-
-@@ -70,8 +72,9 @@
- #include <xen/firmware.h>
- #include <xen/xencons.h>
- #include <setup_arch.h>
--#include <bios_ebda.h>
-+#include <asm/bios_ebda.h>
- #include <asm/cacheflush.h>
-+#include <asm/processor.h>
-
- #ifdef CONFIG_XEN
- #include <xen/interface/kexec.h>
-@@ -136,7 +139,12 @@ static struct resource standard_io_resou
- }, {
- .name = "keyboard",
- .start = 0x0060,
-- .end = 0x006f,
-+ .end = 0x0060,
-+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
-+}, {
-+ .name = "keyboard",
-+ .start = 0x0064,
-+ .end = 0x0064,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
- }, {
- .name = "dma page reg",
-@@ -166,6 +174,8 @@ struct cpuinfo_x86 new_cpu_data __cpuini
- struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
- EXPORT_SYMBOL(boot_cpu_data);
-
-+unsigned int def_to_bigsmp;
-+
- #ifndef CONFIG_X86_PAE
- unsigned long mmu_cr4_features;
- #else
-@@ -204,7 +214,7 @@ EXPORT_SYMBOL(ist_info);
- extern void early_cpu_init(void);
- extern int root_mountflags;
-
--unsigned long saved_videomode;
-+unsigned long saved_video_mode;
-
- #define RAMDISK_IMAGE_START_MASK 0x07FF
- #define RAMDISK_PROMPT_FLAG 0x8000
-@@ -259,7 +269,7 @@ static inline void copy_edd(void)
- }
- #endif
-
--int __initdata user_defined_memmap = 0;
-+int __initdata user_defined_memmap;
-
- /*
- * "mem=nopentium" disables the 4MB page tables.
-@@ -420,20 +430,59 @@ unsigned long __init find_max_low_pfn(vo
- }
-
- #ifndef CONFIG_XEN
-+#define BIOS_LOWMEM_KILOBYTES 0x413
-+
- /*
-- * workaround for Dell systems that neglect to reserve EBDA
-+ * The BIOS places the EBDA/XBDA at the top of conventional
-+ * memory, and usually decreases the reported amount of
-+ * conventional memory (int 0x12) too. This also contains a
-+ * workaround for Dell systems that neglect to reserve EBDA.
-+ * The same workaround also avoids a problem with the AMD768MPX
-+ * chipset: reserve a page before VGA to prevent PCI prefetch
-+ * into it (errata #56). Usually the page is reserved anyways,
-+ * unless you have no PS/2 mouse plugged in.
- */
- static void __init reserve_ebda_region(void)
- {
-- unsigned int addr;
-- addr = get_bios_ebda();
-- if (addr)
-- reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
-+ unsigned int lowmem, ebda_addr;
-+
-+ /* To determine the position of the EBDA and the */
-+ /* end of conventional memory, we need to look at */
-+ /* the BIOS data area. In a paravirtual environment */
-+ /* that area is absent. We'll just have to assume */
-+ /* that the paravirt case can handle memory setup */
-+ /* correctly, without our help. */
-+ if (paravirt_enabled())
-+ return;
-+
-+ /* end of low (conventional) memory */
-+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
-+ lowmem <<= 10;
-+
-+ /* start of EBDA area */
-+ ebda_addr = get_bios_ebda();
-+
-+ /* Fixup: bios puts an EBDA in the top 64K segment */
-+ /* of conventional memory, but does not adjust lowmem. */
-+ if ((lowmem - ebda_addr) <= 0x10000)
-+ lowmem = ebda_addr;
-+
-+ /* Fixup: bios does not report an EBDA at all. */
-+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-+ lowmem = 0x9f000;
-+
-+ /* Paranoia: should never happen, but... */
-+ if ((lowmem == 0) || (lowmem >= 0x100000))
-+ lowmem = 0x9f000;
-+
-+ /* reserve all memory between lowmem and the 1MB mark */
-+ reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
- }
- #endif
-
- #ifndef CONFIG_NEED_MULTIPLE_NODES
--void __init setup_bootmem_allocator(void);
-+static void __init setup_bootmem_allocator(void);
- static unsigned long __init setup_memory(void)
- {
- /*
-@@ -469,7 +518,7 @@ static unsigned long __init setup_memory
- return max_low_pfn;
- }
-
--void __init zone_sizes_init(void)
-+static void __init zone_sizes_init(void)
- {
- unsigned long max_zone_pfns[MAX_NR_ZONES];
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-@@ -521,10 +570,16 @@ static void __init reserve_crashkernel(v
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
-+
-+ if (reserve_bootmem(crash_base, crash_size,
-+ BOOTMEM_EXCLUSIVE) < 0) {
-+ printk(KERN_INFO "crashkernel reservation "
-+ "failed - memory is in use\n");
-+ return;
-+ }
-+
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
-- reserve_bootmem(crash_base, crash_size,
-- BOOTMEM_DEFAULT);
- } else
- printk(KERN_INFO "crashkernel reservation failed - "
- "you have to specify a base address\n");
-@@ -658,16 +713,9 @@ void __init setup_bootmem_allocator(void
- */
- reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
-
-- /* reserve EBDA region, it's a 4K region */
-+ /* reserve EBDA region */
- reserve_ebda_region();
-
-- /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
-- PCI prefetch into it (errata #56). Usually the page is reserved anyways,
-- unless you have no PS/2 mouse plugged in. */
-- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-- boot_cpu_data.x86 == 6)
-- reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
--
- #ifdef CONFIG_SMP
- /*
- * But first pinch a few for the stack/trampoline stuff
-@@ -689,6 +737,8 @@ void __init setup_bootmem_allocator(void
- #endif
- numa_kva_reserve();
- reserve_crashkernel();
-+
-+ reserve_ibft_region();
- }
-
- /*
-@@ -724,6 +774,18 @@ char * __init __attribute__((weak)) memo
- return machine_specific_memory_setup();
- }
-
-+#ifdef CONFIG_NUMA
-+/*
-+ * In the golden day, when everything among i386 and x86_64 will be
-+ * integrated, this will not live here
-+ */
-+void *x86_cpu_to_node_map_early_ptr;
-+int x86_cpu_to_node_map_init[NR_CPUS] = {
-+ [0 ... NR_CPUS-1] = NUMA_NO_NODE
-+};
-+DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
-+#endif
-+
- /*
- * Determine if we were loaded by an EFI loader. If so, then we have also been
- * passed the efi memmap, systab, etc., so we should use these data structures
-@@ -773,7 +835,7 @@ void __init setup_arch(char **cmdline_p)
- copy_edid();
- apm_info.bios = boot_params.apm_bios_info;
- ist_info = boot_params.ist_info;
-- saved_videomode = boot_params.hdr.vid_mode;
-+ saved_video_mode = boot_params.hdr.vid_mode;
- if( boot_params.sys_desc_table.length != 0 ) {
- set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
- machine_id = boot_params.sys_desc_table.table[0];
-@@ -840,15 +902,19 @@ void __init setup_arch(char **cmdline_p)
- efi_init();
-
- /* update e820 for memory not covered by WB MTRRs */
-- find_max_pfn();
-+ propagate_e820_map();
- mtrr_bp_init();
- #ifndef CONFIG_XEN
- if (mtrr_trim_uncached_memory(max_pfn))
-- find_max_pfn();
-+ propagate_e820_map();
- #endif
-
- max_low_pfn = setup_memory();
-
-+#ifdef CONFIG_KVM_CLOCK
-+ kvmclock_init();
-+#endif
-+
- #ifdef CONFIG_VMI
- /*
- * Must be after max_low_pfn is determined, and before kernel
-@@ -856,6 +922,7 @@ void __init setup_arch(char **cmdline_p)
- */
- vmi_init();
- #endif
-+ kvm_guest_init();
-
- /*
- * NOTE: before this point _nobody_ is allowed to allocate
-@@ -977,6 +1044,18 @@ void __init setup_arch(char **cmdline_p)
-
- io_delay_init();
-
-+#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
-+ /*
-+ * setup to use the early static init tables during kernel startup
-+ * X86_SMP will exclude sub-arches that don't deal well with it.
-+ */
-+ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
-+ x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-+#ifdef CONFIG_NUMA
-+ x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-+#endif
-+#endif
-+
- #ifdef CONFIG_X86_GENERICARCH
- generic_apic_probe();
- #endif
---- a/arch/x86/kernel/setup_64-xen.c
-+++ b/arch/x86/kernel/setup_64-xen.c
-@@ -29,18 +29,22 @@
- #include <linux/crash_dump.h>
- #include <linux/root_dev.h>
- #include <linux/pci.h>
-+#include <asm/pci-direct.h>
- #include <linux/efi.h>
- #include <linux/acpi.h>
- #include <linux/kallsyms.h>
- #include <linux/edd.h>
-+#include <linux/iscsi_ibft.h>
- #include <linux/mmzone.h>
- #include <linux/kexec.h>
- #include <linux/cpufreq.h>
- #include <linux/dmi.h>
- #include <linux/dma-mapping.h>
- #include <linux/ctype.h>
-+#include <linux/sort.h>
- #include <linux/uaccess.h>
- #include <linux/init_ohci1394_dma.h>
-+#include <linux/kvm_para.h>
-
- #include <asm/mtrr.h>
- #include <asm/uaccess.h>
-@@ -58,7 +62,6 @@
- #include <asm/mmu_context.h>
- #include <asm/proto.h>
- #include <asm/setup.h>
--#include <asm/mach_apic.h>
- #include <asm/numa.h>
- #include <asm/sections.h>
- #include <asm/dmi.h>
-@@ -66,6 +69,9 @@
- #include <asm/mce.h>
- #include <asm/ds.h>
- #include <asm/topology.h>
-+#include <asm/pat.h>
-+
-+#include <mach_apic.h>
- #ifdef CONFIG_XEN
- #include <linux/percpu.h>
- #include <xen/interface/physdev.h>
-@@ -149,7 +155,7 @@ extern int root_mountflags;
-
- char __initdata command_line[COMMAND_LINE_SIZE];
-
--struct resource standard_io_resources[] = {
-+static struct resource standard_io_resources[] = {
- { .name = "dma1", .start = 0x00, .end = 0x1f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "pic1", .start = 0x20, .end = 0x21,
-@@ -158,7 +164,9 @@ struct resource standard_io_resources[]
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "timer1", .start = 0x50, .end = 0x53,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-- { .name = "keyboard", .start = 0x60, .end = 0x6f,
-+ { .name = "keyboard", .start = 0x60, .end = 0x60,
-+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-+ { .name = "keyboard", .start = 0x64, .end = 0x64,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "dma page reg", .start = 0x80, .end = 0x8f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-@@ -224,8 +232,10 @@ contig_initmem_init(unsigned long start_
- e820_register_active_regions(0, start_pfn, end_pfn);
- #ifdef CONFIG_XEN
- free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
-+ early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
- #else
- free_bootmem_with_active_regions(0, end_pfn);
-+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
- #endif
- reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
- }
-@@ -290,6 +300,7 @@ static void __init reserve_crashkernel(v
- (unsigned long)(total_mem >> 20));
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
-+ insert_resource(&iomem_resource, &crashk_res);
- }
- }
- #else
-@@ -306,6 +317,40 @@ void __attribute__((weak)) __init memory
- machine_specific_memory_setup();
- }
-
-+static void __init parse_setup_data(void)
-+{
-+ struct setup_data *data;
-+ unsigned long pa_data;
-+
-+ if (boot_params.hdr.version < 0x0209)
-+ return;
-+ pa_data = boot_params.hdr.setup_data;
-+ while (pa_data) {
-+ data = early_ioremap(pa_data, PAGE_SIZE);
-+ switch (data->type) {
-+ default:
-+ break;
-+ }
-+#ifndef CONFIG_DEBUG_BOOT_PARAMS
-+ free_early(pa_data, pa_data+sizeof(*data)+data->len);
-+#endif
-+ pa_data = data->next;
-+ early_iounmap(data, PAGE_SIZE);
-+ }
-+}
-+
-+#ifdef CONFIG_PCI_MMCONFIG
-+extern void __cpuinit fam10h_check_enable_mmcfg(void);
-+extern void __init check_enable_amd_mmconf_dmi(void);
-+#else
-+void __cpuinit fam10h_check_enable_mmcfg(void)
-+{
-+}
-+void __init check_enable_amd_mmconf_dmi(void)
-+{
-+}
-+#endif
-+
- /*
- * setup_arch - architecture-specific boot-time initializations
- *
-@@ -389,6 +434,8 @@ void __init setup_arch(char **cmdline_p)
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
-+ parse_setup_data();
-+
- parse_early_param();
-
- #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-@@ -398,6 +445,13 @@ void __init setup_arch(char **cmdline_p)
-
- finish_e820_parsing();
-
-+#ifndef CONFIG_XEN
-+ /* after parse_early_param, so could debug it */
-+ insert_resource(&iomem_resource, &code_resource);
-+ insert_resource(&iomem_resource, &data_resource);
-+ insert_resource(&iomem_resource, &bss_resource);
-+#endif
-+
- early_gart_iommu_check();
-
- e820_register_active_regions(0, 0, -1UL);
-@@ -420,15 +474,23 @@ void __init setup_arch(char **cmdline_p)
-
- check_efer();
-
-- init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
-+ max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
- if (efi_enabled)
- efi_init();
-
-+#ifndef CONFIG_XEN
-+ vsmp_init();
-+#endif
-+
- if (is_initial_xendomain())
- dmi_scan_machine();
-
- io_delay_init();
-
-+#ifdef CONFIG_KVM_CLOCK
-+ kvmclock_init();
-+#endif
-+
- #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
- /* setup to use the early static init tables during kernel startup */
- x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
-@@ -459,9 +521,9 @@ void __init setup_arch(char **cmdline_p)
- contig_initmem_init(0, end_pfn);
- #endif
-
-- early_res_to_bootmem();
--
- #ifndef CONFIG_XEN
-+ dma32_reserve_bootmem();
-+
- #ifdef CONFIG_ACPI_SLEEP
- /*
- * Reserve low memory region for sleep support.
-@@ -487,16 +549,17 @@ void __init setup_arch(char **cmdline_p)
- unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
-
- if (ramdisk_end <= end_of_mem) {
--#ifndef CONFIG_XEN
-- reserve_bootmem_generic(ramdisk_image, ramdisk_size);
--#endif
-+ /*
-+ * don't need to reserve again, already reserved early
-+ * in x86_64_start_kernel, and early_res_to_bootmem
-+ * convert that to reserved in bootmem
-+ */
- initrd_start = ramdisk_image + PAGE_OFFSET;
- initrd_end = initrd_start+ramdisk_size;
- #ifdef CONFIG_XEN
- initrd_below_start_ok = 1;
- #endif
- } else {
-- /* Assumes everything on node 0 */
- free_bootmem(ramdisk_image, ramdisk_size);
- printk(KERN_ERR "initrd extends beyond end of memory "
- "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-@@ -506,6 +569,9 @@ void __init setup_arch(char **cmdline_p)
- }
- #endif
- reserve_crashkernel();
-+
-+ reserve_ibft_region();
-+
- paging_init();
- map_vsyscall();
- #ifdef CONFIG_X86_LOCAL_APIC
-@@ -633,16 +699,16 @@ void __init setup_arch(char **cmdline_p)
- prefill_possible_map();
- #endif
-
-+ kvm_guest_init();
-+
- /*
- * We trust e820 completely. No explicit ROM probing in memory.
- */
- #ifdef CONFIG_XEN
- if (is_initial_xendomain())
-- e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
-- &code_resource, &data_resource, &bss_resource);
-+ e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
- #else
-- e820_reserve_resources(e820.map, e820.nr_map,
-- &code_resource, &data_resource, &bss_resource);
-+ e820_reserve_resources(e820.map, e820.nr_map);
- e820_mark_nosave_regions();
- #endif
-
-@@ -690,6 +756,9 @@ void __init setup_arch(char **cmdline_p)
- #endif
-
- #endif /* !CONFIG_XEN */
-+
-+ /* do this before identify_cpu for boot cpu */
-+ check_enable_amd_mmconf_dmi();
- }
-
- #ifdef CONFIG_XEN
-@@ -786,9 +855,9 @@ static void __cpuinit amd_detect_cmp(str
- bits = c->x86_coreid_bits;
-
- /* Low order bits define the core id (index of core in socket) */
-- c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
-- /* Convert the APIC ID into the socket ID */
-- c->phys_proc_id = phys_pkg_id(bits);
-+ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
-+ /* Convert the initial APIC ID into the socket ID */
-+ c->phys_proc_id = c->initial_apicid >> bits;
-
- #ifdef CONFIG_NUMA
- node = c->phys_proc_id;
-@@ -805,7 +874,7 @@ static void __cpuinit amd_detect_cmp(str
- If that doesn't result in a usable node fall back to the
- path for the previous case. */
-
-- int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
-+ int ht_nodeid = c->initial_apicid;
-
- if (ht_nodeid >= 0 &&
- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-@@ -913,7 +982,7 @@ static void __cpuinit init_amd(struct cp
-
- /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-- clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
-+ clear_cpu_cap(c, 0*32+31);
-
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- level = cpuid_eax(1);
-@@ -955,9 +1024,25 @@ static void __cpuinit init_amd(struct cp
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
-+ if (c->x86 == 0x10)
-+ fam10h_check_enable_mmcfg();
-+
- #ifndef CONFIG_XEN
- if (amd_apic_timer_broken())
- disable_apic_timer = 1;
-+
-+ if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
-+ unsigned long long tseg;
-+
-+ /*
-+ * Split up direct mapping around the TSEG SMM area.
-+ * Don't do it for gbpages because there seems very little
-+ * benefit in doing so.
-+ */
-+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
-+ (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
-+ set_memory_4k((unsigned long)__va(tseg), 1);
-+ }
- #endif
- }
-
-@@ -1051,7 +1136,7 @@ static void __cpuinit early_init_intel(s
- {
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
-- set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
-+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- }
-
- static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-@@ -1094,9 +1179,6 @@ static void __cpuinit init_intel(struct
-
- if (c->x86 == 15)
- c->x86_cache_alignment = c->x86_clflush_size * 2;
-- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
-- (c->x86 == 0x6 && c->x86_model >= 0x0e))
-- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-@@ -1105,6 +1187,32 @@ static void __cpuinit init_intel(struct
- srat_detect_node();
- }
-
-+static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-+{
-+ if (c->x86 == 0x6 && c->x86_model >= 0xf)
-+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-+}
-+
-+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
-+{
-+ /* Cache sizes */
-+ unsigned n;
-+
-+ n = c->extended_cpuid_level;
-+ if (n >= 0x80000008) {
-+ unsigned eax = cpuid_eax(0x80000008);
-+ c->x86_virt_bits = (eax >> 8) & 0xff;
-+ c->x86_phys_bits = eax & 0xff;
-+ }
-+
-+ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
-+ c->x86_cache_alignment = c->x86_clflush_size * 2;
-+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-+ }
-+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-+}
-+
- static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
- {
- char *v = c->x86_vendor_id;
-@@ -1113,6 +1221,8 @@ static void __cpuinit get_cpu_vendor(str
- c->x86_vendor = X86_VENDOR_AMD;
- else if (!strcmp(v, "GenuineIntel"))
- c->x86_vendor = X86_VENDOR_INTEL;
-+ else if (!strcmp(v, "CentaurHauls"))
-+ c->x86_vendor = X86_VENDOR_CENTAUR;
- else
- c->x86_vendor = X86_VENDOR_UNKNOWN;
- }
-@@ -1160,15 +1270,16 @@ static void __cpuinit early_identify_cpu
- c->x86 += (tfms >> 20) & 0xff;
- if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xF) << 4;
-- if (c->x86_capability[0] & (1<<19))
-+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
- c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
- } else {
- /* Have CPUID level 0 only - unheard of */
- c->x86 = 4;
- }
-
-+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
- #ifdef CONFIG_SMP
-- c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
-+ c->phys_proc_id = c->initial_apicid;
- #endif
- /* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
-@@ -1201,8 +1312,12 @@ static void __cpuinit early_identify_cpu
- case X86_VENDOR_INTEL:
- early_init_intel(c);
- break;
-+ case X86_VENDOR_CENTAUR:
-+ early_init_centaur(c);
-+ break;
- }
-
-+ validate_pat_support(c);
- }
-
- /*
-@@ -1237,6 +1352,10 @@ void __cpuinit identify_cpu(struct cpuin
- init_intel(c);
- break;
-
-+ case X86_VENDOR_CENTAUR:
-+ init_centaur(c);
-+ break;
-+
- case X86_VENDOR_UNKNOWN:
- default:
- display_cacheinfo(c);
-@@ -1266,14 +1385,24 @@ void __cpuinit identify_cpu(struct cpuin
- #endif
- select_idle_routine(c);
-
-- if (c != &boot_cpu_data)
-- mtrr_ap_init();
- #ifdef CONFIG_NUMA
- numa_add_cpu(smp_processor_id());
- #endif
-
- }
-
-+void __cpuinit identify_boot_cpu(void)
-+{
-+ identify_cpu(&boot_cpu_data);
-+}
-+
-+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
-+{
-+ BUG_ON(c == &boot_cpu_data);
-+ identify_cpu(c);
-+ mtrr_ap_init();
-+}
-+
- static __init int setup_noclflush(char *arg)
- {
- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
-@@ -1302,123 +1431,3 @@ static __init int setup_disablecpuid(cha
- return 1;
- }
- __setup("clearcpuid=", setup_disablecpuid);
--
--/*
-- * Get CPU information for use by the procfs.
-- */
--
--static int show_cpuinfo(struct seq_file *m, void *v)
--{
-- struct cpuinfo_x86 *c = v;
-- int cpu = 0, i;
--
--#ifdef CONFIG_SMP
-- cpu = c->cpu_index;
--#endif
--
-- seq_printf(m, "processor\t: %u\n"
-- "vendor_id\t: %s\n"
-- "cpu family\t: %d\n"
-- "model\t\t: %d\n"
-- "model name\t: %s\n",
-- (unsigned)cpu,
-- c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
-- c->x86,
-- (int)c->x86_model,
-- c->x86_model_id[0] ? c->x86_model_id : "unknown");
--
-- if (c->x86_mask || c->cpuid_level >= 0)
-- seq_printf(m, "stepping\t: %d\n", c->x86_mask);
-- else
-- seq_printf(m, "stepping\t: unknown\n");
--
-- if (cpu_has(c, X86_FEATURE_TSC)) {
-- unsigned int freq = cpufreq_quick_get((unsigned)cpu);
--
-- if (!freq)
-- freq = cpu_khz;
-- seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
-- freq / 1000, (freq % 1000));
-- }
--
-- /* Cache size */
-- if (c->x86_cache_size >= 0)
-- seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
--
--#ifdef CONFIG_SMP
-- if (smp_num_siblings * c->x86_max_cores > 1) {
-- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
-- seq_printf(m, "siblings\t: %d\n",
-- cpus_weight(per_cpu(cpu_core_map, cpu)));
-- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
-- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
-- }
--#endif
--
-- seq_printf(m,
-- "fpu\t\t: yes\n"
-- "fpu_exception\t: yes\n"
-- "cpuid level\t: %d\n"
-- "wp\t\t: yes\n"
-- "flags\t\t:",
-- c->cpuid_level);
--
-- for (i = 0; i < 32*NCAPINTS; i++)
-- if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
-- seq_printf(m, " %s", x86_cap_flags[i]);
--
-- seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
-- c->loops_per_jiffy/(500000/HZ),
-- (c->loops_per_jiffy/(5000/HZ)) % 100);
--
-- if (c->x86_tlbsize > 0)
-- seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
-- seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
-- seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
--
-- seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
-- c->x86_phys_bits, c->x86_virt_bits);
--
-- seq_printf(m, "power management:");
-- for (i = 0; i < 32; i++) {
-- if (c->x86_power & (1 << i)) {
-- if (i < ARRAY_SIZE(x86_power_flags) &&
-- x86_power_flags[i])
-- seq_printf(m, "%s%s",
-- x86_power_flags[i][0]?" ":"",
-- x86_power_flags[i]);
-- else
-- seq_printf(m, " [%d]", i);
-- }
-- }
--
-- seq_printf(m, "\n\n");
--
-- return 0;
--}
--
--static void *c_start(struct seq_file *m, loff_t *pos)
--{
-- if (*pos == 0) /* just in case, cpu 0 is not the first */
-- *pos = first_cpu(cpu_online_map);
-- if ((*pos) < NR_CPUS && cpu_online(*pos))
-- return &cpu_data(*pos);
-- return NULL;
--}
--
--static void *c_next(struct seq_file *m, void *v, loff_t *pos)
--{
-- *pos = next_cpu(*pos, cpu_online_map);
-- return c_start(m, pos);
--}
--
--static void c_stop(struct seq_file *m, void *v)
--{
--}
--
--const struct seq_operations cpuinfo_op = {
-- .start = c_start,
-- .next = c_next,
-- .stop = c_stop,
-- .show = show_cpuinfo,
--};
---- a/arch/x86/kernel/setup64-xen.c
-+++ b/arch/x86/kernel/setup64-xen.c
-@@ -15,6 +15,7 @@
- #include <linux/bootmem.h>
- #include <linux/bitops.h>
- #include <linux/module.h>
-+#include <linux/kgdb.h>
- #include <asm/pda.h>
- #include <asm/pgtable.h>
- #include <asm/processor.h>
-@@ -27,6 +28,7 @@
- #include <asm/proto.h>
- #include <asm/sections.h>
- #include <asm/setup.h>
-+#include <asm/genapic.h>
- #ifdef CONFIG_XEN
- #include <asm/hypervisor.h>
- #endif
-@@ -81,8 +83,8 @@ int force_personality32 = 0;
- Control non executable heap for 32bit processes.
- To control the stack too use noexec=off
-
--on PROT_READ does not imply PROT_EXEC for 32bit processes
--off PROT_READ implies PROT_EXEC (default)
-+on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
-+off PROT_READ implies PROT_EXEC
- */
- static int __init nonx32_setup(char *str)
- {
-@@ -94,85 +96,6 @@ static int __init nonx32_setup(char *str
- }
- __setup("noexec32=", nonx32_setup);
-
--/*
-- * Copy data used in early init routines from the initial arrays to the
-- * per cpu data areas. These arrays then become expendable and the
-- * *_early_ptr's are zeroed indicating that the static arrays are gone.
-- */
--static void __init setup_per_cpu_maps(void)
--{
--#ifndef CONFIG_XEN
-- int cpu;
--
-- for_each_possible_cpu(cpu) {
--#ifdef CONFIG_SMP
-- if (per_cpu_offset(cpu)) {
--#endif
-- per_cpu(x86_cpu_to_apicid, cpu) =
-- x86_cpu_to_apicid_init[cpu];
-- per_cpu(x86_bios_cpu_apicid, cpu) =
-- x86_bios_cpu_apicid_init[cpu];
--#ifdef CONFIG_NUMA
-- per_cpu(x86_cpu_to_node_map, cpu) =
-- x86_cpu_to_node_map_init[cpu];
--#endif
--#ifdef CONFIG_SMP
-- }
-- else
-- printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
-- cpu);
--#endif
-- }
--
-- /* indicate the early static arrays will soon be gone */
-- x86_cpu_to_apicid_early_ptr = NULL;
-- x86_bios_cpu_apicid_early_ptr = NULL;
--#ifdef CONFIG_NUMA
-- x86_cpu_to_node_map_early_ptr = NULL;
--#endif
--#endif
--}
--
--/*
-- * Great future plan:
-- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
-- * Always point %gs to its beginning
-- */
--void __init setup_per_cpu_areas(void)
--{
-- int i;
-- unsigned long size;
--
--#ifdef CONFIG_HOTPLUG_CPU
-- prefill_possible_map();
--#endif
--
-- /* Copy section for each CPU (we discard the original) */
-- size = PERCPU_ENOUGH_ROOM;
--
-- printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
-- for_each_cpu_mask (i, cpu_possible_map) {
-- char *ptr;
--#ifndef CONFIG_NEED_MULTIPLE_NODES
-- ptr = alloc_bootmem_pages(size);
--#else
-- int node = early_cpu_to_node(i);
--
-- if (!node_online(node) || !NODE_DATA(node))
-- ptr = alloc_bootmem_pages(size);
-- else
-- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
--#endif
-- if (!ptr)
-- panic("Cannot allocate cpu data for CPU %d\n", i);
-- cpu_pda(i)->data_offset = ptr - __per_cpu_start;
-- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-- }
--
-- /* setup percpu data maps early */
-- setup_per_cpu_maps();
--}
--
- #ifdef CONFIG_XEN
- static void __init_refok switch_pt(int cpu)
- {
-@@ -410,6 +333,17 @@ void __cpuinit cpu_init (void)
- #endif
- load_LDT(&init_mm.context);
-
-+#ifdef CONFIG_KGDB
-+ /*
-+ * If the kgdb is connected no debug regs should be altered. This
-+ * is only applicable when KGDB and a KGDB I/O module are built
-+ * into the kernel and you are using early debugging with
-+ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
-+ */
-+ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
-+ arch_kgdb_ops.correct_hw_break();
-+ else {
-+#endif
- /*
- * Clear all 6 debug registers:
- */
-@@ -420,10 +354,17 @@ void __cpuinit cpu_init (void)
- set_debugreg(0UL, 3);
- set_debugreg(0UL, 6);
- set_debugreg(0UL, 7);
-+#ifdef CONFIG_KGDB
-+ /* If the kgdb is connected no debug regs should be altered. */
-+ }
-+#endif
-
- fpu_init();
-
- asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
- if (raw_irqs_disabled())
- kernel_eflags &= ~X86_EFLAGS_IF;
-+
-+ if (is_uv_system())
-+ uv_cpu_init();
- }
---- /dev/null
-+++ b/arch/x86/kernel/setup-xen.c
-@@ -0,0 +1,141 @@
-+#include <linux/kernel.h>
-+#include <linux/module.h>
-+#include <linux/init.h>
-+#include <linux/bootmem.h>
-+#include <linux/percpu.h>
-+#include <asm/smp.h>
-+#include <asm/percpu.h>
-+#include <asm/sections.h>
-+#include <asm/processor.h>
-+#include <asm/setup.h>
-+#include <asm/topology.h>
-+#include <asm/mpspec.h>
-+#include <asm/apicdef.h>
-+
-+#ifdef CONFIG_X86_LOCAL_APIC
-+unsigned int num_processors;
-+unsigned disabled_cpus __cpuinitdata;
-+/* Processor that is doing the boot up */
-+unsigned int boot_cpu_physical_apicid = -1U;
-+EXPORT_SYMBOL(boot_cpu_physical_apicid);
-+
-+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
-+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-+
-+/* Bitmask of physically existing CPUs */
-+physid_mask_t phys_cpu_present_map;
-+#endif
-+
-+#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
-+/*
-+ * Copy data used in early init routines from the initial arrays to the
-+ * per cpu data areas. These arrays then become expendable and the
-+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
-+ */
-+static void __init setup_per_cpu_maps(void)
-+{
-+#ifndef CONFIG_XEN
-+ int cpu;
-+
-+ for_each_possible_cpu(cpu) {
-+ per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
-+ per_cpu(x86_bios_cpu_apicid, cpu) =
-+ x86_bios_cpu_apicid_init[cpu];
-+#ifdef CONFIG_NUMA
-+ per_cpu(x86_cpu_to_node_map, cpu) =
-+ x86_cpu_to_node_map_init[cpu];
-+#endif
-+ }
-+
-+ /* indicate the early static arrays will soon be gone */
-+ x86_cpu_to_apicid_early_ptr = NULL;
-+ x86_bios_cpu_apicid_early_ptr = NULL;
-+#ifdef CONFIG_NUMA
-+ x86_cpu_to_node_map_early_ptr = NULL;
-+#endif
-+#endif
-+}
-+
-+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-+cpumask_t *cpumask_of_cpu_map __read_mostly;
-+EXPORT_SYMBOL(cpumask_of_cpu_map);
-+
-+/* requires nr_cpu_ids to be initialized */
-+static void __init setup_cpumask_of_cpu(void)
-+{
-+ int i;
-+
-+ /* alloc_bootmem zeroes memory */
-+ cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
-+ for (i = 0; i < nr_cpu_ids; i++)
-+ cpu_set(i, cpumask_of_cpu_map[i]);
-+}
-+#else
-+static inline void setup_cpumask_of_cpu(void) { }
-+#endif
-+
-+#ifdef CONFIG_X86_32
-+/*
-+ * Great future not-so-futuristic plan: make i386 and x86_64 do it
-+ * the same way
-+ */
-+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-+EXPORT_SYMBOL(__per_cpu_offset);
-+#endif
-+
-+/*
-+ * Great future plan:
-+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
-+ * Always point %gs to its beginning
-+ */
-+void __init setup_per_cpu_areas(void)
-+{
-+ int i, highest_cpu = 0;
-+ unsigned long size;
-+
-+#ifdef CONFIG_HOTPLUG_CPU
-+ prefill_possible_map();
-+#endif
-+
-+ /* Copy section for each CPU (we discard the original) */
-+ size = PERCPU_ENOUGH_ROOM;
-+ printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
-+ size);
-+
-+ for_each_possible_cpu(i) {
-+ char *ptr;
-+#ifndef CONFIG_NEED_MULTIPLE_NODES
-+ ptr = alloc_bootmem_pages(size);
-+#else
-+ int node = early_cpu_to_node(i);
-+ if (!node_online(node) || !NODE_DATA(node)) {
-+ ptr = alloc_bootmem_pages(size);
-+ printk(KERN_INFO
-+ "cpu %d has no node or node-local memory\n", i);
-+ }
-+ else
-+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-+#endif
-+ if (!ptr)
-+ panic("Cannot allocate cpu data for CPU %d\n", i);
-+#ifdef CONFIG_X86_64
-+ cpu_pda(i)->data_offset = ptr - __per_cpu_start;
-+#else
-+ __per_cpu_offset[i] = ptr - __per_cpu_start;
-+#endif
-+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-+
-+ highest_cpu = i;
-+ }
-+
-+ nr_cpu_ids = highest_cpu + 1;
-+ printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
-+
-+ /* Setup percpu data maps */
-+ setup_per_cpu_maps();
-+
-+ /* Setup cpumask_of_cpu map */
-+ setup_cpumask_of_cpu();
-+}
-+
-+#endif
---- a/arch/x86/kernel/smp_32-xen.c
-+++ /dev/null
-@@ -1,647 +0,0 @@
--/*
-- * Intel SMP support routines.
-- *
-- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
-- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
-- *
-- * This code is released under the GNU General Public License version 2 or
-- * later.
-- */
--
--#include <linux/init.h>
--
--#include <linux/mm.h>
--#include <linux/delay.h>
--#include <linux/spinlock.h>
--#include <linux/kernel_stat.h>
--#include <linux/mc146818rtc.h>
--#include <linux/cache.h>
--#include <linux/interrupt.h>
--#include <linux/cpu.h>
--#include <linux/module.h>
--
--#include <asm/mtrr.h>
--#include <asm/tlbflush.h>
--#include <asm/mmu_context.h>
--#if 0
--#include <mach_apic.h>
--#endif
--#include <xen/evtchn.h>
--
--/*
-- * Some notes on x86 processor bugs affecting SMP operation:
-- *
-- * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
-- * The Linux implications for SMP are handled as follows:
-- *
-- * Pentium III / [Xeon]
-- * None of the E1AP-E3AP errata are visible to the user.
-- *
-- * E1AP. see PII A1AP
-- * E2AP. see PII A2AP
-- * E3AP. see PII A3AP
-- *
-- * Pentium II / [Xeon]
-- * None of the A1AP-A3AP errata are visible to the user.
-- *
-- * A1AP. see PPro 1AP
-- * A2AP. see PPro 2AP
-- * A3AP. see PPro 7AP
-- *
-- * Pentium Pro
-- * None of 1AP-9AP errata are visible to the normal user,
-- * except occasional delivery of 'spurious interrupt' as trap #15.
-- * This is very rare and a non-problem.
-- *
-- * 1AP. Linux maps APIC as non-cacheable
-- * 2AP. worked around in hardware
-- * 3AP. fixed in C0 and above steppings microcode update.
-- * Linux does not use excessive STARTUP_IPIs.
-- * 4AP. worked around in hardware
-- * 5AP. symmetric IO mode (normal Linux operation) not affected.
-- * 'noapic' mode has vector 0xf filled out properly.
-- * 6AP. 'noapic' mode might be affected - fixed in later steppings
-- * 7AP. We do not assume writes to the LVT deassering IRQs
-- * 8AP. We do not enable low power mode (deep sleep) during MP bootup
-- * 9AP. We do not use mixed mode
-- *
-- * Pentium
-- * There is a marginal case where REP MOVS on 100MHz SMP
-- * machines with B stepping processors can fail. XXX should provide
-- * an L1cache=Writethrough or L1cache=off option.
-- *
-- * B stepping CPUs may hang. There are hardware work arounds
-- * for this. We warn about it in case your board doesn't have the work
-- * arounds. Basically that's so I can tell anyone with a B stepping
-- * CPU and SMP problems "tough".
-- *
-- * Specific items [From Pentium Processor Specification Update]
-- *
-- * 1AP. Linux doesn't use remote read
-- * 2AP. Linux doesn't trust APIC errors
-- * 3AP. We work around this
-- * 4AP. Linux never generated 3 interrupts of the same priority
-- * to cause a lost local interrupt.
-- * 5AP. Remote read is never used
-- * 6AP. not affected - worked around in hardware
-- * 7AP. not affected - worked around in hardware
-- * 8AP. worked around in hardware - we get explicit CS errors if not
-- * 9AP. only 'noapic' mode affected. Might generate spurious
-- * interrupts, we log only the first one and count the
-- * rest silently.
-- * 10AP. not affected - worked around in hardware
-- * 11AP. Linux reads the APIC between writes to avoid this, as per
-- * the documentation. Make sure you preserve this as it affects
-- * the C stepping chips too.
-- * 12AP. not affected - worked around in hardware
-- * 13AP. not affected - worked around in hardware
-- * 14AP. we always deassert INIT during bootup
-- * 15AP. not affected - worked around in hardware
-- * 16AP. not affected - worked around in hardware
-- * 17AP. not affected - worked around in hardware
-- * 18AP. not affected - worked around in hardware
-- * 19AP. not affected - worked around in BIOS
-- *
-- * If this sounds worrying believe me these bugs are either ___RARE___,
-- * or are signal timing bugs worked around in hardware and there's
-- * about nothing of note with C stepping upwards.
-- */
--
--DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
--
--/*
-- * the following functions deal with sending IPIs between CPUs.
-- *
-- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
-- */
--
--static inline int __prepare_ICR (unsigned int shortcut, int vector)
--{
-- unsigned int icr = shortcut | APIC_DEST_LOGICAL;
--
-- switch (vector) {
-- default:
-- icr |= APIC_DM_FIXED | vector;
-- break;
-- case NMI_VECTOR:
-- icr |= APIC_DM_NMI;
-- break;
-- }
-- return icr;
--}
--
--static inline int __prepare_ICR2 (unsigned int mask)
--{
-- return SET_APIC_DEST_FIELD(mask);
--}
--
--DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
--
--static inline void __send_IPI_one(unsigned int cpu, int vector)
--{
-- int irq = per_cpu(ipi_to_irq, cpu)[vector];
-- BUG_ON(irq < 0);
-- notify_remote_via_irq(irq);
--}
--
--void __send_IPI_shortcut(unsigned int shortcut, int vector)
--{
-- int cpu;
--
-- switch (shortcut) {
-- case APIC_DEST_SELF:
-- __send_IPI_one(smp_processor_id(), vector);
-- break;
-- case APIC_DEST_ALLBUT:
-- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
-- if (cpu == smp_processor_id())
-- continue;
-- if (cpu_isset(cpu, cpu_online_map)) {
-- __send_IPI_one(cpu, vector);
-- }
-- }
-- break;
-- default:
-- printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
-- vector);
-- break;
-- }
--}
--
--void send_IPI_self(int vector)
--{
-- __send_IPI_shortcut(APIC_DEST_SELF, vector);
--}
--
--/*
-- * This is only used on smaller machines.
-- */
--void send_IPI_mask_bitmask(cpumask_t mask, int vector)
--{
-- unsigned long flags;
-- unsigned int cpu;
--
-- local_irq_save(flags);
-- WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
--
-- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
-- if (cpu_isset(cpu, mask)) {
-- __send_IPI_one(cpu, vector);
-- }
-- }
--
-- local_irq_restore(flags);
--}
--
--void send_IPI_mask_sequence(cpumask_t mask, int vector)
--{
--
-- send_IPI_mask_bitmask(mask, vector);
--}
--
--#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
--
--#if 0 /* XEN */
--/*
-- * Smarter SMP flushing macros.
-- * c/o Linus Torvalds.
-- *
-- * These mean you can really definitely utterly forget about
-- * writing to user space from interrupts. (Its not allowed anyway).
-- *
-- * Optimizations Manfred Spraul <manfred@colorfullife.com>
-- */
--
--static cpumask_t flush_cpumask;
--static struct mm_struct * flush_mm;
--static unsigned long flush_va;
--static DEFINE_SPINLOCK(tlbstate_lock);
--
--/*
-- * We cannot call mmdrop() because we are in interrupt context,
-- * instead update mm->cpu_vm_mask.
-- *
-- * We need to reload %cr3 since the page tables may be going
-- * away from under us..
-- */
--void leave_mm(int cpu)
--{
-- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
-- BUG();
-- cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
-- load_cr3(swapper_pg_dir);
--}
--EXPORT_SYMBOL_GPL(leave_mm);
--
--/*
-- *
-- * The flush IPI assumes that a thread switch happens in this order:
-- * [cpu0: the cpu that switches]
-- * 1) switch_mm() either 1a) or 1b)
-- * 1a) thread switch to a different mm
-- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
-- * Stop ipi delivery for the old mm. This is not synchronized with
-- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
-- * for the wrong mm, and in the worst case we perform a superfluous
-- * tlb flush.
-- * 1a2) set cpu_tlbstate to TLBSTATE_OK
-- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
-- * was in lazy tlb mode.
-- * 1a3) update cpu_tlbstate[].active_mm
-- * Now cpu0 accepts tlb flushes for the new mm.
-- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
-- * Now the other cpus will send tlb flush ipis.
-- * 1a4) change cr3.
-- * 1b) thread switch without mm change
-- * cpu_tlbstate[].active_mm is correct, cpu0 already handles
-- * flush ipis.
-- * 1b1) set cpu_tlbstate to TLBSTATE_OK
-- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
-- * Atomically set the bit [other cpus will start sending flush ipis],
-- * and test the bit.
-- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
-- * 2) switch %%esp, ie current
-- *
-- * The interrupt must handle 2 special cases:
-- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
-- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
-- * runs in kernel space, the cpu could load tlb entries for user space
-- * pages.
-- *
-- * The good news is that cpu_tlbstate is local to each cpu, no
-- * write/read ordering problems.
-- */
--
--/*
-- * TLB flush IPI:
-- *
-- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
-- * 2) Leave the mm if we are in the lazy tlb mode.
-- */
--
--irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id)
--{
-- unsigned long cpu;
--
-- cpu = get_cpu();
--
-- if (!cpu_isset(cpu, flush_cpumask))
-- goto out;
-- /*
-- * This was a BUG() but until someone can quote me the
-- * line from the intel manual that guarantees an IPI to
-- * multiple CPUs is retried _only_ on the erroring CPUs
-- * its staying as a return
-- *
-- * BUG();
-- */
--
-- if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
-- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
-- if (flush_va == TLB_FLUSH_ALL)
-- local_flush_tlb();
-- else
-- __flush_tlb_one(flush_va);
-- } else
-- leave_mm(cpu);
-- }
-- smp_mb__before_clear_bit();
-- cpu_clear(cpu, flush_cpumask);
-- smp_mb__after_clear_bit();
--out:
-- put_cpu_no_resched();
-- __get_cpu_var(irq_stat).irq_tlb_count++;
--
-- return IRQ_HANDLED;
--}
--
--void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
-- unsigned long va)
--{
-- cpumask_t cpumask = *cpumaskp;
--
-- /*
-- * A couple of (to be removed) sanity checks:
-- *
-- * - current CPU must not be in mask
-- * - mask must exist :)
-- */
-- BUG_ON(cpus_empty(cpumask));
-- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
-- BUG_ON(!mm);
--
--#ifdef CONFIG_HOTPLUG_CPU
-- /* If a CPU which we ran on has gone down, OK. */
-- cpus_and(cpumask, cpumask, cpu_online_map);
-- if (unlikely(cpus_empty(cpumask)))
-- return;
--#endif
--
-- /*
-- * i'm not happy about this global shared spinlock in the
-- * MM hot path, but we'll see how contended it is.
-- * AK: x86-64 has a faster method that could be ported.
-- */
-- spin_lock(&tlbstate_lock);
--
-- flush_mm = mm;
-- flush_va = va;
-- cpus_or(flush_cpumask, cpumask, flush_cpumask);
-- /*
-- * We have to send the IPI only to
-- * CPUs affected.
-- */
-- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
--
-- while (!cpus_empty(flush_cpumask))
-- /* nothing. lockup detection does not belong here */
-- cpu_relax();
--
-- flush_mm = NULL;
-- flush_va = 0;
-- spin_unlock(&tlbstate_lock);
--}
--
--void flush_tlb_current_task(void)
--{
-- struct mm_struct *mm = current->mm;
-- cpumask_t cpu_mask;
--
-- preempt_disable();
-- cpu_mask = mm->cpu_vm_mask;
-- cpu_clear(smp_processor_id(), cpu_mask);
--
-- local_flush_tlb();
-- if (!cpus_empty(cpu_mask))
-- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-- preempt_enable();
--}
--
--void flush_tlb_mm (struct mm_struct * mm)
--{
-- cpumask_t cpu_mask;
--
-- preempt_disable();
-- cpu_mask = mm->cpu_vm_mask;
-- cpu_clear(smp_processor_id(), cpu_mask);
--
-- if (current->active_mm == mm) {
-- if (current->mm)
-- local_flush_tlb();
-- else
-- leave_mm(smp_processor_id());
-- }
-- if (!cpus_empty(cpu_mask))
-- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
--
-- preempt_enable();
--}
--
--void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
--{
-- struct mm_struct *mm = vma->vm_mm;
-- cpumask_t cpu_mask;
--
-- preempt_disable();
-- cpu_mask = mm->cpu_vm_mask;
-- cpu_clear(smp_processor_id(), cpu_mask);
--
-- if (current->active_mm == mm) {
-- if(current->mm)
-- __flush_tlb_one(va);
-- else
-- leave_mm(smp_processor_id());
-- }
--
-- if (!cpus_empty(cpu_mask))
-- flush_tlb_others(cpu_mask, mm, va);
--
-- preempt_enable();
--}
--EXPORT_SYMBOL(flush_tlb_page);
--
--static void do_flush_tlb_all(void* info)
--{
-- unsigned long cpu = smp_processor_id();
--
-- __flush_tlb_all();
-- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
-- leave_mm(cpu);
--}
--
--void flush_tlb_all(void)
--{
-- on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
--}
--
--#endif /* XEN */
--
--/*
-- * this function sends a 'reschedule' IPI to another CPU.
-- * it goes straight through and wastes no time serializing
-- * anything. Worst case is that we lose a reschedule ...
-- */
--void xen_smp_send_reschedule(int cpu)
--{
-- WARN_ON(cpu_is_offline(cpu));
-- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
--}
--
--/*
-- * Structure and data for smp_call_function(). This is designed to minimise
-- * static memory requirements. It also looks cleaner.
-- */
--static DEFINE_SPINLOCK(call_lock);
--
--struct call_data_struct {
-- void (*func) (void *info);
-- void *info;
-- atomic_t started;
-- atomic_t finished;
-- int wait;
--};
--
--void lock_ipi_call_lock(void)
--{
-- spin_lock_irq(&call_lock);
--}
--
--void unlock_ipi_call_lock(void)
--{
-- spin_unlock_irq(&call_lock);
--}
--
--static struct call_data_struct *call_data;
--
--static void __smp_call_function(void (*func) (void *info), void *info,
-- int nonatomic, int wait)
--{
-- struct call_data_struct data;
-- int cpus = num_online_cpus() - 1;
--
-- if (!cpus)
-- return;
--
-- data.func = func;
-- data.info = info;
-- atomic_set(&data.started, 0);
-- data.wait = wait;
-- if (wait)
-- atomic_set(&data.finished, 0);
--
-- call_data = &data;
-- mb();
--
-- /* Send a message to all other CPUs and wait for them to respond */
-- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
--
-- /* Wait for response */
-- while (atomic_read(&data.started) != cpus)
-- cpu_relax();
--
-- if (wait)
-- while (atomic_read(&data.finished) != cpus)
-- cpu_relax();
--}
--
--
--/**
-- * smp_call_function_mask(): Run a function on a set of other CPUs.
-- * @mask: The set of cpus to run on. Must not include the current cpu.
-- * @func: The function to run. This must be fast and non-blocking.
-- * @info: An arbitrary pointer to pass to the function.
-- * @wait: If true, wait (atomically) until function has completed on other CPUs.
-- *
-- * Returns 0 on success, else a negative status code.
-- *
-- * If @wait is true, then returns once @func has returned; otherwise
-- * it returns just before the target cpu calls @func.
-- *
-- * You must not call this function with disabled interrupts or from a
-- * hardware interrupt handler or from a bottom half handler.
-- */
--int
--xen_smp_call_function_mask(cpumask_t mask,
-- void (*func)(void *), void *info,
-- int wait)
--{
-- struct call_data_struct data;
-- cpumask_t allbutself;
-- int cpus;
--
-- /* Can deadlock when called with interrupts disabled */
-- WARN_ON(irqs_disabled());
--
-- /* Holding any lock stops cpus from going down. */
-- spin_lock(&call_lock);
--
-- allbutself = cpu_online_map;
-- cpu_clear(smp_processor_id(), allbutself);
--
-- cpus_and(mask, mask, allbutself);
-- cpus = cpus_weight(mask);
--
-- if (!cpus) {
-- spin_unlock(&call_lock);
-- return 0;
-- }
--
-- data.func = func;
-- data.info = info;
-- atomic_set(&data.started, 0);
-- data.wait = wait;
-- if (wait)
-- atomic_set(&data.finished, 0);
--
-- call_data = &data;
-- mb();
--
-- /* Send a message to other CPUs */
-- if (cpus_equal(mask, allbutself))
-- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-- else
-- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
--
-- /* Wait for response */
-- while (atomic_read(&data.started) != cpus)
-- cpu_relax();
--
-- if (wait)
-- while (atomic_read(&data.finished) != cpus)
-- cpu_relax();
-- spin_unlock(&call_lock);
--
-- return 0;
--}
--
--static void stop_this_cpu (void * dummy)
--{
-- local_irq_disable();
-- /*
-- * Remove this CPU:
-- */
-- cpu_clear(smp_processor_id(), cpu_online_map);
-- disable_all_local_evtchn();
-- if (cpu_data(smp_processor_id()).hlt_works_ok)
-- for(;;) halt();
-- for (;;);
--}
--
--/*
-- * this function calls the 'stop' function on all other CPUs in the system.
-- */
--
--void xen_smp_send_stop(void)
--{
-- /* Don't deadlock on the call lock in panic */
-- int nolock = !spin_trylock(&call_lock);
-- unsigned long flags;
--
-- local_irq_save(flags);
-- __smp_call_function(stop_this_cpu, NULL, 0, 0);
-- if (!nolock)
-- spin_unlock(&call_lock);
-- disable_all_local_evtchn();
-- local_irq_restore(flags);
--}
--
--/*
-- * Reschedule call back. Nothing to do,
-- * all the work is done automatically when
-- * we return from the interrupt.
-- */
--irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
--{
-- __get_cpu_var(irq_stat).irq_resched_count++;
--
-- return IRQ_HANDLED;
--}
--
--#include <linux/kallsyms.h>
--irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
--{
-- void (*func) (void *info) = call_data->func;
-- void *info = call_data->info;
-- int wait = call_data->wait;
--
-- /*
-- * Notify initiating CPU that I've grabbed the data and am
-- * about to execute the function
-- */
-- mb();
-- atomic_inc(&call_data->started);
-- /*
-- * At this point the info structure may be out of scope unless wait==1
-- */
-- irq_enter();
-- (*func)(info);
-- __get_cpu_var(irq_stat).irq_call_count++;
-- irq_exit();
--
-- if (wait) {
-- mb();
-- atomic_inc(&call_data->finished);
-- }
--
-- return IRQ_HANDLED;
--}
---- a/arch/x86/kernel/smp_64-xen.c
-+++ /dev/null
-@@ -1,554 +0,0 @@
--/*
-- * Intel SMP support routines.
-- *
-- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
-- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
-- * (c) 2002,2003 Andi Kleen, SuSE Labs.
-- *
-- * This code is released under the GNU General Public License version 2 or
-- * later.
-- */
--
--#include <linux/init.h>
--
--#include <linux/mm.h>
--#include <linux/delay.h>
--#include <linux/spinlock.h>
--#include <linux/smp.h>
--#include <linux/kernel_stat.h>
--#include <linux/mc146818rtc.h>
--#include <linux/interrupt.h>
--
--#include <asm/mtrr.h>
--#include <asm/pgalloc.h>
--#include <asm/tlbflush.h>
--#include <asm/mach_apic.h>
--#include <asm/mmu_context.h>
--#include <asm/proto.h>
--#include <asm/apicdef.h>
--#include <asm/idle.h>
--#ifdef CONFIG_XEN
--#include <xen/evtchn.h>
--#endif
--
--#ifndef CONFIG_XEN
--/*
-- * Smarter SMP flushing macros.
-- * c/o Linus Torvalds.
-- *
-- * These mean you can really definitely utterly forget about
-- * writing to user space from interrupts. (Its not allowed anyway).
-- *
-- * Optimizations Manfred Spraul <manfred@colorfullife.com>
-- *
-- * More scalable flush, from Andi Kleen
-- *
-- * To avoid global state use 8 different call vectors.
-- * Each CPU uses a specific vector to trigger flushes on other
-- * CPUs. Depending on the received vector the target CPUs look into
-- * the right per cpu variable for the flush data.
-- *
-- * With more than 8 CPUs they are hashed to the 8 available
-- * vectors. The limited global vector space forces us to this right now.
-- * In future when interrupts are split into per CPU domains this could be
-- * fixed, at the cost of triggering multiple IPIs in some cases.
-- */
--
--union smp_flush_state {
-- struct {
-- cpumask_t flush_cpumask;
-- struct mm_struct *flush_mm;
-- unsigned long flush_va;
-- spinlock_t tlbstate_lock;
-- };
-- char pad[SMP_CACHE_BYTES];
--} ____cacheline_aligned;
--
--/* State is put into the per CPU data section, but padded
-- to a full cache line because other CPUs can access it and we don't
-- want false sharing in the per cpu data segment. */
--static DEFINE_PER_CPU(union smp_flush_state, flush_state);
--
--/*
-- * We cannot call mmdrop() because we are in interrupt context,
-- * instead update mm->cpu_vm_mask.
-- */
--void leave_mm(int cpu)
--{
-- if (read_pda(mmu_state) == TLBSTATE_OK)
-- BUG();
-- cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
-- load_cr3(swapper_pg_dir);
--}
--EXPORT_SYMBOL_GPL(leave_mm);
--
--/*
-- *
-- * The flush IPI assumes that a thread switch happens in this order:
-- * [cpu0: the cpu that switches]
-- * 1) switch_mm() either 1a) or 1b)
-- * 1a) thread switch to a different mm
-- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
-- * Stop ipi delivery for the old mm. This is not synchronized with
-- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
-- * for the wrong mm, and in the worst case we perform a superfluous
-- * tlb flush.
-- * 1a2) set cpu mmu_state to TLBSTATE_OK
-- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
-- * was in lazy tlb mode.
-- * 1a3) update cpu active_mm
-- * Now cpu0 accepts tlb flushes for the new mm.
-- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
-- * Now the other cpus will send tlb flush ipis.
-- * 1a4) change cr3.
-- * 1b) thread switch without mm change
-- * cpu active_mm is correct, cpu0 already handles
-- * flush ipis.
-- * 1b1) set cpu mmu_state to TLBSTATE_OK
-- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
-- * Atomically set the bit [other cpus will start sending flush ipis],
-- * and test the bit.
-- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
-- * 2) switch %%esp, ie current
-- *
-- * The interrupt must handle 2 special cases:
-- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
-- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
-- * runs in kernel space, the cpu could load tlb entries for user space
-- * pages.
-- *
-- * The good news is that cpu mmu_state is local to each cpu, no
-- * write/read ordering problems.
-- */
--
--/*
-- * TLB flush IPI:
-- *
-- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
-- * 2) Leave the mm if we are in the lazy tlb mode.
-- *
-- * Interrupts are disabled.
-- */
--
--asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
--{
-- int cpu;
-- int sender;
-- union smp_flush_state *f;
--
-- cpu = smp_processor_id();
-- /*
-- * orig_rax contains the negated interrupt vector.
-- * Use that to determine where the sender put the data.
-- */
-- sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
-- f = &per_cpu(flush_state, sender);
--
-- if (!cpu_isset(cpu, f->flush_cpumask))
-- goto out;
-- /*
-- * This was a BUG() but until someone can quote me the
-- * line from the intel manual that guarantees an IPI to
-- * multiple CPUs is retried _only_ on the erroring CPUs
-- * its staying as a return
-- *
-- * BUG();
-- */
--
-- if (f->flush_mm == read_pda(active_mm)) {
-- if (read_pda(mmu_state) == TLBSTATE_OK) {
-- if (f->flush_va == TLB_FLUSH_ALL)
-- local_flush_tlb();
-- else
-- __flush_tlb_one(f->flush_va);
-- } else
-- leave_mm(cpu);
-- }
--out:
-- ack_APIC_irq();
-- cpu_clear(cpu, f->flush_cpumask);
-- add_pda(irq_tlb_count, 1);
--}
--
--void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
-- unsigned long va)
--{
-- int sender;
-- union smp_flush_state *f;
-- cpumask_t cpumask = *cpumaskp;
--
-- /* Caller has disabled preemption */
-- sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
-- f = &per_cpu(flush_state, sender);
--
-- /*
-- * Could avoid this lock when
-- * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-- * probably not worth checking this for a cache-hot lock.
-- */
-- spin_lock(&f->tlbstate_lock);
--
-- f->flush_mm = mm;
-- f->flush_va = va;
-- cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
--
-- /*
-- * We have to send the IPI only to
-- * CPUs affected.
-- */
-- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
--
-- while (!cpus_empty(f->flush_cpumask))
-- cpu_relax();
--
-- f->flush_mm = NULL;
-- f->flush_va = 0;
-- spin_unlock(&f->tlbstate_lock);
--}
--
--int __cpuinit init_smp_flush(void)
--{
-- int i;
--
-- for_each_cpu_mask(i, cpu_possible_map) {
-- spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
-- }
-- return 0;
--}
--core_initcall(init_smp_flush);
--
--void flush_tlb_current_task(void)
--{
-- struct mm_struct *mm = current->mm;
-- cpumask_t cpu_mask;
--
-- preempt_disable();
-- cpu_mask = mm->cpu_vm_mask;
-- cpu_clear(smp_processor_id(), cpu_mask);
--
-- local_flush_tlb();
-- if (!cpus_empty(cpu_mask))
-- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-- preempt_enable();
--}
--
--void flush_tlb_mm (struct mm_struct * mm)
--{
-- cpumask_t cpu_mask;
--
-- preempt_disable();
-- cpu_mask = mm->cpu_vm_mask;
-- cpu_clear(smp_processor_id(), cpu_mask);
--
-- if (current->active_mm == mm) {
-- if (current->mm)
-- local_flush_tlb();
-- else
-- leave_mm(smp_processor_id());
-- }
-- if (!cpus_empty(cpu_mask))
-- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
--
-- preempt_enable();
--}
--
--void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
--{
-- struct mm_struct *mm = vma->vm_mm;
-- cpumask_t cpu_mask;
--
-- preempt_disable();
-- cpu_mask = mm->cpu_vm_mask;
-- cpu_clear(smp_processor_id(), cpu_mask);
--
-- if (current->active_mm == mm) {
-- if(current->mm)
-- __flush_tlb_one(va);
-- else
-- leave_mm(smp_processor_id());
-- }
--
-- if (!cpus_empty(cpu_mask))
-- flush_tlb_others(cpu_mask, mm, va);
--
-- preempt_enable();
--}
--
--static void do_flush_tlb_all(void* info)
--{
-- unsigned long cpu = smp_processor_id();
--
-- __flush_tlb_all();
-- if (read_pda(mmu_state) == TLBSTATE_LAZY)
-- leave_mm(cpu);
--}
--
--void flush_tlb_all(void)
--{
-- on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
--}
--#endif /* Xen */
--
--/*
-- * this function sends a 'reschedule' IPI to another CPU.
-- * it goes straight through and wastes no time serializing
-- * anything. Worst case is that we lose a reschedule ...
-- */
--
--void smp_send_reschedule(int cpu)
--{
-- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
--}
--
--/*
-- * Structure and data for smp_call_function(). This is designed to minimise
-- * static memory requirements. It also looks cleaner.
-- */
--static DEFINE_SPINLOCK(call_lock);
--
--struct call_data_struct {
-- void (*func) (void *info);
-- void *info;
-- atomic_t started;
-- atomic_t finished;
-- int wait;
--};
--
--static struct call_data_struct * call_data;
--
--void lock_ipi_call_lock(void)
--{
-- spin_lock_irq(&call_lock);
--}
--
--void unlock_ipi_call_lock(void)
--{
-- spin_unlock_irq(&call_lock);
--}
--
--/*
-- * this function sends a 'generic call function' IPI to all other CPU
-- * of the system defined in the mask.
-- */
--static int __smp_call_function_mask(cpumask_t mask,
-- void (*func)(void *), void *info,
-- int wait)
--{
-- struct call_data_struct data;
-- cpumask_t allbutself;
-- int cpus;
--
-- allbutself = cpu_online_map;
-- cpu_clear(smp_processor_id(), allbutself);
--
-- cpus_and(mask, mask, allbutself);
-- cpus = cpus_weight(mask);
--
-- if (!cpus)
-- return 0;
--
-- data.func = func;
-- data.info = info;
-- atomic_set(&data.started, 0);
-- data.wait = wait;
-- if (wait)
-- atomic_set(&data.finished, 0);
--
-- call_data = &data;
-- wmb();
--
-- /* Send a message to other CPUs */
-- if (cpus_equal(mask, allbutself))
-- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-- else
-- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
--
-- /* Wait for response */
-- while (atomic_read(&data.started) != cpus)
-- cpu_relax();
--
-- if (!wait)
-- return 0;
--
-- while (atomic_read(&data.finished) != cpus)
-- cpu_relax();
--
-- return 0;
--}
--/**
-- * smp_call_function_mask(): Run a function on a set of other CPUs.
-- * @mask: The set of cpus to run on. Must not include the current cpu.
-- * @func: The function to run. This must be fast and non-blocking.
-- * @info: An arbitrary pointer to pass to the function.
-- * @wait: If true, wait (atomically) until function has completed on other CPUs.
-- *
-- * Returns 0 on success, else a negative status code.
-- *
-- * If @wait is true, then returns once @func has returned; otherwise
-- * it returns just before the target cpu calls @func.
-- *
-- * You must not call this function with disabled interrupts or from a
-- * hardware interrupt handler or from a bottom half handler.
-- */
--int smp_call_function_mask(cpumask_t mask,
-- void (*func)(void *), void *info,
-- int wait)
--{
-- int ret;
--
-- /* Can deadlock when called with interrupts disabled */
-- WARN_ON(irqs_disabled());
--
-- spin_lock(&call_lock);
-- ret = __smp_call_function_mask(mask, func, info, wait);
-- spin_unlock(&call_lock);
-- return ret;
--}
--EXPORT_SYMBOL(smp_call_function_mask);
--
--/*
-- * smp_call_function_single - Run a function on a specific CPU
-- * @func: The function to run. This must be fast and non-blocking.
-- * @info: An arbitrary pointer to pass to the function.
-- * @nonatomic: Currently unused.
-- * @wait: If true, wait until function has completed on other CPUs.
-- *
-- * Retrurns 0 on success, else a negative status code.
-- *
-- * Does not return until the remote CPU is nearly ready to execute <func>
-- * or is or has executed.
-- */
--
--int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
-- int nonatomic, int wait)
--{
-- /* prevent preemption and reschedule on another processor */
-- int ret, me = get_cpu();
--
-- /* Can deadlock when called with interrupts disabled */
-- WARN_ON(irqs_disabled());
--
-- if (cpu == me) {
-- local_irq_disable();
-- func(info);
-- local_irq_enable();
-- put_cpu();
-- return 0;
-- }
--
-- ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
--
-- put_cpu();
-- return ret;
--}
--EXPORT_SYMBOL(smp_call_function_single);
--
--/*
-- * smp_call_function - run a function on all other CPUs.
-- * @func: The function to run. This must be fast and non-blocking.
-- * @info: An arbitrary pointer to pass to the function.
-- * @nonatomic: currently unused.
-- * @wait: If true, wait (atomically) until function has completed on other
-- * CPUs.
-- *
-- * Returns 0 on success, else a negative status code. Does not return until
-- * remote CPUs are nearly ready to execute func or are or have executed.
-- *
-- * You must not call this function with disabled interrupts or from a
-- * hardware interrupt handler or from a bottom half handler.
-- * Actually there are a few legal cases, like panic.
-- */
--int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-- int wait)
--{
-- return smp_call_function_mask(cpu_online_map, func, info, wait);
--}
--EXPORT_SYMBOL(smp_call_function);
--
--static void stop_this_cpu(void *dummy)
--{
-- local_irq_disable();
-- /*
-- * Remove this CPU:
-- */
-- cpu_clear(smp_processor_id(), cpu_online_map);
-- disable_all_local_evtchn();
-- for (;;)
-- halt();
--}
--
--void smp_send_stop(void)
--{
-- int nolock;
-- unsigned long flags;
--
--#ifndef CONFIG_XEN
-- if (reboot_force)
-- return;
--#endif
--
-- /* Don't deadlock on the call lock in panic */
-- nolock = !spin_trylock(&call_lock);
-- local_irq_save(flags);
-- __smp_call_function_mask(cpu_online_map, stop_this_cpu, NULL, 0);
-- if (!nolock)
-- spin_unlock(&call_lock);
-- disable_all_local_evtchn();
-- local_irq_restore(flags);
--}
--
--/*
-- * Reschedule call back. Nothing to do,
-- * all the work is done automatically when
-- * we return from the interrupt.
-- */
--#ifndef CONFIG_XEN
--asmlinkage void smp_reschedule_interrupt(void)
--#else
--asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx)
--#endif
--{
--#ifndef CONFIG_XEN
-- ack_APIC_irq();
--#endif
-- add_pda(irq_resched_count, 1);
--#ifdef CONFIG_XEN
-- return IRQ_HANDLED;
--#endif
--}
--
--#ifndef CONFIG_XEN
--asmlinkage void smp_call_function_interrupt(void)
--#else
--asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx)
--#endif
--{
-- void (*func) (void *info) = call_data->func;
-- void *info = call_data->info;
-- int wait = call_data->wait;
--
--#ifndef CONFIG_XEN
-- ack_APIC_irq();
--#endif
-- /*
-- * Notify initiating CPU that I've grabbed the data and am
-- * about to execute the function
-- */
-- mb();
-- atomic_inc(&call_data->started);
-- /*
-- * At this point the info structure may be out of scope unless wait==1
-- */
-- exit_idle();
-- irq_enter();
-- (*func)(info);
-- add_pda(irq_call_count, 1);
-- irq_exit();
-- if (wait) {
-- mb();
-- atomic_inc(&call_data->finished);
-- }
--#ifdef CONFIG_XEN
-- return IRQ_HANDLED;
--#endif
--}
---- /dev/null
-+++ b/arch/x86/kernel/smp-xen.c
-@@ -0,0 +1,329 @@
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-05-14/arch/x86/kernel/mpparse-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -0,0 +1,1101 @@
++/*
++ * Intel Multiprocessor Specification 1.1 and 1.4
++ * compliant MP-table parsing routines.
++ *
++ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
++ * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
++ * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
++ */
++
++#include <linux/mm.h>
++#include <linux/init.h>
++#include <linux/delay.h>
++#include <linux/bootmem.h>
++#include <linux/kernel_stat.h>
++#include <linux/mc146818rtc.h>
++#include <linux/bitops.h>
++#include <linux/acpi.h>
++#include <linux/module.h>
++
++#include <asm/smp.h>
++#include <asm/mtrr.h>
++#include <asm/mpspec.h>
++#include <asm/pgalloc.h>
++#include <asm/io_apic.h>
++#include <asm/proto.h>
++#include <asm/acpi.h>
++#include <asm/bios_ebda.h>
++
++#include <mach_apic.h>
++#ifdef CONFIG_X86_32
++#include <mach_apicdef.h>
++#include <mach_mpparse.h>
++#endif
++
++/* Have we found an MP table */
++int smp_found_config;
++
++/*
++ * Various Linux-internal data structures created from the
++ * MP-table.
++ */
++#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
++int mp_bus_id_to_type[MAX_MP_BUSSES];
++#endif
++
++DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
++int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
++
++static int mp_current_pci_id;
++
++int pic_mode;
++
++/*
++ * Intel MP BIOS table parsing routines:
++ */
++
++/*
++ * Checksum an MP configuration block.
++ */
++
++static int __init mpf_checksum(unsigned char *mp, int len)
++{
++ int sum = 0;
++
++ while (len--)
++ sum += *mp++;
++
++ return sum & 0xFF;
++}
++
++#ifdef CONFIG_X86_NUMAQ
++/*
++ * Have to match translation table entries to main table entries by counter
++ * hence the mpc_record variable .... can't see a less disgusting way of
++ * doing this ....
++ */
++
++static int mpc_record;
++static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
++ __cpuinitdata;
++#endif
++
++static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
++{
++#ifndef CONFIG_XEN
++ int apicid;
++ char *bootup_cpu = "";
++
++ if (!(m->mpc_cpuflag & CPU_ENABLED)) {
++ disabled_cpus++;
++ return;
++ }
++#ifdef CONFIG_X86_NUMAQ
++ apicid = mpc_apic_id(m, translation_table[mpc_record]);
++#else
++ apicid = m->mpc_apicid;
++#endif
++ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
++ bootup_cpu = " (Bootup-CPU)";
++ boot_cpu_physical_apicid = m->mpc_apicid;
++ }
++
++ printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
++ generic_processor_info(apicid, m->mpc_apicver);
++#else /* CONFIG_XEN */
++ num_processors++;
++#endif
++}
++
++static void __init MP_bus_info(struct mpc_config_bus *m)
++{
++ char str[7];
++
++ memcpy(str, m->mpc_bustype, 6);
++ str[6] = 0;
++
++#ifdef CONFIG_X86_NUMAQ
++ mpc_oem_bus_info(m, str, translation_table[mpc_record]);
++#else
++ Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
++#endif
++
++#if MAX_MP_BUSSES < 256
++ if (m->mpc_busid >= MAX_MP_BUSSES) {
++ printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
++ " is too large, max. supported is %d\n",
++ m->mpc_busid, str, MAX_MP_BUSSES - 1);
++ return;
++ }
++#endif
++
++ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
++ set_bit(m->mpc_busid, mp_bus_not_pci);
++#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
++#endif
++ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
++#ifdef CONFIG_X86_NUMAQ
++ mpc_oem_pci_bus(m, translation_table[mpc_record]);
++#endif
++ clear_bit(m->mpc_busid, mp_bus_not_pci);
++ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
++ mp_current_pci_id++;
++#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
++ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
++ } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
++#endif
++ } else
++ printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
++}
++
++#ifdef CONFIG_X86_IO_APIC
++
++static int bad_ioapic(unsigned long address)
++{
++ if (nr_ioapics >= MAX_IO_APICS) {
++ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
++ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
++ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
++ }
++ if (!address) {
++ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
++ " found in table, skipping!\n");
++ return 1;
++ }
++ return 0;
++}
++
++static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
++{
++ if (!(m->mpc_flags & MPC_APIC_USABLE))
++ return;
++
++ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
++ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
++
++ if (bad_ioapic(m->mpc_apicaddr))
++ return;
++
++ mp_ioapics[nr_ioapics] = *m;
++ nr_ioapics++;
++}
++
++static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
++{
++ mp_irqs[mp_irq_entries] = *m;
++ Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
++ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
++ m->mpc_irqtype, m->mpc_irqflag & 3,
++ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
++ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
++ if (++mp_irq_entries == MAX_IRQ_SOURCES)
++ panic("Max # of irq sources exceeded!!\n");
++}
++
++#endif
++
++static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
++{
++ Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
++ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
++ m->mpc_irqtype, m->mpc_irqflag & 3,
++ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
++ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
++}
++
++#ifdef CONFIG_X86_NUMAQ
++static void __init MP_translation_info(struct mpc_config_translation *m)
++{
++ printk(KERN_INFO
++ "Translation: record %d, type %d, quad %d, global %d, local %d\n",
++ mpc_record, m->trans_type, m->trans_quad, m->trans_global,
++ m->trans_local);
++
++ if (mpc_record >= MAX_MPC_ENTRY)
++ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
++ else
++ translation_table[mpc_record] = m; /* stash this for later */
++ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
++ node_set_online(m->trans_quad);
++}
++
++/*
++ * Read/parse the MPC oem tables
++ */
++
++static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
++ unsigned short oemsize)
++{
++ int count = sizeof(*oemtable); /* the header size */
++ unsigned char *oemptr = ((unsigned char *)oemtable) + count;
++
++ mpc_record = 0;
++ printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
++ oemtable);
++ if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
++ printk(KERN_WARNING
++ "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
++ oemtable->oem_signature[0], oemtable->oem_signature[1],
++ oemtable->oem_signature[2], oemtable->oem_signature[3]);
++ return;
++ }
++ if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
++ printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
++ return;
++ }
++ while (count < oemtable->oem_length) {
++ switch (*oemptr) {
++ case MP_TRANSLATION:
++ {
++ struct mpc_config_translation *m =
++ (struct mpc_config_translation *)oemptr;
++ MP_translation_info(m);
++ oemptr += sizeof(*m);
++ count += sizeof(*m);
++ ++mpc_record;
++ break;
++ }
++ default:
++ {
++ printk(KERN_WARNING
++ "Unrecognised OEM table entry type! - %d\n",
++ (int)*oemptr);
++ return;
++ }
++ }
++ }
++}
++
++static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
++ char *productid)
++{
++ if (strncmp(oem, "IBM NUMA", 8))
++ printk("Warning! May not be a NUMA-Q system!\n");
++ if (mpc->mpc_oemptr)
++ smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
++ mpc->mpc_oemsize);
++}
++#endif /* CONFIG_X86_NUMAQ */
++
++/*
++ * Read/parse the MPC
++ */
++
++static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
++{
++ char str[16];
++ char oem[10];
++ int count = sizeof(*mpc);
++ unsigned char *mpt = ((unsigned char *)mpc) + count;
++
++ if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
++ printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
++ mpc->mpc_signature[0], mpc->mpc_signature[1],
++ mpc->mpc_signature[2], mpc->mpc_signature[3]);
++ return 0;
++ }
++ if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
++ printk(KERN_ERR "MPTABLE: checksum error!\n");
++ return 0;
++ }
++ if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
++ printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
++ mpc->mpc_spec);
++ return 0;
++ }
++ if (!mpc->mpc_lapic) {
++ printk(KERN_ERR "MPTABLE: null local APIC address!\n");
++ return 0;
++ }
++ memcpy(oem, mpc->mpc_oem, 8);
++ oem[8] = 0;
++ printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
++
++ memcpy(str, mpc->mpc_productid, 12);
++ str[12] = 0;
++ printk("Product ID: %s ", str);
++
++#ifdef CONFIG_X86_32
++ mps_oem_check(mpc, oem, str);
++#endif
++ printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
++
++ printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
++
++ /* save the local APIC address, it might be non-default */
++ if (!acpi_lapic)
++ mp_lapic_addr = mpc->mpc_lapic;
++
++ if (early)
++ return 1;
++
++ /*
++ * Now process the configuration blocks.
++ */
++#ifdef CONFIG_X86_NUMAQ
++ mpc_record = 0;
++#endif
++ while (count < mpc->mpc_length) {
++ switch (*mpt) {
++ case MP_PROCESSOR:
++ {
++ struct mpc_config_processor *m =
++ (struct mpc_config_processor *)mpt;
++ /* ACPI may have already provided this data */
++ if (!acpi_lapic)
++ MP_processor_info(m);
++ mpt += sizeof(*m);
++ count += sizeof(*m);
++ break;
++ }
++ case MP_BUS:
++ {
++ struct mpc_config_bus *m =
++ (struct mpc_config_bus *)mpt;
++ MP_bus_info(m);
++ mpt += sizeof(*m);
++ count += sizeof(*m);
++ break;
++ }
++ case MP_IOAPIC:
++ {
++#ifdef CONFIG_X86_IO_APIC
++ struct mpc_config_ioapic *m =
++ (struct mpc_config_ioapic *)mpt;
++ MP_ioapic_info(m);
++#endif
++ mpt += sizeof(struct mpc_config_ioapic);
++ count += sizeof(struct mpc_config_ioapic);
++ break;
++ }
++ case MP_INTSRC:
++ {
++#ifdef CONFIG_X86_IO_APIC
++ struct mpc_config_intsrc *m =
++ (struct mpc_config_intsrc *)mpt;
++
++ MP_intsrc_info(m);
++#endif
++ mpt += sizeof(struct mpc_config_intsrc);
++ count += sizeof(struct mpc_config_intsrc);
++ break;
++ }
++ case MP_LINTSRC:
++ {
++ struct mpc_config_lintsrc *m =
++ (struct mpc_config_lintsrc *)mpt;
++ MP_lintsrc_info(m);
++ mpt += sizeof(*m);
++ count += sizeof(*m);
++ break;
++ }
++ default:
++ /* wrong mptable */
++ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
++ printk(KERN_ERR "type %x\n", *mpt);
++ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
++ 1, mpc, mpc->mpc_length, 1);
++ count = mpc->mpc_length;
++ break;
++ }
++#ifdef CONFIG_X86_NUMAQ
++ ++mpc_record;
++#endif
++ }
++ setup_apic_routing();
++ if (!num_processors)
++ printk(KERN_ERR "MPTABLE: no processors registered!\n");
++ return num_processors;
++}
++
++#ifdef CONFIG_X86_IO_APIC
++
++static int __init ELCR_trigger(unsigned int irq)
++{
++ unsigned int port;
++
++ port = 0x4d0 + (irq >> 3);
++ return (inb(port) >> (irq & 7)) & 1;
++}
++
++static void __init construct_default_ioirq_mptable(int mpc_default_type)
++{
++ struct mpc_config_intsrc intsrc;
++ int i;
++ int ELCR_fallback = 0;
++
++ intsrc.mpc_type = MP_INTSRC;
++ intsrc.mpc_irqflag = 0; /* conforming */
++ intsrc.mpc_srcbus = 0;
++ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
++
++ intsrc.mpc_irqtype = mp_INT;
++
++ /*
++ * If true, we have an ISA/PCI system with no IRQ entries
++ * in the MP table. To prevent the PCI interrupts from being set up
++ * incorrectly, we try to use the ELCR. The sanity check to see if
++ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
++ * never be level sensitive, so we simply see if the ELCR agrees.
++ * If it does, we assume it's valid.
++ */
++ if (mpc_default_type == 5) {
++ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
++ "falling back to ELCR\n");
++
++ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
++ ELCR_trigger(13))
++ printk(KERN_ERR "ELCR contains invalid data... "
++ "not using ELCR\n");
++ else {
++ printk(KERN_INFO
++ "Using ELCR to identify PCI interrupts\n");
++ ELCR_fallback = 1;
++ }
++ }
++
++ for (i = 0; i < 16; i++) {
++ switch (mpc_default_type) {
++ case 2:
++ if (i == 0 || i == 13)
++ continue; /* IRQ0 & IRQ13 not connected */
++ /* fall through */
++ default:
++ if (i == 2)
++ continue; /* IRQ2 is never connected */
++ }
++
++ if (ELCR_fallback) {
++ /*
++ * If the ELCR indicates a level-sensitive interrupt, we
++ * copy that information over to the MP table in the
++ * irqflag field (level sensitive, active high polarity).
++ */
++ if (ELCR_trigger(i))
++ intsrc.mpc_irqflag = 13;
++ else
++ intsrc.mpc_irqflag = 0;
++ }
++
++ intsrc.mpc_srcbusirq = i;
++ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
++ MP_intsrc_info(&intsrc);
++ }
++
++ intsrc.mpc_irqtype = mp_ExtINT;
++ intsrc.mpc_srcbusirq = 0;
++ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
++ MP_intsrc_info(&intsrc);
++}
++
++#endif
++
++static inline void __init construct_default_ISA_mptable(int mpc_default_type)
++{
++ struct mpc_config_processor processor;
++ struct mpc_config_bus bus;
++#ifdef CONFIG_X86_IO_APIC
++ struct mpc_config_ioapic ioapic;
++#endif
++ struct mpc_config_lintsrc lintsrc;
++ int linttypes[2] = { mp_ExtINT, mp_NMI };
++ int i;
++
++ /*
++ * local APIC has default address
++ */
++ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
++
++ /*
++ * 2 CPUs, numbered 0 & 1.
++ */
++ processor.mpc_type = MP_PROCESSOR;
++ /* Either an integrated APIC or a discrete 82489DX. */
++ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
++ processor.mpc_cpuflag = CPU_ENABLED;
++ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
++ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
++ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
++ processor.mpc_reserved[0] = 0;
++ processor.mpc_reserved[1] = 0;
++ for (i = 0; i < 2; i++) {
++ processor.mpc_apicid = i;
++ MP_processor_info(&processor);
++ }
++
++ bus.mpc_type = MP_BUS;
++ bus.mpc_busid = 0;
++ switch (mpc_default_type) {
++ default:
++ printk(KERN_ERR "???\nUnknown standard configuration %d\n",
++ mpc_default_type);
++ /* fall through */
++ case 1:
++ case 5:
++ memcpy(bus.mpc_bustype, "ISA ", 6);
++ break;
++ case 2:
++ case 6:
++ case 3:
++ memcpy(bus.mpc_bustype, "EISA ", 6);
++ break;
++ case 4:
++ case 7:
++ memcpy(bus.mpc_bustype, "MCA ", 6);
++ }
++ MP_bus_info(&bus);
++ if (mpc_default_type > 4) {
++ bus.mpc_busid = 1;
++ memcpy(bus.mpc_bustype, "PCI ", 6);
++ MP_bus_info(&bus);
++ }
++
++#ifdef CONFIG_X86_IO_APIC
++ ioapic.mpc_type = MP_IOAPIC;
++ ioapic.mpc_apicid = 2;
++ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
++ ioapic.mpc_flags = MPC_APIC_USABLE;
++ ioapic.mpc_apicaddr = 0xFEC00000;
++ MP_ioapic_info(&ioapic);
++
++ /*
++ * We set up most of the low 16 IO-APIC pins according to MPS rules.
++ */
++ construct_default_ioirq_mptable(mpc_default_type);
++#endif
++ lintsrc.mpc_type = MP_LINTSRC;
++ lintsrc.mpc_irqflag = 0; /* conforming */
++ lintsrc.mpc_srcbusid = 0;
++ lintsrc.mpc_srcbusirq = 0;
++ lintsrc.mpc_destapic = MP_APIC_ALL;
++ for (i = 0; i < 2; i++) {
++ lintsrc.mpc_irqtype = linttypes[i];
++ lintsrc.mpc_destapiclint = i;
++ MP_lintsrc_info(&lintsrc);
++ }
++}
++
++static struct intel_mp_floating *mpf_found;
++
+/*
-+ * Intel SMP support routines.
-+ *
-+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
-+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
-+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
-+ *
-+ * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
-+ *
-+ * This code is released under the GNU General Public License version 2 or
-+ * later.
++ * Scan the memory blocks for an SMP configuration block.
+ */
++static void __init __get_smp_config(unsigned early)
++{
++ struct intel_mp_floating *mpf = mpf_found;
++
++ if (acpi_lapic && early)
++ return;
++ /*
++ * ACPI supports both logical (e.g. Hyper-Threading) and physical
++ * processors, where MPS only supports physical.
++ */
++ if (acpi_lapic && acpi_ioapic) {
++ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
++ "information\n");
++ return;
++ } else if (acpi_lapic)
++ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
++ "configuration information\n");
++
++ printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
++ mpf->mpf_specification);
++#ifdef CONFIG_X86_32
++ if (mpf->mpf_feature2 & (1 << 7)) {
++ printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
++ pic_mode = 1;
++ } else {
++ printk(KERN_INFO " Virtual Wire compatibility mode.\n");
++ pic_mode = 0;
++ }
++#endif
++ /*
++ * Now see if we need to read further.
++ */
++ if (mpf->mpf_feature1 != 0) {
++ if (early) {
++ /*
++ * local APIC has default address
++ */
++ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
++ return;
++ }
++
++ printk(KERN_INFO "Default MP configuration #%d\n",
++ mpf->mpf_feature1);
++ construct_default_ISA_mptable(mpf->mpf_feature1);
++
++ } else if (mpf->mpf_physptr) {
++
++ /*
++ * Read the physical hardware table. Anything here will
++ * override the defaults.
++ */
++ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
++ smp_found_config = 0;
++ printk(KERN_ERR
++ "BIOS bug, MP table errors detected!...\n");
++ printk(KERN_ERR "... disabling SMP support. "
++ "(tell your hw vendor)\n");
++ return;
++ }
+
-+#include <linux/init.h>
++ if (early)
++ return;
++#ifdef CONFIG_X86_IO_APIC
++ /*
++ * If there are no explicit MP IRQ entries, then we are
++ * broken. We set up most of the low 16 IO-APIC pins to
++ * ISA defaults and hope it will work.
++ */
++ if (!mp_irq_entries) {
++ struct mpc_config_bus bus;
+
-+#include <linux/mm.h>
-+#include <linux/delay.h>
-+#include <linux/spinlock.h>
-+#include <linux/kernel_stat.h>
-+#include <linux/mc146818rtc.h>
-+#include <linux/cache.h>
-+#include <linux/interrupt.h>
-+#include <linux/cpu.h>
++ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
++ "using default mptable. "
++ "(tell your hw vendor)\n");
+
-+#include <asm/mtrr.h>
-+#include <asm/tlbflush.h>
-+#include <asm/mmu_context.h>
-+#include <asm/proto.h>
-+#include <mach_ipi.h>
-+#include <xen/evtchn.h>
-+/*
-+ * Some notes on x86 processor bugs affecting SMP operation:
-+ *
-+ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
-+ * The Linux implications for SMP are handled as follows:
-+ *
-+ * Pentium III / [Xeon]
-+ * None of the E1AP-E3AP errata are visible to the user.
-+ *
-+ * E1AP. see PII A1AP
-+ * E2AP. see PII A2AP
-+ * E3AP. see PII A3AP
-+ *
-+ * Pentium II / [Xeon]
-+ * None of the A1AP-A3AP errata are visible to the user.
-+ *
-+ * A1AP. see PPro 1AP
-+ * A2AP. see PPro 2AP
-+ * A3AP. see PPro 7AP
-+ *
-+ * Pentium Pro
-+ * None of 1AP-9AP errata are visible to the normal user,
-+ * except occasional delivery of 'spurious interrupt' as trap #15.
-+ * This is very rare and a non-problem.
-+ *
-+ * 1AP. Linux maps APIC as non-cacheable
-+ * 2AP. worked around in hardware
-+ * 3AP. fixed in C0 and above steppings microcode update.
-+ * Linux does not use excessive STARTUP_IPIs.
-+ * 4AP. worked around in hardware
-+ * 5AP. symmetric IO mode (normal Linux operation) not affected.
-+ * 'noapic' mode has vector 0xf filled out properly.
-+ * 6AP. 'noapic' mode might be affected - fixed in later steppings
-+ * 7AP. We do not assume writes to the LVT deassering IRQs
-+ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
-+ * 9AP. We do not use mixed mode
-+ *
-+ * Pentium
-+ * There is a marginal case where REP MOVS on 100MHz SMP
-+ * machines with B stepping processors can fail. XXX should provide
-+ * an L1cache=Writethrough or L1cache=off option.
-+ *
-+ * B stepping CPUs may hang. There are hardware work arounds
-+ * for this. We warn about it in case your board doesn't have the work
-+ * arounds. Basically that's so I can tell anyone with a B stepping
-+ * CPU and SMP problems "tough".
-+ *
-+ * Specific items [From Pentium Processor Specification Update]
-+ *
-+ * 1AP. Linux doesn't use remote read
-+ * 2AP. Linux doesn't trust APIC errors
-+ * 3AP. We work around this
-+ * 4AP. Linux never generated 3 interrupts of the same priority
-+ * to cause a lost local interrupt.
-+ * 5AP. Remote read is never used
-+ * 6AP. not affected - worked around in hardware
-+ * 7AP. not affected - worked around in hardware
-+ * 8AP. worked around in hardware - we get explicit CS errors if not
-+ * 9AP. only 'noapic' mode affected. Might generate spurious
-+ * interrupts, we log only the first one and count the
-+ * rest silently.
-+ * 10AP. not affected - worked around in hardware
-+ * 11AP. Linux reads the APIC between writes to avoid this, as per
-+ * the documentation. Make sure you preserve this as it affects
-+ * the C stepping chips too.
-+ * 12AP. not affected - worked around in hardware
-+ * 13AP. not affected - worked around in hardware
-+ * 14AP. we always deassert INIT during bootup
-+ * 15AP. not affected - worked around in hardware
-+ * 16AP. not affected - worked around in hardware
-+ * 17AP. not affected - worked around in hardware
-+ * 18AP. not affected - worked around in hardware
-+ * 19AP. not affected - worked around in BIOS
-+ *
-+ * If this sounds worrying believe me these bugs are either ___RARE___,
-+ * or are signal timing bugs worked around in hardware and there's
-+ * about nothing of note with C stepping upwards.
-+ */
++ bus.mpc_type = MP_BUS;
++ bus.mpc_busid = 0;
++ memcpy(bus.mpc_bustype, "ISA ", 6);
++ MP_bus_info(&bus);
+
-+/*
-+ * this function sends a 'reschedule' IPI to another CPU.
-+ * it goes straight through and wastes no time serializing
-+ * anything. Worst case is that we lose a reschedule ...
-+ */
-+void xen_smp_send_reschedule(int cpu)
++ construct_default_ioirq_mptable(0);
++ }
++#endif
++ } else
++ BUG();
++
++ if (!early)
++ printk(KERN_INFO "Processors: %d\n", num_processors);
++ /*
++ * Only use the first configuration found.
++ */
++}
++
++void __init early_get_smp_config(void)
++{
++ __get_smp_config(1);
++}
++
++void __init get_smp_config(void)
++{
++ __get_smp_config(0);
++}
++
++static int __init smp_scan_config(unsigned long base, unsigned long length,
++ unsigned reserve)
++{
++ unsigned int *bp = isa_bus_to_virt(base);
++ struct intel_mp_floating *mpf;
++
++ Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
++ BUILD_BUG_ON(sizeof(*mpf) != 16);
++
++ while (length > 0) {
++ mpf = (struct intel_mp_floating *)bp;
++ if ((*bp == SMP_MAGIC_IDENT) &&
++ (mpf->mpf_length == 1) &&
++ !mpf_checksum((unsigned char *)bp, 16) &&
++ ((mpf->mpf_specification == 1)
++ || (mpf->mpf_specification == 4))) {
++
++ smp_found_config = 1;
++ mpf_found = mpf;
++#ifdef CONFIG_X86_32
++#ifndef CONFIG_XEN
++ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
++ mpf, virt_to_phys(mpf));
++ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
++ BOOTMEM_DEFAULT);
++ if (mpf->mpf_physptr) {
++ /*
++ * We cannot access to MPC table to compute
++ * table size yet, as only few megabytes from
++ * the bottom is mapped now.
++ * PC-9800's MPC table places on the very last
++ * of physical memory; so that simply reserving
++ * PAGE_SIZE from mpg->mpf_physptr yields BUG()
++ * in reserve_bootmem.
++ */
++ unsigned long size = PAGE_SIZE;
++ unsigned long end = max_low_pfn * PAGE_SIZE;
++ if (mpf->mpf_physptr + size > end)
++ size = end - mpf->mpf_physptr;
++ reserve_bootmem(mpf->mpf_physptr, size,
++ BOOTMEM_DEFAULT);
++ }
++#else
++ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
++ mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
++#endif
++#elif !defined(CONFIG_XEN)
++ if (!reserve)
++ return 1;
++
++ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
++ if (mpf->mpf_physptr)
++ reserve_bootmem_generic(mpf->mpf_physptr,
++ PAGE_SIZE);
++#endif
++ return 1;
++ }
++ bp += 4;
++ length -= 16;
++ }
++ return 0;
++}
++
++static void __init __find_smp_config(unsigned reserve)
++{
++#ifndef CONFIG_XEN
++ unsigned int address;
++#endif
++
++ /*
++ * FIXME: Linux assumes you have 640K of base ram..
++ * this continues the error...
++ *
++ * 1) Scan the bottom 1K for a signature
++ * 2) Scan the top 1K of base RAM
++ * 3) Scan the 64K of bios
++ */
++ if (smp_scan_config(0x0, 0x400, reserve) ||
++ smp_scan_config(639 * 0x400, 0x400, reserve) ||
++ smp_scan_config(0xF0000, 0x10000, reserve))
++ return;
++ /*
++ * If it is an SMP machine we should know now, unless the
++ * configuration is in an EISA/MCA bus machine with an
++ * extended bios data area.
++ *
++ * there is a real-mode segmented pointer pointing to the
++ * 4K EBDA area at 0x40E, calculate and scan it here.
++ *
++ * NOTE! There are Linux loaders that will corrupt the EBDA
++ * area, and as such this kind of SMP config may be less
++ * trustworthy, simply because the SMP table may have been
++ * stomped on during early boot. These loaders are buggy and
++ * should be fixed.
++ *
++ * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
++ */
++
++#ifndef CONFIG_XEN
++ address = get_bios_ebda();
++ if (address)
++ smp_scan_config(address, 0x400, reserve);
++#endif
++}
++
++void __init early_find_smp_config(void)
+{
-+ if (unlikely(cpu_is_offline(cpu))) {
-+ WARN_ON(1);
-+ return;
-+ }
-+ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
++ __find_smp_config(0);
++}
++
++void __init find_smp_config(void)
++{
++ __find_smp_config(1);
+}
+
++/* --------------------------------------------------------------------------
++ ACPI-based MP Configuration
++ -------------------------------------------------------------------------- */
++
+/*
-+ * Structure and data for smp_call_function(). This is designed to minimise
-+ * static memory requirements. It also looks cleaner.
++ * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
+ */
-+static DEFINE_SPINLOCK(call_lock);
++int es7000_plat;
+
-+struct call_data_struct {
-+ void (*func) (void *info);
-+ void *info;
-+ atomic_t started;
-+ atomic_t finished;
-+ int wait;
-+};
++#ifdef CONFIG_ACPI
+
-+void lock_ipi_call_lock(void)
++#ifdef CONFIG_X86_IO_APIC
++
++#define MP_ISA_BUS 0
++
++extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
++
++static int mp_find_ioapic(int gsi)
+{
-+ spin_lock_irq(&call_lock);
++ int i = 0;
++
++ /* Find the IOAPIC that manages this GSI. */
++ for (i = 0; i < nr_ioapics; i++) {
++ if ((gsi >= mp_ioapic_routing[i].gsi_base)
++ && (gsi <= mp_ioapic_routing[i].gsi_end))
++ return i;
++ }
++
++ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
++ return -1;
+}
+
-+void unlock_ipi_call_lock(void)
++static u8 __init uniq_ioapic_id(u8 id)
+{
-+ spin_unlock_irq(&call_lock);
++#ifdef CONFIG_X86_32
++ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
++ !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
++ return io_apic_get_unique_id(nr_ioapics, id);
++ else
++ return id;
++#else
++ int i;
++ DECLARE_BITMAP(used, 256);
++ bitmap_zero(used, 256);
++ for (i = 0; i < nr_ioapics; i++) {
++ struct mpc_config_ioapic *ia = &mp_ioapics[i];
++ __set_bit(ia->mpc_apicid, used);
++ }
++ if (!test_bit(id, used))
++ return id;
++ return find_first_zero_bit(used, 256);
++#endif
+}
+
-+static struct call_data_struct *call_data;
-+
-+static void __smp_call_function(void (*func) (void *info), void *info,
-+ int nonatomic, int wait)
++void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
-+ struct call_data_struct data;
-+ int cpus = num_online_cpus() - 1;
++ int idx = 0;
+
-+ if (!cpus)
++ if (bad_ioapic(address))
+ return;
+
-+ data.func = func;
-+ data.info = info;
-+ atomic_set(&data.started, 0);
-+ data.wait = wait;
-+ if (wait)
-+ atomic_set(&data.finished, 0);
++ idx = nr_ioapics;
+
-+ call_data = &data;
-+ mb();
++ mp_ioapics[idx].mpc_type = MP_IOAPIC;
++ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
++ mp_ioapics[idx].mpc_apicaddr = address;
+
-+ /* Send a message to all other CPUs and wait for them to respond */
-+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
++#ifndef CONFIG_XEN
++ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
++#endif
++ mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
++#ifdef CONFIG_X86_32
++ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
++#else
++ mp_ioapics[idx].mpc_apicver = 0;
++#endif
++ /*
++ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
++ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
++ */
++ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
++ mp_ioapic_routing[idx].gsi_base = gsi_base;
++ mp_ioapic_routing[idx].gsi_end = gsi_base +
++ io_apic_get_redir_entries(idx);
+
-+ /* Wait for response */
-+ while (atomic_read(&data.started) != cpus)
-+ cpu_relax();
++ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
++ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
++ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
++ mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+
-+ if (wait)
-+ while (atomic_read(&data.finished) != cpus)
-+ cpu_relax();
++ nr_ioapics++;
+}
+
-+
-+/**
-+ * smp_call_function_mask(): Run a function on a set of other CPUs.
-+ * @mask: The set of cpus to run on. Must not include the current cpu.
-+ * @func: The function to run. This must be fast and non-blocking.
-+ * @info: An arbitrary pointer to pass to the function.
-+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
-+ *
-+ * Returns 0 on success, else a negative status code.
-+ *
-+ * If @wait is true, then returns once @func has returned; otherwise
-+ * it returns just before the target cpu calls @func.
-+ *
-+ * You must not call this function with disabled interrupts or from a
-+ * hardware interrupt handler or from a bottom half handler.
-+ */
-+int
-+xen_smp_call_function_mask(cpumask_t mask,
-+ void (*func)(void *), void *info,
-+ int wait)
++void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
-+ struct call_data_struct data;
-+ cpumask_t allbutself;
-+ int cpus;
++ struct mpc_config_intsrc intsrc;
++ int ioapic = -1;
++ int pin = -1;
+
-+ /* Can deadlock when called with interrupts disabled */
-+ WARN_ON(irqs_disabled());
++ /*
++ * Convert 'gsi' to 'ioapic.pin'.
++ */
++ ioapic = mp_find_ioapic(gsi);
++ if (ioapic < 0)
++ return;
++ pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
-+ /* Holding any lock stops cpus from going down. */
-+ spin_lock(&call_lock);
++ /*
++ * TBD: This check is for faulty timer entries, where the override
++ * erroneously sets the trigger to level, resulting in a HUGE
++ * increase of timer interrupts!
++ */
++ if ((bus_irq == 0) && (trigger == 3))
++ trigger = 1;
+
-+ allbutself = cpu_online_map;
-+ cpu_clear(smp_processor_id(), allbutself);
++ intsrc.mpc_type = MP_INTSRC;
++ intsrc.mpc_irqtype = mp_INT;
++ intsrc.mpc_irqflag = (trigger << 2) | polarity;
++ intsrc.mpc_srcbus = MP_ISA_BUS;
++ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
++ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
++ intsrc.mpc_dstirq = pin; /* INTIN# */
+
-+ cpus_and(mask, mask, allbutself);
-+ cpus = cpus_weight(mask);
++ MP_intsrc_info(&intsrc);
++}
+
-+ if (!cpus) {
-+ spin_unlock(&call_lock);
-+ return 0;
-+ }
++void __init mp_config_acpi_legacy_irqs(void)
++{
++ struct mpc_config_intsrc intsrc;
++ int i = 0;
++ int ioapic = -1;
+
-+ data.func = func;
-+ data.info = info;
-+ atomic_set(&data.started, 0);
-+ data.wait = wait;
-+ if (wait)
-+ atomic_set(&data.finished, 0);
++#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
++ /*
++ * Fabricate the legacy ISA bus (bus #31).
++ */
++ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
++#endif
++ set_bit(MP_ISA_BUS, mp_bus_not_pci);
++ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
-+ call_data = &data;
-+ wmb();
++ /*
++ * Older generations of ES7000 have no legacy identity mappings
++ */
++ if (es7000_plat == 1)
++ return;
+
-+ /* Send a message to other CPUs */
-+ if (cpus_equal(mask, allbutself) &&
-+ cpus_equal(cpu_online_map, cpu_callout_map))
-+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-+ else
-+ send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
++ /*
++ * Locate the IOAPIC that manages the ISA IRQs (0-15).
++ */
++ ioapic = mp_find_ioapic(0);
++ if (ioapic < 0)
++ return;
+
-+ /* Wait for response */
-+ while (atomic_read(&data.started) != cpus)
-+ cpu_relax();
++ intsrc.mpc_type = MP_INTSRC;
++ intsrc.mpc_irqflag = 0; /* Conforming */
++ intsrc.mpc_srcbus = MP_ISA_BUS;
++#ifdef CONFIG_X86_IO_APIC
++ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
++#endif
++ /*
++ * Use the default configuration for the IRQs 0-15. Unless
++ * overridden by (MADT) interrupt source override entries.
++ */
++ for (i = 0; i < 16; i++) {
++ int idx;
+
-+ if (wait)
-+ while (atomic_read(&data.finished) != cpus)
-+ cpu_relax();
-+ spin_unlock(&call_lock);
++ for (idx = 0; idx < mp_irq_entries; idx++) {
++ struct mpc_config_intsrc *irq = mp_irqs + idx;
++
++ /* Do we already have a mapping for this ISA IRQ? */
++ if (irq->mpc_srcbus == MP_ISA_BUS
++ && irq->mpc_srcbusirq == i)
++ break;
++
++ /* Do we already have a mapping for this IOAPIC pin */
++ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
++ (irq->mpc_dstirq == i))
++ break;
++ }
++
++ if (idx != mp_irq_entries) {
++ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
++ continue; /* IRQ already used */
++ }
+
-+ return 0;
++ intsrc.mpc_irqtype = mp_INT;
++ intsrc.mpc_srcbusirq = i; /* Identity mapped */
++ intsrc.mpc_dstirq = i;
++
++ MP_intsrc_info(&intsrc);
++ }
+}
+
-+static void stop_this_cpu(void *dummy)
++int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
-+ local_irq_disable();
++ int ioapic;
++ int ioapic_pin;
++#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
++#define MAX_GSI_NUM 4096
++#define IRQ_COMPRESSION_START 64
++
++ static int pci_irq = IRQ_COMPRESSION_START;
+ /*
-+ * Remove this CPU:
++ * Mapping between Global System Interrupts, which
++ * represent all possible interrupts, and IRQs
++ * assigned to actual devices.
+ */
-+ cpu_clear(smp_processor_id(), cpu_online_map);
-+ disable_all_local_evtchn();
-+ if (hlt_works(smp_processor_id()))
-+ for (;;) halt();
-+ for (;;);
-+}
++ static int gsi_to_irq[MAX_GSI_NUM];
++#else
+
-+/*
-+ * this function calls the 'stop' function on all other CPUs in the system.
-+ */
++ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
++ return gsi;
++#endif
+
-+void xen_smp_send_stop(void)
-+{
-+ int nolock;
-+ unsigned long flags;
++ /* Don't set up the ACPI SCI because it's already set up */
++ if (acpi_gbl_FADT.sci_interrupt == gsi)
++ return gsi;
+
-+ /* Don't deadlock on the call lock in panic */
-+ nolock = !spin_trylock(&call_lock);
-+ local_irq_save(flags);
-+ __smp_call_function(stop_this_cpu, NULL, 0, 0);
-+ if (!nolock)
-+ spin_unlock(&call_lock);
-+ disable_all_local_evtchn();
-+ local_irq_restore(flags);
-+}
++ ioapic = mp_find_ioapic(gsi);
++ if (ioapic < 0) {
++ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
++ return gsi;
++ }
+
-+/*
-+ * Reschedule call back. Nothing to do,
-+ * all the work is done automatically when
-+ * we return from the interrupt.
-+ */
-+irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
-+{
-+#ifdef CONFIG_X86_32
-+ __get_cpu_var(irq_stat).irq_resched_count++;
-+#else
-+ add_pda(irq_resched_count, 1);
-+#endif
-+ return IRQ_HANDLED;
-+}
++ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
-+irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
-+{
-+ void (*func) (void *info) = call_data->func;
-+ void *info = call_data->info;
-+ int wait = call_data->wait;
++#ifndef CONFIG_X86_32
++ if (ioapic_renumber_irq)
++ gsi = ioapic_renumber_irq(ioapic, gsi);
++#endif
+
+ /*
-+ * Notify initiating CPU that I've grabbed the data and am
-+ * about to execute the function
-+ */
-+ mb();
-+ atomic_inc(&call_data->started);
-+ /*
-+ * At this point the info structure may be out of scope unless wait==1
++ * Avoid pin reprogramming. PRTs typically include entries
++ * with redundant pin->gsi mappings (but unique PCI devices);
++ * we only program the IOAPIC on the first.
+ */
-+ irq_enter();
-+ (*func)(info);
-+#ifdef CONFIG_X86_32
-+ __get_cpu_var(irq_stat).irq_call_count++;
++ if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
++ printk(KERN_ERR "Invalid reference to IOAPIC pin "
++ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
++ ioapic_pin);
++ return gsi;
++ }
++ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
++ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
++ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
++#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
++ return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+#else
-+ add_pda(irq_call_count, 1);
++ return gsi;
+#endif
-+ irq_exit();
-+
-+ if (wait) {
-+ mb();
-+ atomic_inc(&call_data->finished);
+ }
+
-+ return IRQ_HANDLED;
++ set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
++#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
++ /*
++ * For GSI >= 64, use IRQ compression
++ */
++ if ((gsi >= IRQ_COMPRESSION_START)
++ && (triggering == ACPI_LEVEL_SENSITIVE)) {
++ /*
++ * For PCI devices assign IRQs in order, avoiding gaps
++ * due to unused I/O APIC pins.
++ */
++ int irq = gsi;
++ if (gsi < MAX_GSI_NUM) {
++ /*
++ * Retain the VIA chipset work-around (gsi > 15), but
++ * avoid a problem where the 8254 timer (IRQ0) is setup
++ * via an override (so it's not on pin 0 of the ioapic),
++ * and at the same time, the pin 0 interrupt is a PCI
++ * type. The gsi > 15 test could cause these two pins
++ * to be shared as IRQ0, and they are not shareable.
++ * So test for this condition, and if necessary, avoid
++ * the pin collision.
++ */
++ gsi = pci_irq++;
++ /*
++ * Don't assign IRQ used by ACPI SCI
++ */
++ if (gsi == acpi_gbl_FADT.sci_interrupt)
++ gsi = pci_irq++;
++ gsi_to_irq[irq] = gsi;
++ } else {
++ printk(KERN_ERR "GSI %u is too high\n", gsi);
++ return gsi;
++ }
++ }
++#endif
++ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
++ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
++ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
++ return gsi;
+}
---- a/arch/x86/kernel/time_32-xen.c
-+++ b/arch/x86/kernel/time_32-xen.c
-@@ -701,8 +701,6 @@ int xen_update_persistent_clock(void)
- return 0;
- }
-
--extern void (*late_time_init)(void);
++
++#endif /* CONFIG_X86_IO_APIC */
++#endif /* CONFIG_ACPI */
+--- sle11-2009-05-14.orig/arch/x86/kernel/mpparse_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,1161 +0,0 @@
+-/*
+- * Intel Multiprocessor Specification 1.1 and 1.4
+- * compliant MP-table parsing routines.
+- *
+- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+- *
+- * Fixes
+- * Erich Boleyn : MP v1.4 and additional changes.
+- * Alan Cox : Added EBDA scanning
+- * Ingo Molnar : various cleanups and rewrites
+- * Maciej W. Rozycki: Bits for default MP configurations
+- * Paul Diefenbaugh: Added full ACPI support
+- */
+-
+-#include <linux/mm.h>
+-#include <linux/init.h>
+-#include <linux/acpi.h>
+-#include <linux/delay.h>
+-#include <linux/bootmem.h>
+-#include <linux/kernel_stat.h>
+-#include <linux/mc146818rtc.h>
+-#include <linux/bitops.h>
+-
+-#include <asm/smp.h>
+-#include <asm/acpi.h>
+-#include <asm/mtrr.h>
+-#include <asm/mpspec.h>
+-#include <asm/io_apic.h>
+-
+-#include <mach_apic.h>
+-#include <mach_apicdef.h>
+-#include <mach_mpparse.h>
+-#include <bios_ebda.h>
+-
+-/* Have we found an MP table */
+-int smp_found_config;
+-unsigned int __cpuinitdata maxcpus = NR_CPUS;
+-
+-/*
+- * Various Linux-internal data structures created from the
+- * MP-table.
+- */
+-int apic_version [MAX_APICS];
+-int mp_bus_id_to_type [MAX_MP_BUSSES];
+-int mp_bus_id_to_node [MAX_MP_BUSSES];
+-int mp_bus_id_to_local [MAX_MP_BUSSES];
+-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+-static int mp_current_pci_id;
+-
+-/* I/O APIC entries */
+-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+-
+-/* # of MP IRQ source entries */
+-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+-
+-/* MP IRQ source entries */
+-int mp_irq_entries;
+-
+-int nr_ioapics;
+-
+-int pic_mode;
+-unsigned long mp_lapic_addr;
+-
+-unsigned int def_to_bigsmp = 0;
+-
+-/* Processor that is doing the boot up */
+-unsigned int boot_cpu_physical_apicid = -1U;
+-/* Internal processor count */
+-unsigned int num_processors;
+-
+-/* Bitmask of physically existing CPUs */
+-physid_mask_t phys_cpu_present_map;
+-
+-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+-
+-/*
+- * Intel MP BIOS table parsing routines:
+- */
+-
+-
+-/*
+- * Checksum an MP configuration block.
+- */
+-
+-static int __init mpf_checksum(unsigned char *mp, int len)
+-{
+- int sum = 0;
+-
+- while (len--)
+- sum += *mp++;
+-
+- return sum & 0xFF;
+-}
+-
+-/*
+- * Have to match translation table entries to main table entries by counter
+- * hence the mpc_record variable .... can't see a less disgusting way of
+- * doing this ....
+- */
+-
+-static int mpc_record;
+-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
+-
+-#ifndef CONFIG_XEN
+-static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
+-{
+- int ver, apicid;
+- physid_mask_t phys_cpu;
+-
+- if (!(m->mpc_cpuflag & CPU_ENABLED))
+- return;
+-
+- apicid = mpc_apic_id(m, translation_table[mpc_record]);
+-
+- if (m->mpc_featureflag&(1<<0))
+- Dprintk(" Floating point unit present.\n");
+- if (m->mpc_featureflag&(1<<7))
+- Dprintk(" Machine Exception supported.\n");
+- if (m->mpc_featureflag&(1<<8))
+- Dprintk(" 64 bit compare & exchange supported.\n");
+- if (m->mpc_featureflag&(1<<9))
+- Dprintk(" Internal APIC present.\n");
+- if (m->mpc_featureflag&(1<<11))
+- Dprintk(" SEP present.\n");
+- if (m->mpc_featureflag&(1<<12))
+- Dprintk(" MTRR present.\n");
+- if (m->mpc_featureflag&(1<<13))
+- Dprintk(" PGE present.\n");
+- if (m->mpc_featureflag&(1<<14))
+- Dprintk(" MCA present.\n");
+- if (m->mpc_featureflag&(1<<15))
+- Dprintk(" CMOV present.\n");
+- if (m->mpc_featureflag&(1<<16))
+- Dprintk(" PAT present.\n");
+- if (m->mpc_featureflag&(1<<17))
+- Dprintk(" PSE present.\n");
+- if (m->mpc_featureflag&(1<<18))
+- Dprintk(" PSN present.\n");
+- if (m->mpc_featureflag&(1<<19))
+- Dprintk(" Cache Line Flush Instruction present.\n");
+- /* 20 Reserved */
+- if (m->mpc_featureflag&(1<<21))
+- Dprintk(" Debug Trace and EMON Store present.\n");
+- if (m->mpc_featureflag&(1<<22))
+- Dprintk(" ACPI Thermal Throttle Registers present.\n");
+- if (m->mpc_featureflag&(1<<23))
+- Dprintk(" MMX present.\n");
+- if (m->mpc_featureflag&(1<<24))
+- Dprintk(" FXSR present.\n");
+- if (m->mpc_featureflag&(1<<25))
+- Dprintk(" XMM present.\n");
+- if (m->mpc_featureflag&(1<<26))
+- Dprintk(" Willamette New Instructions present.\n");
+- if (m->mpc_featureflag&(1<<27))
+- Dprintk(" Self Snoop present.\n");
+- if (m->mpc_featureflag&(1<<28))
+- Dprintk(" HT present.\n");
+- if (m->mpc_featureflag&(1<<29))
+- Dprintk(" Thermal Monitor present.\n");
+- /* 30, 31 Reserved */
+-
+-
+- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+- Dprintk(" Bootup CPU\n");
+- boot_cpu_physical_apicid = m->mpc_apicid;
+- }
+-
+- ver = m->mpc_apicver;
+-
+- /*
+- * Validate version
+- */
+- if (ver == 0x0) {
+- printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
+- "fixing up to 0x10. (tell your hw vendor)\n",
+- m->mpc_apicid);
+- ver = 0x10;
+- }
+- apic_version[m->mpc_apicid] = ver;
+-
+- phys_cpu = apicid_to_cpu_present(apicid);
+- physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
+-
+- if (num_processors >= NR_CPUS) {
+- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+- " Processor ignored.\n", NR_CPUS);
+- return;
+- }
+-
+- if (num_processors >= maxcpus) {
+- printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
+- " Processor ignored.\n", maxcpus);
+- return;
+- }
+-
+- cpu_set(num_processors, cpu_possible_map);
+- num_processors++;
+-
+- /*
+- * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
+- * but we need to work other dependencies like SMP_SUSPEND etc
+- * before this can be done without some confusion.
+- * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
+- * - Ashok Raj <ashok.raj@intel.com>
+- */
+- if (num_processors > 8) {
+- switch (boot_cpu_data.x86_vendor) {
+- case X86_VENDOR_INTEL:
+- if (!APIC_XAPIC(ver)) {
+- def_to_bigsmp = 0;
+- break;
+- }
+- /* If P4 and above fall through */
+- case X86_VENDOR_AMD:
+- def_to_bigsmp = 1;
+- }
+- }
+- bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
+-}
+-#else
+-static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
+-{
+- num_processors++;
+-}
+-#endif /* CONFIG_XEN */
+-
+-static void __init MP_bus_info (struct mpc_config_bus *m)
+-{
+- char str[7];
+-
+- memcpy(str, m->mpc_bustype, 6);
+- str[6] = 0;
+-
+- mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+-
+-#if MAX_MP_BUSSES < 256
+- if (m->mpc_busid >= MAX_MP_BUSSES) {
+- printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
+- " is too large, max. supported is %d\n",
+- m->mpc_busid, str, MAX_MP_BUSSES - 1);
+- return;
+- }
+-#endif
+-
+- if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
+- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+- } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
+- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+- } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
+- mpc_oem_pci_bus(m, translation_table[mpc_record]);
+- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+- mp_current_pci_id++;
+- } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
+- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+- } else {
+- printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+- }
+-}
+-
+-static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+-{
+- if (!(m->mpc_flags & MPC_APIC_USABLE))
+- return;
+-
+- printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
+- m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+- if (nr_ioapics >= MAX_IO_APICS) {
+- printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
+- MAX_IO_APICS, nr_ioapics);
+- panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
+- }
+- if (!m->mpc_apicaddr) {
+- printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
+- " found in MP table, skipping!\n");
+- return;
+- }
+- mp_ioapics[nr_ioapics] = *m;
+- nr_ioapics++;
+-}
+-
+-static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+-{
+- mp_irqs [mp_irq_entries] = *m;
+- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+- m->mpc_irqtype, m->mpc_irqflag & 3,
+- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+- m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+- if (++mp_irq_entries == MAX_IRQ_SOURCES)
+- panic("Max # of irq sources exceeded!!\n");
+-}
+-
+-static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+-{
+- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+- m->mpc_irqtype, m->mpc_irqflag & 3,
+- (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
+- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+-}
+-
+-#ifdef CONFIG_X86_NUMAQ
+-static void __init MP_translation_info (struct mpc_config_translation *m)
+-{
+- printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
+-
+- if (mpc_record >= MAX_MPC_ENTRY)
+- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+- else
+- translation_table[mpc_record] = m; /* stash this for later */
+- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+- node_set_online(m->trans_quad);
+-}
+-
+-/*
+- * Read/parse the MPC oem tables
+- */
+-
+-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
+- unsigned short oemsize)
+-{
+- int count = sizeof (*oemtable); /* the header size */
+- unsigned char *oemptr = ((unsigned char *)oemtable)+count;
+-
+- mpc_record = 0;
+- printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
+- if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
+- {
+- printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+- oemtable->oem_signature[0],
+- oemtable->oem_signature[1],
+- oemtable->oem_signature[2],
+- oemtable->oem_signature[3]);
+- return;
+- }
+- if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
+- {
+- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+- return;
+- }
+- while (count < oemtable->oem_length) {
+- switch (*oemptr) {
+- case MP_TRANSLATION:
+- {
+- struct mpc_config_translation *m=
+- (struct mpc_config_translation *)oemptr;
+- MP_translation_info(m);
+- oemptr += sizeof(*m);
+- count += sizeof(*m);
+- ++mpc_record;
+- break;
+- }
+- default:
+- {
+- printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
+- return;
+- }
+- }
+- }
+-}
-
- /* Dynamically-mapped IRQ. */
- DEFINE_PER_CPU(int, timer_irq);
-
---- a/arch/x86/kernel/traps_32-xen.c
-+++ b/arch/x86/kernel/traps_32-xen.c
-@@ -9,26 +9,28 @@
- * 'Traps.c' handles hardware traps and faults after we have saved some
- * state in 'asm.s'.
- */
--#include <linux/sched.h>
-+#include <linux/interrupt.h>
-+#include <linux/kallsyms.h>
-+#include <linux/spinlock.h>
-+#include <linux/highmem.h>
-+#include <linux/kprobes.h>
-+#include <linux/uaccess.h>
-+#include <linux/utsname.h>
-+#include <linux/kdebug.h>
- #include <linux/kernel.h>
-+#include <linux/module.h>
-+#include <linux/ptrace.h>
- #include <linux/string.h>
-+#include <linux/unwind.h>
-+#include <linux/delay.h>
- #include <linux/errno.h>
-+#include <linux/kexec.h>
-+#include <linux/sched.h>
- #include <linux/timer.h>
--#include <linux/mm.h>
- #include <linux/init.h>
--#include <linux/delay.h>
--#include <linux/spinlock.h>
--#include <linux/interrupt.h>
--#include <linux/highmem.h>
--#include <linux/kallsyms.h>
--#include <linux/ptrace.h>
--#include <linux/utsname.h>
--#include <linux/kprobes.h>
--#include <linux/kexec.h>
--#include <linux/unwind.h>
--#include <linux/uaccess.h>
--#include <linux/nmi.h>
- #include <linux/bug.h>
-+#include <linux/nmi.h>
-+#include <linux/mm.h>
-
- #ifdef CONFIG_EISA
- #include <linux/ioport.h>
-@@ -43,21 +45,18 @@
- #include <linux/edac.h>
- #endif
-
-+#include <asm/arch_hooks.h>
-+#include <asm/stacktrace.h>
- #include <asm/processor.h>
--#include <asm/system.h>
--#include <asm/io.h>
--#include <asm/atomic.h>
- #include <asm/debugreg.h>
-+#include <asm/atomic.h>
-+#include <asm/system.h>
-+#include <asm/unwind.h>
- #include <asm/desc.h>
- #include <asm/i387.h>
- #include <asm/nmi.h>
--#include <asm/unwind.h>
- #include <asm/smp.h>
--#include <asm/arch_hooks.h>
--#include <linux/kdebug.h>
--#include <asm/stacktrace.h>
+-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+- char *productid)
+-{
+- if (strncmp(oem, "IBM NUMA", 8))
+- printk("Warning! May not be a NUMA-Q system!\n");
+- if (mpc->mpc_oemptr)
+- smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
+- mpc->mpc_oemsize);
+-}
+-#endif /* CONFIG_X86_NUMAQ */
+-
+-/*
+- * Read/parse the MPC
+- */
+-
+-static int __init smp_read_mpc(struct mp_config_table *mpc)
+-{
+- char str[16];
+- char oem[10];
+- int count=sizeof(*mpc);
+- unsigned char *mpt=((unsigned char *)mpc)+count;
+-
+- if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
+- printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
+- *(u32 *)mpc->mpc_signature);
+- return 0;
+- }
+- if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
+- printk(KERN_ERR "SMP mptable: checksum error!\n");
+- return 0;
+- }
+- if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
+- printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
+- mpc->mpc_spec);
+- return 0;
+- }
+- if (!mpc->mpc_lapic) {
+- printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+- return 0;
+- }
+- memcpy(oem,mpc->mpc_oem,8);
+- oem[8]=0;
+- printk(KERN_INFO "OEM ID: %s ",oem);
+-
+- memcpy(str,mpc->mpc_productid,12);
+- str[12]=0;
+- printk("Product ID: %s ",str);
+-
+- mps_oem_check(mpc, oem, str);
+-
+- printk("APIC at: 0x%X\n", mpc->mpc_lapic);
+-
+- /*
+- * Save the local APIC address (it might be non-default) -- but only
+- * if we're not using ACPI.
+- */
+- if (!acpi_lapic)
+- mp_lapic_addr = mpc->mpc_lapic;
+-
+- /*
+- * Now process the configuration blocks.
+- */
+- mpc_record = 0;
+- while (count < mpc->mpc_length) {
+- switch(*mpt) {
+- case MP_PROCESSOR:
+- {
+- struct mpc_config_processor *m=
+- (struct mpc_config_processor *)mpt;
+- /* ACPI may have already provided this data */
+- if (!acpi_lapic)
+- MP_processor_info(m);
+- mpt += sizeof(*m);
+- count += sizeof(*m);
+- break;
+- }
+- case MP_BUS:
+- {
+- struct mpc_config_bus *m=
+- (struct mpc_config_bus *)mpt;
+- MP_bus_info(m);
+- mpt += sizeof(*m);
+- count += sizeof(*m);
+- break;
+- }
+- case MP_IOAPIC:
+- {
+- struct mpc_config_ioapic *m=
+- (struct mpc_config_ioapic *)mpt;
+- MP_ioapic_info(m);
+- mpt+=sizeof(*m);
+- count+=sizeof(*m);
+- break;
+- }
+- case MP_INTSRC:
+- {
+- struct mpc_config_intsrc *m=
+- (struct mpc_config_intsrc *)mpt;
+-
+- MP_intsrc_info(m);
+- mpt+=sizeof(*m);
+- count+=sizeof(*m);
+- break;
+- }
+- case MP_LINTSRC:
+- {
+- struct mpc_config_lintsrc *m=
+- (struct mpc_config_lintsrc *)mpt;
+- MP_lintsrc_info(m);
+- mpt+=sizeof(*m);
+- count+=sizeof(*m);
+- break;
+- }
+- default:
+- {
+- count = mpc->mpc_length;
+- break;
+- }
+- }
+- ++mpc_record;
+- }
+- setup_apic_routing();
+- if (!num_processors)
+- printk(KERN_ERR "SMP mptable: no processors registered!\n");
+- return num_processors;
+-}
+-
+-static int __init ELCR_trigger(unsigned int irq)
+-{
+- unsigned int port;
+-
+- port = 0x4d0 + (irq >> 3);
+- return (inb(port) >> (irq & 7)) & 1;
+-}
+-
+-static void __init construct_default_ioirq_mptable(int mpc_default_type)
+-{
+- struct mpc_config_intsrc intsrc;
+- int i;
+- int ELCR_fallback = 0;
+-
+- intsrc.mpc_type = MP_INTSRC;
+- intsrc.mpc_irqflag = 0; /* conforming */
+- intsrc.mpc_srcbus = 0;
+- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+-
+- intsrc.mpc_irqtype = mp_INT;
+-
+- /*
+- * If true, we have an ISA/PCI system with no IRQ entries
+- * in the MP table. To prevent the PCI interrupts from being set up
+- * incorrectly, we try to use the ELCR. The sanity check to see if
+- * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+- * never be level sensitive, so we simply see if the ELCR agrees.
+- * If it does, we assume it's valid.
+- */
+- if (mpc_default_type == 5) {
+- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+-
+- if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
+- printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
+- else {
+- printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
+- ELCR_fallback = 1;
+- }
+- }
+-
+- for (i = 0; i < 16; i++) {
+- switch (mpc_default_type) {
+- case 2:
+- if (i == 0 || i == 13)
+- continue; /* IRQ0 & IRQ13 not connected */
+- /* fall through */
+- default:
+- if (i == 2)
+- continue; /* IRQ2 is never connected */
+- }
-
--#include <linux/module.h>
-+#include <asm/io.h>
-
- #include "mach_traps.h"
-
-@@ -71,7 +70,7 @@ EXPORT_SYMBOL_GPL(used_vectors);
- asmlinkage int system_call(void);
-
- /* Do we ignore FPU interrupts ? */
--char ignore_fpu_irq = 0;
-+char ignore_fpu_irq;
-
- #ifndef CONFIG_X86_NO_IDT
- /*
-@@ -113,12 +112,13 @@ static unsigned int code_bytes = 64;
- void printk_address(unsigned long address, int reliable)
- {
- #ifdef CONFIG_KALLSYMS
-- unsigned long offset = 0, symsize;
-+ char namebuf[KSYM_NAME_LEN];
-+ unsigned long offset = 0;
-+ unsigned long symsize;
- const char *symname;
-- char *modname;
-- char *delim = ":";
-- char namebuf[128];
- char reliab[4] = "";
-+ char *delim = ":";
-+ char *modname;
-
- symname = kallsyms_lookup(address, &symsize, &offset,
- &modname, namebuf);
-@@ -146,13 +146,14 @@ static inline int valid_stack_ptr(struct
-
- /* The form of the top of the frame on the stack */
- struct stack_frame {
-- struct stack_frame *next_frame;
-- unsigned long return_address;
-+ struct stack_frame *next_frame;
-+ unsigned long return_address;
- };
-
--static inline unsigned long print_context_stack(struct thread_info *tinfo,
-- unsigned long *stack, unsigned long bp,
-- const struct stacktrace_ops *ops, void *data)
-+static inline unsigned long
-+print_context_stack(struct thread_info *tinfo,
-+ unsigned long *stack, unsigned long bp,
-+ const struct stacktrace_ops *ops, void *data)
- {
- struct stack_frame *frame = (struct stack_frame *)bp;
-
-@@ -174,7 +175,7 @@ static inline unsigned long print_contex
- return bp;
- }
-
--#define MSG(msg) ops->warning(data, msg)
-+#define MSG(msg) ops->warning(data, msg)
-
- void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
-@@ -185,6 +186,7 @@ void dump_trace(struct task_struct *task
-
- if (!stack) {
- unsigned long dummy;
-+
- stack = &dummy;
- if (task != current)
- stack = (unsigned long *)task->thread.sp;
-@@ -194,7 +196,7 @@ void dump_trace(struct task_struct *task
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
-- asm ("movl %%ebp, %0" : "=r" (bp) : );
-+ asm("movl %%ebp, %0" : "=r" (bp) :);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
-@@ -204,15 +206,18 @@ void dump_trace(struct task_struct *task
-
- while (1) {
- struct thread_info *context;
-+
- context = (struct thread_info *)
- ((unsigned long)stack & (~(THREAD_SIZE - 1)));
- bp = print_context_stack(context, stack, bp, ops, data);
-- /* Should be after the line below, but somewhere
-- in early boot context comes out corrupted and we
-- can't reference it -AK */
-+ /*
-+ * Should be after the line below, but somewhere
-+ * in early boot context comes out corrupted and we
-+ * can't reference it:
-+ */
- if (ops->stack(data, "IRQ") < 0)
- break;
-- stack = (unsigned long*)context->previous_esp;
-+ stack = (unsigned long *)context->previous_esp;
- if (!stack)
- break;
- touch_nmi_watchdog();
-@@ -251,15 +256,15 @@ static void print_trace_address(void *da
- }
-
- static const struct stacktrace_ops print_trace_ops = {
-- .warning = print_trace_warning,
-- .warning_symbol = print_trace_warning_symbol,
-- .stack = print_trace_stack,
-- .address = print_trace_address,
-+ .warning = print_trace_warning,
-+ .warning_symbol = print_trace_warning_symbol,
-+ .stack = print_trace_stack,
-+ .address = print_trace_address,
- };
-
- static void
- show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-- unsigned long *stack, unsigned long bp, char *log_lvl)
-+ unsigned long *stack, unsigned long bp, char *log_lvl)
- {
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
- printk("%s =======================\n", log_lvl);
-@@ -271,21 +276,22 @@ void show_trace(struct task_struct *task
- show_trace_log_lvl(task, regs, stack, bp, "");
- }
-
--static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-- unsigned long *sp, unsigned long bp, char *log_lvl)
-+static void
-+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-+ unsigned long *sp, unsigned long bp, char *log_lvl)
- {
- unsigned long *stack;
- int i;
-
- if (sp == NULL) {
- if (task)
-- sp = (unsigned long*)task->thread.sp;
-+ sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
-
- stack = sp;
-- for(i = 0; i < kstack_depth_to_print; i++) {
-+ for (i = 0; i < kstack_depth_to_print; i++) {
- if (kstack_end(stack))
- break;
- if (i && ((i % 8) == 0))
-@@ -293,6 +299,7 @@ static void show_stack_log_lvl(struct ta
- printk("%08lx ", *stack++);
- }
- printk("\n%sCall Trace:\n", log_lvl);
-+
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
- }
-
-@@ -307,8 +314,8 @@ void show_stack(struct task_struct *task
- */
- void dump_stack(void)
- {
-- unsigned long stack;
- unsigned long bp = 0;
-+ unsigned long stack;
-
- #ifdef CONFIG_FRAME_POINTER
- if (!bp)
-@@ -320,6 +327,7 @@ void dump_stack(void)
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
-+
- show_trace(current, NULL, &stack, bp);
- }
-
-@@ -331,6 +339,7 @@ void show_registers(struct pt_regs *regs
-
- print_modules();
- __show_registers(regs, 0);
-+
- printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
- TASK_COMM_LEN, current->comm, task_pid_nr(current),
- current_thread_info(), current, task_thread_info(current));
-@@ -339,10 +348,10 @@ void show_registers(struct pt_regs *regs
- * time of the fault..
- */
- if (!user_mode_vm(regs)) {
-- u8 *ip;
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
-+ u8 *ip;
-
- printk("\n" KERN_EMERG "Stack: ");
- show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG);
-@@ -369,7 +378,7 @@ void show_registers(struct pt_regs *regs
- }
- }
- printk("\n");
--}
-+}
-
- int is_valid_bugaddr(unsigned long ip)
- {
-@@ -385,10 +394,10 @@ int is_valid_bugaddr(unsigned long ip)
-
- static int die_counter;
-
--int __kprobes __die(const char * str, struct pt_regs * regs, long err)
-+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
- {
-- unsigned long sp;
- unsigned short ss;
-+ unsigned long sp;
-
- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
- #ifdef CONFIG_PREEMPT
-@@ -403,8 +412,8 @@ int __kprobes __die(const char * str, st
- printk("\n");
-
- if (notify_die(DIE_OOPS, str, regs, err,
-- current->thread.trap_no, SIGSEGV) !=
-- NOTIFY_STOP) {
-+ current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
-+
- show_registers(regs);
- /* Executive summary in case the oops scrolled away */
- sp = (unsigned long) (®s->sp);
-@@ -416,17 +425,18 @@ int __kprobes __die(const char * str, st
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
-+
- return 0;
+- if (ELCR_fallback) {
+- /*
+- * If the ELCR indicates a level-sensitive interrupt, we
+- * copy that information over to the MP table in the
+- * irqflag field (level sensitive, active high polarity).
+- */
+- if (ELCR_trigger(i))
+- intsrc.mpc_irqflag = 13;
+- else
+- intsrc.mpc_irqflag = 0;
+- }
+-
+- intsrc.mpc_srcbusirq = i;
+- intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
+- MP_intsrc_info(&intsrc);
+- }
+-
+- intsrc.mpc_irqtype = mp_ExtINT;
+- intsrc.mpc_srcbusirq = 0;
+- intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
+- MP_intsrc_info(&intsrc);
+-}
+-
+-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+-{
+- struct mpc_config_processor processor;
+- struct mpc_config_bus bus;
+- struct mpc_config_ioapic ioapic;
+- struct mpc_config_lintsrc lintsrc;
+- int linttypes[2] = { mp_ExtINT, mp_NMI };
+- int i;
+-
+- /*
+- * local APIC has default address
+- */
+- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+-
+- /*
+- * 2 CPUs, numbered 0 & 1.
+- */
+- processor.mpc_type = MP_PROCESSOR;
+- /* Either an integrated APIC or a discrete 82489DX. */
+- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+- processor.mpc_cpuflag = CPU_ENABLED;
+- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+- (boot_cpu_data.x86_model << 4) |
+- boot_cpu_data.x86_mask;
+- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+- processor.mpc_reserved[0] = 0;
+- processor.mpc_reserved[1] = 0;
+- for (i = 0; i < 2; i++) {
+- processor.mpc_apicid = i;
+- MP_processor_info(&processor);
+- }
+-
+- bus.mpc_type = MP_BUS;
+- bus.mpc_busid = 0;
+- switch (mpc_default_type) {
+- default:
+- printk("???\n");
+- printk(KERN_ERR "Unknown standard configuration %d\n",
+- mpc_default_type);
+- /* fall through */
+- case 1:
+- case 5:
+- memcpy(bus.mpc_bustype, "ISA ", 6);
+- break;
+- case 2:
+- case 6:
+- case 3:
+- memcpy(bus.mpc_bustype, "EISA ", 6);
+- break;
+- case 4:
+- case 7:
+- memcpy(bus.mpc_bustype, "MCA ", 6);
+- }
+- MP_bus_info(&bus);
+- if (mpc_default_type > 4) {
+- bus.mpc_busid = 1;
+- memcpy(bus.mpc_bustype, "PCI ", 6);
+- MP_bus_info(&bus);
+- }
+-
+- ioapic.mpc_type = MP_IOAPIC;
+- ioapic.mpc_apicid = 2;
+- ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+- ioapic.mpc_flags = MPC_APIC_USABLE;
+- ioapic.mpc_apicaddr = 0xFEC00000;
+- MP_ioapic_info(&ioapic);
+-
+- /*
+- * We set up most of the low 16 IO-APIC pins according to MPS rules.
+- */
+- construct_default_ioirq_mptable(mpc_default_type);
+-
+- lintsrc.mpc_type = MP_LINTSRC;
+- lintsrc.mpc_irqflag = 0; /* conforming */
+- lintsrc.mpc_srcbusid = 0;
+- lintsrc.mpc_srcbusirq = 0;
+- lintsrc.mpc_destapic = MP_APIC_ALL;
+- for (i = 0; i < 2; i++) {
+- lintsrc.mpc_irqtype = linttypes[i];
+- lintsrc.mpc_destapiclint = i;
+- MP_lintsrc_info(&lintsrc);
+- }
+-}
+-
+-static struct intel_mp_floating *mpf_found;
+-
+-/*
+- * Scan the memory blocks for an SMP configuration block.
+- */
+-void __init get_smp_config (void)
+-{
+- struct intel_mp_floating *mpf = mpf_found;
+-
+- /*
+- * ACPI supports both logical (e.g. Hyper-Threading) and physical
+- * processors, where MPS only supports physical.
+- */
+- if (acpi_lapic && acpi_ioapic) {
+- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
+- return;
+- }
+- else if (acpi_lapic)
+- printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
+-
+- printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
+- if (mpf->mpf_feature2 & (1<<7)) {
+- printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
+- pic_mode = 1;
- } else {
-- return 1;
- }
-+
-+ return 1;
- }
-
- /*
-- * This is gone through when something in the kernel has done something bad and
-- * is about to be terminated.
-+ * This is gone through when something in the kernel has done something bad
-+ * and is about to be terminated:
- */
--void die(const char * str, struct pt_regs * regs, long err)
-+void die(const char *str, struct pt_regs *regs, long err)
- {
- static struct {
- raw_spinlock_t lock;
-@@ -448,8 +458,9 @@ void die(const char * str, struct pt_reg
- die.lock_owner = smp_processor_id();
- die.lock_owner_depth = 0;
- bust_spinlocks(1);
-- } else
-+ } else {
- raw_local_irq_save(flags);
-+ }
-
- if (++die.lock_owner_depth < 3) {
- report_bug(regs->ip, regs);
-@@ -482,19 +493,20 @@ void die(const char * str, struct pt_reg
- do_exit(SIGSEGV);
- }
-
--static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
-+static inline void
-+die_if_kernel(const char *str, struct pt_regs *regs, long err)
- {
- if (!user_mode_vm(regs))
- die(str, regs, err);
- }
-
--static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
-- struct pt_regs * regs, long error_code,
-- siginfo_t *info)
-+static void __kprobes
-+do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
-+ long error_code, siginfo_t *info)
- {
- struct task_struct *tsk = current;
-
-- if (regs->flags & VM_MASK) {
-+ if (regs->flags & X86_VM_MASK) {
- if (vm86)
- goto vm86_trap;
- goto trap_signal;
-@@ -503,109 +515,112 @@ static void __kprobes do_trap(int trapnr
- if (!user_mode(regs))
- goto kernel_trap;
-
-- trap_signal: {
+- printk(KERN_INFO " Virtual Wire compatibility mode.\n");
+- pic_mode = 0;
+- }
+-
+- /*
+- * Now see if we need to read further.
+- */
+- if (mpf->mpf_feature1 != 0) {
+-
+- printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
+- construct_default_ISA_mptable(mpf->mpf_feature1);
+-
+- } else if (mpf->mpf_physptr) {
+-
- /*
-- * We want error_code and trap_no set for userspace faults and
-- * kernelspace faults which result in die(), but not
-- * kernelspace faults which are fixed up. die() gives the
-- * process no chance to handle the signal and notice the
-- * kernel fault information, so that won't result in polluting
-- * the information about previously queued, but not yet
-- * delivered, faults. See also do_general_protection below.
+- * Read the physical hardware table. Anything here will
+- * override the defaults.
- */
-- tsk->thread.error_code = error_code;
-- tsk->thread.trap_no = trapnr;
-+trap_signal:
-+ /*
-+ * We want error_code and trap_no set for userspace faults and
-+ * kernelspace faults which result in die(), but not
-+ * kernelspace faults which are fixed up. die() gives the
-+ * process no chance to handle the signal and notice the
-+ * kernel fault information, so that won't result in polluting
-+ * the information about previously queued, but not yet
-+ * delivered, faults. See also do_general_protection below.
-+ */
-+ tsk->thread.error_code = error_code;
-+ tsk->thread.trap_no = trapnr;
-
-- if (info)
-- force_sig_info(signr, info, tsk);
-- else
-- force_sig(signr, tsk);
+- if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
+- smp_found_config = 0;
+- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
+- printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
+- return;
+- }
+- /*
+- * If there are no explicit MP IRQ entries, then we are
+- * broken. We set up most of the low 16 IO-APIC pins to
+- * ISA defaults and hope it will work.
+- */
+- if (!mp_irq_entries) {
+- struct mpc_config_bus bus;
+-
+- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+-
+- bus.mpc_type = MP_BUS;
+- bus.mpc_busid = 0;
+- memcpy(bus.mpc_bustype, "ISA ", 6);
+- MP_bus_info(&bus);
+-
+- construct_default_ioirq_mptable(0);
+- }
+-
+- } else
+- BUG();
+-
+- printk(KERN_INFO "Processors: %d\n", num_processors);
+- /*
+- * Only use the first configuration found.
+- */
+-}
+-
+-static int __init smp_scan_config (unsigned long base, unsigned long length)
+-{
+- unsigned long *bp = isa_bus_to_virt(base);
+- struct intel_mp_floating *mpf;
+-
+- printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
+- if (sizeof(*mpf) != 16)
+- printk("Error: MPF size\n");
+-
+- while (length > 0) {
+- mpf = (struct intel_mp_floating *)bp;
+- if ((*bp == SMP_MAGIC_IDENT) &&
+- (mpf->mpf_length == 1) &&
+- !mpf_checksum((unsigned char *)bp, 16) &&
+- ((mpf->mpf_specification == 1)
+- || (mpf->mpf_specification == 4)) ) {
+-
+- smp_found_config = 1;
+-#ifndef CONFIG_XEN
+- printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+- mpf, virt_to_phys(mpf));
+- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+- BOOTMEM_DEFAULT);
+- if (mpf->mpf_physptr) {
+- /*
+- * We cannot access to MPC table to compute
+- * table size yet, as only few megabytes from
+- * the bottom is mapped now.
+- * PC-9800's MPC table places on the very last
+- * of physical memory; so that simply reserving
+- * PAGE_SIZE from mpg->mpf_physptr yields BUG()
+- * in reserve_bootmem.
+- */
+- unsigned long size = PAGE_SIZE;
+- unsigned long end = max_low_pfn * PAGE_SIZE;
+- if (mpf->mpf_physptr + size > end)
+- size = end - mpf->mpf_physptr;
+- reserve_bootmem(mpf->mpf_physptr, size,
+- BOOTMEM_DEFAULT);
+- }
+-#else
+- printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+- mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
+-#endif
+-
+- mpf_found = mpf;
+- return 1;
+- }
+- bp += 4;
+- length -= 16;
+- }
+- return 0;
+-}
+-
+-void __init find_smp_config (void)
+-{
+-#ifndef CONFIG_XEN
+- unsigned int address;
+-#endif
+-
+- /*
+- * FIXME: Linux assumes you have 640K of base ram..
+- * this continues the error...
+- *
+- * 1) Scan the bottom 1K for a signature
+- * 2) Scan the top 1K of base RAM
+- * 3) Scan the 64K of bios
+- */
+- if (smp_scan_config(0x0,0x400) ||
+- smp_scan_config(639*0x400,0x400) ||
+- smp_scan_config(0xF0000,0x10000))
+- return;
+- /*
+- * If it is an SMP machine we should know now, unless the
+- * configuration is in an EISA/MCA bus machine with an
+- * extended bios data area.
+- *
+- * there is a real-mode segmented pointer pointing to the
+- * 4K EBDA area at 0x40E, calculate and scan it here.
+- *
+- * NOTE! There are Linux loaders that will corrupt the EBDA
+- * area, and as such this kind of SMP config may be less
+- * trustworthy, simply because the SMP table may have been
+- * stomped on during early boot. These loaders are buggy and
+- * should be fixed.
+- *
+- * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+- */
+-
+-#ifndef CONFIG_XEN
+- address = get_bios_ebda();
+- if (address)
+- smp_scan_config(address, 0x400);
+-#endif
+-}
+-
+-int es7000_plat;
+-
+-/* --------------------------------------------------------------------------
+- ACPI-based MP Configuration
+- -------------------------------------------------------------------------- */
+-
+-#ifdef CONFIG_ACPI
+-
+-void __init mp_register_lapic_address(u64 address)
+-{
+-#ifndef CONFIG_XEN
+- mp_lapic_addr = (unsigned long) address;
+-
+- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+-
+- if (boot_cpu_physical_apicid == -1U)
+- boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+-
+- Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
+-#endif
+-}
+-
+-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
+-{
+- struct mpc_config_processor processor;
+- int boot_cpu = 0;
+-
+- if (MAX_APICS - id <= 0) {
+- printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
+- id, MAX_APICS);
- return;
- }
-+ if (info)
-+ force_sig_info(signr, info, tsk);
-+ else
-+ force_sig(signr, tsk);
-+ return;
-
-- kernel_trap: {
-- if (!fixup_exception(regs)) {
-- tsk->thread.error_code = error_code;
-- tsk->thread.trap_no = trapnr;
-- die(str, regs, error_code);
-- }
+-
+- if (id == boot_cpu_physical_apicid)
+- boot_cpu = 1;
+-
+-#ifndef CONFIG_XEN
+- processor.mpc_type = MP_PROCESSOR;
+- processor.mpc_apicid = id;
+- processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
+- processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
+- processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
+- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+- processor.mpc_reserved[0] = 0;
+- processor.mpc_reserved[1] = 0;
+-#endif
+-
+- MP_processor_info(&processor);
+-}
+-
+-#ifdef CONFIG_X86_IO_APIC
+-
+-#define MP_ISA_BUS 0
+-#define MP_MAX_IOAPIC_PIN 127
+-
+-static struct mp_ioapic_routing {
+- int apic_id;
+- int gsi_base;
+- int gsi_end;
+- u32 pin_programmed[4];
+-} mp_ioapic_routing[MAX_IO_APICS];
+-
+-static int mp_find_ioapic (int gsi)
+-{
+- int i = 0;
+-
+- /* Find the IOAPIC that manages this GSI. */
+- for (i = 0; i < nr_ioapics; i++) {
+- if ((gsi >= mp_ioapic_routing[i].gsi_base)
+- && (gsi <= mp_ioapic_routing[i].gsi_end))
+- return i;
+- }
+-
+- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+-
+- return -1;
+-}
+-
+-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
+-{
+- int idx = 0;
+- int tmpid;
+-
+- if (nr_ioapics >= MAX_IO_APICS) {
+- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+- }
+- if (!address) {
+- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+- " found in MADT table, skipping!\n");
+- return;
+- }
+-
+- idx = nr_ioapics++;
+-
+- mp_ioapics[idx].mpc_type = MP_IOAPIC;
+- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+- mp_ioapics[idx].mpc_apicaddr = address;
+-
+-#ifndef CONFIG_XEN
+- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+-#endif
+- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+- && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+- tmpid = io_apic_get_unique_id(idx, id);
+- else
+- tmpid = id;
+- if (tmpid == -1) {
+- nr_ioapics--;
+- return;
+- }
+- mp_ioapics[idx].mpc_apicid = tmpid;
+- mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+-
+- /*
+- * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+- * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+- */
+- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+- mp_ioapic_routing[idx].gsi_base = gsi_base;
+- mp_ioapic_routing[idx].gsi_end = gsi_base +
+- io_apic_get_redir_entries(idx);
+-
+- printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
+- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+- mp_ioapic_routing[idx].gsi_base,
+- mp_ioapic_routing[idx].gsi_end);
+-}
+-
+-void __init
+-mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+-{
+- struct mpc_config_intsrc intsrc;
+- int ioapic = -1;
+- int pin = -1;
+-
+- /*
+- * Convert 'gsi' to 'ioapic.pin'.
+- */
+- ioapic = mp_find_ioapic(gsi);
+- if (ioapic < 0)
+- return;
+- pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+-
+- /*
+- * TBD: This check is for faulty timer entries, where the override
+- * erroneously sets the trigger to level, resulting in a HUGE
+- * increase of timer interrupts!
+- */
+- if ((bus_irq == 0) && (trigger == 3))
+- trigger = 1;
+-
+- intsrc.mpc_type = MP_INTSRC;
+- intsrc.mpc_irqtype = mp_INT;
+- intsrc.mpc_irqflag = (trigger << 2) | polarity;
+- intsrc.mpc_srcbus = MP_ISA_BUS;
+- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
+- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
+- intsrc.mpc_dstirq = pin; /* INTIN# */
+-
+- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
+- intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
+- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
+- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
+-
+- mp_irqs[mp_irq_entries] = intsrc;
+- if (++mp_irq_entries == MAX_IRQ_SOURCES)
+- panic("Max # of irq sources exceeded!\n");
+-}
+-
+-void __init mp_config_acpi_legacy_irqs (void)
+-{
+- struct mpc_config_intsrc intsrc;
+- int i = 0;
+- int ioapic = -1;
+-
+- /*
+- * Fabricate the legacy ISA bus (bus #31).
+- */
+- mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+- Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+-
+- /*
+- * Older generations of ES7000 have no legacy identity mappings
+- */
+- if (es7000_plat == 1)
- return;
-+kernel_trap:
-+ if (!fixup_exception(regs)) {
-+ tsk->thread.error_code = error_code;
-+ tsk->thread.trap_no = trapnr;
-+ die(str, regs, error_code);
- }
-+ return;
-
-- vm86_trap: {
-- int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
-- if (ret) goto trap_signal;
+-
+- /*
+- * Locate the IOAPIC that manages the ISA IRQs (0-15).
+- */
+- ioapic = mp_find_ioapic(0);
+- if (ioapic < 0)
- return;
+-
+- intsrc.mpc_type = MP_INTSRC;
+- intsrc.mpc_irqflag = 0; /* Conforming */
+- intsrc.mpc_srcbus = MP_ISA_BUS;
+- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+-
+- /*
+- * Use the default configuration for the IRQs 0-15. Unless
+- * overridden by (MADT) interrupt source override entries.
+- */
+- for (i = 0; i < 16; i++) {
+- int idx;
+-
+- for (idx = 0; idx < mp_irq_entries; idx++) {
+- struct mpc_config_intsrc *irq = mp_irqs + idx;
+-
+- /* Do we already have a mapping for this ISA IRQ? */
+- if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
+- break;
+-
+- /* Do we already have a mapping for this IOAPIC pin */
+- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+- (irq->mpc_dstirq == i))
+- break;
+- }
+-
+- if (idx != mp_irq_entries) {
+- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+- continue; /* IRQ already used */
+- }
+-
+- intsrc.mpc_irqtype = mp_INT;
+- intsrc.mpc_srcbusirq = i; /* Identity mapped */
+- intsrc.mpc_dstirq = i;
+-
+- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
+- "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
+- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
+- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
+- intsrc.mpc_dstirq);
+-
+- mp_irqs[mp_irq_entries] = intsrc;
+- if (++mp_irq_entries == MAX_IRQ_SOURCES)
+- panic("Max # of irq sources exceeded!\n");
- }
-+vm86_trap:
-+ if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
-+ error_code, trapnr))
-+ goto trap_signal;
-+ return;
- }
-
--#define DO_ERROR(trapnr, signr, str, name) \
--void do_##name(struct pt_regs * regs, long error_code) \
--{ \
-- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-- == NOTIFY_STOP) \
-- return; \
-- do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
-}
-
--#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
--void do_##name(struct pt_regs * regs, long error_code) \
--{ \
-- siginfo_t info; \
-- if (irq) \
-- local_irq_enable(); \
-- info.si_signo = signr; \
-- info.si_errno = 0; \
-- info.si_code = sicode; \
-- info.si_addr = (void __user *)siaddr; \
-- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-- == NOTIFY_STOP) \
-- return; \
-- do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
+-#define MAX_GSI_NUM 4096
+-#define IRQ_COMPRESSION_START 64
+-
+-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+-{
+- int ioapic = -1;
+- int ioapic_pin = 0;
+- int idx, bit = 0;
+- static int pci_irq = IRQ_COMPRESSION_START;
+- /*
+- * Mapping between Global System Interrupts, which
+- * represent all possible interrupts, and IRQs
+- * assigned to actual devices.
+- */
+- static int gsi_to_irq[MAX_GSI_NUM];
+-
+- /* Don't set up the ACPI SCI because it's already set up */
+- if (acpi_gbl_FADT.sci_interrupt == gsi)
+- return gsi;
+-
+- ioapic = mp_find_ioapic(gsi);
+- if (ioapic < 0) {
+- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+- return gsi;
+- }
+-
+- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+-
+- if (ioapic_renumber_irq)
+- gsi = ioapic_renumber_irq(ioapic, gsi);
+-
+- /*
+- * Avoid pin reprogramming. PRTs typically include entries
+- * with redundant pin->gsi mappings (but unique PCI devices);
+- * we only program the IOAPIC on the first.
+- */
+- bit = ioapic_pin % 32;
+- idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
+- if (idx > 3) {
+- printk(KERN_ERR "Invalid reference to IOAPIC pin "
+- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+- ioapic_pin);
+- return gsi;
+- }
+- if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+- }
+-
+- mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+-
+- /*
+- * For GSI >= 64, use IRQ compression
+- */
+- if ((gsi >= IRQ_COMPRESSION_START)
+- && (triggering == ACPI_LEVEL_SENSITIVE)) {
+- /*
+- * For PCI devices assign IRQs in order, avoiding gaps
+- * due to unused I/O APIC pins.
+- */
+- int irq = gsi;
+- if (gsi < MAX_GSI_NUM) {
+- /*
+- * Retain the VIA chipset work-around (gsi > 15), but
+- * avoid a problem where the 8254 timer (IRQ0) is setup
+- * via an override (so it's not on pin 0 of the ioapic),
+- * and at the same time, the pin 0 interrupt is a PCI
+- * type. The gsi > 15 test could cause these two pins
+- * to be shared as IRQ0, and they are not shareable.
+- * So test for this condition, and if necessary, avoid
+- * the pin collision.
+- */
+- if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
+- gsi = pci_irq++;
+- /*
+- * Don't assign IRQ used by ACPI SCI
+- */
+- if (gsi == acpi_gbl_FADT.sci_interrupt)
+- gsi = pci_irq++;
+- gsi_to_irq[irq] = gsi;
+- } else {
+- printk(KERN_ERR "GSI %u is too high\n", gsi);
+- return gsi;
+- }
+- }
+-
+- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+- return gsi;
-}
-
--#define DO_VM86_ERROR(trapnr, signr, str, name) \
--void do_##name(struct pt_regs * regs, long error_code) \
--{ \
-- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-- == NOTIFY_STOP) \
-- return; \
-- do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
--}
+-#endif /* CONFIG_X86_IO_APIC */
+-#endif /* CONFIG_ACPI */
+--- sle11-2009-05-14.orig/arch/x86/kernel/mpparse_64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,879 +0,0 @@
+-/*
+- * Intel Multiprocessor Specification 1.1 and 1.4
+- * compliant MP-table parsing routines.
+- *
+- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+- *
+- * Fixes
+- * Erich Boleyn : MP v1.4 and additional changes.
+- * Alan Cox : Added EBDA scanning
+- * Ingo Molnar : various cleanups and rewrites
+- * Maciej W. Rozycki: Bits for default MP configurations
+- * Paul Diefenbaugh: Added full ACPI support
+- */
+-
+-#include <linux/mm.h>
+-#include <linux/init.h>
+-#include <linux/delay.h>
+-#include <linux/bootmem.h>
+-#include <linux/kernel_stat.h>
+-#include <linux/mc146818rtc.h>
+-#include <linux/acpi.h>
+-#include <linux/module.h>
+-
+-#include <asm/smp.h>
+-#include <asm/mtrr.h>
+-#include <asm/mpspec.h>
+-#include <asm/pgalloc.h>
+-#include <asm/io_apic.h>
+-#include <asm/proto.h>
+-#include <asm/acpi.h>
+-
+-/* Have we found an MP table */
+-int smp_found_config;
+-
+-/*
+- * Various Linux-internal data structures created from the
+- * MP-table.
+- */
+-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+-
+-static int mp_current_pci_id = 0;
+-/* I/O APIC entries */
+-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+-
+-/* # of MP IRQ source entries */
+-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
--#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
--void do_##name(struct pt_regs * regs, long error_code) \
--{ \
-- siginfo_t info; \
-- info.si_signo = signr; \
-- info.si_errno = 0; \
-- info.si_code = sicode; \
-- info.si_addr = (void __user *)siaddr; \
-- trace_hardirqs_fixup(); \
-- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-- == NOTIFY_STOP) \
-- return; \
-- do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
-+#define DO_ERROR(trapnr, signr, str, name) \
-+void do_##name(struct pt_regs *regs, long error_code) \
-+{ \
-+ trace_hardirqs_fixup(); \
-+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-+ == NOTIFY_STOP) \
-+ return; \
-+ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
-+}
-+
-+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
-+void do_##name(struct pt_regs *regs, long error_code) \
-+{ \
-+ siginfo_t info; \
-+ if (irq) \
-+ local_irq_enable(); \
-+ info.si_signo = signr; \
-+ info.si_errno = 0; \
-+ info.si_code = sicode; \
-+ info.si_addr = (void __user *)siaddr; \
-+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-+ == NOTIFY_STOP) \
-+ return; \
-+ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
-+}
-+
-+#define DO_VM86_ERROR(trapnr, signr, str, name) \
-+void do_##name(struct pt_regs *regs, long error_code) \
-+{ \
-+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-+ == NOTIFY_STOP) \
-+ return; \
-+ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
-+}
-+
-+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-+void do_##name(struct pt_regs *regs, long error_code) \
-+{ \
-+ siginfo_t info; \
-+ info.si_signo = signr; \
-+ info.si_errno = 0; \
-+ info.si_code = sicode; \
-+ info.si_addr = (void __user *)siaddr; \
-+ trace_hardirqs_fixup(); \
-+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-+ == NOTIFY_STOP) \
-+ return; \
-+ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
- }
-
--DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-+DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
- #ifndef CONFIG_KPROBES
--DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
-+DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
- #endif
--DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
--DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
--DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
--DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
-+DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
-+DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
-+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
-+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
- DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
- DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
- DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
- DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
--DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
-+DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
-
- void __kprobes do_general_protection(struct pt_regs * regs,
- long error_code)
- {
-- if (regs->flags & VM_MASK)
-+ struct thread_struct *thread;
-+
-+ thread = ¤t->thread;
-+
-+ if (regs->flags & X86_VM_MASK)
- goto gp_in_vm86;
-
- if (!user_mode(regs))
-@@ -613,6 +628,7 @@ void __kprobes do_general_protection(str
-
- current->thread.error_code = error_code;
- current->thread.trap_no = 13;
-+
- if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
- printk_ratelimit()) {
- printk(KERN_INFO
-@@ -642,22 +658,25 @@ gp_in_kernel:
- }
- }
-
--static __kprobes void
--mem_parity_error(unsigned char reason, struct pt_regs * regs)
-+static notrace __kprobes void
-+mem_parity_error(unsigned char reason, struct pt_regs *regs)
- {
-- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
-- "CPU %d.\n", reason, smp_processor_id());
-- printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
-+ printk(KERN_EMERG
-+ "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
-+ reason, smp_processor_id());
-+
-+ printk(KERN_EMERG
-+ "You have some hardware problem, likely on the PCI bus.\n");
-
- #if defined(CONFIG_EDAC)
-- if(edac_handler_set()) {
-+ if (edac_handler_set()) {
- edac_atomic_assert_error();
- return;
- }
- #endif
-
- if (panic_on_unrecovered_nmi)
-- panic("NMI: Not continuing");
-+ panic("NMI: Not continuing");
-
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
-
-@@ -665,8 +684,8 @@ mem_parity_error(unsigned char reason, s
- clear_mem_error(reason);
- }
-
--static __kprobes void
--io_check_error(unsigned char reason, struct pt_regs * regs)
-+static notrace __kprobes void
-+io_check_error(unsigned char reason, struct pt_regs *regs)
- {
- printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
- show_registers(regs);
-@@ -675,38 +694,43 @@ io_check_error(unsigned char reason, str
- clear_io_check_error(reason);
- }
-
--static __kprobes void
--unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-+static notrace __kprobes void
-+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
- {
-+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
-+ return;
- #ifdef CONFIG_MCA
-- /* Might actually be able to figure out what the guilty party
-- * is. */
-- if( MCA_bus ) {
-+ /*
-+ * Might actually be able to figure out what the guilty party
-+ * is:
-+ */
-+ if (MCA_bus) {
- mca_handle_nmi();
- return;
- }
- #endif
-- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
-- "CPU %d.\n", reason, smp_processor_id());
-+ printk(KERN_EMERG
-+ "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
-+ reason, smp_processor_id());
-+
- printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
- if (panic_on_unrecovered_nmi)
-- panic("NMI: Not continuing");
-+ panic("NMI: Not continuing");
-
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
- }
-
- static DEFINE_SPINLOCK(nmi_print_lock);
-
--void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
-+void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
- {
-- if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
-- NOTIFY_STOP)
-+ if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
-
- spin_lock(&nmi_print_lock);
- /*
- * We are in trouble anyway, lets at least try
-- * to get a message out.
-+ * to get a message out:
- */
- bust_spinlocks(1);
- printk(KERN_EMERG "%s", msg);
-@@ -717,9 +741,10 @@ void __kprobes die_nmi(struct pt_regs *r
- spin_unlock(&nmi_print_lock);
- bust_spinlocks(0);
-
-- /* If we are in kernel we are probably nested up pretty bad
-- * and might aswell get out now while we still can.
-- */
-+ /*
-+ * If we are in kernel we are probably nested up pretty bad
-+ * and might aswell get out now while we still can:
-+ */
- if (!user_mode_vm(regs)) {
- current->thread.trap_no = 2;
- crash_kexec(regs);
-@@ -728,14 +753,14 @@ void __kprobes die_nmi(struct pt_regs *r
- do_exit(SIGSEGV);
- }
-
--static __kprobes void default_do_nmi(struct pt_regs * regs)
-+static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
- {
- unsigned char reason = 0;
-
-- /* Only the BSP gets external NMIs from the system. */
-+ /* Only the BSP gets external NMIs from the system: */
- if (!smp_processor_id())
- reason = get_nmi_reason();
--
-+
- if (!(reason & 0xc0)) {
- if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
-@@ -748,8 +773,10 @@ static __kprobes void default_do_nmi(str
- if (nmi_watchdog_tick(regs, reason))
- return;
- if (!do_nmi_callback(regs, smp_processor_id()))
+-/* MP IRQ source entries */
+-int mp_irq_entries;
+-
+-int nr_ioapics;
+-unsigned long mp_lapic_addr = 0;
+-
+-
+-
+-/* Processor that is doing the boot up */
+-unsigned int boot_cpu_id = -1U;
+-EXPORT_SYMBOL(boot_cpu_id);
+-
+-/* Internal processor count */
+-unsigned int num_processors;
+-
+-unsigned disabled_cpus __cpuinitdata;
+-
+-/* Bitmask of physically existing CPUs */
+-physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
+-
+-#ifndef CONFIG_XEN
+-u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
+- = { [0 ... NR_CPUS-1] = BAD_APICID };
+-void *x86_bios_cpu_apicid_early_ptr;
-#endif
- unknown_nmi_error(reason, regs);
-+#else
-+ unknown_nmi_error(reason, regs);
-+#endif
-
- return;
- }
-@@ -761,14 +788,14 @@ static __kprobes void default_do_nmi(str
- io_check_error(reason, regs);
- /*
- * Reassert NMI in case it became active meanwhile
-- * as it's edge-triggered.
-+ * as it's edge-triggered:
- */
- reassert_nmi();
- }
-
- static int ignore_nmis;
-
--__kprobes void do_nmi(struct pt_regs * regs, long error_code)
-+notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
- {
- int cpu;
-
-@@ -804,9 +831,12 @@ void __kprobes do_int3(struct pt_regs *r
- if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
- return;
-- /* This is an interrupt gate, because kprobes wants interrupts
-- disabled. Normal trap handlers don't. */
-+ /*
-+ * This is an interrupt gate, because kprobes wants interrupts
-+ * disabled. Normal trap handlers don't.
-+ */
- restore_interrupts(regs);
-+
- do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
- }
- #endif
-@@ -821,7 +851,7 @@ void __kprobes do_int3(struct pt_regs *r
- * from user space. Such code must not hold kernel locks (since it
- * can equally take a page fault), therefore it is safe to call
- * force_sig_info even though that claims and releases locks.
-- *
-+ *
- * Code in ./signal.c ensures that the debug control register
- * is restored before we deliver any signal, and therefore that
- * user code runs with the correct debug control register even though
-@@ -833,10 +863,10 @@ void __kprobes do_int3(struct pt_regs *r
- * find every occurrence of the TF bit that could be saved away even
- * by user code)
- */
--void __kprobes do_debug(struct pt_regs * regs, long error_code)
-+void __kprobes do_debug(struct pt_regs *regs, long error_code)
- {
-- unsigned int condition;
- struct task_struct *tsk = current;
-+ unsigned int condition;
-
- trace_hardirqs_fixup();
-
-@@ -861,7 +891,7 @@ void __kprobes do_debug(struct pt_regs *
- goto clear_dr7;
- }
-
-- if (regs->flags & VM_MASK)
-+ if (regs->flags & X86_VM_MASK)
- goto debug_vm86;
-
- /* Save debug status register where ptrace can see it */
-@@ -884,7 +914,8 @@ void __kprobes do_debug(struct pt_regs *
- /* Ok, finally something we can handle */
- send_sigtrap(tsk, regs, error_code);
-
-- /* Disable additional traps. They'll be re-enabled when
-+ /*
-+ * Disable additional traps. They'll be re-enabled when
- * the signal is delivered.
- */
- clear_dr7:
-@@ -897,7 +928,7 @@ debug_vm86:
-
- clear_TF_reenable:
- set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-- regs->flags &= ~TF_MASK;
-+ regs->flags &= ~X86_EFLAGS_TF;
- return;
- }
-
-@@ -908,9 +939,10 @@ clear_TF_reenable:
- */
- void math_error(void __user *ip)
- {
-- struct task_struct * task;
-+ struct task_struct *task;
-+ unsigned short cwd;
-+ unsigned short swd;
- siginfo_t info;
-- unsigned short cwd, swd;
-
- /*
- * Save the info for the exception handler and clear the error.
-@@ -936,36 +968,36 @@ void math_error(void __user *ip)
- cwd = get_fpu_cwd(task);
- swd = get_fpu_swd(task);
- switch (swd & ~cwd & 0x3f) {
-- case 0x000: /* No unmasked exception */
-- return;
-- default: /* Multiple exceptions */
-- break;
-- case 0x001: /* Invalid Op */
+-DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
+-EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+-
+-
+-/*
+- * Intel MP BIOS table parsing routines:
+- */
+-
+-/*
+- * Checksum an MP configuration block.
+- */
+-
+-static int __init mpf_checksum(unsigned char *mp, int len)
+-{
+- int sum = 0;
+-
+- while (len--)
+- sum += *mp++;
+-
+- return sum & 0xFF;
+-}
+-
+-#ifndef CONFIG_XEN
+-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
+-{
+- int cpu;
+- cpumask_t tmp_map;
+- char *bootup_cpu = "";
+-
+- if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+- disabled_cpus++;
+- return;
+- }
+- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+- bootup_cpu = " (Bootup-CPU)";
+- boot_cpu_id = m->mpc_apicid;
+- }
+-
+- printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
+-
+- if (num_processors >= NR_CPUS) {
+- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+- " Processor ignored.\n", NR_CPUS);
+- return;
+- }
+-
+- num_processors++;
+- cpus_complement(tmp_map, cpu_present_map);
+- cpu = first_cpu(tmp_map);
+-
+- physid_set(m->mpc_apicid, phys_cpu_present_map);
+- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+- /*
+- * x86_bios_cpu_apicid is required to have processors listed
+- * in same order as logical cpu numbers. Hence the first
+- * entry is BSP, and so on.
+- */
+- cpu = 0;
+- }
+- /* are we being called early in kernel startup? */
+- if (x86_cpu_to_apicid_early_ptr) {
+- u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
+- u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+-
+- cpu_to_apicid[cpu] = m->mpc_apicid;
+- bios_cpu_apicid[cpu] = m->mpc_apicid;
+- } else {
+- per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
+- per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
+- }
+-
+- cpu_set(cpu, cpu_possible_map);
+- cpu_set(cpu, cpu_present_map);
+-}
+-#else
+-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
+-{
+- num_processors++;
+-}
+-#endif /* CONFIG_XEN */
+-
+-static void __init MP_bus_info (struct mpc_config_bus *m)
+-{
+- char str[7];
+-
+- memcpy(str, m->mpc_bustype, 6);
+- str[6] = 0;
+- Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+-
+- if (strncmp(str, "ISA", 3) == 0) {
+- set_bit(m->mpc_busid, mp_bus_not_pci);
+- } else if (strncmp(str, "PCI", 3) == 0) {
+- clear_bit(m->mpc_busid, mp_bus_not_pci);
+- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+- mp_current_pci_id++;
+- } else {
+- printk(KERN_ERR "Unknown bustype %s\n", str);
+- }
+-}
+-
+-static int bad_ioapic(unsigned long address)
+-{
+- if (nr_ioapics >= MAX_IO_APICS) {
+- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+- }
+- if (!address) {
+- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+- " found in table, skipping!\n");
+- return 1;
+- }
+- return 0;
+-}
+-
+-static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+-{
+- if (!(m->mpc_flags & MPC_APIC_USABLE))
+- return;
+-
+- printk("I/O APIC #%d at 0x%X.\n",
+- m->mpc_apicid, m->mpc_apicaddr);
+-
+- if (bad_ioapic(m->mpc_apicaddr))
+- return;
+-
+- mp_ioapics[nr_ioapics] = *m;
+- nr_ioapics++;
+-}
+-
+-static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+-{
+- mp_irqs [mp_irq_entries] = *m;
+- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+- m->mpc_irqtype, m->mpc_irqflag & 3,
+- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+- m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+- if (++mp_irq_entries >= MAX_IRQ_SOURCES)
+- panic("Max # of irq sources exceeded!!\n");
+-}
+-
+-static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+-{
+- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+- m->mpc_irqtype, m->mpc_irqflag & 3,
+- (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
+- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+-}
+-
+-/*
+- * Read/parse the MPC
+- */
+-
+-static int __init smp_read_mpc(struct mp_config_table *mpc)
+-{
+- char str[16];
+- int count=sizeof(*mpc);
+- unsigned char *mpt=((unsigned char *)mpc)+count;
+-
+- if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
+- printk("MPTABLE: bad signature [%c%c%c%c]!\n",
+- mpc->mpc_signature[0],
+- mpc->mpc_signature[1],
+- mpc->mpc_signature[2],
+- mpc->mpc_signature[3]);
+- return 0;
+- }
+- if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
+- printk("MPTABLE: checksum error!\n");
+- return 0;
+- }
+- if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
+- printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
+- mpc->mpc_spec);
+- return 0;
+- }
+- if (!mpc->mpc_lapic) {
+- printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+- return 0;
+- }
+- memcpy(str,mpc->mpc_oem,8);
+- str[8] = 0;
+- printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
+-
+- memcpy(str,mpc->mpc_productid,12);
+- str[12] = 0;
+- printk("MPTABLE: Product ID: %s ",str);
+-
+- printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
+-
+- /* save the local APIC address, it might be non-default */
+- if (!acpi_lapic)
+- mp_lapic_addr = mpc->mpc_lapic;
+-
+- /*
+- * Now process the configuration blocks.
+- */
+- while (count < mpc->mpc_length) {
+- switch(*mpt) {
+- case MP_PROCESSOR:
+- {
+- struct mpc_config_processor *m=
+- (struct mpc_config_processor *)mpt;
+- if (!acpi_lapic)
+- MP_processor_info(m);
+- mpt += sizeof(*m);
+- count += sizeof(*m);
+- break;
+- }
+- case MP_BUS:
+- {
+- struct mpc_config_bus *m=
+- (struct mpc_config_bus *)mpt;
+- MP_bus_info(m);
+- mpt += sizeof(*m);
+- count += sizeof(*m);
+- break;
+- }
+- case MP_IOAPIC:
+- {
+- struct mpc_config_ioapic *m=
+- (struct mpc_config_ioapic *)mpt;
+- MP_ioapic_info(m);
+- mpt += sizeof(*m);
+- count += sizeof(*m);
+- break;
+- }
+- case MP_INTSRC:
+- {
+- struct mpc_config_intsrc *m=
+- (struct mpc_config_intsrc *)mpt;
+-
+- MP_intsrc_info(m);
+- mpt += sizeof(*m);
+- count += sizeof(*m);
+- break;
+- }
+- case MP_LINTSRC:
+- {
+- struct mpc_config_lintsrc *m=
+- (struct mpc_config_lintsrc *)mpt;
+- MP_lintsrc_info(m);
+- mpt += sizeof(*m);
+- count += sizeof(*m);
+- break;
+- }
+- }
+- }
+- setup_apic_routing();
+- if (!num_processors)
+- printk(KERN_ERR "MPTABLE: no processors registered!\n");
+- return num_processors;
+-}
+-
+-static int __init ELCR_trigger(unsigned int irq)
+-{
+- unsigned int port;
+-
+- port = 0x4d0 + (irq >> 3);
+- return (inb(port) >> (irq & 7)) & 1;
+-}
+-
+-static void __init construct_default_ioirq_mptable(int mpc_default_type)
+-{
+- struct mpc_config_intsrc intsrc;
+- int i;
+- int ELCR_fallback = 0;
+-
+- intsrc.mpc_type = MP_INTSRC;
+- intsrc.mpc_irqflag = 0; /* conforming */
+- intsrc.mpc_srcbus = 0;
+- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+-
+- intsrc.mpc_irqtype = mp_INT;
+-
+- /*
+- * If true, we have an ISA/PCI system with no IRQ entries
+- * in the MP table. To prevent the PCI interrupts from being set up
+- * incorrectly, we try to use the ELCR. The sanity check to see if
+- * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+- * never be level sensitive, so we simply see if the ELCR agrees.
+- * If it does, we assume it's valid.
+- */
+- if (mpc_default_type == 5) {
+- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+-
+- if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
+- printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
+- else {
+- printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
+- ELCR_fallback = 1;
+- }
+- }
+-
+- for (i = 0; i < 16; i++) {
+- switch (mpc_default_type) {
+- case 2:
+- if (i == 0 || i == 13)
+- continue; /* IRQ0 & IRQ13 not connected */
+- /* fall through */
+- default:
+- if (i == 2)
+- continue; /* IRQ2 is never connected */
+- }
+-
+- if (ELCR_fallback) {
- /*
-- * swd & 0x240 == 0x040: Stack Underflow
-- * swd & 0x240 == 0x240: Stack Overflow
-- * User must clear the SF bit (0x40) if set
+- * If the ELCR indicates a level-sensitive interrupt, we
+- * copy that information over to the MP table in the
+- * irqflag field (level sensitive, active high polarity).
- */
-- info.si_code = FPE_FLTINV;
-- break;
-- case 0x002: /* Denormalize */
-- case 0x010: /* Underflow */
-- info.si_code = FPE_FLTUND;
-- break;
-- case 0x004: /* Zero Divide */
-- info.si_code = FPE_FLTDIV;
-- break;
-- case 0x008: /* Overflow */
-- info.si_code = FPE_FLTOVF;
-- break;
-- case 0x020: /* Precision */
-- info.si_code = FPE_FLTRES;
-- break;
-+ case 0x000: /* No unmasked exception */
-+ return;
-+ default: /* Multiple exceptions */
-+ break;
-+ case 0x001: /* Invalid Op */
-+ /*
-+ * swd & 0x240 == 0x040: Stack Underflow
-+ * swd & 0x240 == 0x240: Stack Overflow
-+ * User must clear the SF bit (0x40) if set
-+ */
-+ info.si_code = FPE_FLTINV;
-+ break;
-+ case 0x002: /* Denormalize */
-+ case 0x010: /* Underflow */
-+ info.si_code = FPE_FLTUND;
-+ break;
-+ case 0x004: /* Zero Divide */
-+ info.si_code = FPE_FLTDIV;
-+ break;
-+ case 0x008: /* Overflow */
-+ info.si_code = FPE_FLTOVF;
-+ break;
-+ case 0x020: /* Precision */
-+ info.si_code = FPE_FLTRES;
-+ break;
- }
- force_sig_info(SIGFPE, &info, task);
- }
-
--void do_coprocessor_error(struct pt_regs * regs, long error_code)
-+void do_coprocessor_error(struct pt_regs *regs, long error_code)
- {
- ignore_fpu_irq = 1;
- math_error((void __user *)regs->ip);
-@@ -973,9 +1005,9 @@ void do_coprocessor_error(struct pt_regs
-
- static void simd_math_error(void __user *ip)
- {
-- struct task_struct * task;
-- siginfo_t info;
-+ struct task_struct *task;
- unsigned short mxcsr;
-+ siginfo_t info;
-
- /*
- * Save the info for the exception handler and clear the error.
-@@ -996,84 +1028,82 @@ static void simd_math_error(void __user
- */
- mxcsr = get_fpu_mxcsr(task);
- switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-- case 0x000:
+- if (ELCR_trigger(i))
+- intsrc.mpc_irqflag = 13;
+- else
+- intsrc.mpc_irqflag = 0;
+- }
+-
+- intsrc.mpc_srcbusirq = i;
+- intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
+- MP_intsrc_info(&intsrc);
+- }
+-
+- intsrc.mpc_irqtype = mp_ExtINT;
+- intsrc.mpc_srcbusirq = 0;
+- intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
+- MP_intsrc_info(&intsrc);
+-}
+-
+-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+-{
+- struct mpc_config_processor processor;
+- struct mpc_config_bus bus;
+- struct mpc_config_ioapic ioapic;
+- struct mpc_config_lintsrc lintsrc;
+- int linttypes[2] = { mp_ExtINT, mp_NMI };
+- int i;
+-
+- /*
+- * local APIC has default address
+- */
+- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+-
+- /*
+- * 2 CPUs, numbered 0 & 1.
+- */
+- processor.mpc_type = MP_PROCESSOR;
+- processor.mpc_apicver = 0;
+- processor.mpc_cpuflag = CPU_ENABLED;
+- processor.mpc_cpufeature = 0;
+- processor.mpc_featureflag = 0;
+- processor.mpc_reserved[0] = 0;
+- processor.mpc_reserved[1] = 0;
+- for (i = 0; i < 2; i++) {
+- processor.mpc_apicid = i;
+- MP_processor_info(&processor);
+- }
+-
+- bus.mpc_type = MP_BUS;
+- bus.mpc_busid = 0;
+- switch (mpc_default_type) {
- default:
+- printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+- mpc_default_type);
+- /* fall through */
+- case 1:
+- case 5:
+- memcpy(bus.mpc_bustype, "ISA ", 6);
- break;
-- case 0x001: /* Invalid Op */
-- info.si_code = FPE_FLTINV;
-- break;
-- case 0x002: /* Denormalize */
-- case 0x010: /* Underflow */
-- info.si_code = FPE_FLTUND;
-- break;
-- case 0x004: /* Zero Divide */
-- info.si_code = FPE_FLTDIV;
-- break;
-- case 0x008: /* Overflow */
-- info.si_code = FPE_FLTOVF;
-- break;
-- case 0x020: /* Precision */
-- info.si_code = FPE_FLTRES;
-- break;
-+ case 0x000:
-+ default:
-+ break;
-+ case 0x001: /* Invalid Op */
-+ info.si_code = FPE_FLTINV;
-+ break;
-+ case 0x002: /* Denormalize */
-+ case 0x010: /* Underflow */
-+ info.si_code = FPE_FLTUND;
-+ break;
-+ case 0x004: /* Zero Divide */
-+ info.si_code = FPE_FLTDIV;
-+ break;
-+ case 0x008: /* Overflow */
-+ info.si_code = FPE_FLTOVF;
-+ break;
-+ case 0x020: /* Precision */
-+ info.si_code = FPE_FLTRES;
-+ break;
- }
- force_sig_info(SIGFPE, &info, task);
- }
-
--void do_simd_coprocessor_error(struct pt_regs * regs,
-- long error_code)
-+void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
- {
- if (cpu_has_xmm) {
- /* Handle SIMD FPU exceptions on PIII+ processors. */
- ignore_fpu_irq = 1;
- simd_math_error((void __user *)regs->ip);
-- } else {
-- /*
-- * Handle strange cache flush from user space exception
-- * in all other cases. This is undocumented behaviour.
-- */
-- if (regs->flags & VM_MASK) {
-- handle_vm86_fault((struct kernel_vm86_regs *)regs,
-- error_code);
-- return;
-- }
-- current->thread.trap_no = 19;
-- current->thread.error_code = error_code;
-- die_if_kernel("cache flush denied", regs, error_code);
-- force_sig(SIGSEGV, current);
-+ return;
-+ }
-+ /*
-+ * Handle strange cache flush from user space exception
-+ * in all other cases. This is undocumented behaviour.
-+ */
-+ if (regs->flags & X86_VM_MASK) {
-+ handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
-+ return;
- }
-+ current->thread.trap_no = 19;
-+ current->thread.error_code = error_code;
-+ die_if_kernel("cache flush denied", regs, error_code);
-+ force_sig(SIGSEGV, current);
- }
-
- #ifndef CONFIG_XEN
--void do_spurious_interrupt_bug(struct pt_regs * regs,
-- long error_code)
-+void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
- {
- #if 0
- /* No need to warn about this any longer. */
-- printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-+ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
- #endif
- }
-
--unsigned long patch_espfix_desc(unsigned long uesp,
-- unsigned long kesp)
-+unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
- {
- struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
- unsigned long base = (kesp - uesp) & -THREAD_SIZE;
- unsigned long new_kesp = kesp - base;
- unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
- __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
-+
- /* Set up base for espfix segment */
-- desc &= 0x00f0ff0000000000ULL;
-- desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
-+ desc &= 0x00f0ff0000000000ULL;
-+ desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
- ((((__u64)base) << 32) & 0xff00000000000000ULL) |
- ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
- (lim_pages & 0xffff);
- *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
-+
- return new_kesp;
- }
- #endif
-
- /*
-- * 'math_state_restore()' saves the current math information in the
-+ * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
-@@ -1087,9 +1117,22 @@ asmlinkage void math_state_restore(void)
- struct thread_info *thread = current_thread_info();
- struct task_struct *tsk = thread->task;
-
-+ if (!tsk_used_math(tsk)) {
-+ local_irq_enable();
-+ /*
-+ * does a slab alloc which can sleep
-+ */
-+ if (init_fpu(tsk)) {
-+ /*
-+ * ran out of memory!
-+ */
-+ do_group_exit(SIGKILL);
-+ return;
-+ }
-+ local_irq_disable();
-+ }
-+
- /* NB. 'clts' is done for us by Xen during virtual trap. */
-- if (!tsk_used_math(tsk))
-- init_fpu(tsk);
- restore_fpu(tsk);
- thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
- tsk->fpu_counter++;
-@@ -1100,15 +1143,15 @@ EXPORT_SYMBOL_GPL(math_state_restore);
-
- asmlinkage void math_emulate(long arg)
- {
-- printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
-- printk(KERN_EMERG "killing %s.\n",current->comm);
-- force_sig(SIGFPE,current);
-+ printk(KERN_EMERG
-+ "math-emulation not enabled and no coprocessor found.\n");
-+ printk(KERN_EMERG "killing %s.\n", current->comm);
-+ force_sig(SIGFPE, current);
- schedule();
- }
-
- #endif /* CONFIG_MATH_EMULATION */
-
+- }
+- MP_bus_info(&bus);
+- if (mpc_default_type > 4) {
+- bus.mpc_busid = 1;
+- memcpy(bus.mpc_bustype, "PCI ", 6);
+- MP_bus_info(&bus);
+- }
+-
+- ioapic.mpc_type = MP_IOAPIC;
+- ioapic.mpc_apicid = 2;
+- ioapic.mpc_apicver = 0;
+- ioapic.mpc_flags = MPC_APIC_USABLE;
+- ioapic.mpc_apicaddr = 0xFEC00000;
+- MP_ioapic_info(&ioapic);
+-
+- /*
+- * We set up most of the low 16 IO-APIC pins according to MPS rules.
+- */
+- construct_default_ioirq_mptable(mpc_default_type);
+-
+- lintsrc.mpc_type = MP_LINTSRC;
+- lintsrc.mpc_irqflag = 0; /* conforming */
+- lintsrc.mpc_srcbusid = 0;
+- lintsrc.mpc_srcbusirq = 0;
+- lintsrc.mpc_destapic = MP_APIC_ALL;
+- for (i = 0; i < 2; i++) {
+- lintsrc.mpc_irqtype = linttypes[i];
+- lintsrc.mpc_destapiclint = i;
+- MP_lintsrc_info(&lintsrc);
+- }
+-}
+-
+-static struct intel_mp_floating *mpf_found;
+-
+-/*
+- * Scan the memory blocks for an SMP configuration block.
+- */
+-void __init get_smp_config (void)
+-{
+- struct intel_mp_floating *mpf = mpf_found;
+-
+- /*
+- * ACPI supports both logical (e.g. Hyper-Threading) and physical
+- * processors, where MPS only supports physical.
+- */
+- if (acpi_lapic && acpi_ioapic) {
+- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
+- return;
+- }
+- else if (acpi_lapic)
+- printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
+-
+- printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
+-
+- /*
+- * Now see if we need to read further.
+- */
+- if (mpf->mpf_feature1 != 0) {
+-
+- printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
+- construct_default_ISA_mptable(mpf->mpf_feature1);
+-
+- } else if (mpf->mpf_physptr) {
+-
+- /*
+- * Read the physical hardware table. Anything here will
+- * override the defaults.
+- */
+- if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
+- smp_found_config = 0;
+- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
+- printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
+- return;
+- }
+- /*
+- * If there are no explicit MP IRQ entries, then we are
+- * broken. We set up most of the low 16 IO-APIC pins to
+- * ISA defaults and hope it will work.
+- */
+- if (!mp_irq_entries) {
+- struct mpc_config_bus bus;
+-
+- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+-
+- bus.mpc_type = MP_BUS;
+- bus.mpc_busid = 0;
+- memcpy(bus.mpc_bustype, "ISA ", 6);
+- MP_bus_info(&bus);
+-
+- construct_default_ioirq_mptable(0);
+- }
+-
+- } else
+- BUG();
+-
+- printk(KERN_INFO "Processors: %d\n", num_processors);
+- /*
+- * Only use the first configuration found.
+- */
+-}
+-
+-static int __init smp_scan_config (unsigned long base, unsigned long length)
+-{
+- extern void __bad_mpf_size(void);
+- unsigned int *bp = isa_bus_to_virt(base);
+- struct intel_mp_floating *mpf;
+-
+- Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+- if (sizeof(*mpf) != 16)
+- __bad_mpf_size();
+-
+- while (length > 0) {
+- mpf = (struct intel_mp_floating *)bp;
+- if ((*bp == SMP_MAGIC_IDENT) &&
+- (mpf->mpf_length == 1) &&
+- !mpf_checksum((unsigned char *)bp, 16) &&
+- ((mpf->mpf_specification == 1)
+- || (mpf->mpf_specification == 4)) ) {
+-
+- smp_found_config = 1;
+- mpf_found = mpf;
+- return 1;
+- }
+- bp += 4;
+- length -= 16;
+- }
+- return 0;
+-}
+-
+-void __init find_smp_config(void)
+-{
+- unsigned int address;
+-
+- /*
+- * FIXME: Linux assumes you have 640K of base ram..
+- * this continues the error...
+- *
+- * 1) Scan the bottom 1K for a signature
+- * 2) Scan the top 1K of base RAM
+- * 3) Scan the 64K of bios
+- */
+- if (smp_scan_config(0x0,0x400) ||
+- smp_scan_config(639*0x400,0x400) ||
+- smp_scan_config(0xF0000,0x10000))
+- return;
+- /*
+- * If it is an SMP machine we should know now.
+- *
+- * there is a real-mode segmented pointer pointing to the
+- * 4K EBDA area at 0x40E, calculate and scan it here.
+- *
+- * NOTE! There are Linux loaders that will corrupt the EBDA
+- * area, and as such this kind of SMP config may be less
+- * trustworthy, simply because the SMP table may have been
+- * stomped on during early boot. These loaders are buggy and
+- * should be fixed.
+- */
+-
+- address = *(unsigned short *)phys_to_virt(0x40E);
+- address <<= 4;
+- if (smp_scan_config(address, 0x1000))
+- return;
+-
+- /* If we have come this far, we did not find an MP table */
+- printk(KERN_INFO "No mptable found.\n");
+-}
+-
+-/* --------------------------------------------------------------------------
+- ACPI-based MP Configuration
+- -------------------------------------------------------------------------- */
+-
+-#ifdef CONFIG_ACPI
+-
+-void __init mp_register_lapic_address(u64 address)
+-{
+-#ifndef CONFIG_XEN
+- mp_lapic_addr = (unsigned long) address;
+- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+- if (boot_cpu_id == -1U)
+- boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+-#endif
+-}
+-
+-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
+-{
+- struct mpc_config_processor processor;
+- int boot_cpu = 0;
+-
+- if (id == boot_cpu_id)
+- boot_cpu = 1;
+-
+-#ifndef CONFIG_XEN
+- processor.mpc_type = MP_PROCESSOR;
+- processor.mpc_apicid = id;
+- processor.mpc_apicver = 0;
+- processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
+- processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
+- processor.mpc_cpufeature = 0;
+- processor.mpc_featureflag = 0;
+- processor.mpc_reserved[0] = 0;
+- processor.mpc_reserved[1] = 0;
+-#endif
+-
+- MP_processor_info(&processor);
+-}
+-
+-#define MP_ISA_BUS 0
+-#define MP_MAX_IOAPIC_PIN 127
+-
+-static struct mp_ioapic_routing {
+- int apic_id;
+- int gsi_start;
+- int gsi_end;
+- u32 pin_programmed[4];
+-} mp_ioapic_routing[MAX_IO_APICS];
+-
+-static int mp_find_ioapic(int gsi)
+-{
+- int i = 0;
+-
+- /* Find the IOAPIC that manages this GSI. */
+- for (i = 0; i < nr_ioapics; i++) {
+- if ((gsi >= mp_ioapic_routing[i].gsi_start)
+- && (gsi <= mp_ioapic_routing[i].gsi_end))
+- return i;
+- }
+-
+- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+- return -1;
+-}
+-
+-static u8 uniq_ioapic_id(u8 id)
+-{
+- int i;
+- DECLARE_BITMAP(used, 256);
+- bitmap_zero(used, 256);
+- for (i = 0; i < nr_ioapics; i++) {
+- struct mpc_config_ioapic *ia = &mp_ioapics[i];
+- __set_bit(ia->mpc_apicid, used);
+- }
+- if (!test_bit(id, used))
+- return id;
+- return find_first_zero_bit(used, 256);
+-}
+-
+-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
+-{
+- int idx = 0;
+-
+- if (bad_ioapic(address))
+- return;
+-
+- idx = nr_ioapics;
+-
+- mp_ioapics[idx].mpc_type = MP_IOAPIC;
+- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+- mp_ioapics[idx].mpc_apicaddr = address;
+-
+-#ifndef CONFIG_XEN
+- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+-#endif
+- mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
+- mp_ioapics[idx].mpc_apicver = 0;
+-
+- /*
+- * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
+- * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
+- */
+- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+- mp_ioapic_routing[idx].gsi_start = gsi_base;
+- mp_ioapic_routing[idx].gsi_end = gsi_base +
+- io_apic_get_redir_entries(idx);
+-
+- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
+- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
+- mp_ioapics[idx].mpc_apicaddr,
+- mp_ioapic_routing[idx].gsi_start,
+- mp_ioapic_routing[idx].gsi_end);
+-
+- nr_ioapics++;
+-}
+-
+-void __init
+-mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+-{
+- struct mpc_config_intsrc intsrc;
+- int ioapic = -1;
+- int pin = -1;
+-
+- /*
+- * Convert 'gsi' to 'ioapic.pin'.
+- */
+- ioapic = mp_find_ioapic(gsi);
+- if (ioapic < 0)
+- return;
+- pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
-
- /*
- * NB. All these are "trap gates" (i.e. events_mask isn't set) except
- * for those that specify <dpl>|4 in the second field.
-@@ -1146,25 +1189,21 @@ void __init trap_init(void)
- if (ret)
- printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
-
- /*
-- * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
-- * Generate a build-time error if the alignment is wrong.
+- * TBD: This check is for faulty timer entries, where the override
+- * erroneously sets the trigger to level, resulting in a HUGE
+- * increase of timer interrupts!
- */
-- BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
- if (cpu_has_fxsr) {
- printk(KERN_INFO "Enabling fast FPU save and restore... ");
- set_in_cr4(X86_CR4_OSFXSR);
- printk("done.\n");
- }
- if (cpu_has_xmm) {
-- printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
-- "support... ");
-+ printk(KERN_INFO
-+ "Enabling unmasked SIMD FPU exception support... ");
- set_in_cr4(X86_CR4_OSXMMEXCPT);
- printk("done.\n");
- }
-
-+ init_thread_xstate();
- /*
-- * Should be a barrier for any external CPU state.
-+ * Should be a barrier for any external CPU state:
- */
- cpu_init();
- }
-@@ -1183,6 +1222,7 @@ void __cpuinit smp_trap_init(trap_info_t
- static int __init kstack_setup(char *s)
- {
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-+
- return 1;
- }
- __setup("kstack=", kstack_setup);
---- a/arch/x86/kernel/traps_64-xen.c
-+++ b/arch/x86/kernel/traps_64-xen.c
-@@ -33,6 +33,8 @@
- #include <linux/kdebug.h>
- #include <linux/utsname.h>
-
-+#include <mach_traps.h>
-+
- #if defined(CONFIG_EDAC)
- #include <linux/edac.h>
- #endif
-@@ -601,10 +603,16 @@ void die(const char * str, struct pt_reg
- }
-
- #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
--void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
-+notrace __kprobes void
-+die_nmi(char *str, struct pt_regs *regs, int do_panic)
- {
-- unsigned long flags = oops_begin();
-+ unsigned long flags;
-+
-+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
-+ NOTIFY_STOP)
-+ return;
-
-+ flags = oops_begin();
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out.
-@@ -769,7 +777,7 @@ asmlinkage void __kprobes do_general_pro
- die("general protection fault", regs, error_code);
- }
-
--static __kprobes void
-+static notrace __kprobes void
- mem_parity_error(unsigned char reason, struct pt_regs * regs)
- {
- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
-@@ -792,7 +800,7 @@ mem_parity_error(unsigned char reason, s
- clear_mem_error(reason);
- }
-
--static __kprobes void
-+static notrace __kprobes void
- io_check_error(unsigned char reason, struct pt_regs * regs)
- {
- printk("NMI: IOCK error (debug interrupt?)\n");
-@@ -802,9 +810,11 @@ io_check_error(unsigned char reason, str
- clear_io_check_error(reason);
- }
-
--static __kprobes void
-+static notrace __kprobes void
- unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
- {
-+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
-+ return;
- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
- reason);
- printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
-@@ -817,7 +827,7 @@ unknown_nmi_error(unsigned char reason,
+- if ((bus_irq == 0) && (trigger == 3))
+- trigger = 1;
+-
+- intsrc.mpc_type = MP_INTSRC;
+- intsrc.mpc_irqtype = mp_INT;
+- intsrc.mpc_irqflag = (trigger << 2) | polarity;
+- intsrc.mpc_srcbus = MP_ISA_BUS;
+- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
+- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
+- intsrc.mpc_dstirq = pin; /* INTIN# */
+-
+- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
+- intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
+- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
+- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
+-
+- mp_irqs[mp_irq_entries] = intsrc;
+- if (++mp_irq_entries == MAX_IRQ_SOURCES)
+- panic("Max # of irq sources exceeded!\n");
+-}
+-
+-void __init mp_config_acpi_legacy_irqs(void)
+-{
+- struct mpc_config_intsrc intsrc;
+- int i = 0;
+- int ioapic = -1;
+-
+- /*
+- * Fabricate the legacy ISA bus (bus #31).
+- */
+- set_bit(MP_ISA_BUS, mp_bus_not_pci);
+-
+- /*
+- * Locate the IOAPIC that manages the ISA IRQs (0-15).
+- */
+- ioapic = mp_find_ioapic(0);
+- if (ioapic < 0)
+- return;
+-
+- intsrc.mpc_type = MP_INTSRC;
+- intsrc.mpc_irqflag = 0; /* Conforming */
+- intsrc.mpc_srcbus = MP_ISA_BUS;
+- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+-
+- /*
+- * Use the default configuration for the IRQs 0-15. Unless
+- * overridden by (MADT) interrupt source override entries.
+- */
+- for (i = 0; i < 16; i++) {
+- int idx;
+-
+- for (idx = 0; idx < mp_irq_entries; idx++) {
+- struct mpc_config_intsrc *irq = mp_irqs + idx;
+-
+- /* Do we already have a mapping for this ISA IRQ? */
+- if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
+- break;
+-
+- /* Do we already have a mapping for this IOAPIC pin */
+- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+- (irq->mpc_dstirq == i))
+- break;
+- }
+-
+- if (idx != mp_irq_entries) {
+- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+- continue; /* IRQ already used */
+- }
+-
+- intsrc.mpc_irqtype = mp_INT;
+- intsrc.mpc_srcbusirq = i; /* Identity mapped */
+- intsrc.mpc_dstirq = i;
+-
+- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
+- "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
+- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
+- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
+- intsrc.mpc_dstirq);
+-
+- mp_irqs[mp_irq_entries] = intsrc;
+- if (++mp_irq_entries == MAX_IRQ_SOURCES)
+- panic("Max # of irq sources exceeded!\n");
+- }
+-}
+-
+-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+-{
+- int ioapic = -1;
+- int ioapic_pin = 0;
+- int idx, bit = 0;
+-
+- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+- return gsi;
+-
+- /* Don't set up the ACPI SCI because it's already set up */
+- if (acpi_gbl_FADT.sci_interrupt == gsi)
+- return gsi;
+-
+- ioapic = mp_find_ioapic(gsi);
+- if (ioapic < 0) {
+- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+- return gsi;
+- }
+-
+- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
+-
+- /*
+- * Avoid pin reprogramming. PRTs typically include entries
+- * with redundant pin->gsi mappings (but unique PCI devices);
+- * we only program the IOAPIC on the first.
+- */
+- bit = ioapic_pin % 32;
+- idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
+- if (idx > 3) {
+- printk(KERN_ERR "Invalid reference to IOAPIC pin "
+- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+- ioapic_pin);
+- return gsi;
+- }
+- if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+- return gsi;
+- }
+-
+- mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+-
+- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+- return gsi;
+-}
+-#endif /*CONFIG_ACPI*/
+--- sle11-2009-05-14.orig/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -1,283 +1,251 @@
+-/*
+- * Dynamic DMA mapping support.
+- *
+- * On i386 there is no hardware dynamic DMA address translation,
+- * so consistent alloc/free are merely page allocation/freeing.
+- * The rest of the dynamic DMA mapping interface is implemented
+- * in asm/pci.h.
+- */
+-
+-#include <linux/types.h>
+-#include <linux/mm.h>
+-#include <linux/string.h>
++#include <linux/dma-mapping.h>
++#include <linux/dmar.h>
++#include <linux/bootmem.h>
+ #include <linux/pci.h>
+-#include <linux/module.h>
+-#include <linux/version.h>
+-#include <asm/io.h>
+-#include <xen/balloon.h>
+-#include <xen/gnttab.h>
+-#include <asm/swiotlb.h>
+-#include <asm/tlbflush.h>
+-#include <asm/swiotlb_32.h>
+-#include <asm/gnttab_dma.h>
+-#include <asm/bug.h>
- /* Runs on IST stack. This code must keep interrupts off all the time.
- Nested NMIs are prevented by the CPU. */
--asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
-+asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
- {
- unsigned char reason = 0;
- int cpu;
-@@ -1117,11 +1127,25 @@ asmlinkage void __attribute__((weak)) mc
- asmlinkage void math_state_restore(void)
- {
- struct task_struct *me = current;
-+
-+ if (!used_math()) {
-+ local_irq_enable();
-+ /*
-+ * does a slab alloc which can sleep
-+ */
-+ if (init_fpu(me)) {
-+ /*
-+ * ran out of memory!
-+ */
-+ do_group_exit(SIGKILL);
-+ return;
-+ }
-+ local_irq_disable();
-+ }
+-#ifdef __x86_64__
+-#include <asm/iommu.h>
++#include <asm/proto.h>
++#include <asm/dma.h>
++#include <asm/gart.h>
++#include <asm/calgary.h>
+
- /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
-
-- if (!used_math())
-- init_fpu(me);
-- restore_fpu_checking(&me->thread.i387.fxsave);
-+ restore_fpu_checking(&me->thread.xstate->fxsave);
- task_thread_info(me)->status |= TS_USEDFPU;
- me->fpu_counter++;
- }
-@@ -1168,6 +1192,10 @@ void __init trap_init(void)
- printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
-
- /*
-+ * initialize the per thread extended state:
-+ */
-+ init_thread_xstate();
-+ /*
- * Should be a barrier for any external CPU state.
- */
- cpu_init();
---- a/arch/x86/kernel/vsyscall_64-xen.c
-+++ b/arch/x86/kernel/vsyscall_64-xen.c
-@@ -216,7 +216,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
- return 0;
- }
-
--long __vsyscall(3) venosys_1(void)
-+static long __vsyscall(3) venosys_1(void)
- {
- return -ENOSYS;
- }
---- a/arch/x86/mm/fault-xen.c
-+++ b/arch/x86/mm/fault-xen.c
-@@ -510,6 +510,11 @@ static int vmalloc_fault(unsigned long a
- unsigned long pgd_paddr;
- pmd_t *pmd_k;
- pte_t *pte_k;
++int forbid_dac __read_mostly;
++EXPORT_SYMBOL(forbid_dac);
+
-+ /* Make sure we are in vmalloc area */
-+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
-+ return -1;
++const struct dma_mapping_ops *dma_ops;
++EXPORT_SYMBOL(dma_ops);
+
- /*
- * Synchronize this task's top level page-table
- * with the 'reference' page table.
-@@ -671,7 +676,7 @@ void __kprobes do_page_fault(struct pt_r
- #ifdef CONFIG_X86_32
- /* It's safe to allow irq's after cr2 has been saved and the vmalloc
- fault has been handled. */
-- if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
-+ if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
- local_irq_enable();
-
- /*
-@@ -1018,9 +1023,5 @@ void vmalloc_sync_all(void)
- if (address == start)
- start = address + PGDIR_SIZE;
- }
-- /* Check that there is no need to do the same for the modules area. */
-- BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
-- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
-- (__START_KERNEL & PGDIR_MASK)));
- #endif
- }
---- a/arch/x86/mm/highmem_32-xen.c
-+++ b/arch/x86/mm/highmem_32-xen.c
-@@ -200,6 +200,5 @@ EXPORT_SYMBOL(kmap);
- EXPORT_SYMBOL(kunmap);
- EXPORT_SYMBOL(kmap_atomic);
- EXPORT_SYMBOL(kunmap_atomic);
--EXPORT_SYMBOL(kmap_atomic_to_page);
- EXPORT_SYMBOL(clear_highpage);
- EXPORT_SYMBOL(copy_highpage);
---- a/arch/x86/mm/init_32-xen.c
-+++ b/arch/x86/mm/init_32-xen.c
-@@ -1,5 +1,4 @@
- /*
-- * linux/arch/i386/mm/init.c
- *
- * Copyright (C) 1995 Linus Torvalds
- *
-@@ -22,6 +21,7 @@
- #include <linux/init.h>
- #include <linux/highmem.h>
- #include <linux/pagemap.h>
-+#include <linux/pci.h>
- #include <linux/pfn.h>
- #include <linux/poison.h>
- #include <linux/bootmem.h>
-@@ -54,6 +54,8 @@
-
- unsigned int __VMALLOC_RESERVE = 128 << 20;
-
-+unsigned long max_pfn_mapped;
++static int iommu_sac_force __read_mostly;
+
- DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
- unsigned long highstart_pfn, highend_pfn;
-
-@@ -73,7 +75,7 @@ static pmd_t * __init one_md_table_init(
- if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
- pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-
-- paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
-+ paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
- make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
- set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
- pud = pud_offset(pgd, 0);
-@@ -107,7 +109,7 @@ static pte_t * __init one_page_table_ini
- (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- }
-
-- paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
-+ paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
- make_lowmem_page_readonly(page_table,
- XENFEAT_writable_page_tables);
- set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
-@@ -209,8 +211,13 @@ static void __init kernel_physical_mappi
- /*
- * Map with big pages if possible, otherwise
- * create normal page tables:
-+ *
-+ * Don't use a large page for the first 2/4MB of memory
-+ * because there are often fixed size MTRRs in there
-+ * and overlapping MTRRs into large pages can cause
-+ * slowdowns.
- */
-- if (cpu_has_pse) {
-+ if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
- unsigned int addr2;
- pgprot_t prot = PAGE_KERNEL_LARGE;
-
-@@ -224,6 +231,7 @@ static void __init kernel_physical_mappi
- set_pmd(pmd, pfn_pmd(pfn, prot));
++#ifdef CONFIG_IOMMU_DEBUG
++int panic_on_overflow __read_mostly = 1;
++int force_iommu __read_mostly = 1;
++#else
++int panic_on_overflow __read_mostly = 0;
++int force_iommu __read_mostly = 0;
++#endif
- pfn += PTRS_PER_PTE;
-+ max_pfn_mapped = pfn;
- continue;
- }
- pte = one_page_table_init(pmd);
-@@ -241,6 +249,7 @@ static void __init kernel_physical_mappi
+ int iommu_merge __read_mostly = 0;
+-EXPORT_SYMBOL(iommu_merge);
- set_pte(pte, pfn_pte(pfn, prot));
- }
-+ max_pfn_mapped = pfn;
- pte_ofs = 0;
- }
- pmd_idx = 0;
-@@ -262,6 +271,25 @@ static inline int page_kills_ppro(unsign
+-dma_addr_t bad_dma_address __read_mostly;
+-EXPORT_SYMBOL(bad_dma_address);
++int no_iommu __read_mostly;
++/* Set this to 1 if there is a HW IOMMU in the system */
++int iommu_detected __read_mostly = 0;
- #endif
+ /* This tells the BIO block layer to assume merging. Default to off
+ because we cannot guarantee merging later. */
+ int iommu_bio_merge __read_mostly = 0;
+ EXPORT_SYMBOL(iommu_bio_merge);
-+/*
-+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
-+ * is valid. The argument is a physical page number.
-+ *
-+ *
-+ * On x86, access has to be given to the first megabyte of ram because that area
-+ * contains bios code and data regions used by X and dosemu and similar apps.
-+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
-+ * mmio resources as well as potential bios/acpi data regions.
-+ */
-+int devmem_is_allowed(unsigned long pagenr)
-+{
-+ if (pagenr <= 256)
-+ return 1;
-+ if (mfn_to_local_pfn(pagenr) >= max_pfn)
-+ return 1;
-+ return 0;
-+}
-+
- #ifdef CONFIG_HIGHMEM
- pte_t *kmap_pte;
- pgprot_t kmap_prot;
-@@ -303,48 +331,18 @@ static void __init permanent_kmaps_init(
- pkmap_page_table = pte;
- }
+-int force_iommu __read_mostly= 0;
++dma_addr_t bad_dma_address __read_mostly = 0;
++EXPORT_SYMBOL(bad_dma_address);
--static void __meminit free_new_highpage(struct page *page, int pfn)
+-__init int iommu_setup(char *p)
-{
-- init_page_count(page);
-- if (pfn < xen_start_info->nr_pages)
-- __free_page(page);
-- totalhigh_pages++;
+- return 1;
-}
--
- void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
++/* Dummy device used for NULL arguments (normally ISA). Better would
++ be probably a smaller DMA mask, but this is bug-to-bug compatible
++ to older i386. */
++struct device fallback_dev = {
++ .bus_id = "fallback device",
++ .coherent_dma_mask = DMA_32BIT_MASK,
++ .dma_mask = &fallback_dev.coherent_dma_mask,
++};
+
+-void __init pci_iommu_alloc(void)
++int dma_set_mask(struct device *dev, u64 mask)
{
- if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
- ClearPageReserved(page);
-- free_new_highpage(page, pfn);
-+ init_page_count(page);
-+ if (pfn < xen_start_info->nr_pages)
-+ __free_page(page);
-+ totalhigh_pages++;
- } else
- SetPageReserved(page);
- }
+-#ifdef CONFIG_SWIOTLB
+- pci_swiotlb_init();
+-#endif
+-}
++ if (!dev->dma_mask || !dma_supported(dev, mask))
++ return -EIO;
++
++ *dev->dma_mask = mask;
--static int __meminit
--add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+-static int __init pci_iommu_init(void)
-{
-- free_new_highpage(page, pfn);
-- totalram_pages++;
--#ifdef CONFIG_FLATMEM
-- max_mapnr = max(pfn, max_mapnr);
+- no_iommu_init();
+ return 0;
+ }
++EXPORT_SYMBOL(dma_set_mask);
+
+-/* Must execute after PCI subsystem */
+-fs_initcall(pci_iommu_init);
-#endif
-- num_physpages++;
--
-- return 0;
--}
-
--/*
-- * Not currently handling the NUMA case.
-- * Assuming single node and all memory that
-- * has been added dynamically that would be
-- * onlined here is in HIGHMEM.
-- */
--void __meminit online_page(struct page *page)
--{
-- ClearPageReserved(page);
-- add_one_highpage_hotplug(page, page_to_pfn(page));
--}
+-struct dma_coherent_mem {
+- void *virt_base;
+- u32 device_base;
+- int size;
+- int flags;
+- unsigned long *bitmap;
+-};
-
- #ifndef CONFIG_NUMA
- static void __init set_highmem_pages_init(int bad_ppro)
+-#define IOMMU_BUG_ON(test) \
+-do { \
+- if (unlikely(test)) { \
+- printk(KERN_ALERT "Fatal DMA error! " \
+- "Please use 'swiotlb=force'\n"); \
+- BUG(); \
+- } \
+-} while (0)
++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
++static __initdata void *dma32_bootmem_ptr;
++static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
+
+-static int check_pages_physically_contiguous(unsigned long pfn,
+- unsigned int offset,
+- size_t length)
++static int __init parse_dma32_size_opt(char *p)
{
-@@ -459,15 +457,13 @@ void zap_low_mappings(void)
+- unsigned long next_mfn;
+- int i;
+- int nr_pages;
+-
+- next_mfn = pfn_to_mfn(pfn);
+- nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+-
+- for (i = 1; i < nr_pages; i++) {
+- if (pfn_to_mfn(++pfn) != ++next_mfn)
+- return 0;
+- }
+- return 1;
++ if (!p)
++ return -EINVAL;
++ dma32_bootmem_size = memparse(p, &p);
++ return 0;
+ }
++early_param("dma32_size", parse_dma32_size_opt);
+
+-int range_straddles_page_boundary(paddr_t p, size_t size)
++void __init dma32_reserve_bootmem(void)
{
- int i;
+- unsigned long pfn = p >> PAGE_SHIFT;
+- unsigned int offset = p & ~PAGE_MASK;
++ unsigned long size, align;
++ if (end_pfn <= MAX_DMA32_PFN)
++ return;
-- save_pg_dir();
+- return ((offset + size > PAGE_SIZE) &&
+- !check_pages_physically_contiguous(pfn, offset, size));
++ align = 64ULL<<20;
++ size = round_up(dma32_bootmem_size, align);
++ dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
++ __pa(MAX_DMA_ADDRESS));
++ if (dma32_bootmem_ptr)
++ dma32_bootmem_size = size;
++ else
++ dma32_bootmem_size = 0;
+ }
-
- /*
- * Zap initial low-memory mappings.
- *
- * Note that "pgd_clear()" doesn't do it for
- * us, because pgd_clear() is a no-op on i386.
- */
-- for (i = 0; i < USER_PTRS_PER_PGD; i++) {
-+ for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
- #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
- set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
- #else
-@@ -572,9 +568,9 @@ void __init paging_init(void)
-
- /*
- * Test if the WP bit works in supervisor mode. It isn't supported on 386's
-- * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
-- * used to involve black magic jumps to work around some nasty CPU bugs,
-- * but fortunately the switch to using exceptions got rid of all that.
-+ * and also on some strange 486's. All 586+'s are OK. This used to involve
-+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
-+ * switch to using exceptions got rid of all that.
- */
- static void __init test_wp_bit(void)
+-int
+-dma_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
+- enum dma_data_direction direction)
++static void __init dma32_free_bootmem(void)
{
-@@ -605,9 +601,7 @@ void __init mem_init(void)
- int tmp, bad_ppro;
- unsigned long pfn;
-
--#if defined(CONFIG_SWIOTLB)
-- swiotlb_init();
--#endif
-+ pci_iommu_alloc();
+- int i, rc;
++ int node;
++
++ if (end_pfn <= MAX_DMA32_PFN)
++ return;
- #ifdef CONFIG_FLATMEM
- BUG_ON(!mem_map);
-@@ -710,16 +704,8 @@ void __init mem_init(void)
- test_wp_bit();
+- BUG_ON(!valid_dma_direction(direction));
+- WARN_ON(nents == 0 || sgl->length == 0);
++ if (!dma32_bootmem_ptr)
++ return;
- cpa_init();
+- if (swiotlb) {
+- rc = swiotlb_map_sg(hwdev, sgl, nents, direction);
+- } else {
+- struct scatterlist *sg;
-
-- /*
-- * Subtle. SMP is doing it's boot stuff late (because it has to
-- * fork idle threads) - but it also needs low mappings for the
-- * protected-mode entry to work. We zap these entries only after
-- * the WP-bit has been tested.
-- */
--#ifndef CONFIG_SMP
-+ save_pg_dir();
- zap_low_mappings();
--#endif
+- for_each_sg(sgl, sg, nents, i) {
+- BUG_ON(!sg_page(sg));
+- sg->dma_address =
+- gnttab_dma_map_page(sg_page(sg)) + sg->offset;
+- sg->dma_length = sg->length;
+- IOMMU_BUG_ON(address_needs_mapping(
+- hwdev, sg->dma_address));
+- IOMMU_BUG_ON(range_straddles_page_boundary(
+- page_to_pseudophys(sg_page(sg)) + sg->offset,
+- sg->length));
+- }
+- rc = nents;
+- }
++ for_each_online_node(node)
++ free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
++ dma32_bootmem_size);
- SetPagePinned(virt_to_page(init_mm.pgd));
+- flush_write_buffers();
+- return rc;
++ dma32_bootmem_ptr = NULL;
++ dma32_bootmem_size = 0;
}
-@@ -769,25 +755,17 @@ void mark_rodata_ro(void)
- unsigned long start = PFN_ALIGN(_text);
- unsigned long size = PFN_ALIGN(_etext) - start;
+-EXPORT_SYMBOL(dma_map_sg);
++#else
++#define dma32_free_bootmem() ((void)0)
++#endif
--#ifndef CONFIG_KPROBES
--#ifdef CONFIG_HOTPLUG_CPU
-- /* It must still be possible to apply SMP alternatives. */
-- if (num_possible_cpus() <= 1)
--#endif
-- {
-- set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-- printk(KERN_INFO "Write protecting the kernel text: %luk\n",
-- size >> 10);
-+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-+ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
-+ size >> 10);
+-void
+-dma_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
+- enum dma_data_direction direction)
+-{
+- int i;
++static const struct dma_mapping_ops swiotlb_dma_ops = {
++ .mapping_error = swiotlb_dma_mapping_error,
++ .map_single = swiotlb_map_single_phys,
++ .unmap_single = swiotlb_unmap_single,
++ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
++ .sync_single_for_device = swiotlb_sync_single_for_device,
++ .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
++ .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
++ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
++ .sync_sg_for_device = swiotlb_sync_sg_for_device,
++ .map_sg = swiotlb_map_sg,
++ .unmap_sg = swiotlb_unmap_sg,
++ .dma_supported = swiotlb_dma_supported
++};
+
+- BUG_ON(!valid_dma_direction(direction));
+- if (swiotlb)
+- swiotlb_unmap_sg(hwdev, sgl, nents, direction);
+- else {
+- struct scatterlist *sg;
++void __init pci_iommu_alloc(void)
++{
++ /* free the range so iommu could get some range less than 4G */
++ dma32_free_bootmem();
++ /*
++ * The order of these functions is important for
++ * fall-back/fail-over reasons
++ */
++#ifdef CONFIG_GART_IOMMU
++ gart_iommu_hole_init();
++#endif
+
+- for_each_sg(sgl, sg, nents, i)
+- gnttab_dma_unmap_page(sg->dma_address);
+- }
+-}
+-EXPORT_SYMBOL(dma_unmap_sg);
++#ifdef CONFIG_CALGARY_IOMMU
++ detect_calgary();
++#endif
+
+-#ifdef CONFIG_HIGHMEM
+-dma_addr_t
+-dma_map_page(struct device *dev, struct page *page, unsigned long offset,
+- size_t size, enum dma_data_direction direction)
+-{
+- dma_addr_t dma_addr;
++ detect_intel_iommu();
+
+- BUG_ON(!valid_dma_direction(direction));
++#ifdef CONFIG_SWIOTLB
++ swiotlb_init();
+ if (swiotlb) {
+- dma_addr = swiotlb_map_page(
+- dev, page, offset, size, direction);
+- } else {
+- dma_addr = gnttab_dma_map_page(page) + offset;
+- IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
++ printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
++ dma_ops = &swiotlb_dma_ops;
+ }
+-
+- return dma_addr;
++#endif
+ }
+-EXPORT_SYMBOL(dma_map_page);
- #ifdef CONFIG_CPA_DEBUG
-- printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
-- start, start+size);
-- set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
-+ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
-+ start, start+size);
-+ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+-void
+-dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
+- enum dma_data_direction direction)
++/*
++ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
++ * documentation.
++ */
++static __init int iommu_setup(char *p)
+ {
+- BUG_ON(!valid_dma_direction(direction));
+- if (swiotlb)
+- swiotlb_unmap_page(dev, dma_address, size, direction);
+- else
+- gnttab_dma_unmap_page(dma_address);
+-}
+-EXPORT_SYMBOL(dma_unmap_page);
+-#endif /* CONFIG_HIGHMEM */
++ iommu_merge = 1;
-- printk(KERN_INFO "Testing CPA: write protecting again\n");
-- set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
--#endif
-- }
-+ printk(KERN_INFO "Testing CPA: write protecting again\n");
-+ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
- #endif
- start += size;
- size = (unsigned long)__end_rodata - start;
---- a/arch/x86/mm/init_64-xen.c
-+++ b/arch/x86/mm/init_64-xen.c
-@@ -52,9 +52,6 @@
+-int
+-dma_mapping_error(dma_addr_t dma_addr)
+-{
+- if (swiotlb)
+- return swiotlb_dma_mapping_error(dma_addr);
+- return 0;
+-}
+-EXPORT_SYMBOL(dma_mapping_error);
++ if (!p)
++ return -EINVAL;
- #include <xen/features.h>
+-int
+-dma_supported(struct device *dev, u64 mask)
+-{
+- if (swiotlb)
+- return swiotlb_dma_supported(dev, mask);
+- /*
+- * By default we'll BUG when an infeasible DMA is requested, and
+- * request swiotlb=force (see IOMMU_BUG_ON).
+- */
+- return 1;
+-}
+-EXPORT_SYMBOL(dma_supported);
++ while (*p) {
++ if (!strncmp(p, "off", 3))
++ no_iommu = 1;
++ /* gart_parse_options has more force support */
++ if (!strncmp(p, "force", 5))
++ force_iommu = 1;
++ if (!strncmp(p, "noforce", 7)) {
++ iommu_merge = 0;
++ force_iommu = 0;
++ }
--const struct dma_mapping_ops *dma_ops;
--EXPORT_SYMBOL(dma_ops);
--
- #if CONFIG_XEN_COMPAT <= 0x030002
- unsigned int __kernel_page_user;
- EXPORT_SYMBOL(__kernel_page_user);
-@@ -68,6 +65,28 @@ extern unsigned long start_pfn;
- extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
- extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
+-void *dma_alloc_coherent(struct device *dev, size_t size,
+- dma_addr_t *dma_handle, gfp_t gfp)
+-{
+- void *ret;
+- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+- unsigned int order = get_order(size);
+- unsigned long vstart;
+- u64 mask;
++ if (!strncmp(p, "biomerge", 8)) {
++ iommu_bio_merge = 4096;
++ iommu_merge = 1;
++ force_iommu = 1;
++ }
++ if (!strncmp(p, "panic", 5))
++ panic_on_overflow = 1;
++ if (!strncmp(p, "nopanic", 7))
++ panic_on_overflow = 0;
++ if (!strncmp(p, "merge", 5)) {
++ iommu_merge = 1;
++ force_iommu = 1;
++ }
++ if (!strncmp(p, "nomerge", 7))
++ iommu_merge = 0;
++ if (!strncmp(p, "forcesac", 8))
++ iommu_sac_force = 1;
++ if (!strncmp(p, "allowdac", 8))
++ forbid_dac = 0;
++ if (!strncmp(p, "nodac", 5))
++ forbid_dac = -1;
++ if (!strncmp(p, "usedac", 6)) {
++ forbid_dac = -1;
++ return 1;
++ }
++#ifdef CONFIG_SWIOTLB
++ if (!strncmp(p, "soft", 4))
++ swiotlb = 1;
++#endif
-+int direct_gbpages __meminitdata
-+#ifdef CONFIG_DIRECT_GBPAGES
-+ = 1
+- /* ignore region specifiers */
+- gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
++#ifdef CONFIG_GART_IOMMU
++ gart_parse_options(p);
+#endif
-+;
-+
-+#ifndef CONFIG_XEN
-+static int __init parse_direct_gbpages_off(char *arg)
-+{
-+ direct_gbpages = 0;
-+ return 0;
-+}
-+early_param("nogbpages", parse_direct_gbpages_off);
+
+- if (mem) {
+- int page = bitmap_find_free_region(mem->bitmap, mem->size,
+- order);
+- if (page >= 0) {
+- *dma_handle = mem->device_base + (page << PAGE_SHIFT);
+- ret = mem->virt_base + (page << PAGE_SHIFT);
+- memset(ret, 0, size);
+- return ret;
+- }
+- if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+- return NULL;
++#ifdef CONFIG_CALGARY_IOMMU
++ if (!strncmp(p, "calgary", 7))
++ use_calgary = 1;
++#endif /* CONFIG_CALGARY_IOMMU */
+
-+static int __init parse_direct_gbpages_on(char *arg)
-+{
-+ direct_gbpages = 1;
++ p += strcspn(p, ",");
++ if (*p == ',')
++ ++p;
+ }
+ return 0;
+}
-+early_param("gbpages", parse_direct_gbpages_on);
-+#endif
-+
- /*
- * Use this until direct mapping is established, i.e. before __va() is
- * available in init_memory_mapping().
-@@ -135,9 +154,6 @@ void show_mem(void)
++early_param("iommu", iommu_setup);
- printk(KERN_INFO "Mem-info:\n");
- show_free_areas();
-- printk(KERN_INFO "Free swap: %6ldkB\n",
-- nr_swap_pages << (PAGE_SHIFT-10));
+- if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
+- gfp |= GFP_DMA;
-
- for_each_online_pgdat(pgdat) {
- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
- /*
-@@ -328,7 +344,7 @@ void __init cleanup_highmap(void)
- pmd_t *last_pmd = pmd + PTRS_PER_PMD;
-
- for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
-- if (!pmd_present(*pmd))
-+ if (pmd_none(*pmd))
- continue;
- if (vaddr < (unsigned long) _text || vaddr > end)
- set_pmd(pmd, __pmd(0));
-@@ -337,8 +353,7 @@ void __init cleanup_highmap(void)
- #endif
-
- /* NOTE: this is meant to be run only at boot */
--void __init
--__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
-+void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
- {
- unsigned long address = __fix_to_virt(idx);
+- vstart = __get_free_pages(gfp, order);
+- ret = (void *)vstart;
++static int check_pages_physically_contiguous(unsigned long pfn,
++ unsigned int offset,
++ size_t length)
++{
++ unsigned long next_mfn;
++ int i;
++ int nr_pages;
-@@ -463,7 +478,7 @@ __meminit void early_iounmap(void *addr,
- }
- #endif
+- if (dev != NULL && dev->coherent_dma_mask)
+- mask = dev->coherent_dma_mask;
+- else
+- mask = 0xffffffff;
++ next_mfn = pfn_to_mfn(pfn);
++ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
--static void __meminit
-+static unsigned long __meminit
- phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
- {
- int i = pmd_index(address);
-@@ -503,21 +518,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
- set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
- }
+- if (ret != NULL) {
+- if (xen_create_contiguous_region(vstart, order,
+- fls64(mask)) != 0) {
+- free_pages(vstart, order);
+- return NULL;
+- }
+- memset(ret, 0, size);
+- *dma_handle = virt_to_bus(ret);
++ for (i = 1; i < nr_pages; i++) {
++ if (pfn_to_mfn(++pfn) != ++next_mfn)
++ return 0;
}
-+ return address;
- }
-
--static void __meminit
-+static unsigned long __meminit
- phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
- {
- pmd_t *pmd = pmd_offset(pud, 0);
-+ unsigned long last_map_addr;
-+
- spin_lock(&init_mm.page_table_lock);
-- phys_pmd_init(pmd, address, end);
-+ last_map_addr = phys_pmd_init(pmd, address, end);
- spin_unlock(&init_mm.page_table_lock);
- __flush_tlb_all();
-+ return last_map_addr;
+- return ret;
++ return 1;
}
+-EXPORT_SYMBOL(dma_alloc_coherent);
--static void __meminit
-+static unsigned long __meminit
- phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+-void dma_free_coherent(struct device *dev, size_t size,
+- void *vaddr, dma_addr_t dma_handle)
++int range_straddles_page_boundary(paddr_t p, size_t size)
{
-+ unsigned long last_map_addr = end;
- int i = pud_index(addr);
+- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+- int order = get_order(size);
+-
+- WARN_ON(irqs_disabled()); /* for portability */
+- if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+- int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
++ unsigned long pfn = p >> PAGE_SHIFT;
++ unsigned int offset = p & ~PAGE_MASK;
- for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
-@@ -529,7 +549,15 @@ phys_pud_init(pud_t *pud_page, unsigned
- break;
+- bitmap_release_region(mem->bitmap, page, order);
+- } else {
+- xen_destroy_contiguous_region((unsigned long)vaddr, order);
+- free_pages((unsigned long)vaddr, order);
+- }
++ return ((offset + size > PAGE_SIZE) &&
++ !check_pages_physically_contiguous(pfn, offset, size));
+ }
+-EXPORT_SYMBOL(dma_free_coherent);
- if (__pud_val(*pud)) {
-- phys_pmd_update(pud, addr, end);
-+ if (!pud_large(*pud))
-+ last_map_addr = phys_pmd_update(pud, addr, end);
-+ continue;
-+ }
+-#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
++#ifdef CONFIG_X86_32
+ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+ dma_addr_t device_addr, size_t size, int flags)
+ {
+@@ -327,8 +295,8 @@ EXPORT_SYMBOL(dma_declare_coherent_memor
+ void dma_release_declared_memory(struct device *dev)
+ {
+ struct dma_coherent_mem *mem = dev->dma_mem;
+-
+- if(!mem)
+
-+ if (direct_gbpages) {
-+ set_pte((pte_t *)pud,
-+ pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
-+ last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
- continue;
- }
-
-@@ -537,12 +565,14 @@ phys_pud_init(pud_t *pud_page, unsigned
-
- spin_lock(&init_mm.page_table_lock);
- *pud = __pud(pmd_phys | _KERNPG_TABLE);
-- phys_pmd_init(pmd, addr, end);
-+ last_map_addr = phys_pmd_init(pmd, addr, end);
- spin_unlock(&init_mm.page_table_lock);
-
- early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
- }
- __flush_tlb_all();
++ if (!mem)
+ return;
+ dev->dma_mem = NULL;
+ iounmap(mem->virt_base);
+@@ -341,8 +309,10 @@ void *dma_mark_declared_memory_occupied(
+ dma_addr_t device_addr, size_t size)
+ {
+ struct dma_coherent_mem *mem = dev->dma_mem;
+- int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ int pos, err;
++ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
+
-+ return last_map_addr >> PAGE_SHIFT;
- }
++ pages >>= PAGE_SHIFT;
- void __init xen_init_pt(void)
-@@ -763,16 +793,136 @@ static void __init xen_finish_init_mappi
- table_end = start_pfn;
+ if (!mem)
+ return ERR_PTR(-EINVAL);
+@@ -354,103 +324,270 @@ void *dma_mark_declared_memory_occupied(
+ return mem->virt_base + (pos << PAGE_SHIFT);
}
+ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+-#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
+-
+-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
+-/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-+static void __init init_gbpages(void)
-+{
-+ if (direct_gbpages && cpu_has_gbpages)
-+ printk(KERN_INFO "Using GB pages for direct mapping\n");
-+ else
-+ direct_gbpages = 0;
-+}
-+
-+#ifdef CONFIG_MEMTEST_BOOTPARAM
-+
-+static void __init memtest(unsigned long start_phys, unsigned long size,
-+ unsigned pattern)
-+{
-+ unsigned long i;
-+ unsigned long *start;
-+ unsigned long start_bad;
-+ unsigned long last_bad;
-+ unsigned long val;
-+ unsigned long start_phys_aligned;
-+ unsigned long count;
-+ unsigned long incr;
-+
-+ switch (pattern) {
-+ case 0:
-+ val = 0UL;
-+ break;
-+ case 1:
-+ val = -1UL;
-+ break;
-+ case 2:
-+ val = 0x5555555555555555UL;
-+ break;
-+ case 3:
-+ val = 0xaaaaaaaaaaaaaaaaUL;
-+ break;
-+ default:
-+ return;
-+ }
-+
-+ incr = sizeof(unsigned long);
-+ start_phys_aligned = ALIGN(start_phys, incr);
-+ count = (size - (start_phys_aligned - start_phys))/incr;
-+ start = __va(start_phys_aligned);
-+ start_bad = 0;
-+ last_bad = 0;
+-int forbid_dac;
+-EXPORT_SYMBOL(forbid_dac);
+-
+-static __devinit void via_no_dac(struct pci_dev *dev)
++static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
++ dma_addr_t *dma_handle, void **ret)
+ {
+- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+- printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
+- forbid_dac = 1;
++ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
++ int order = get_order(size);
+
-+ for (i = 0; i < count; i++)
-+ start[i] = val;
-+ for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-+ if (*start != val) {
-+ if (start_phys_aligned == last_bad + incr) {
-+ last_bad += incr;
-+ } else {
-+ if (start_bad) {
-+ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
-+ val, start_bad, last_bad + incr);
-+ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-+ }
-+ start_bad = last_bad = start_phys_aligned;
-+ }
++ if (mem) {
++ int page = bitmap_find_free_region(mem->bitmap, mem->size,
++ order);
++ if (page >= 0) {
++ *dma_handle = mem->device_base + (page << PAGE_SHIFT);
++ *ret = mem->virt_base + (page << PAGE_SHIFT);
++ memset(*ret, 0, size);
+ }
-+ }
-+ if (start_bad) {
-+ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
-+ val, start_bad, last_bad + incr);
-+ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-+ }
-+
-+}
-+
-+static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
++ if (mem->flags & DMA_MEMORY_EXCLUSIVE)
++ *ret = NULL;
+ }
++ return (mem != NULL);
+ }
+-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+
+-static int check_iommu(char *s)
++static int dma_release_coherent(struct device *dev, int order, void *vaddr)
+ {
+- if (!strcmp(s, "usedac")) {
+- forbid_dac = -1;
++ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+
-+static int __init parse_memtest(char *arg)
-+{
-+ if (arg)
-+ memtest_pattern = simple_strtoul(arg, NULL, 0);
-+ return 0;
-+}
++ if (mem && vaddr >= mem->virt_base && vaddr <
++ (mem->virt_base + (mem->size << PAGE_SHIFT))) {
++ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
-+early_param("memtest", parse_memtest);
++ bitmap_release_region(mem->bitmap, page, order);
+ return 1;
+ }
+ return 0;
+ }
+-__setup("iommu=", check_iommu);
++#else
++#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
++#define dma_release_coherent(dev, order, vaddr) (0)
++#endif /* CONFIG_X86_32 */
+
-+static void __init early_memtest(unsigned long start, unsigned long end)
++int dma_supported(struct device *dev, u64 mask)
+{
-+ u64 t_start, t_size;
-+ unsigned pattern;
-+
-+ if (!memtest_pattern)
-+ return;
-+
-+ printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
-+ for (pattern = 0; pattern < memtest_pattern; pattern++) {
-+ t_start = start;
-+ t_size = 0;
-+ while (t_start < end) {
-+ t_start = find_e820_area_size(t_start, &t_size, 1);
++#ifdef CONFIG_PCI
++ if (mask > 0xffffffff && forbid_dac > 0) {
++ printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
++ dev->bus_id);
++ return 0;
++ }
+ #endif
+
+-dma_addr_t
+-dma_map_single(struct device *dev, void *ptr, size_t size,
+- enum dma_data_direction direction)
++ if (dma_ops->dma_supported)
++ return dma_ops->dma_supported(dev, mask);
+
-+ /* done ? */
-+ if (t_start >= end)
-+ break;
-+ if (t_start + t_size > end)
-+ t_size = end - t_start;
++ /* Copied from i386. Doesn't make much sense, because it will
++ only work for pci_alloc_coherent.
++ The caller just has to use GFP_DMA in this case. */
++ if (mask < DMA_24BIT_MASK)
++ return 0;
+
-+ printk(KERN_CONT "\n %016llx - %016llx pattern %d",
-+ (unsigned long long)t_start,
-+ (unsigned long long)t_start + t_size, pattern);
++ /* Tell the device to use SAC when IOMMU force is on. This
++ allows the driver to use cheaper accesses in some cases.
+
-+ memtest(t_start, t_size, pattern);
++ Problem with this is that if we overflow the IOMMU area and
++ return DAC as fallback address the device may not handle it
++ correctly.
+
-+ t_start += t_size;
-+ }
++ As a special case some controllers have a 39bit address
++ mode that is as efficient as 32bit (aic79xx). Don't force
++ SAC for these. Assume all masks <= 40 bits are of this
++ type. Normally this doesn't make any difference, but gives
++ more gentle handling of IOMMU overflow. */
++ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
++ printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
++ dev->bus_id, mask);
++ return 0;
+ }
-+ printk(KERN_CONT "\n");
-+}
-+#else
-+static void __init early_memtest(unsigned long start, unsigned long end)
-+{
++
++ return 1;
+}
-+#endif
++EXPORT_SYMBOL(dma_supported);
+
- /*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
--void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
-+unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
++/* Allocate DMA memory on node near device */
++static struct page *
++dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
{
-- unsigned long next;
-+ unsigned long next, last_map_addr = end;
-+ unsigned long start_phys = start, end_phys = end;
-
-- pr_debug("init_memory_mapping\n");
-+ printk(KERN_INFO "init_memory_mapping\n");
-
- /*
- * Find space for the kernel direct mapping tables.
-@@ -781,8 +931,10 @@ void __init_refok init_memory_mapping(un
- * memory mapped. Unfortunately this is done currently before the
- * nodes are discovered.
- */
-- if (!after_bootmem)
-+ if (!after_bootmem) {
-+ init_gbpages();
- find_early_table_space(end);
-+ }
+- dma_addr_t dma;
++ int node;
- start = (unsigned long)__va(start);
- end = (unsigned long)__va(end);
-@@ -799,7 +951,7 @@ void __init_refok init_memory_mapping(un
- next = start + PGDIR_SIZE;
- if (next > end)
- next = end;
-- phys_pud_init(pud, __pa(start), __pa(next));
-+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
- if (!after_bootmem) {
- early_make_page_readonly(pud, XENFEAT_writable_page_tables);
- set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
-@@ -816,6 +968,11 @@ void __init_refok init_memory_mapping(un
- if (!after_bootmem)
- reserve_early(table_start << PAGE_SHIFT,
- table_end << PAGE_SHIFT, "PGTABLE");
-+
-+ if (!after_bootmem)
-+ early_memtest(start_phys, end_phys);
-+
-+ return last_map_addr;
- }
+- BUG_ON(!valid_dma_direction(direction));
+- WARN_ON(size == 0);
++ node = dev_to_node(dev);
- #ifndef CONFIG_NUMA
-@@ -839,15 +996,6 @@ void __init paging_init(void)
- /*
- * Memory hotplug specific functions
- */
--void online_page(struct page *page)
--{
-- ClearPageReserved(page);
-- init_page_count(page);
-- __free_page(page);
-- totalram_pages++;
-- num_physpages++;
+- if (swiotlb) {
+- dma = swiotlb_map_single(dev, ptr, size, direction);
+- } else {
+- dma = gnttab_dma_map_page(virt_to_page(ptr)) +
+- offset_in_page(ptr);
+- IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size));
+- IOMMU_BUG_ON(address_needs_mapping(dev, dma));
+- }
+-
+- flush_write_buffers();
+- return dma;
-}
+-EXPORT_SYMBOL(dma_map_single);
-
- #ifdef CONFIG_MEMORY_HOTPLUG
- /*
- * Memory is added always to NORMAL zone. This means you will never get
-@@ -857,11 +1005,13 @@ int arch_add_memory(int nid, u64 start,
- {
- struct pglist_data *pgdat = NODE_DATA(nid);
- struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
-- unsigned long start_pfn = start >> PAGE_SHIFT;
-+ unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
- int ret;
-
-- init_memory_mapping(start, start + size-1);
-+ last_mapped_pfn = init_memory_mapping(start, start + size-1);
-+ if (last_mapped_pfn > max_pfn_mapped)
-+ max_pfn_mapped = last_mapped_pfn;
-
- ret = __add_pages(zone, start_pfn, nr_pages);
- WARN_ON(1);
-@@ -880,6 +1030,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
-
- #endif /* CONFIG_MEMORY_HOTPLUG */
-
+-void
+-dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+- enum dma_data_direction direction)
+-{
+- BUG_ON(!valid_dma_direction(direction));
+- if (swiotlb)
+- swiotlb_unmap_single(dev, dma_addr, size, direction);
+- else
+- gnttab_dma_unmap_page(dma_addr);
++ return alloc_pages_node(node, gfp, order);
++}
++
+/*
-+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
-+ * is valid. The argument is a physical page number.
-+ *
-+ *
-+ * On x86, access has to be given to the first megabyte of ram because that area
-+ * contains bios code and data regions used by X and dosemu and similar apps.
-+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
-+ * mmio resources as well as potential bios/acpi data regions.
++ * Allocate memory for a coherent mapping.
+ */
-+int devmem_is_allowed(unsigned long pagenr)
++void *
++dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
++ gfp_t gfp)
+{
-+ if (pagenr <= 256)
-+ return 1;
-+ if (mfn_to_local_pfn(pagenr) >= max_pfn)
-+ return 1;
-+ return 0;
-+}
++ void *memory = NULL;
++ struct page *page;
++ unsigned long dma_mask = 0;
++ int noretry = 0;
++ unsigned int order = get_order(size);
+
++ /* ignore region specifiers */
++ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
- static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
- kcore_modules, kcore_vsyscall;
-
-@@ -988,24 +1158,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
-
- void mark_rodata_ro(void)
- {
-- unsigned long start = (unsigned long)_stext, end;
--
--#ifdef CONFIG_HOTPLUG_CPU
-- /* It must still be possible to apply SMP alternatives. */
-- if (num_possible_cpus() > 1)
-- start = (unsigned long)_etext;
--#endif
--
--#ifdef CONFIG_KPROBES
-- start = (unsigned long)__start_rodata;
--#endif
--
-- end = (unsigned long)__end_rodata;
-- start = (start + PAGE_SIZE - 1) & PAGE_MASK;
-- end &= PAGE_MASK;
-- if (end <= start)
-- return;
--
-+ unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
-
- printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
- (end - start) >> 10);
-@@ -1028,6 +1181,7 @@ void mark_rodata_ro(void)
- set_memory_ro(start, (end-start) >> PAGE_SHIFT);
- #endif
- }
++ if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
++ return memory;
+
- #endif
-
- #ifdef CONFIG_BLK_DEV_INITRD
-@@ -1040,7 +1194,7 @@ void free_initrd_mem(unsigned long start
- void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
- {
- #ifdef CONFIG_NUMA
-- int nid = phys_to_nid(phys);
-+ int nid, next_nid;
- #endif
- unsigned long pfn = phys >> PAGE_SHIFT;
-
-@@ -1049,7 +1203,7 @@ void __init reserve_bootmem_generic(unsi
- * This can happen with kdump kernels when accessing
- * firmware tables:
- */
-- if (pfn < end_pfn_map)
-+ if (pfn < max_pfn_mapped)
- return;
-
- printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
-@@ -1059,10 +1213,16 @@ void __init reserve_bootmem_generic(unsi
-
- /* Should check here against the e820 map to avoid double free */
- #ifdef CONFIG_NUMA
-- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
-+ nid = phys_to_nid(phys);
-+ next_nid = phys_to_nid(phys + len - 1);
-+ if (nid == next_nid)
-+ reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
-+ else
-+ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
- #else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
- #endif
++ if (!dev) {
++ dev = &fallback_dev;
++ gfp |= GFP_DMA;
++ }
++ dma_mask = dev->coherent_dma_mask;
++ if (dma_mask == 0)
++ dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
+
- #ifndef CONFIG_XEN
- if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
- static unsigned long dma_reserve __initdata;
-@@ -1160,6 +1320,10 @@ const char *arch_vma_name(struct vm_area
- /*
- * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
- */
-+static long __meminitdata addr_start, addr_end;
-+static void __meminitdata *p_start, *p_end;
-+static int __meminitdata node_start;
++ /* Device not DMA able */
++ if (dev->dma_mask == NULL)
++ return NULL;
++
++ /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
++ if (gfp & __GFP_DMA)
++ noretry = 1;
++
++#ifdef CONFIG_XEN
++ gfp &= ~(__GFP_DMA | __GFP_DMA32);
++#else
++#ifdef CONFIG_X86_64
++ /* Why <=? Even when the mask is smaller than 4GB it is often
++ larger than 16MB and in this case we have a chance of
++ finding fitting memory in the next higher zone first. If
++ not retry with true GFP_DMA. -AK */
++ if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
++ gfp |= GFP_DMA32;
++#endif
++
++ again:
++#endif
++ page = dma_alloc_pages(dev,
++ noretry ? gfp | __GFP_NORETRY : gfp, order);
++ if (page == NULL)
++ return NULL;
+
- int __meminit
- vmemmap_populate(struct page *start_page, unsigned long size, int node)
- {
-@@ -1194,12 +1358,32 @@ vmemmap_populate(struct page *start_page
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, __pmd_ma(__pte_val(entry)));
-
-- printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
-- addr, addr + PMD_SIZE - 1, p, node);
-+ /* check to see if we have contiguous blocks */
-+ if (p_end != p || node_start != node) {
-+ if (p_start)
-+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
-+ addr_start, addr_end-1, p_start, p_end-1, node_start);
-+ addr_start = addr;
-+ node_start = node;
-+ p_start = p;
++#ifndef CONFIG_XEN
++ {
++ int high, mmu;
++ dma_addr_t bus = page_to_phys(page);
++ memory = page_address(page);
++ high = (bus + size) >= dma_mask;
++ mmu = high;
++ if (force_iommu && !(gfp & GFP_DMA))
++ mmu = 1;
++ else if (high) {
++ free_pages((unsigned long)memory, order);
++
++ /* Don't use the 16MB ZONE_DMA unless absolutely
++ needed. It's better to use remapping first. */
++ if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
++ gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
++ goto again;
+ }
-+ addr_end = addr + PMD_SIZE;
-+ p_end = p + PMD_SIZE;
- } else {
- vmemmap_verify((pte_t *)pmd, node, addr, next);
- }
- }
- return 0;
- }
+
-+void __meminit vmemmap_populate_print_last(void)
-+{
-+ if (p_start) {
-+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
-+ addr_start, addr_end-1, p_start, p_end-1, node_start);
-+ p_start = NULL;
-+ p_end = NULL;
-+ node_start = 0;
++ /* Let low level make its own zone decisions */
++ gfp &= ~(GFP_DMA32|GFP_DMA);
++
++ if (dma_ops->alloc_coherent)
++ return dma_ops->alloc_coherent(dev, size,
++ dma_handle, gfp);
++ return NULL;
++ }
++
++ memset(memory, 0, size);
++ if (!mmu) {
++ *dma_handle = bus;
++ return memory;
++ }
+ }
-+}
- #endif
---- a/arch/x86/mm/ioremap-xen.c
-+++ b/arch/x86/mm/ioremap-xen.c
-@@ -20,14 +20,11 @@
- #include <asm/pgtable.h>
- #include <asm/tlbflush.h>
- #include <asm/pgalloc.h>
-+#include <asm/pat.h>
-
--enum ioremap_mode {
-- IOR_MODE_UNCACHED,
-- IOR_MODE_CACHED,
--};
--
--#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
-+#ifdef CONFIG_X86_64
++
++ if (dma_ops->alloc_coherent) {
++ free_pages((unsigned long)memory, order);
++ gfp &= ~(GFP_DMA|GFP_DMA32);
++ return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
++ }
++
++ if (dma_ops->map_simple) {
++ *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
++ size,
++ PCI_DMA_BIDIRECTIONAL);
++ if (*dma_handle != bad_dma_address)
++ return memory;
++ }
++#else
++ memory = page_address(page);
++ if (xen_create_contiguous_region((unsigned long)memory, order,
++ fls64(dma_mask)) == 0) {
++ memset(memory, 0, size);
++ *dma_handle = virt_to_bus(memory);
++ return memory;
++ }
++#endif
++
++ if (panic_on_overflow)
++ panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
++ (unsigned long)size);
++ free_pages((unsigned long)memory, order);
++ return NULL;
+ }
+-EXPORT_SYMBOL(dma_unmap_single);
++EXPORT_SYMBOL(dma_alloc_coherent);
-+#ifndef CONFIG_XEN
- unsigned long __phys_addr(unsigned long x)
+-void
+-dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+- enum dma_data_direction direction)
++/*
++ * Unmap coherent memory.
++ * The caller must ensure that the device has finished accessing the mapping.
++ */
++void dma_free_coherent(struct device *dev, size_t size,
++ void *vaddr, dma_addr_t bus)
{
- if (x >= __START_KERNEL_map)
-@@ -35,6 +32,19 @@ unsigned long __phys_addr(unsigned long
- return x - PAGE_OFFSET;
+- if (swiotlb)
+- swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
++ int order = get_order(size);
++ WARN_ON(irqs_disabled()); /* for portability */
++ if (dma_release_coherent(dev, order, vaddr))
++ return;
++#ifndef CONFIG_XEN
++ if (dma_ops->unmap_single)
++ dma_ops->unmap_single(dev, bus, size, 0);
++#endif
++ xen_destroy_contiguous_region((unsigned long)vaddr, order);
++ free_pages((unsigned long)vaddr, order);
}
- EXPORT_SYMBOL(__phys_addr);
+-EXPORT_SYMBOL(dma_sync_single_for_cpu);
++EXPORT_SYMBOL(dma_free_coherent);
+
+-void
+-dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
+- enum dma_data_direction direction)
++static int __init pci_iommu_init(void)
+ {
+- if (swiotlb)
+- swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
++#ifdef CONFIG_CALGARY_IOMMU
++ calgary_iommu_init();
+#endif
+
-+static inline int phys_addr_valid(unsigned long addr)
-+{
-+ return addr < (1UL << boot_cpu_data.x86_phys_bits);
-+}
++ intel_iommu_init();
+
-+#else
++#ifdef CONFIG_GART_IOMMU
++ gart_iommu_init();
++#endif
+
-+static inline int phys_addr_valid(unsigned long addr)
-+{
-+ return 1;
-+}
-
- #endif
-
-@@ -92,7 +102,8 @@ static int __direct_remap_pfn_range(stru
- * Fill in the machine address: PTE ptr is done later by
- * apply_to_page_range().
- */
-- v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
-+ v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot)))
-+ | _PAGE_IO;
-
- mfn++;
- address += PAGE_SIZE;
-@@ -189,10 +200,9 @@ int touch_pte_range(struct mm_struct *mm
-
- EXPORT_SYMBOL(touch_pte_range);
++ no_iommu_init();
++ return 0;
+ }
+-EXPORT_SYMBOL(dma_sync_single_for_device);
--#ifdef CONFIG_X86_32
- int page_is_ram(unsigned long pagenr)
+-void
+-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+- enum dma_data_direction direction)
++void pci_iommu_shutdown(void)
{
-- unsigned long addr, end;
-+ resource_size_t addr, end;
- int i;
-
- #ifndef CONFIG_XEN
-@@ -228,31 +238,51 @@ int page_is_ram(unsigned long pagenr)
- }
- return 0;
+- if (swiotlb)
+- swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
+- flush_write_buffers();
++ gart_iommu_shutdown();
}
--#endif
+-EXPORT_SYMBOL(dma_sync_sg_for_cpu);
++/* Must execute after PCI subsystem */
++fs_initcall(pci_iommu_init);
++
++#ifdef CONFIG_PCI
++/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
- /*
- * Fix up the linear direct mapping of the kernel to avoid cache attribute
- * conflicts.
- */
- static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
-- enum ioremap_mode mode)
-+ unsigned long prot_val)
+-void
+-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+- enum dma_data_direction direction)
++static __devinit void via_no_dac(struct pci_dev *dev)
{
- unsigned long nrpages = size >> PAGE_SHIFT;
- int err;
-
-- switch (mode) {
-- case IOR_MODE_UNCACHED:
-+ switch (prot_val) {
-+ case _PAGE_CACHE_UC:
- default:
-- err = set_memory_uc(vaddr, nrpages);
-+ err = _set_memory_uc(vaddr, nrpages);
-+ break;
-+ case _PAGE_CACHE_WC:
-+ err = _set_memory_wc(vaddr, nrpages);
- break;
-- case IOR_MODE_CACHED:
-- err = set_memory_wb(vaddr, nrpages);
-+ case _PAGE_CACHE_WB:
-+ err = _set_memory_wb(vaddr, nrpages);
- break;
- }
-
- return err;
+- if (swiotlb)
+- swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
+- flush_write_buffers();
++ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
++ printk(KERN_INFO "PCI: VIA PCI bridge detected."
++ "Disabling DAC.\n");
++ forbid_dac = 1;
++ }
}
-
-+int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
-+ unsigned long prot_val)
+-EXPORT_SYMBOL(dma_sync_sg_for_device);
++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
++#endif
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-05-14/arch/x86/kernel/pci-nommu-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -0,0 +1,103 @@
++#include <linux/dma-mapping.h>
++#include <linux/dmar.h>
++#include <linux/bootmem.h>
++#include <linux/pci.h>
++
++#include <xen/gnttab.h>
++
++#include <asm/proto.h>
++#include <asm/dma.h>
++#include <asm/swiotlb.h>
++#include <asm/tlbflush.h>
++#include <asm/gnttab_dma.h>
++#include <asm/bug.h>
++
++#define IOMMU_BUG_ON(test) \
++do { \
++ if (unlikely(test)) { \
++ printk(KERN_ALERT "Fatal DMA error! " \
++ "Please use 'swiotlb=force'\n"); \
++ BUG(); \
++ } \
++} while (0)
++
++static int
++gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
++ int direction)
+{
-+ unsigned long sz;
-+ int rc;
++ unsigned int i;
++ struct scatterlist *sg;
+
-+ for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
-+ unsigned long pfn = mfn_to_local_pfn(mfn);
++ WARN_ON(nents == 0 || sgl->length == 0);
+
-+ if (pfn >= max_pfn_mapped)
-+ continue;
-+ rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
-+ PAGE_SIZE, prot_val);
++ for_each_sg(sgl, sg, nents, i) {
++ BUG_ON(!sg_page(sg));
++ sg->dma_address =
++ gnttab_dma_map_page(sg_page(sg)) + sg->offset;
++ sg->dma_length = sg->length;
++ IOMMU_BUG_ON(address_needs_mapping(
++ hwdev, sg->dma_address));
++ IOMMU_BUG_ON(range_straddles_page_boundary(
++ page_to_pseudophys(sg_page(sg)) + sg->offset,
++ sg->length));
+ }
+
-+ return rc;
++ return nents;
+}
+
- /*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
-@@ -262,12 +292,15 @@ static int ioremap_change_attr(unsigned
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
--static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
-- enum ioremap_mode mode)
-+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-+ unsigned long size, unsigned long prot_val, void *caller)
- {
-- unsigned long mfn, offset, last_addr, vaddr;
-+ unsigned long mfn, offset, vaddr;
-+ resource_size_t last_addr;
- struct vm_struct *area;
-+ unsigned long new_prot_val;
- pgprot_t prot;
-+ int retval;
- domid_t domid = DOMID_IO;
-
- /* Don't allow wraparound or zero size */
-@@ -275,6 +308,13 @@ static void __iomem *__ioremap(resource_
- if (!size || last_addr < phys_addr)
- return NULL;
-
-+ if (!phys_addr_valid(phys_addr)) {
-+ printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
-+ (unsigned long long)phys_addr);
-+ WARN_ON_ONCE(1);
-+ return NULL;
-+ }
++static void
++gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
++ int direction)
++{
++ unsigned int i;
++ struct scatterlist *sg;
+
- /*
- * Don't remap the low PCI/ISA area, it's always mapped..
- */
-@@ -287,55 +327,86 @@ static void __iomem *__ioremap(resource_
- for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
- unsigned long pfn = mfn_to_local_pfn(mfn);
-
-- if (pfn >= max_pfn)
-- continue;
-+ if (pfn_valid(pfn)) {
-+ if (!PageReserved(pfn_to_page(pfn)))
-+ return NULL;
-+ domid = DOMID_SELF;
-+ }
-+ }
-+ WARN_ON_ONCE(domid == DOMID_SELF);
-
-- domid = DOMID_SELF;
-+ /*
-+ * Mappings have to be page-aligned
-+ */
-+ offset = phys_addr & ~PAGE_MASK;
-+ phys_addr &= PAGE_MASK;
-+ size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
-- if (pfn >= max_pfn_mapped) /* bogus */
-- continue;
-+ retval = reserve_memtype(phys_addr, phys_addr + size,
-+ prot_val, &new_prot_val);
-+ if (retval) {
-+ pr_debug("Warning: reserve_memtype returned %d\n", retval);
-+ return NULL;
-+ }
-
-- if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
-+ if (prot_val != new_prot_val) {
-+ /*
-+ * Do not fallback to certain memory types with certain
-+ * requested type:
-+ * - request is uc-, return cannot be write-back
-+ * - request is uc-, return cannot be write-combine
-+ * - request is write-combine, return cannot be write-back
-+ */
-+ if ((prot_val == _PAGE_CACHE_UC_MINUS &&
-+ (new_prot_val == _PAGE_CACHE_WB ||
-+ new_prot_val == _PAGE_CACHE_WC)) ||
-+ (prot_val == _PAGE_CACHE_WC &&
-+ new_prot_val == _PAGE_CACHE_WB)) {
-+ pr_debug(
-+ "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
-+ (unsigned long long)phys_addr,
-+ (unsigned long long)(phys_addr + size),
-+ prot_val, new_prot_val);
-+ free_memtype(phys_addr, phys_addr + size);
- return NULL;
-+ }
-+ prot_val = new_prot_val;
- }
-
-- switch (mode) {
-- case IOR_MODE_UNCACHED:
-+ switch (prot_val) {
-+ case _PAGE_CACHE_UC:
- default:
-- /*
-- * FIXME: we will use UC MINUS for now, as video fb drivers
-- * depend on it. Upcoming ioremap_wc() will fix this behavior.
-- */
-+ prot = PAGE_KERNEL_NOCACHE;
-+ break;
-+ case _PAGE_CACHE_UC_MINUS:
- prot = PAGE_KERNEL_UC_MINUS;
- break;
-- case IOR_MODE_CACHED:
-+ case _PAGE_CACHE_WC:
-+ prot = PAGE_KERNEL_WC;
-+ break;
-+ case _PAGE_CACHE_WB:
- prot = PAGE_KERNEL;
- break;
- }
-
- /*
-- * Mappings have to be page-aligned
-- */
-- offset = phys_addr & ~PAGE_MASK;
-- phys_addr &= PAGE_MASK;
-- size = PAGE_ALIGN(last_addr+1) - phys_addr;
--
-- /*
- * Ok, go for it..
- */
-- area = get_vm_area(size, VM_IOREMAP | (mode << 20));
-+ area = get_vm_area_caller(size, VM_IOREMAP, caller);
- if (!area)
- return NULL;
- area->phys_addr = phys_addr;
- vaddr = (unsigned long) area->addr;
- if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
- size, prot, domid)) {
-+ free_memtype(phys_addr, phys_addr + size);
- free_vm_area(area);
- return NULL;
- }
-
-- if (ioremap_change_attr(vaddr, size, mode) < 0) {
-- iounmap((void __iomem *) vaddr);
-+ if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
-+ free_memtype(phys_addr, phys_addr + size);
-+ vunmap(area->addr);
- return NULL;
- }
-
-@@ -365,16 +436,72 @@ static void __iomem *__ioremap(resource_
- */
- void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
- {
-- return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
-+ /*
-+ * Ideally, this should be:
-+ * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
-+ *
-+ * Till we fix all X drivers to use ioremap_wc(), we will use
-+ * UC MINUS.
-+ */
-+ unsigned long val = _PAGE_CACHE_UC_MINUS;
++ for_each_sg(sgl, sg, nents, i)
++ gnttab_dma_unmap_page(sg->dma_address);
++}
++
++static dma_addr_t
++gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size,
++ int direction)
++{
++ dma_addr_t dma;
++
++ WARN_ON(size == 0);
++
++ dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) +
++ offset_in_page(paddr);
++ IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
++ IOMMU_BUG_ON(address_needs_mapping(dev, dma));
++
++ return dma;
++}
++
++static void
++gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
++ int direction)
++{
++ gnttab_dma_unmap_page(dma_addr);
++}
++
++static int nommu_mapping_error(dma_addr_t dma_addr)
++{
++ return (dma_addr == bad_dma_address);
++}
++
++static const struct dma_mapping_ops nommu_dma_ops = {
++ .map_single = gnttab_map_single,
++ .unmap_single = gnttab_unmap_single,
++ .map_sg = gnttab_map_sg,
++ .unmap_sg = gnttab_unmap_sg,
++ .dma_supported = swiotlb_dma_supported,
++ .mapping_error = nommu_mapping_error
++};
++
++void __init no_iommu_init(void)
++{
++ if (dma_ops)
++ return;
+
-+ return __ioremap_caller(phys_addr, size, val,
-+ __builtin_return_address(0));
- }
- EXPORT_SYMBOL(ioremap_nocache);
-
-+/**
-+ * ioremap_wc - map memory into CPU space write combined
-+ * @offset: bus address of the memory
-+ * @size: size of the resource to map
-+ *
-+ * This version of ioremap ensures that the memory is marked write combining.
-+ * Write combining allows faster writes to some hardware devices.
++ force_iommu = 0; /* no HW IOMMU */
++ dma_ops = &nommu_dma_ops;
++}
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-05-14/arch/x86/kernel/process-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -0,0 +1,188 @@
++#include <linux/errno.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/smp.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/module.h>
++#include <linux/pm.h>
++
++struct kmem_cache *task_xstate_cachep;
++
++int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
++{
++ *dst = *src;
++ if (src->thread.xstate) {
++ dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
++ GFP_KERNEL);
++ if (!dst->thread.xstate)
++ return -ENOMEM;
++ WARN_ON((unsigned long)dst->thread.xstate & 15);
++ memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
++ }
++ return 0;
++}
++
++void free_thread_xstate(struct task_struct *tsk)
++{
++ if (tsk->thread.xstate) {
++ kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
++ tsk->thread.xstate = NULL;
++ }
++}
++
++void free_thread_info(struct thread_info *ti)
++{
++ free_thread_xstate(ti->task);
++ free_pages((unsigned long)ti, get_order(THREAD_SIZE));
++}
++
++void arch_task_cache_init(void)
++{
++ task_xstate_cachep =
++ kmem_cache_create("task_xstate", xstate_size,
++ __alignof__(union thread_xstate),
++ SLAB_PANIC, NULL);
++}
++
++static void do_nothing(void *unused)
++{
++}
++
++/*
++ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
++ * pm_idle and update to new pm_idle value. Required while changing pm_idle
++ * handler on SMP systems.
+ *
-+ * Must be freed with iounmap.
++ * Caller must have changed pm_idle to the new value before the call. Old
++ * pm_idle value will not be used by any CPU after the return of this function.
+ */
-+void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
++void cpu_idle_wait(void)
+{
-+ if (pat_wc_enabled)
-+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
-+ __builtin_return_address(0));
-+ else
-+ return ioremap_nocache(phys_addr, size);
++ smp_mb();
++ /* kick all the CPUs so that they exit out of pm_idle */
++ smp_call_function(do_nothing, NULL, 0, 1);
+}
-+EXPORT_SYMBOL(ioremap_wc);
++EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
- void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
- {
-- return __ioremap(phys_addr, size, IOR_MODE_CACHED);
-+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
-+ __builtin_return_address(0));
- }
- EXPORT_SYMBOL(ioremap_cache);
-
+#ifndef CONFIG_XEN
-+static void __iomem *ioremap_default(resource_size_t phys_addr,
-+ unsigned long size)
++/*
++ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
++ * which can obviate IPI to trigger checking of need_resched.
++ * We execute MONITOR against need_resched and enter optimized wait state
++ * through MWAIT. Whenever someone changes need_resched, we would be woken
++ * up from MWAIT (without an IPI).
++ *
++ * New with Core Duo processors, MWAIT can take some hints based on CPU
++ * capability.
++ */
++void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+{
-+ unsigned long flags;
-+ void *ret;
-+ int err;
-+
-+ /*
-+ * - WB for WB-able memory and no other conflicting mappings
-+ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
-+ * - Inherit from confliting mappings otherwise
-+ */
-+ err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
-+ if (err < 0)
-+ return NULL;
-+
-+ ret = (void *) __ioremap_caller(phys_addr, size, flags,
-+ __builtin_return_address(0));
++ if (!need_resched()) {
++ __monitor((void *)¤t_thread_info()->flags, 0, 0);
++ smp_mb();
++ if (!need_resched())
++ __mwait(ax, cx);
++ }
++}
+
-+ free_memtype(phys_addr, phys_addr + size);
-+ return (void __iomem *)ret;
++/* Default MONITOR/MWAIT with no hints, used for default C1 state */
++static void mwait_idle(void)
++{
++ if (!need_resched()) {
++ __monitor((void *)¤t_thread_info()->flags, 0, 0);
++ smp_mb();
++ if (!need_resched())
++ __sti_mwait(0, 0);
++ else
++ local_irq_enable();
++ } else
++ local_irq_enable();
+}
+#endif
+
- /**
- * iounmap - Free a IO remapping
- * @addr: virtual address from ioremap_*
-@@ -417,15 +544,7 @@ void iounmap(volatile void __iomem *addr
- return;
- }
-
-- if ((p->flags >> 20) != IOR_MODE_CACHED) {
-- unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
-- unsigned long mfn = p->phys_addr;
-- unsigned long va = (unsigned long)addr;
--
-- for (; n > 0; n--, mfn++, va += PAGE_SIZE)
-- if (mfn_to_local_pfn(mfn) < max_pfn)
-- set_memory_wb(va, 1);
-- }
-+ free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
-
- /* Finally remove it */
- o = remove_vm_area((void *)addr);
-@@ -434,6 +553,37 @@ void iounmap(volatile void __iomem *addr
- }
- EXPORT_SYMBOL(iounmap);
-
++/*
++ * On SMP it's slightly faster (but much more power-consuming!)
++ * to poll the ->work.need_resched flag instead of waiting for the
++ * cross-CPU IPI to arrive. Use this option with caution.
++ */
++static void poll_idle(void)
++{
++ local_irq_enable();
++ cpu_relax();
++}
++
+#ifndef CONFIG_XEN
+/*
-+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
-+ * access
++ * mwait selection logic:
++ *
++ * It depends on the CPU. For AMD CPUs that support MWAIT this is
++ * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
++ * then depend on a clock divisor and current Pstate of the core. If
++ * all cores of a processor are in halt state (C1) the processor can
++ * enter the C1E (C1 enhanced) state. If mwait is used this will never
++ * happen.
++ *
++ * idle=mwait overrides this decision and forces the usage of mwait.
+ */
-+void *xlate_dev_mem_ptr(unsigned long phys)
++static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+{
-+ void *addr;
-+ unsigned long start = phys & PAGE_MASK;
++ if (force_mwait)
++ return 1;
+
-+ /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
-+ if (page_is_ram(start >> PAGE_SHIFT))
-+ return __va(phys);
++ if (c->x86_vendor == X86_VENDOR_AMD) {
++ switch(c->x86) {
++ case 0x10:
++ case 0x11:
++ return 0;
++ }
++ }
++ return 1;
++}
++#endif
+
-+ addr = (void *)ioremap_default(start, PAGE_SIZE);
-+ if (addr)
-+ addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
++void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
++{
++#ifndef CONFIG_XEN
++ static int selected;
+
-+ return addr;
++ if (selected)
++ return;
++#ifdef CONFIG_X86_SMP
++ if (pm_idle == poll_idle && smp_num_siblings > 1) {
++ printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
++ " performance may degrade.\n");
++ }
++#endif
++ if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
++ /*
++ * Skip, if setup has overridden idle.
++ * One CPU supports mwait => All CPUs supports mwait
++ */
++ if (!pm_idle) {
++ printk(KERN_INFO "using mwait in idle threads.\n");
++ pm_idle = mwait_idle;
++ }
++ }
++ selected = 1;
++#endif
+}
+
-+void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
++static int __init idle_setup(char *str)
+{
-+ if (page_is_ram(phys >> PAGE_SHIFT))
-+ return;
++ if (!strcmp(str, "poll")) {
++ printk("using polling idle threads.\n");
++ pm_idle = poll_idle;
++ }
++#ifndef CONFIG_XEN
++ else if (!strcmp(str, "mwait"))
++ force_mwait = 1;
++#endif
++ else
++ return -1;
+
-+ iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
-+ return;
++ boot_option_idle_override = 1;
++ return 0;
+}
-+#endif
++early_param("idle", idle_setup);
+
- int __initdata early_ioremap_debug;
-
- static int __init early_ioremap_debug_setup(char *str)
-@@ -445,8 +595,8 @@ static int __init early_ioremap_debug_se
- early_param("early_ioremap_debug", early_ioremap_debug_setup);
-
- static __initdata int after_paging_init;
--static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
-- __attribute__((aligned(PAGE_SIZE)));
-+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
-+ __section(.bss.page_aligned);
+--- sle11-2009-05-14.orig/arch/x86/kernel/process_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/process_32-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -36,6 +36,7 @@
+ #include <linux/personality.h>
+ #include <linux/tick.h>
+ #include <linux/percpu.h>
++#include <linux/prctl.h>
- #ifdef CONFIG_X86_32
- static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
-@@ -461,8 +611,8 @@ static inline pmd_t * __init early_iorem
- }
- #else
- #define early_ioremap_pmd early_get_pmd
-+#undef make_lowmem_page_readonly
- #define make_lowmem_page_readonly early_make_page_readonly
--#define make_lowmem_page_writable make_page_writable
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -45,7 +46,6 @@
+ #include <asm/processor.h>
+ #include <asm/i387.h>
+ #include <asm/desc.h>
+-#include <asm/vm86.h>
+ #ifdef CONFIG_MATH_EMULATION
+ #include <asm/math_emu.h>
#endif
+@@ -102,16 +102,6 @@ void enable_hlt(void)
- static inline pte_t * __init early_ioremap_pte(unsigned long addr)
-@@ -511,7 +661,7 @@ void __init early_ioremap_clear(void)
- pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
- pmd_clear(pmd);
- make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
-- /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
-+ /* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */
- __flush_tlb_all();
- }
-
-@@ -652,10 +802,11 @@ void __init early_iounmap(void *addr, un
- unsigned long offset;
- unsigned int nrpages;
- enum fixed_addresses idx;
-- unsigned int nesting;
-+ int nesting;
-
- nesting = --early_ioremap_nested;
-- WARN_ON(nesting < 0);
-+ if (WARN_ON(nesting < 0))
-+ return;
-
- if (early_ioremap_debug) {
- printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
---- a/arch/x86/mm/pageattr-xen.c
-+++ b/arch/x86/mm/pageattr-xen.c
-@@ -9,6 +9,8 @@
- #include <linux/slab.h>
- #include <linux/mm.h>
- #include <linux/interrupt.h>
-+#include <linux/seq_file.h>
-+#include <linux/debugfs.h>
+ EXPORT_SYMBOL(enable_hlt);
- #include <asm/e820.h>
- #include <asm/processor.h>
-@@ -17,370 +19,7 @@
- #include <asm/uaccess.h>
- #include <asm/pgalloc.h>
- #include <asm/proto.h>
--#include <asm/mmu_context.h>
--
--#ifndef CONFIG_X86_64
--#define TASK_SIZE64 TASK_SIZE
--#endif
--
--static void _pin_lock(struct mm_struct *mm, int lock) {
-- if (lock)
-- spin_lock(&mm->page_table_lock);
--#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-- /* While mm->page_table_lock protects us against insertions and
-- * removals of higher level page table pages, it doesn't protect
-- * against updates of pte-s. Such updates, however, require the
-- * pte pages to be in consistent state (unpinned+writable or
-- * pinned+readonly). The pinning and attribute changes, however
-- * cannot be done atomically, which is why such updates must be
-- * prevented from happening concurrently.
-- * Note that no pte lock can ever elsewhere be acquired nesting
-- * with an already acquired one in the same mm, or with the mm's
-- * page_table_lock already acquired, as that would break in the
-- * non-split case (where all these are actually resolving to the
-- * one page_table_lock). Thus acquiring all of them here is not
-- * going to result in dead locks, and the order of acquires
-- * doesn't matter.
-- */
-- {
-- pgd_t *pgd = mm->pgd;
-- unsigned g;
--
-- for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
-- pud_t *pud;
-- unsigned u;
--
-- if (pgd_none(*pgd))
-- continue;
-- pud = pud_offset(pgd, 0);
-- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-- pmd_t *pmd;
-- unsigned m;
--
-- if (pud_none(*pud))
-- continue;
-- pmd = pmd_offset(pud, 0);
-- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
-- spinlock_t *ptl;
--
-- if (pmd_none(*pmd))
-- continue;
-- ptl = pte_lockptr(0, pmd);
-- if (lock)
-- spin_lock(ptl);
-- else
-- spin_unlock(ptl);
-- }
-- }
-- }
-- }
--#endif
-- if (!lock)
-- spin_unlock(&mm->page_table_lock);
--}
--#define pin_lock(mm) _pin_lock(mm, 1)
--#define pin_unlock(mm) _pin_lock(mm, 0)
--
--#define PIN_BATCH sizeof(void *)
--static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
--
--static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
-- unsigned int cpu, unsigned int seq)
+-/*
+- * On SMP it's slightly faster (but much more power-consuming!)
+- * to poll the ->work.need_resched flag instead of waiting for the
+- * cross-CPU IPI to arrive. Use this option with caution.
+- */
+-static void poll_idle(void)
-{
-- unsigned long pfn = page_to_pfn(page);
--
-- if (PageHighMem(page)) {
-- if (pgprot_val(flags) & _PAGE_RW)
-- ClearPagePinned(page);
-- else
-- SetPagePinned(page);
-- } else {
-- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
-- (unsigned long)__va(pfn << PAGE_SHIFT),
-- pfn_pte(pfn, flags), 0);
-- if (unlikely(++seq == PIN_BATCH)) {
-- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
-- PIN_BATCH, NULL)))
-- BUG();
-- seq = 0;
-- }
-- }
--
-- return seq;
+- cpu_relax();
-}
-
--static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
--{
-- pgd_t *pgd = pgd_base;
-- pud_t *pud;
-- pmd_t *pmd;
-- int g,u,m;
-- unsigned int cpu, seq;
-- multicall_entry_t *mcl;
--
-- if (xen_feature(XENFEAT_auto_translated_physmap))
-- return;
--
-- cpu = get_cpu();
--
-- /*
-- * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
-- * may not be the 'current' task's pagetables (e.g., current may be
-- * 32-bit, but the pagetables may be for a 64-bit task).
-- * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
-- * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
-- */
-- for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
-- if (pgd_none(*pgd))
-- continue;
-- pud = pud_offset(pgd, 0);
-- if (PTRS_PER_PUD > 1) /* not folded */
-- seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
-- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-- if (pud_none(*pud))
-- continue;
-- pmd = pmd_offset(pud, 0);
-- if (PTRS_PER_PMD > 1) /* not folded */
-- seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
-- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
-- if (pmd_none(*pmd))
-- continue;
-- seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
-- }
-- }
-- }
+ static void xen_idle(void)
+ {
+ current_thread_info()->status &= ~TS_POLLING;
+@@ -121,20 +111,10 @@ static void xen_idle(void)
+ */
+ smp_mb();
+
+- local_irq_disable();
+- if (!need_resched()) {
+- ktime_t t0, t1;
+- u64 t0n, t1n;
-
-- mcl = per_cpu(pb_mcl, cpu);
--#ifdef CONFIG_X86_64
-- if (unlikely(seq > PIN_BATCH - 2)) {
-- if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
-- BUG();
-- seq = 0;
-- }
-- MULTI_update_va_mapping(mcl + seq,
-- (unsigned long)__user_pgd(pgd_base),
-- pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
-- 0);
-- MULTI_update_va_mapping(mcl + seq + 1,
-- (unsigned long)pgd_base,
-- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-- UVMF_TLB_FLUSH);
-- if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
-- BUG();
--#else
-- if (likely(seq != 0)) {
-- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
-- (unsigned long)pgd_base,
-- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-- UVMF_TLB_FLUSH);
-- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
-- seq + 1, NULL)))
-- BUG();
-- } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
-- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-- UVMF_TLB_FLUSH))
-- BUG();
--#endif
+- t0 = ktime_get();
+- t0n = ktime_to_ns(t0);
++ if (!need_resched())
+ safe_halt(); /* enables interrupts racelessly */
+- local_irq_disable();
+- t1 = ktime_get();
+- t1n = ktime_to_ns(t1);
+- sched_clock_idle_wakeup_event(t1n - t0n);
+- }
+- local_irq_enable();
++ else
++ local_irq_enable();
+ current_thread_info()->status |= TS_POLLING;
+ }
+ #ifdef CONFIG_APM_MODULE
+@@ -142,7 +122,6 @@ EXPORT_SYMBOL(default_idle);
+ #endif
+
+ #ifdef CONFIG_HOTPLUG_CPU
+-extern cpumask_t cpu_initialized;
+ static inline void play_dead(void)
+ {
+ idle_task_exit();
+@@ -187,6 +166,7 @@ void cpu_idle(void)
+ if (cpu_is_offline(cpu))
+ play_dead();
+
++ local_irq_disable();
+ __get_cpu_var(irq_stat).idle_timestamp = jiffies;
+ idle();
+ }
+@@ -197,44 +177,6 @@ void cpu_idle(void)
+ }
+ }
+
+-static void do_nothing(void *unused)
+-{
+-}
-
-- put_cpu();
+-/*
+- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+- * pm_idle and update to new pm_idle value. Required while changing pm_idle
+- * handler on SMP systems.
+- *
+- * Caller must have changed pm_idle to the new value before the call. Old
+- * pm_idle value will not be used by any CPU after the return of this function.
+- */
+-void cpu_idle_wait(void)
+-{
+- smp_mb();
+- /* kick all the CPUs so that they exit out of pm_idle */
+- smp_call_function(do_nothing, NULL, 0, 1);
-}
+-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
--static void __pgd_pin(pgd_t *pgd)
+-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-{
-- pgd_walk(pgd, PAGE_KERNEL_RO);
-- kmap_flush_unused();
-- xen_pgd_pin(__pa(pgd)); /* kernel */
--#ifdef CONFIG_X86_64
-- xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
--#endif
-- SetPagePinned(virt_to_page(pgd));
-}
-
--static void __pgd_unpin(pgd_t *pgd)
+-static int __init idle_setup(char *str)
-{
-- xen_pgd_unpin(__pa(pgd));
--#ifdef CONFIG_X86_64
-- xen_pgd_unpin(__pa(__user_pgd(pgd)));
--#endif
-- pgd_walk(pgd, PAGE_KERNEL);
-- ClearPagePinned(virt_to_page(pgd));
+- if (!strcmp(str, "poll")) {
+- printk("using polling idle threads.\n");
+- pm_idle = poll_idle;
+- }
+- else
+- return -1;
+-
+- boot_option_idle_override = 1;
+- return 0;
-}
+-early_param("idle", idle_setup);
-
--void pgd_test_and_unpin(pgd_t *pgd)
+ void __show_registers(struct pt_regs *regs, int all)
+ {
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+@@ -260,7 +202,7 @@ void __show_registers(struct pt_regs *re
+ init_utsname()->version);
+
+ printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
+- 0xffff & regs->cs, regs->ip, regs->flags,
++ (u16)regs->cs, regs->ip, regs->flags,
+ smp_processor_id());
+ print_symbol("EIP is at %s\n", regs->ip);
+
+@@ -269,8 +211,7 @@ void __show_registers(struct pt_regs *re
+ printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+ regs->si, regs->di, regs->bp, sp);
+ printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
+- regs->ds & 0xffff, regs->es & 0xffff,
+- regs->fs & 0xffff, gs, ss);
++ (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
+
+ if (!all)
+ return;
+@@ -367,6 +308,7 @@ void flush_thread(void)
+ /*
+ * Forget coprocessor state..
+ */
++ tsk->fpu_counter = 0;
+ clear_fpu(tsk);
+ clear_used_math();
+ }
+@@ -437,11 +379,30 @@ int copy_thread(int nr, unsigned long cl
+ return err;
+ }
+
+-#ifdef CONFIG_SECCOMP
++void
++start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
++{
++ __asm__("movl %0, %%gs" :: "r"(0));
++ regs->fs = 0;
++ set_fs(USER_DS);
++ regs->ds = __USER_DS;
++ regs->es = __USER_DS;
++ regs->ss = __USER_DS;
++ regs->cs = __USER_CS;
++ regs->ip = new_ip;
++ regs->sp = new_sp;
++ /*
++ * Free the old FP and other extended state
++ */
++ free_thread_xstate(current);
++}
++EXPORT_SYMBOL_GPL(start_thread);
++
+ static void hard_disable_TSC(void)
+ {
+ write_cr4(read_cr4() | X86_CR4_TSD);
+ }
++
+ void disable_TSC(void)
+ {
+ preempt_disable();
+@@ -453,11 +414,47 @@ void disable_TSC(void)
+ hard_disable_TSC();
+ preempt_enable();
+ }
++
+ static void hard_enable_TSC(void)
+ {
+ write_cr4(read_cr4() & ~X86_CR4_TSD);
+ }
+-#endif /* CONFIG_SECCOMP */
++
++static void enable_TSC(void)
++{
++ preempt_disable();
++ if (test_and_clear_thread_flag(TIF_NOTSC))
++ /*
++ * Must flip the CPU state synchronously with
++ * TIF_NOTSC in the current running context.
++ */
++ hard_enable_TSC();
++ preempt_enable();
++}
++
++int get_tsc_mode(unsigned long adr)
++{
++ unsigned int val;
++
++ if (test_thread_flag(TIF_NOTSC))
++ val = PR_TSC_SIGSEGV;
++ else
++ val = PR_TSC_ENABLE;
++
++ return put_user(val, (unsigned int __user *)adr);
++}
++
++int set_tsc_mode(unsigned int val)
++{
++ if (val == PR_TSC_SIGSEGV)
++ disable_TSC();
++ else if (val == PR_TSC_ENABLE)
++ enable_TSC();
++ else
++ return -EINVAL;
++
++ return 0;
++}
+
+ static noinline void
+ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
+@@ -473,12 +470,12 @@ __switch_to_xtra(struct task_struct *pre
+ /* we clear debugctl to make sure DS
+ * is not in use when we change it */
+ debugctl = 0;
+- wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
++ update_debugctlmsr(0);
+ wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
+ }
+
+ if (next->debugctlmsr != debugctl)
+- wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
++ update_debugctlmsr(next->debugctlmsr);
+
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+ set_debugreg(next->debugreg0, 0);
+@@ -490,7 +487,6 @@ __switch_to_xtra(struct task_struct *pre
+ set_debugreg(next->debugreg7, 7);
+ }
+
+-#ifdef CONFIG_SECCOMP
+ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+ /* prev and next are different */
+@@ -499,7 +495,6 @@ __switch_to_xtra(struct task_struct *pre
+ else
+ hard_enable_TSC();
+ }
+-#endif
+
+ #ifdef X86_BTS
+ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
+@@ -637,7 +632,7 @@ struct task_struct * __switch_to(struct
+
+ /* we're going to use this soon, after a few expensive things */
+ if (next_p->fpu_counter > 5)
+- prefetch(&next->i387.fxsave);
++ prefetch(next->xstate);
+
+ /*
+ * Now maybe handle debug registers
+@@ -658,8 +653,11 @@ struct task_struct * __switch_to(struct
+ /* If the task has used fpu the last 5 timeslices, just do a full
+ * restore of the math state immediately to avoid the trap; the
+ * chances of needing FPU soon are obviously high now
++ *
++ * tsk_used_math() checks prevent calling math_state_restore(),
++ * which can sleep in the case of !tsk_used_math()
+ */
+- if (next_p->fpu_counter > 5)
++ if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
+ math_state_restore();
+
+ /*
+--- sle11-2009-05-14.orig/arch/x86/kernel/process_64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/process_64-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -39,6 +39,7 @@
+ #include <linux/kprobes.h>
+ #include <linux/kdebug.h>
+ #include <linux/tick.h>
++#include <linux/prctl.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -102,17 +103,6 @@ void exit_idle(void)
+ __exit_idle();
+ }
+
+-/*
+- * On SMP it's slightly faster (but much more power-consuming!)
+- * to poll the ->need_resched flag instead of waiting for the
+- * cross-CPU IPI to arrive. Use this option with caution.
+- */
+-static void poll_idle(void)
-{
-- if (PagePinned(virt_to_page(pgd)))
-- __pgd_unpin(pgd);
+- local_irq_enable();
+- cpu_relax();
-}
-
--void mm_pin(struct mm_struct *mm)
--{
-- if (xen_feature(XENFEAT_writable_page_tables))
-- return;
--
-- pin_lock(mm);
-- __pgd_pin(mm->pgd);
-- pin_unlock(mm);
--}
+ static void xen_idle(void)
+ {
+ current_thread_info()->status &= ~TS_POLLING;
+@@ -121,20 +111,10 @@ static void xen_idle(void)
+ * test NEED_RESCHED:
+ */
+ smp_mb();
+- local_irq_disable();
+- if (!need_resched()) {
+- ktime_t t0, t1;
+- u64 t0n, t1n;
-
--void mm_unpin(struct mm_struct *mm)
+- t0 = ktime_get();
+- t0n = ktime_to_ns(t0);
++ if (!need_resched())
+ safe_halt(); /* enables interrupts racelessly */
+- local_irq_disable();
+- t1 = ktime_get();
+- t1n = ktime_to_ns(t1);
+- sched_clock_idle_wakeup_event(t1n - t0n);
+- }
+- local_irq_enable();
++ else
++ local_irq_enable();
+ current_thread_info()->status |= TS_POLLING;
+ }
+
+@@ -195,45 +175,6 @@ void cpu_idle(void)
+ }
+ }
+
+-static void do_nothing(void *unused)
-{
-- if (xen_feature(XENFEAT_writable_page_tables))
-- return;
--
-- pin_lock(mm);
-- __pgd_unpin(mm->pgd);
-- pin_unlock(mm);
-}
-
--void mm_pin_all(void)
+-/*
+- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+- * pm_idle and update to new pm_idle value. Required while changing pm_idle
+- * handler on SMP systems.
+- *
+- * Caller must have changed pm_idle to the new value before the call. Old
+- * pm_idle value will not be used by any CPU after the return of this function.
+- */
+-void cpu_idle_wait(void)
-{
-- struct page *page;
-- unsigned long flags;
--
-- if (xen_feature(XENFEAT_writable_page_tables))
-- return;
--
-- /*
-- * Allow uninterrupted access to the pgd_list. Also protects
-- * __pgd_pin() by disabling preemption.
-- * All other CPUs must be at a safe point (e.g., in stop_machine
-- * or offlined entirely).
-- */
-- spin_lock_irqsave(&pgd_lock, flags);
-- list_for_each_entry(page, &pgd_list, lru) {
-- if (!PagePinned(page))
-- __pgd_pin((pgd_t *)page_address(page));
-- }
-- spin_unlock_irqrestore(&pgd_lock, flags);
+- smp_mb();
+- /* kick all the CPUs so that they exit out of pm_idle */
+- smp_call_function(do_nothing, NULL, 0, 1);
-}
+-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
--void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-{
-- if (!PagePinned(virt_to_page(mm->pgd)))
-- mm_pin(mm);
-}
-
--void arch_exit_mmap(struct mm_struct *mm)
+-static int __init idle_setup(char *str)
-{
-- struct task_struct *tsk = current;
--
-- task_lock(tsk);
--
-- /*
-- * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
-- * *much* faster this way, as no tlb flushes means bigger wrpt batches.
-- */
-- if (tsk->active_mm == mm) {
-- tsk->active_mm = &init_mm;
-- atomic_inc(&init_mm.mm_count);
--
-- switch_mm(mm, &init_mm, tsk);
--
-- atomic_dec(&mm->mm_count);
-- BUG_ON(atomic_read(&mm->mm_count) == 0);
-- }
--
-- task_unlock(tsk);
--
-- if (PagePinned(virt_to_page(mm->pgd))
-- && atomic_read(&mm->mm_count) == 1
-- && !mm->context.has_foreign_mappings)
-- mm_unpin(mm);
--}
+- if (!strcmp(str, "poll")) {
+- printk("using polling idle threads.\n");
+- pm_idle = poll_idle;
+- } else if (!strcmp(str, "mwait"))
+- force_mwait = 1;
+- else
+- return -1;
-
--static void _pte_free(struct page *page, unsigned int order)
--{
-- BUG_ON(order);
-- __pte_free(page);
+- boot_option_idle_override = 1;
+- return 0;
-}
+-early_param("idle", idle_setup);
-
--pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+ /* Prints also some state that isn't saved in the pt_regs */
+ void __show_regs(struct pt_regs * regs)
+ {
+@@ -360,6 +301,7 @@ void flush_thread(void)
+ /*
+ * Forget coprocessor state..
+ */
++ tsk->fpu_counter = 0;
+ clear_fpu(tsk);
+ clear_used_math();
+ }
+@@ -472,6 +414,83 @@ out:
+ return err;
+ }
+
++void
++start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
++{
++ asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
++ load_gs_index(0);
++ regs->ip = new_ip;
++ regs->sp = new_sp;
++ write_pda(oldrsp, new_sp);
++ regs->cs = __USER_CS;
++ regs->ss = __USER_DS;
++ regs->flags = 0x200;
++ set_fs(USER_DS);
++ /*
++ * Free the old FP and other extended state
++ */
++ free_thread_xstate(current);
++}
++EXPORT_SYMBOL_GPL(start_thread);
++
++static void hard_disable_TSC(void)
++{
++ write_cr4(read_cr4() | X86_CR4_TSD);
++}
++
++void disable_TSC(void)
++{
++ preempt_disable();
++ if (!test_and_set_thread_flag(TIF_NOTSC))
++ /*
++ * Must flip the CPU state synchronously with
++ * TIF_NOTSC in the current running context.
++ */
++ hard_disable_TSC();
++ preempt_enable();
++}
++
++static void hard_enable_TSC(void)
++{
++ write_cr4(read_cr4() & ~X86_CR4_TSD);
++}
++
++static void enable_TSC(void)
++{
++ preempt_disable();
++ if (test_and_clear_thread_flag(TIF_NOTSC))
++ /*
++ * Must flip the CPU state synchronously with
++ * TIF_NOTSC in the current running context.
++ */
++ hard_enable_TSC();
++ preempt_enable();
++}
++
++int get_tsc_mode(unsigned long adr)
++{
++ unsigned int val;
++
++ if (test_thread_flag(TIF_NOTSC))
++ val = PR_TSC_SIGSEGV;
++ else
++ val = PR_TSC_ENABLE;
++
++ return put_user(val, (unsigned int __user *)adr);
++}
++
++int set_tsc_mode(unsigned int val)
++{
++ if (val == PR_TSC_SIGSEGV)
++ disable_TSC();
++ else if (val == PR_TSC_ENABLE)
++ enable_TSC();
++ else
++ return -EINVAL;
++
++ return 0;
++}
++
+ /*
+ * This special macro can be used to load a debugging register
+ */
+@@ -491,12 +510,12 @@ static inline void __switch_to_xtra(stru
+ /* we clear debugctl to make sure DS
+ * is not in use when we change it */
+ debugctl = 0;
+- wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
++ update_debugctlmsr(0);
+ wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
+ }
+
+ if (next->debugctlmsr != debugctl)
+- wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
++ update_debugctlmsr(next->debugctlmsr);
+
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+ loaddebug(next, 0);
+@@ -508,6 +527,15 @@ static inline void __switch_to_xtra(stru
+ loaddebug(next, 7);
+ }
+
++ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
++ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
++ /* prev and next are different */
++ if (test_tsk_thread_flag(next_p, TIF_NOTSC))
++ hard_disable_TSC();
++ else
++ hard_enable_TSC();
++ }
++
+ #ifdef X86_BTS
+ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
+ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+@@ -547,7 +575,7 @@ __switch_to(struct task_struct *prev_p,
+
+ /* we're going to use this soon, after a few expensive things */
+ if (next_p->fpu_counter>5)
+- prefetch(&next->i387.fxsave);
++ prefetch(next->xstate);
+
+ /*
+ * This is basically '__unlazy_fpu', except that we queue a
+@@ -680,8 +708,11 @@ __switch_to(struct task_struct *prev_p,
+ /* If the task has used fpu the last 5 timeslices, just do a full
+ * restore of the math state immediately to avoid the trap; the
+ * chances of needing FPU soon are obviously high now
++ *
++ * tsk_used_math() checks prevent calling math_state_restore(),
++ * which can sleep in the case of !tsk_used_math()
+ */
+- if (next_p->fpu_counter>5)
++ if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
+ math_state_restore();
+ return prev_p;
+ }
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-05-14/arch/x86/kernel/setup-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -0,0 +1,141 @@
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/bootmem.h>
++#include <linux/percpu.h>
++#include <asm/smp.h>
++#include <asm/percpu.h>
++#include <asm/sections.h>
++#include <asm/processor.h>
++#include <asm/setup.h>
++#include <asm/topology.h>
++#include <asm/mpspec.h>
++#include <asm/apicdef.h>
++
++#ifdef CONFIG_X86_LOCAL_APIC
++unsigned int num_processors;
++unsigned disabled_cpus __cpuinitdata;
++/* Processor that is doing the boot up */
++unsigned int boot_cpu_physical_apicid = -1U;
++EXPORT_SYMBOL(boot_cpu_physical_apicid);
++
++DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
++EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
++
++/* Bitmask of physically existing CPUs */
++physid_mask_t phys_cpu_present_map;
++#endif
++
++#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
++/*
++ * Copy data used in early init routines from the initial arrays to the
++ * per cpu data areas. These arrays then become expendable and the
++ * *_early_ptr's are zeroed indicating that the static arrays are gone.
++ */
++static void __init setup_per_cpu_maps(void)
++{
++#ifndef CONFIG_XEN
++ int cpu;
++
++ for_each_possible_cpu(cpu) {
++ per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
++ per_cpu(x86_bios_cpu_apicid, cpu) =
++ x86_bios_cpu_apicid_init[cpu];
++#ifdef CONFIG_NUMA
++ per_cpu(x86_cpu_to_node_map, cpu) =
++ x86_cpu_to_node_map_init[cpu];
++#endif
++ }
++
++ /* indicate the early static arrays will soon be gone */
++ x86_cpu_to_apicid_early_ptr = NULL;
++ x86_bios_cpu_apicid_early_ptr = NULL;
++#ifdef CONFIG_NUMA
++ x86_cpu_to_node_map_early_ptr = NULL;
++#endif
++#endif
++}
++
++#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
++cpumask_t *cpumask_of_cpu_map __read_mostly;
++EXPORT_SYMBOL(cpumask_of_cpu_map);
++
++/* requires nr_cpu_ids to be initialized */
++static void __init setup_cpumask_of_cpu(void)
++{
++ int i;
++
++ /* alloc_bootmem zeroes memory */
++ cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
++ for (i = 0; i < nr_cpu_ids; i++)
++ cpu_set(i, cpumask_of_cpu_map[i]);
++}
++#else
++static inline void setup_cpumask_of_cpu(void) { }
++#endif
++
++#ifdef CONFIG_X86_32
++/*
++ * Great future not-so-futuristic plan: make i386 and x86_64 do it
++ * the same way
++ */
++unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
++EXPORT_SYMBOL(__per_cpu_offset);
++#endif
++
++/*
++ * Great future plan:
++ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
++ * Always point %gs to its beginning
++ */
++void __init setup_per_cpu_areas(void)
++{
++ int i, highest_cpu = 0;
++ unsigned long size;
++
++#ifdef CONFIG_HOTPLUG_CPU
++ prefill_possible_map();
++#endif
++
++ /* Copy section for each CPU (we discard the original) */
++ size = PERCPU_ENOUGH_ROOM;
++ printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
++ size);
++
++ for_each_possible_cpu(i) {
++ char *ptr;
++#ifndef CONFIG_NEED_MULTIPLE_NODES
++ ptr = alloc_bootmem_pages(size);
++#else
++ int node = early_cpu_to_node(i);
++ if (!node_online(node) || !NODE_DATA(node)) {
++ ptr = alloc_bootmem_pages(size);
++ printk(KERN_INFO
++ "cpu %d has no node or node-local memory\n", i);
++ }
++ else
++ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
++#endif
++ if (!ptr)
++ panic("Cannot allocate cpu data for CPU %d\n", i);
++#ifdef CONFIG_X86_64
++ cpu_pda(i)->data_offset = ptr - __per_cpu_start;
++#else
++ __per_cpu_offset[i] = ptr - __per_cpu_start;
++#endif
++ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
++
++ highest_cpu = i;
++ }
++
++ nr_cpu_ids = highest_cpu + 1;
++ printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
++
++ /* Setup percpu data maps */
++ setup_per_cpu_maps();
++
++ /* Setup cpumask_of_cpu map */
++ setup_cpumask_of_cpu();
++}
++
++#endif
+--- sle11-2009-05-14.orig/arch/x86/kernel/setup64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/setup64-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -15,6 +15,7 @@
+ #include <linux/bootmem.h>
+ #include <linux/bitops.h>
+ #include <linux/module.h>
++#include <linux/kgdb.h>
+ #include <asm/pda.h>
+ #include <asm/pgtable.h>
+ #include <asm/processor.h>
+@@ -27,6 +28,7 @@
+ #include <asm/proto.h>
+ #include <asm/sections.h>
+ #include <asm/setup.h>
++#include <asm/genapic.h>
+ #ifdef CONFIG_XEN
+ #include <asm/hypervisor.h>
+ #endif
+@@ -81,8 +83,8 @@ int force_personality32 = 0;
+ Control non executable heap for 32bit processes.
+ To control the stack too use noexec=off
+
+-on PROT_READ does not imply PROT_EXEC for 32bit processes
+-off PROT_READ implies PROT_EXEC (default)
++on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
++off PROT_READ implies PROT_EXEC
+ */
+ static int __init nonx32_setup(char *str)
+ {
+@@ -94,85 +96,6 @@ static int __init nonx32_setup(char *str
+ }
+ __setup("noexec32=", nonx32_setup);
+
+-/*
+- * Copy data used in early init routines from the initial arrays to the
+- * per cpu data areas. These arrays then become expendable and the
+- * *_early_ptr's are zeroed indicating that the static arrays are gone.
+- */
+-static void __init setup_per_cpu_maps(void)
-{
-- struct page *pte;
+-#ifndef CONFIG_XEN
+- int cpu;
-
--#ifdef CONFIG_HIGHPTE
-- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
--#else
-- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+- for_each_possible_cpu(cpu) {
+-#ifdef CONFIG_SMP
+- if (per_cpu_offset(cpu)) {
-#endif
-- if (pte) {
-- pgtable_page_ctor(pte);
-- SetPageForeign(pte, _pte_free);
-- init_page_count(pte);
-- }
-- return pte;
--}
--
--void __pte_free(pgtable_t pte)
--{
-- if (!PageHighMem(pte)) {
-- unsigned long va = (unsigned long)page_address(pte);
-- unsigned int level;
-- pte_t *ptep = lookup_address(va, &level);
--
-- BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
-- if (!pte_write(*ptep)
-- && HYPERVISOR_update_va_mapping(va,
-- mk_pte(pte, PAGE_KERNEL),
-- 0))
-- BUG();
-- } else
--#ifdef CONFIG_HIGHPTE
-- ClearPagePinned(pte);
--#else
-- BUG();
+- per_cpu(x86_cpu_to_apicid, cpu) =
+- x86_cpu_to_apicid_init[cpu];
+- per_cpu(x86_bios_cpu_apicid, cpu) =
+- x86_bios_cpu_apicid_init[cpu];
+-#ifdef CONFIG_NUMA
+- per_cpu(x86_cpu_to_node_map, cpu) =
+- x86_cpu_to_node_map_init[cpu];
-#endif
+-#ifdef CONFIG_SMP
+- }
+- else
+- printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
+- cpu);
+-#endif
+- }
-
-- ClearPageForeign(pte);
-- init_page_count(pte);
-- pgtable_page_dtor(pte);
-- __free_page(pte);
--}
--
--#if PAGETABLE_LEVELS >= 3
--static void _pmd_free(struct page *page, unsigned int order)
--{
-- BUG_ON(order);
-- __pmd_free(page);
+- /* indicate the early static arrays will soon be gone */
+- x86_cpu_to_apicid_early_ptr = NULL;
+- x86_bios_cpu_apicid_early_ptr = NULL;
+-#ifdef CONFIG_NUMA
+- x86_cpu_to_node_map_early_ptr = NULL;
+-#endif
+-#endif
-}
-
--pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
--{
-- struct page *pmd;
+-/*
+- * Great future plan:
+- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+- * Always point %gs to its beginning
+- */
+-void __init setup_per_cpu_areas(void)
+-{
+- int i;
+- unsigned long size;
-
-- pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-- if (!pmd)
-- return NULL;
-- SetPageForeign(pmd, _pmd_free);
-- init_page_count(pmd);
-- return page_address(pmd);
--}
+-#ifdef CONFIG_HOTPLUG_CPU
+- prefill_possible_map();
+-#endif
-
--void __pmd_free(pgtable_t pmd)
--{
-- unsigned long va = (unsigned long)page_address(pmd);
-- unsigned int level;
-- pte_t *ptep = lookup_address(va, &level);
+- /* Copy section for each CPU (we discard the original) */
+- size = PERCPU_ENOUGH_ROOM;
-
-- BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
-- if (!pte_write(*ptep)
-- && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
-- BUG();
+- printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
+- for_each_cpu_mask (i, cpu_possible_map) {
+- char *ptr;
+-#ifndef CONFIG_NEED_MULTIPLE_NODES
+- ptr = alloc_bootmem_pages(size);
+-#else
+- int node = early_cpu_to_node(i);
-
-- ClearPageForeign(pmd);
-- init_page_count(pmd);
-- __free_page(pmd);
--}
+- if (!node_online(node) || !NODE_DATA(node))
+- ptr = alloc_bootmem_pages(size);
+- else
+- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-#endif
+- if (!ptr)
+- panic("Cannot allocate cpu data for CPU %d\n", i);
+- cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+- }
+-
+- /* setup percpu data maps early */
+- setup_per_cpu_maps();
+-}
+-
+ #ifdef CONFIG_XEN
+ static void __init_refok switch_pt(int cpu)
+ {
+@@ -410,6 +333,17 @@ void __cpuinit cpu_init (void)
+ #endif
+ load_LDT(&init_mm.context);
+
++#ifdef CONFIG_KGDB
++ /*
++ * If the kgdb is connected no debug regs should be altered. This
++ * is only applicable when KGDB and a KGDB I/O module are built
++ * into the kernel and you are using early debugging with
++ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
++ */
++ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
++ arch_kgdb_ops.correct_hw_break();
++ else {
++#endif
+ /*
+ * Clear all 6 debug registers:
+ */
+@@ -420,10 +354,17 @@ void __cpuinit cpu_init (void)
+ set_debugreg(0UL, 3);
+ set_debugreg(0UL, 6);
+ set_debugreg(0UL, 7);
++#ifdef CONFIG_KGDB
++ /* If the kgdb is connected no debug regs should be altered. */
++ }
++#endif
+
+ fpu_init();
+
+ asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
+ if (raw_irqs_disabled())
+ kernel_eflags &= ~X86_EFLAGS_IF;
++
++ if (is_uv_system())
++ uv_cpu_init();
+ }
+--- sle11-2009-05-14.orig/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -39,6 +39,7 @@
+ #include <linux/efi.h>
+ #include <linux/init.h>
+ #include <linux/edd.h>
++#include <linux/iscsi_ibft.h>
+ #include <linux/nodemask.h>
+ #include <linux/kernel.h>
+ #include <linux/percpu.h>
+@@ -49,6 +50,7 @@
+ #include <linux/pfn.h>
+ #include <linux/pci.h>
+ #include <linux/init_ohci1394_dma.h>
++#include <linux/kvm_para.h>
+
+ #include <video/edid.h>
+
+@@ -70,8 +72,9 @@
+ #include <xen/firmware.h>
+ #include <xen/xencons.h>
+ #include <setup_arch.h>
+-#include <bios_ebda.h>
++#include <asm/bios_ebda.h>
+ #include <asm/cacheflush.h>
++#include <asm/processor.h>
+
+ #ifdef CONFIG_XEN
+ #include <xen/interface/kexec.h>
+@@ -136,7 +139,12 @@ static struct resource standard_io_resou
+ }, {
+ .name = "keyboard",
+ .start = 0x0060,
+- .end = 0x006f,
++ .end = 0x0060,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "keyboard",
++ .start = 0x0064,
++ .end = 0x0064,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
+ }, {
+ .name = "dma page reg",
+@@ -166,6 +174,8 @@ struct cpuinfo_x86 new_cpu_data __cpuini
+ struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+ EXPORT_SYMBOL(boot_cpu_data);
+
++unsigned int def_to_bigsmp;
++
+ #ifndef CONFIG_X86_PAE
+ unsigned long mmu_cr4_features;
+ #else
+@@ -204,7 +214,7 @@ EXPORT_SYMBOL(ist_info);
+ extern void early_cpu_init(void);
+ extern int root_mountflags;
+
+-unsigned long saved_videomode;
++unsigned long saved_video_mode;
+
+ #define RAMDISK_IMAGE_START_MASK 0x07FF
+ #define RAMDISK_PROMPT_FLAG 0x8000
+@@ -259,7 +269,7 @@ static inline void copy_edd(void)
+ }
+ #endif
+
+-int __initdata user_defined_memmap = 0;
++int __initdata user_defined_memmap;
+
+ /*
+ * "mem=nopentium" disables the 4MB page tables.
+@@ -420,20 +430,59 @@ unsigned long __init find_max_low_pfn(vo
+ }
+
+ #ifndef CONFIG_XEN
++#define BIOS_LOWMEM_KILOBYTES 0x413
++
+ /*
+- * workaround for Dell systems that neglect to reserve EBDA
++ * The BIOS places the EBDA/XBDA at the top of conventional
++ * memory, and usually decreases the reported amount of
++ * conventional memory (int 0x12) too. This also contains a
++ * workaround for Dell systems that neglect to reserve EBDA.
++ * The same workaround also avoids a problem with the AMD768MPX
++ * chipset: reserve a page before VGA to prevent PCI prefetch
++ * into it (errata #56). Usually the page is reserved anyways,
++ * unless you have no PS/2 mouse plugged in.
+ */
+ static void __init reserve_ebda_region(void)
+ {
+- unsigned int addr;
+- addr = get_bios_ebda();
+- if (addr)
+- reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
++ unsigned int lowmem, ebda_addr;
++
++ /* To determine the position of the EBDA and the */
++ /* end of conventional memory, we need to look at */
++ /* the BIOS data area. In a paravirtual environment */
++ /* that area is absent. We'll just have to assume */
++ /* that the paravirt case can handle memory setup */
++ /* correctly, without our help. */
++ if (paravirt_enabled())
++ return;
++
++ /* end of low (conventional) memory */
++ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
++ lowmem <<= 10;
++
++ /* start of EBDA area */
++ ebda_addr = get_bios_ebda();
++
++ /* Fixup: bios puts an EBDA in the top 64K segment */
++ /* of conventional memory, but does not adjust lowmem. */
++ if ((lowmem - ebda_addr) <= 0x10000)
++ lowmem = ebda_addr;
++
++ /* Fixup: bios does not report an EBDA at all. */
++ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
++ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
++ lowmem = 0x9f000;
++
++ /* Paranoia: should never happen, but... */
++ if ((lowmem == 0) || (lowmem >= 0x100000))
++ lowmem = 0x9f000;
++
++ /* reserve all memory between lowmem and the 1MB mark */
++ reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
+ }
+ #endif
+
+ #ifndef CONFIG_NEED_MULTIPLE_NODES
+-void __init setup_bootmem_allocator(void);
++static void __init setup_bootmem_allocator(void);
+ static unsigned long __init setup_memory(void)
+ {
+ /*
+@@ -469,7 +518,7 @@ static unsigned long __init setup_memory
+ return max_low_pfn;
+ }
+
+-void __init zone_sizes_init(void)
++static void __init zone_sizes_init(void)
+ {
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+@@ -521,10 +570,16 @@ static void __init reserve_crashkernel(v
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crash_base >> 20),
+ (unsigned long)(total_mem >> 20));
++
++ if (reserve_bootmem(crash_base, crash_size,
++ BOOTMEM_EXCLUSIVE) < 0) {
++ printk(KERN_INFO "crashkernel reservation "
++ "failed - memory is in use\n");
++ return;
++ }
++
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+- reserve_bootmem(crash_base, crash_size,
+- BOOTMEM_DEFAULT);
+ } else
+ printk(KERN_INFO "crashkernel reservation failed - "
+ "you have to specify a base address\n");
+@@ -658,16 +713,9 @@ void __init setup_bootmem_allocator(void
+ */
+ reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
+
+- /* reserve EBDA region, it's a 4K region */
++ /* reserve EBDA region */
+ reserve_ebda_region();
+
+- /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
+- PCI prefetch into it (errata #56). Usually the page is reserved anyways,
+- unless you have no PS/2 mouse plugged in. */
+- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+- boot_cpu_data.x86 == 6)
+- reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
-
--/* blktap and gntdev need this, as otherwise they would implicitly (and
-- * needlessly, as they never use it) reference init_mm. */
--pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
-- unsigned long addr, pte_t *ptep, int full)
--{
-- return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
--}
--EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
+ #ifdef CONFIG_SMP
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+@@ -689,6 +737,8 @@ void __init setup_bootmem_allocator(void
+ #endif
+ numa_kva_reserve();
+ reserve_crashkernel();
++
++ reserve_ibft_region();
+ }
+
+ /*
+@@ -724,6 +774,18 @@ char * __init __attribute__((weak)) memo
+ return machine_specific_memory_setup();
+ }
+
++#ifdef CONFIG_NUMA
++/*
++ * In the golden day, when everything among i386 and x86_64 will be
++ * integrated, this will not live here
++ */
++void *x86_cpu_to_node_map_early_ptr;
++int x86_cpu_to_node_map_init[NR_CPUS] = {
++ [0 ... NR_CPUS-1] = NUMA_NO_NODE
++};
++DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
++#endif
++
+ /*
+ * Determine if we were loaded by an EFI loader. If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+@@ -773,7 +835,7 @@ void __init setup_arch(char **cmdline_p)
+ copy_edid();
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+- saved_videomode = boot_params.hdr.vid_mode;
++ saved_video_mode = boot_params.hdr.vid_mode;
+ if( boot_params.sys_desc_table.length != 0 ) {
+ set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
+ machine_id = boot_params.sys_desc_table.table[0];
+@@ -840,15 +902,19 @@ void __init setup_arch(char **cmdline_p)
+ efi_init();
+
+ /* update e820 for memory not covered by WB MTRRs */
+- find_max_pfn();
++ propagate_e820_map();
+ mtrr_bp_init();
+ #ifndef CONFIG_XEN
+ if (mtrr_trim_uncached_memory(max_pfn))
+- find_max_pfn();
++ propagate_e820_map();
+ #endif
+
+ max_low_pfn = setup_memory();
+
++#ifdef CONFIG_KVM_CLOCK
++ kvmclock_init();
++#endif
++
+ #ifdef CONFIG_VMI
+ /*
+ * Must be after max_low_pfn is determined, and before kernel
+@@ -856,6 +922,7 @@ void __init setup_arch(char **cmdline_p)
+ */
+ vmi_init();
+ #endif
++ kvm_guest_init();
+
+ /*
+ * NOTE: before this point _nobody_ is allowed to allocate
+@@ -977,6 +1044,18 @@ void __init setup_arch(char **cmdline_p)
+
+ io_delay_init();
+
++#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
++ /*
++ * setup to use the early static init tables during kernel startup
++ * X86_SMP will exclude sub-arches that don't deal well with it.
++ */
++ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
++ x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
++#ifdef CONFIG_NUMA
++ x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
++#endif
++#endif
++
+ #ifdef CONFIG_X86_GENERICARCH
+ generic_apic_probe();
+ #endif
+--- sle11-2009-05-14.orig/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -29,18 +29,22 @@
+ #include <linux/crash_dump.h>
+ #include <linux/root_dev.h>
+ #include <linux/pci.h>
++#include <asm/pci-direct.h>
+ #include <linux/efi.h>
+ #include <linux/acpi.h>
+ #include <linux/kallsyms.h>
+ #include <linux/edd.h>
++#include <linux/iscsi_ibft.h>
+ #include <linux/mmzone.h>
+ #include <linux/kexec.h>
+ #include <linux/cpufreq.h>
+ #include <linux/dmi.h>
+ #include <linux/dma-mapping.h>
+ #include <linux/ctype.h>
++#include <linux/sort.h>
+ #include <linux/uaccess.h>
+ #include <linux/init_ohci1394_dma.h>
++#include <linux/kvm_para.h>
+
+ #include <asm/mtrr.h>
+ #include <asm/uaccess.h>
+@@ -58,7 +62,6 @@
+ #include <asm/mmu_context.h>
+ #include <asm/proto.h>
+ #include <asm/setup.h>
+-#include <asm/mach_apic.h>
+ #include <asm/numa.h>
+ #include <asm/sections.h>
+ #include <asm/dmi.h>
+@@ -66,6 +69,9 @@
+ #include <asm/mce.h>
+ #include <asm/ds.h>
+ #include <asm/topology.h>
+#include <asm/pat.h>
++
++#include <mach_apic.h>
+ #ifdef CONFIG_XEN
+ #include <linux/percpu.h>
+ #include <xen/interface/physdev.h>
+@@ -149,7 +155,7 @@ extern int root_mountflags;
+
+ char __initdata command_line[COMMAND_LINE_SIZE];
+
+-struct resource standard_io_resources[] = {
++static struct resource standard_io_resources[] = {
+ { .name = "dma1", .start = 0x00, .end = 0x1f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic1", .start = 0x20, .end = 0x21,
+@@ -158,7 +164,9 @@ struct resource standard_io_resources[]
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer1", .start = 0x50, .end = 0x53,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+- { .name = "keyboard", .start = 0x60, .end = 0x6f,
++ { .name = "keyboard", .start = 0x60, .end = 0x60,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
++ { .name = "keyboard", .start = 0x64, .end = 0x64,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+@@ -224,8 +232,10 @@ contig_initmem_init(unsigned long start_
+ e820_register_active_regions(0, start_pfn, end_pfn);
+ #ifdef CONFIG_XEN
+ free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
++ early_res_to_bootmem(0, xen_start_info->nr_pages<<PAGE_SHIFT);
+ #else
+ free_bootmem_with_active_regions(0, end_pfn);
++ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+ #endif
+ reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
+ }
+@@ -290,6 +300,7 @@ static void __init reserve_crashkernel(v
+ (unsigned long)(total_mem >> 20));
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
++ insert_resource(&iomem_resource, &crashk_res);
+ }
+ }
+ #else
+@@ -306,6 +317,40 @@ void __attribute__((weak)) __init memory
+ machine_specific_memory_setup();
+ }
+
++static void __init parse_setup_data(void)
++{
++ struct setup_data *data;
++ unsigned long pa_data;
++
++ if (boot_params.hdr.version < 0x0209)
++ return;
++ pa_data = boot_params.hdr.setup_data;
++ while (pa_data) {
++ data = early_ioremap(pa_data, PAGE_SIZE);
++ switch (data->type) {
++ default:
++ break;
++ }
++#ifndef CONFIG_DEBUG_BOOT_PARAMS
++ free_early(pa_data, pa_data+sizeof(*data)+data->len);
++#endif
++ pa_data = data->next;
++ early_iounmap(data, PAGE_SIZE);
++ }
++}
++
++#ifdef CONFIG_PCI_MMCONFIG
++extern void __cpuinit fam10h_check_enable_mmcfg(void);
++extern void __init check_enable_amd_mmconf_dmi(void);
++#else
++void __cpuinit fam10h_check_enable_mmcfg(void)
++{
++}
++void __init check_enable_amd_mmconf_dmi(void)
++{
++}
++#endif
++
+ /*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+@@ -389,6 +434,8 @@ void __init setup_arch(char **cmdline_p)
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
++ parse_setup_data();
++
+ parse_early_param();
- /*
- * The current flushing context - we pass it instead of 5 arguments:
-@@ -392,6 +31,7 @@ struct cpa_data {
- int numpages;
- int flushtlb;
- unsigned long pfn;
-+ unsigned force_split : 1;
- };
+ #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+@@ -398,6 +445,13 @@ void __init setup_arch(char **cmdline_p)
- #ifdef CONFIG_X86_64
-@@ -637,6 +277,9 @@ try_preserve_large_page(pte_t *kpte, uns
- int i, do_split = 1;
- unsigned int level;
+ finish_e820_parsing();
-+ if (cpa->force_split)
-+ return 1;
++#ifndef CONFIG_XEN
++ /* after parse_early_param, so could debug it */
++ insert_resource(&iomem_resource, &code_resource);
++ insert_resource(&iomem_resource, &data_resource);
++ insert_resource(&iomem_resource, &bss_resource);
++#endif
+
- spin_lock_irqsave(&pgd_lock, flags);
+ early_gart_iommu_check();
+
+ e820_register_active_regions(0, 0, -1UL);
+@@ -420,15 +474,23 @@ void __init setup_arch(char **cmdline_p)
+
+ check_efer();
+
+- init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
++ max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
+ if (efi_enabled)
+ efi_init();
+
++#ifndef CONFIG_XEN
++ vsmp_init();
++#endif
++
+ if (is_initial_xendomain())
+ dmi_scan_machine();
+
+ io_delay_init();
+
++#ifdef CONFIG_KVM_CLOCK
++ kvmclock_init();
++#endif
++
+ #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+ /* setup to use the early static init tables during kernel startup */
+ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
+@@ -459,9 +521,9 @@ void __init setup_arch(char **cmdline_p)
+ contig_initmem_init(0, end_pfn);
+ #endif
+
+- early_res_to_bootmem();
+-
+ #ifndef CONFIG_XEN
++ dma32_reserve_bootmem();
++
+ #ifdef CONFIG_ACPI_SLEEP
/*
- * Check for races, another CPU might have split this page
-@@ -856,9 +499,7 @@ static int split_large_page(pte_t *kpte,
- goto out_unlock;
+ * Reserve low memory region for sleep support.
+@@ -487,16 +549,17 @@ void __init setup_arch(char **cmdline_p)
+ unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
- pbase = (pte_t *)page_address(base);
--#ifdef CONFIG_X86_32
-- paravirt_alloc_pt(&init_mm, page_to_pfn(base));
+ if (ramdisk_end <= end_of_mem) {
+-#ifndef CONFIG_XEN
+- reserve_bootmem_generic(ramdisk_image, ramdisk_size);
-#endif
-+ paravirt_alloc_pte(&init_mm, page_to_pfn(base));
- ref_prot = pte_pgprot(pte_clrhuge(*kpte));
++ /*
++ * don't need to reserve again, already reserved early
++ * in x86_64_start_kernel, and early_res_to_bootmem
++ * convert that to reserved in bootmem
++ */
+ initrd_start = ramdisk_image + PAGE_OFFSET;
+ initrd_end = initrd_start+ramdisk_size;
+ #ifdef CONFIG_XEN
+ initrd_below_start_ok = 1;
+ #endif
+ } else {
+- /* Assumes everything on node 0 */
+ free_bootmem(ramdisk_image, ramdisk_size);
+ printk(KERN_ERR "initrd extends beyond end of memory "
+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+@@ -506,6 +569,9 @@ void __init setup_arch(char **cmdline_p)
+ }
+ #endif
+ reserve_crashkernel();
++
++ reserve_ibft_region();
++
+ paging_init();
+ map_vsyscall();
+ #ifdef CONFIG_X86_LOCAL_APIC
+@@ -633,16 +699,16 @@ void __init setup_arch(char **cmdline_p)
+ prefill_possible_map();
+ #endif
- #ifdef CONFIG_X86_64
-@@ -918,7 +559,7 @@ static int __change_page_attr(struct cpa
- repeat:
- kpte = lookup_address(address, &level);
- if (!kpte)
-- return primary ? -EINVAL : 0;
-+ return 0;
++ kvm_guest_init();
++
+ /*
+ * We trust e820 completely. No explicit ROM probing in memory.
+ */
+ #ifdef CONFIG_XEN
+ if (is_initial_xendomain())
+- e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
+- &code_resource, &data_resource, &bss_resource);
++ e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
+ #else
+- e820_reserve_resources(e820.map, e820.nr_map,
+- &code_resource, &data_resource, &bss_resource);
++ e820_reserve_resources(e820.map, e820.nr_map);
+ e820_mark_nosave_regions();
+ #endif
- old_pte = *kpte;
- if (!__pte_val(old_pte)) {
-@@ -1077,7 +718,8 @@ static inline int cache_attr(pgprot_t at
+@@ -690,6 +756,9 @@ void __init setup_arch(char **cmdline_p)
+ #endif
+
+ #endif /* !CONFIG_XEN */
++
++ /* do this before identify_cpu for boot cpu */
++ check_enable_amd_mmconf_dmi();
}
- static int change_page_attr_set_clr(unsigned long addr, int numpages,
-- pgprot_t mask_set, pgprot_t mask_clr)
-+ pgprot_t mask_set, pgprot_t mask_clr,
-+ int force_split)
- {
- struct cpa_data cpa;
- int ret, cache, checkalias;
-@@ -1088,7 +730,7 @@ static int change_page_attr_set_clr(unsi
- */
- mask_set = canon_pgprot(mask_set);
- mask_clr = canon_pgprot(mask_clr);
-- if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
-+ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
- return 0;
+ #ifdef CONFIG_XEN
+@@ -786,9 +855,9 @@ static void __cpuinit amd_detect_cmp(str
+ bits = c->x86_coreid_bits;
- /* Ensure we are PAGE_SIZE aligned */
-@@ -1105,6 +747,7 @@ static int change_page_attr_set_clr(unsi
- cpa.mask_set = mask_set;
- cpa.mask_clr = mask_clr;
- cpa.flushtlb = 0;
-+ cpa.force_split = force_split;
+ /* Low order bits define the core id (index of core in socket) */
+- c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
+- /* Convert the APIC ID into the socket ID */
+- c->phys_proc_id = phys_pkg_id(bits);
++ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
++ /* Convert the initial APIC ID into the socket ID */
++ c->phys_proc_id = c->initial_apicid >> bits;
- /* No alias checking for _NX bit modifications */
- checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
-@@ -1143,26 +786,67 @@ out:
- static inline int change_page_attr_set(unsigned long addr, int numpages,
- pgprot_t mask)
- {
-- return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
-+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
+ #ifdef CONFIG_NUMA
+ node = c->phys_proc_id;
+@@ -805,7 +874,7 @@ static void __cpuinit amd_detect_cmp(str
+ If that doesn't result in a usable node fall back to the
+ path for the previous case. */
+
+- int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
++ int ht_nodeid = c->initial_apicid;
+
+ if (ht_nodeid >= 0 &&
+ apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+@@ -913,7 +982,7 @@ static void __cpuinit init_amd(struct cp
+
+ /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+- clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
++ clear_cpu_cap(c, 0*32+31);
+
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+@@ -955,9 +1024,25 @@ static void __cpuinit init_amd(struct cp
+ /* MFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+
++ if (c->x86 == 0x10)
++ fam10h_check_enable_mmcfg();
++
+ #ifndef CONFIG_XEN
+ if (amd_apic_timer_broken())
+ disable_apic_timer = 1;
++
++ if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
++ unsigned long long tseg;
++
++ /*
++ * Split up direct mapping around the TSEG SMM area.
++ * Don't do it for gbpages because there seems very little
++ * benefit in doing so.
++ */
++ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
++ (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
++ set_memory_4k((unsigned long)__va(tseg), 1);
++ }
+ #endif
}
- static inline int change_page_attr_clear(unsigned long addr, int numpages,
- pgprot_t mask)
+@@ -1051,7 +1136,7 @@ static void __cpuinit early_init_intel(s
{
-- return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
-+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
+- set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
++ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
}
--int set_memory_uc(unsigned long addr, int numpages)
-+int _set_memory_uc(unsigned long addr, int numpages)
- {
-+ /*
-+ * for now UC MINUS. see comments in ioremap_nocache()
-+ */
- return change_page_attr_set(addr, numpages,
-- __pgprot(_PAGE_PCD));
-+ __pgprot(_PAGE_CACHE_UC_MINUS));
-+}
-+
-+int set_memory_uc(unsigned long addr, int numpages)
-+{
-+ /*
-+ * for now UC MINUS. see comments in ioremap_nocache()
-+ */
-+ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
-+ _PAGE_CACHE_UC_MINUS, NULL))
-+ return -EINVAL;
-+
-+ return _set_memory_uc(addr, numpages);
+ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+@@ -1094,9 +1179,6 @@ static void __cpuinit init_intel(struct
+
+ if (c->x86 == 15)
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+- (c->x86 == 0x6 && c->x86_model >= 0x0e))
+- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+@@ -1105,6 +1187,32 @@ static void __cpuinit init_intel(struct
+ srat_detect_node();
}
- EXPORT_SYMBOL(set_memory_uc);
--int set_memory_wb(unsigned long addr, int numpages)
-+int _set_memory_wc(unsigned long addr, int numpages)
++static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+{
-+ return change_page_attr_set(addr, numpages,
-+ __pgprot(_PAGE_CACHE_WC));
++ if (c->x86 == 0x6 && c->x86_model >= 0xf)
++ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
-+int set_memory_wc(unsigned long addr, int numpages)
++static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+{
-+ if (!pat_wc_enabled)
-+ return set_memory_uc(addr, numpages);
++ /* Cache sizes */
++ unsigned n;
+
-+ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
-+ _PAGE_CACHE_WC, NULL))
-+ return -EINVAL;
++ n = c->extended_cpuid_level;
++ if (n >= 0x80000008) {
++ unsigned eax = cpuid_eax(0x80000008);
++ c->x86_virt_bits = (eax >> 8) & 0xff;
++ c->x86_phys_bits = eax & 0xff;
++ }
+
-+ return _set_memory_wc(addr, numpages);
++ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
++ c->x86_cache_alignment = c->x86_clflush_size * 2;
++ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
++ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
++ }
++ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+}
-+EXPORT_SYMBOL(set_memory_wc);
+
-+int _set_memory_wb(unsigned long addr, int numpages)
+ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
- return change_page_attr_clear(addr, numpages,
-- __pgprot(_PAGE_PCD | _PAGE_PWT));
-+ __pgprot(_PAGE_CACHE_MASK));
-+}
-+
-+int set_memory_wb(unsigned long addr, int numpages)
-+{
-+ free_memtype(addr, addr + numpages * PAGE_SIZE);
-+
-+ return _set_memory_wb(addr, numpages);
+ char *v = c->x86_vendor_id;
+@@ -1113,6 +1221,8 @@ static void __cpuinit get_cpu_vendor(str
+ c->x86_vendor = X86_VENDOR_AMD;
+ else if (!strcmp(v, "GenuineIntel"))
+ c->x86_vendor = X86_VENDOR_INTEL;
++ else if (!strcmp(v, "CentaurHauls"))
++ c->x86_vendor = X86_VENDOR_CENTAUR;
+ else
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
}
- EXPORT_SYMBOL(set_memory_wb);
+@@ -1160,15 +1270,16 @@ static void __cpuinit early_identify_cpu
+ c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
+- if (c->x86_capability[0] & (1<<19))
++ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+ } else {
+ /* Have CPUID level 0 only - unheard of */
+ c->x86 = 4;
+ }
-@@ -1193,6 +877,12 @@ int set_memory_np(unsigned long addr, in
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
++ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
+ #ifdef CONFIG_SMP
+- c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
++ c->phys_proc_id = c->initial_apicid;
+ #endif
+ /* AMD-defined flags: level 0x80000001 */
+ xlvl = cpuid_eax(0x80000000);
+@@ -1201,8 +1312,12 @@ static void __cpuinit early_identify_cpu
+ case X86_VENDOR_INTEL:
+ early_init_intel(c);
+ break;
++ case X86_VENDOR_CENTAUR:
++ early_init_centaur(c);
++ break;
+ }
+
++ validate_pat_support(c);
}
-+int set_memory_4k(unsigned long addr, int numpages)
-+{
-+ return change_page_attr_set_clr(addr, numpages, __pgprot(0),
-+ __pgprot(0), 1);
-+}
+ /*
+@@ -1237,6 +1352,10 @@ void __cpuinit identify_cpu(struct cpuin
+ init_intel(c);
+ break;
+
++ case X86_VENDOR_CENTAUR:
++ init_centaur(c);
++ break;
+
- int set_pages_uc(struct page *page, int numpages)
- {
- unsigned long addr = (unsigned long)page_address(page);
-@@ -1302,6 +992,45 @@ void kernel_map_pages(struct page *page,
- cpa_fill_pool(NULL);
+ case X86_VENDOR_UNKNOWN:
+ default:
+ display_cacheinfo(c);
+@@ -1266,14 +1385,24 @@ void __cpuinit identify_cpu(struct cpuin
+ #endif
+ select_idle_routine(c);
+
+- if (c != &boot_cpu_data)
+- mtrr_ap_init();
+ #ifdef CONFIG_NUMA
+ numa_add_cpu(smp_processor_id());
+ #endif
+
}
-+#ifdef CONFIG_DEBUG_FS
-+static int dpa_show(struct seq_file *m, void *v)
-+{
-+ seq_puts(m, "DEBUG_PAGEALLOC\n");
-+ seq_printf(m, "pool_size : %lu\n", pool_size);
-+ seq_printf(m, "pool_pages : %lu\n", pool_pages);
-+ seq_printf(m, "pool_low : %lu\n", pool_low);
-+ seq_printf(m, "pool_used : %lu\n", pool_used);
-+ seq_printf(m, "pool_failed : %lu\n", pool_failed);
-+
-+ return 0;
-+}
-+
-+static int dpa_open(struct inode *inode, struct file *filp)
++void __cpuinit identify_boot_cpu(void)
+{
-+ return single_open(filp, dpa_show, NULL);
++ identify_cpu(&boot_cpu_data);
+}
+
-+static const struct file_operations dpa_fops = {
-+ .open = dpa_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = single_release,
-+};
-+
-+static int __init debug_pagealloc_proc_init(void)
++void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
-+ struct dentry *de;
-+
-+ de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
-+ &dpa_fops);
-+ if (!de)
-+ return -ENOMEM;
-+
-+ return 0;
++ BUG_ON(c == &boot_cpu_data);
++ identify_cpu(c);
++ mtrr_ap_init();
+}
-+__initcall(debug_pagealloc_proc_init);
-+#endif
+
- #ifdef CONFIG_HIBERNATION
-
- bool kernel_page_present(struct page *page)
---- /dev/null
-+++ b/arch/x86/mm/pat-xen.c
-@@ -0,0 +1,602 @@
+ static __init int setup_noclflush(char *arg)
+ {
+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+@@ -1302,123 +1431,3 @@ static __init int setup_disablecpuid(cha
+ return 1;
+ }
+ __setup("clearcpuid=", setup_disablecpuid);
+-
+-/*
+- * Get CPU information for use by the procfs.
+- */
+-
+-static int show_cpuinfo(struct seq_file *m, void *v)
+-{
+- struct cpuinfo_x86 *c = v;
+- int cpu = 0, i;
+-
+-#ifdef CONFIG_SMP
+- cpu = c->cpu_index;
+-#endif
+-
+- seq_printf(m, "processor\t: %u\n"
+- "vendor_id\t: %s\n"
+- "cpu family\t: %d\n"
+- "model\t\t: %d\n"
+- "model name\t: %s\n",
+- (unsigned)cpu,
+- c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+- c->x86,
+- (int)c->x86_model,
+- c->x86_model_id[0] ? c->x86_model_id : "unknown");
+-
+- if (c->x86_mask || c->cpuid_level >= 0)
+- seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+- else
+- seq_printf(m, "stepping\t: unknown\n");
+-
+- if (cpu_has(c, X86_FEATURE_TSC)) {
+- unsigned int freq = cpufreq_quick_get((unsigned)cpu);
+-
+- if (!freq)
+- freq = cpu_khz;
+- seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
+- freq / 1000, (freq % 1000));
+- }
+-
+- /* Cache size */
+- if (c->x86_cache_size >= 0)
+- seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+-
+-#ifdef CONFIG_SMP
+- if (smp_num_siblings * c->x86_max_cores > 1) {
+- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+- seq_printf(m, "siblings\t: %d\n",
+- cpus_weight(per_cpu(cpu_core_map, cpu)));
+- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+- }
+-#endif
+-
+- seq_printf(m,
+- "fpu\t\t: yes\n"
+- "fpu_exception\t: yes\n"
+- "cpuid level\t: %d\n"
+- "wp\t\t: yes\n"
+- "flags\t\t:",
+- c->cpuid_level);
+-
+- for (i = 0; i < 32*NCAPINTS; i++)
+- if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+- seq_printf(m, " %s", x86_cap_flags[i]);
+-
+- seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+- c->loops_per_jiffy/(500000/HZ),
+- (c->loops_per_jiffy/(5000/HZ)) % 100);
+-
+- if (c->x86_tlbsize > 0)
+- seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
+- seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
+- seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
+-
+- seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
+- c->x86_phys_bits, c->x86_virt_bits);
+-
+- seq_printf(m, "power management:");
+- for (i = 0; i < 32; i++) {
+- if (c->x86_power & (1 << i)) {
+- if (i < ARRAY_SIZE(x86_power_flags) &&
+- x86_power_flags[i])
+- seq_printf(m, "%s%s",
+- x86_power_flags[i][0]?" ":"",
+- x86_power_flags[i]);
+- else
+- seq_printf(m, " [%d]", i);
+- }
+- }
+-
+- seq_printf(m, "\n\n");
+-
+- return 0;
+-}
+-
+-static void *c_start(struct seq_file *m, loff_t *pos)
+-{
+- if (*pos == 0) /* just in case, cpu 0 is not the first */
+- *pos = first_cpu(cpu_online_map);
+- if ((*pos) < NR_CPUS && cpu_online(*pos))
+- return &cpu_data(*pos);
+- return NULL;
+-}
+-
+-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+-{
+- *pos = next_cpu(*pos, cpu_online_map);
+- return c_start(m, pos);
+-}
+-
+-static void c_stop(struct seq_file *m, void *v)
+-{
+-}
+-
+-const struct seq_operations cpuinfo_op = {
+- .start = c_start,
+- .next = c_next,
+- .stop = c_stop,
+- .show = show_cpuinfo,
+-};
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-05-14/arch/x86/kernel/smp-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -0,0 +1,329 @@
+/*
-+ * Handle caching attributes in page tables (PAT)
++ * Intel SMP support routines.
+ *
-+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
-+ * Suresh B Siddha <suresh.b.siddha@intel.com>
++ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
++ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
++ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
-+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
++ * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
++ *
++ * This code is released under the GNU General Public License version 2 or
++ * later.
+ */
+
++#include <linux/init.h>
++
+#include <linux/mm.h>
-+#include <linux/kernel.h>
-+#include <linux/gfp.h>
-+#include <linux/fs.h>
-+#include <linux/bootmem.h>
++#include <linux/delay.h>
++#include <linux/spinlock.h>
++#include <linux/kernel_stat.h>
++#include <linux/mc146818rtc.h>
++#include <linux/cache.h>
++#include <linux/interrupt.h>
++#include <linux/cpu.h>
+
-+#include <asm/msr.h>
-+#include <asm/tlbflush.h>
-+#include <asm/processor.h>
-+#include <asm/page.h>
-+#include <asm/pgtable.h>
-+#include <asm/pat.h>
-+#include <asm/e820.h>
-+#include <asm/cacheflush.h>
-+#include <asm/fcntl.h>
+#include <asm/mtrr.h>
-+#include <asm/io.h>
-+
-+#ifdef CONFIG_X86_PAT
-+int __read_mostly pat_wc_enabled = 1;
-+
-+void __cpuinit pat_disable(char *reason)
-+{
-+ pat_wc_enabled = 0;
-+ printk(KERN_INFO "%s\n", reason);
-+}
-+
-+static int __init nopat(char *str)
-+{
-+ pat_disable("PAT support disabled.");
-+ return 0;
-+}
-+early_param("nopat", nopat);
-+#endif
-+
-+static u64 __read_mostly boot_pat_state;
-+
-+enum {
-+ PAT_UC = 0, /* uncached */
-+ PAT_WC = 1, /* Write combining */
-+ PAT_WT = 4, /* Write Through */
-+ PAT_WP = 5, /* Write Protected */
-+ PAT_WB = 6, /* Write Back (default) */
-+ PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
-+};
-+
-+#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
-+
-+void pat_init(void)
-+{
-+ u64 pat;
-+
-+ if (!pat_wc_enabled)
-+ return;
-+
-+ /* Paranoia check. */
-+ if (!cpu_has_pat) {
-+ printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
-+ /*
-+ * Panic if this happens on the secondary CPU, and we
-+ * switched to PAT on the boot CPU. We have no way to
-+ * undo PAT.
-+ */
-+ BUG_ON(boot_pat_state);
-+ }
-+
-+#ifndef CONFIG_XEN
-+ /* Set PWT to Write-Combining. All other bits stay the same */
-+ /*
-+ * PTE encoding used in Linux:
-+ * PAT
-+ * |PCD
-+ * ||PWT
-+ * |||
-+ * 000 WB _PAGE_CACHE_WB
-+ * 001 WC _PAGE_CACHE_WC
-+ * 010 UC- _PAGE_CACHE_UC_MINUS
-+ * 011 UC _PAGE_CACHE_UC
-+ * PAT bit unused
-+ */
-+ pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
-+ PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
-+
-+ /* Boot CPU check */
-+ if (!boot_pat_state)
-+ rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
-+
-+ wrmsrl(MSR_IA32_CR_PAT, pat);
-+#else
-+ /*
-+ * PAT settings are part of the hypervisor interface, and their
-+ * assignment cannot be changed.
-+ */
-+ rdmsrl(MSR_IA32_CR_PAT, pat);
-+ if (!boot_pat_state)
-+ boot_pat_state = pat;
-+#endif
-+ printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
-+ smp_processor_id(), boot_pat_state, pat);
-+}
-+
-+#undef PAT
-+
-+static char *cattr_name(unsigned long flags)
-+{
-+ switch (flags & _PAGE_CACHE_MASK) {
-+ case _PAGE_CACHE_UC: return "uncached";
-+ case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
-+ case _PAGE_CACHE_WB: return "write-back";
-+ case _PAGE_CACHE_WC: return "write-combining";
-+ case _PAGE_CACHE_WP: return "write-protected";
-+ case _PAGE_CACHE_WT: return "write-through";
-+ default: return "broken";
-+ }
-+}
-+
++#include <asm/tlbflush.h>
++#include <asm/mmu_context.h>
++#include <asm/proto.h>
++#include <mach_ipi.h>
++#include <xen/evtchn.h>
+/*
-+ * The global memtype list keeps track of memory type for specific
-+ * physical memory areas. Conflicting memory types in different
-+ * mappings can cause CPU cache corruption. To avoid this we keep track.
++ * Some notes on x86 processor bugs affecting SMP operation:
+ *
-+ * The list is sorted based on starting address and can contain multiple
-+ * entries for each address (this allows reference counting for overlapping
-+ * areas). All the aliases have the same cache attributes of course.
-+ * Zero attributes are represented as holes.
++ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
++ * The Linux implications for SMP are handled as follows:
+ *
-+ * Currently the data structure is a list because the number of mappings
-+ * are expected to be relatively small. If this should be a problem
-+ * it could be changed to a rbtree or similar.
++ * Pentium III / [Xeon]
++ * None of the E1AP-E3AP errata are visible to the user.
+ *
-+ * memtype_lock protects the whole list.
-+ */
-+
-+struct memtype {
-+ u64 start;
-+ u64 end;
-+ unsigned long type;
-+ struct list_head nd;
-+};
-+
-+static LIST_HEAD(memtype_list);
-+static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
++ * E1AP. see PII A1AP
++ * E2AP. see PII A2AP
++ * E3AP. see PII A3AP
++ *
++ * Pentium II / [Xeon]
++ * None of the A1AP-A3AP errata are visible to the user.
++ *
++ * A1AP. see PPro 1AP
++ * A2AP. see PPro 2AP
++ * A3AP. see PPro 7AP
++ *
++ * Pentium Pro
++ * None of 1AP-9AP errata are visible to the normal user,
++ * except occasional delivery of 'spurious interrupt' as trap #15.
++ * This is very rare and a non-problem.
++ *
++ * 1AP. Linux maps APIC as non-cacheable
++ * 2AP. worked around in hardware
++ * 3AP. fixed in C0 and above steppings microcode update.
++ * Linux does not use excessive STARTUP_IPIs.
++ * 4AP. worked around in hardware
++ * 5AP. symmetric IO mode (normal Linux operation) not affected.
++ * 'noapic' mode has vector 0xf filled out properly.
++ * 6AP. 'noapic' mode might be affected - fixed in later steppings
++ * 7AP. We do not assume writes to the LVT deassering IRQs
++ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
++ * 9AP. We do not use mixed mode
++ *
++ * Pentium
++ * There is a marginal case where REP MOVS on 100MHz SMP
++ * machines with B stepping processors can fail. XXX should provide
++ * an L1cache=Writethrough or L1cache=off option.
++ *
++ * B stepping CPUs may hang. There are hardware work arounds
++ * for this. We warn about it in case your board doesn't have the work
++ * arounds. Basically that's so I can tell anyone with a B stepping
++ * CPU and SMP problems "tough".
++ *
++ * Specific items [From Pentium Processor Specification Update]
++ *
++ * 1AP. Linux doesn't use remote read
++ * 2AP. Linux doesn't trust APIC errors
++ * 3AP. We work around this
++ * 4AP. Linux never generated 3 interrupts of the same priority
++ * to cause a lost local interrupt.
++ * 5AP. Remote read is never used
++ * 6AP. not affected - worked around in hardware
++ * 7AP. not affected - worked around in hardware
++ * 8AP. worked around in hardware - we get explicit CS errors if not
++ * 9AP. only 'noapic' mode affected. Might generate spurious
++ * interrupts, we log only the first one and count the
++ * rest silently.
++ * 10AP. not affected - worked around in hardware
++ * 11AP. Linux reads the APIC between writes to avoid this, as per
++ * the documentation. Make sure you preserve this as it affects
++ * the C stepping chips too.
++ * 12AP. not affected - worked around in hardware
++ * 13AP. not affected - worked around in hardware
++ * 14AP. we always deassert INIT during bootup
++ * 15AP. not affected - worked around in hardware
++ * 16AP. not affected - worked around in hardware
++ * 17AP. not affected - worked around in hardware
++ * 18AP. not affected - worked around in hardware
++ * 19AP. not affected - worked around in BIOS
++ *
++ * If this sounds worrying believe me these bugs are either ___RARE___,
++ * or are signal timing bugs worked around in hardware and there's
++ * about nothing of note with C stepping upwards.
++ */
+
+/*
-+ * Does intersection of PAT memory type and MTRR memory type and returns
-+ * the resulting memory type as PAT understands it.
-+ * (Type in pat and mtrr will not have same value)
-+ * The intersection is based on "Effective Memory Type" tables in IA-32
-+ * SDM vol 3a
++ * this function sends a 'reschedule' IPI to another CPU.
++ * it goes straight through and wastes no time serializing
++ * anything. Worst case is that we lose a reschedule ...
+ */
-+static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
-+ unsigned long *ret_prot)
++void xen_smp_send_reschedule(int cpu)
+{
-+ unsigned long pat_type;
-+ u8 mtrr_type;
-+
-+ pat_type = prot & _PAGE_CACHE_MASK;
-+ prot &= (~_PAGE_CACHE_MASK);
-+
-+ /*
-+ * We return the PAT request directly for types where PAT takes
-+ * precedence with respect to MTRR and for UC_MINUS.
-+ * Consistency checks with other PAT requests is done later
-+ * while going through memtype list.
-+ */
-+ if (pat_type == _PAGE_CACHE_WC) {
-+ *ret_prot = prot | _PAGE_CACHE_WC;
-+ return 0;
-+ } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
-+ *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
-+ return 0;
-+ } else if (pat_type == _PAGE_CACHE_UC) {
-+ *ret_prot = prot | _PAGE_CACHE_UC;
-+ return 0;
-+ }
-+
-+ /*
-+ * Look for MTRR hint to get the effective type in case where PAT
-+ * request is for WB.
-+ */
-+ mtrr_type = mtrr_type_lookup(start, end);
-+
-+ if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
-+ *ret_prot = prot | _PAGE_CACHE_UC;
-+ } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
-+ *ret_prot = prot | _PAGE_CACHE_WC;
-+ } else {
-+ *ret_prot = prot | _PAGE_CACHE_WB;
++ if (unlikely(cpu_is_offline(cpu))) {
++ WARN_ON(1);
++ return;
+ }
-+
-+ return 0;
++ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+
+/*
-+ * req_type typically has one of the:
-+ * - _PAGE_CACHE_WB
-+ * - _PAGE_CACHE_WC
-+ * - _PAGE_CACHE_UC_MINUS
-+ * - _PAGE_CACHE_UC
-+ *
-+ * req_type will have a special case value '-1', when requester want to inherit
-+ * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
-+ *
-+ * If ret_type is NULL, function will return an error if it cannot reserve the
-+ * region with req_type. If ret_type is non-null, function will return
-+ * available type in ret_type in case of no error. In case of any error
-+ * it will return a negative return value.
++ * Structure and data for smp_call_function(). This is designed to minimise
++ * static memory requirements. It also looks cleaner.
+ */
-+int reserve_memtype(u64 start, u64 end, unsigned long req_type,
-+ unsigned long *ret_type)
-+{
-+ struct memtype *new_entry = NULL;
-+ struct memtype *parse;
-+ unsigned long actual_type;
-+ int err = 0;
-+
-+ /* Only track when pat_wc_enabled */
-+ if (!pat_wc_enabled) {
-+ /* This is identical to page table setting without PAT */
-+ if (ret_type) {
-+ if (req_type == -1) {
-+ *ret_type = _PAGE_CACHE_WB;
-+ } else {
-+ *ret_type = req_type;
-+ }
-+ }
-+ return 0;
-+ }
-+
-+ /* Low ISA region is always mapped WB in page table. No need to track */
-+ if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
-+ if (ret_type)
-+ *ret_type = _PAGE_CACHE_WB;
-+
-+ return 0;
-+ }
-+
-+ if (req_type == -1) {
-+ /*
-+ * Call mtrr_lookup to get the type hint. This is an
-+ * optimization for /dev/mem mmap'ers into WB memory (BIOS
-+ * tools and ACPI tools). Use WB request for WB memory and use
-+ * UC_MINUS otherwise.
-+ */
-+ u8 mtrr_type = mtrr_type_lookup(start, end);
-+
-+ if (mtrr_type == MTRR_TYPE_WRBACK) {
-+ req_type = _PAGE_CACHE_WB;
-+ actual_type = _PAGE_CACHE_WB;
-+ } else {
-+ req_type = _PAGE_CACHE_UC_MINUS;
-+ actual_type = _PAGE_CACHE_UC_MINUS;
-+ }
-+ } else {
-+ req_type &= _PAGE_CACHE_MASK;
-+ err = pat_x_mtrr_type(start, end, req_type, &actual_type);
-+ }
-+
-+ if (err) {
-+ if (ret_type)
-+ *ret_type = actual_type;
-+
-+ return -EINVAL;
-+ }
-+
-+ new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
-+ if (!new_entry)
-+ return -ENOMEM;
-+
-+ new_entry->start = start;
-+ new_entry->end = end;
-+ new_entry->type = actual_type;
-+
-+ if (ret_type)
-+ *ret_type = actual_type;
-+
-+ spin_lock(&memtype_lock);
-+
-+ /* Search for existing mapping that overlaps the current range */
-+ list_for_each_entry(parse, &memtype_list, nd) {
-+ struct memtype *saved_ptr;
-+
-+ if (parse->start >= end) {
-+ pr_debug("New Entry\n");
-+ list_add(&new_entry->nd, parse->nd.prev);
-+ new_entry = NULL;
-+ break;
-+ }
-+
-+ if (start <= parse->start && end >= parse->start) {
-+ if (actual_type != parse->type && ret_type) {
-+ actual_type = parse->type;
-+ *ret_type = actual_type;
-+ new_entry->type = actual_type;
-+ }
-+
-+ if (actual_type != parse->type) {
-+ printk(
-+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-+ current->comm, current->pid,
-+ start, end,
-+ cattr_name(actual_type),
-+ cattr_name(parse->type));
-+ err = -EBUSY;
-+ break;
-+ }
-+
-+ saved_ptr = parse;
-+ /*
-+ * Check to see whether the request overlaps more
-+ * than one entry in the list
-+ */
-+ list_for_each_entry_continue(parse, &memtype_list, nd) {
-+ if (end <= parse->start) {
-+ break;
-+ }
-+
-+ if (actual_type != parse->type) {
-+ printk(
-+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-+ current->comm, current->pid,
-+ start, end,
-+ cattr_name(actual_type),
-+ cattr_name(parse->type));
-+ err = -EBUSY;
-+ break;
-+ }
-+ }
-+
-+ if (err) {
-+ break;
-+ }
-+
-+ pr_debug("Overlap at 0x%Lx-0x%Lx\n",
-+ saved_ptr->start, saved_ptr->end);
-+ /* No conflict. Go ahead and add this new entry */
-+ list_add(&new_entry->nd, saved_ptr->nd.prev);
-+ new_entry = NULL;
-+ break;
-+ }
-+
-+ if (start < parse->end) {
-+ if (actual_type != parse->type && ret_type) {
-+ actual_type = parse->type;
-+ *ret_type = actual_type;
-+ new_entry->type = actual_type;
-+ }
++static DEFINE_SPINLOCK(call_lock);
+
-+ if (actual_type != parse->type) {
-+ printk(
-+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-+ current->comm, current->pid,
-+ start, end,
-+ cattr_name(actual_type),
-+ cattr_name(parse->type));
-+ err = -EBUSY;
-+ break;
-+ }
++struct call_data_struct {
++ void (*func) (void *info);
++ void *info;
++ atomic_t started;
++ atomic_t finished;
++ int wait;
++};
+
-+ saved_ptr = parse;
-+ /*
-+ * Check to see whether the request overlaps more
-+ * than one entry in the list
-+ */
-+ list_for_each_entry_continue(parse, &memtype_list, nd) {
-+ if (end <= parse->start) {
-+ break;
-+ }
++void lock_ipi_call_lock(void)
++{
++ spin_lock_irq(&call_lock);
++}
+
-+ if (actual_type != parse->type) {
-+ printk(
-+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-+ current->comm, current->pid,
-+ start, end,
-+ cattr_name(actual_type),
-+ cattr_name(parse->type));
-+ err = -EBUSY;
-+ break;
-+ }
-+ }
++void unlock_ipi_call_lock(void)
++{
++ spin_unlock_irq(&call_lock);
++}
+
-+ if (err) {
-+ break;
-+ }
++static struct call_data_struct *call_data;
+
-+ pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
-+ saved_ptr->start, saved_ptr->end);
-+ /* No conflict. Go ahead and add this new entry */
-+ list_add(&new_entry->nd, &saved_ptr->nd);
-+ new_entry = NULL;
-+ break;
-+ }
-+ }
++static void __smp_call_function(void (*func) (void *info), void *info,
++ int nonatomic, int wait)
++{
++ struct call_data_struct data;
++ int cpus = num_online_cpus() - 1;
+
-+ if (err) {
-+ printk(KERN_INFO
-+ "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
-+ start, end, cattr_name(new_entry->type),
-+ cattr_name(req_type));
-+ kfree(new_entry);
-+ spin_unlock(&memtype_lock);
-+ return err;
-+ }
++ if (!cpus)
++ return;
+
-+ if (new_entry) {
-+ /* No conflict. Not yet added to the list. Add to the tail */
-+ list_add_tail(&new_entry->nd, &memtype_list);
-+ pr_debug("New Entry\n");
-+ }
++ data.func = func;
++ data.info = info;
++ atomic_set(&data.started, 0);
++ data.wait = wait;
++ if (wait)
++ atomic_set(&data.finished, 0);
+
-+ if (ret_type) {
-+ pr_debug(
-+ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
-+ start, end, cattr_name(actual_type),
-+ cattr_name(req_type), cattr_name(*ret_type));
-+ } else {
-+ pr_debug(
-+ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
-+ start, end, cattr_name(actual_type),
-+ cattr_name(req_type));
-+ }
++ call_data = &data;
++ mb();
+
-+ spin_unlock(&memtype_lock);
-+ return err;
++ /* Send a message to all other CPUs and wait for them to respond */
++ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
++
++ /* Wait for response */
++ while (atomic_read(&data.started) != cpus)
++ cpu_relax();
++
++ if (wait)
++ while (atomic_read(&data.finished) != cpus)
++ cpu_relax();
+}
+
-+int free_memtype(u64 start, u64 end)
++
++/**
++ * smp_call_function_mask(): Run a function on a set of other CPUs.
++ * @mask: The set of cpus to run on. Must not include the current cpu.
++ * @func: The function to run. This must be fast and non-blocking.
++ * @info: An arbitrary pointer to pass to the function.
++ * @wait: If true, wait (atomically) until function has completed on other CPUs.
++ *
++ * Returns 0 on success, else a negative status code.
++ *
++ * If @wait is true, then returns once @func has returned; otherwise
++ * it returns just before the target cpu calls @func.
++ *
++ * You must not call this function with disabled interrupts or from a
++ * hardware interrupt handler or from a bottom half handler.
++ */
++int
++xen_smp_call_function_mask(cpumask_t mask,
++ void (*func)(void *), void *info,
++ int wait)
+{
-+ struct memtype *ml;
-+ int err = -EINVAL;
++ struct call_data_struct data;
++ cpumask_t allbutself;
++ int cpus;
+
-+ /* Only track when pat_wc_enabled */
-+ if (!pat_wc_enabled) {
-+ return 0;
-+ }
++ /* Can deadlock when called with interrupts disabled */
++ WARN_ON(irqs_disabled());
+
-+ /* Low ISA region is always mapped WB. No need to track */
-+ if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
++ /* Holding any lock stops cpus from going down. */
++ spin_lock(&call_lock);
++
++ allbutself = cpu_online_map;
++ cpu_clear(smp_processor_id(), allbutself);
++
++ cpus_and(mask, mask, allbutself);
++ cpus = cpus_weight(mask);
++
++ if (!cpus) {
++ spin_unlock(&call_lock);
+ return 0;
+ }
+
-+ spin_lock(&memtype_lock);
-+ list_for_each_entry(ml, &memtype_list, nd) {
-+ if (ml->start == start && ml->end == end) {
-+ list_del(&ml->nd);
-+ kfree(ml);
-+ err = 0;
-+ break;
-+ }
-+ }
-+ spin_unlock(&memtype_lock);
++ data.func = func;
++ data.info = info;
++ atomic_set(&data.started, 0);
++ data.wait = wait;
++ if (wait)
++ atomic_set(&data.finished, 0);
+
-+ if (err) {
-+ printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
-+ current->comm, current->pid, start, end);
-+ }
++ call_data = &data;
++ wmb();
+
-+ pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
-+ return err;
-+}
++ /* Send a message to other CPUs */
++ if (cpus_equal(mask, allbutself) &&
++ cpus_equal(cpu_online_map, cpu_callout_map))
++ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
++ else
++ send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+
++ /* Wait for response */
++ while (atomic_read(&data.started) != cpus)
++ cpu_relax();
+
-+/*
-+ * /dev/mem mmap interface. The memtype used for mapping varies:
-+ * - Use UC for mappings with O_SYNC flag
-+ * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
-+ * inherit the memtype from existing mapping.
-+ * - Else use UC_MINUS memtype (for backward compatibility with existing
-+ * X drivers.
-+ */
-+pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
-+ unsigned long size, pgprot_t vma_prot)
-+{
-+ return vma_prot;
++ if (wait)
++ while (atomic_read(&data.finished) != cpus)
++ cpu_relax();
++ spin_unlock(&call_lock);
++
++ return 0;
+}
+
-+#ifdef CONFIG_NONPROMISC_DEVMEM
-+/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
-+static inline int range_is_allowed(unsigned long mfn, unsigned long size)
++static void stop_this_cpu(void *dummy)
+{
-+ return 1;
++ local_irq_disable();
++ /*
++ * Remove this CPU:
++ */
++ cpu_clear(smp_processor_id(), cpu_online_map);
++ disable_all_local_evtchn();
++ if (hlt_works(smp_processor_id()))
++ for (;;) halt();
++ for (;;);
+}
-+#else
-+static inline int range_is_allowed(unsigned long mfn, unsigned long size)
-+{
-+ u64 from = ((u64)mfn) << PAGE_SHIFT;
-+ u64 to = from + size;
-+ u64 cursor = from;
+
-+ while (cursor < to) {
-+ if (!devmem_is_allowed(mfn)) {
-+ printk(KERN_INFO
-+ "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
-+ current->comm, from, to);
-+ return 0;
-+ }
-+ cursor += PAGE_SIZE;
-+ mfn++;
-+ }
-+ return 1;
-+}
-+#endif /* CONFIG_NONPROMISC_DEVMEM */
++/*
++ * this function calls the 'stop' function on all other CPUs in the system.
++ */
+
-+int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
-+ unsigned long size, pgprot_t *vma_prot)
++void xen_smp_send_stop(void)
+{
-+ u64 addr = (u64)mfn << PAGE_SHIFT;
-+ unsigned long flags = _PAGE_CACHE_UC_MINUS;
-+ int retval;
-+
-+ if (!range_is_allowed(mfn, size))
-+ return 0;
++ int nolock;
++ unsigned long flags;
+
-+ if (file->f_flags & O_SYNC) {
-+ flags = _PAGE_CACHE_UC;
-+ }
++ /* Don't deadlock on the call lock in panic */
++ nolock = !spin_trylock(&call_lock);
++ local_irq_save(flags);
++ __smp_call_function(stop_this_cpu, NULL, 0, 0);
++ if (!nolock)
++ spin_unlock(&call_lock);
++ disable_all_local_evtchn();
++ local_irq_restore(flags);
++}
+
-+#ifndef CONFIG_X86_32
-+#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */
-+ /*
-+ * On the PPro and successors, the MTRRs are used to set
-+ * memory types for physical addresses outside main memory,
-+ * so blindly setting UC or PWT on those pages is wrong.
-+ * For Pentiums and earlier, the surround logic should disable
-+ * caching for the high addresses through the KEN pin, but
-+ * we maintain the tradition of paranoia in this code.
-+ */
-+ if (!pat_wc_enabled &&
-+ ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
-+ test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
-+ test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
-+ test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
-+ (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
-+ flags = _PAGE_CACHE_UC;
-+ }
-+#endif
++/*
++ * Reschedule call back. Nothing to do,
++ * all the work is done automatically when
++ * we return from the interrupt.
++ */
++irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
++{
++#ifdef CONFIG_X86_32
++ __get_cpu_var(irq_stat).irq_resched_count++;
++#else
++ add_pda(irq_resched_count, 1);
+#endif
++ return IRQ_HANDLED;
++}
++
++irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
++{
++ void (*func) (void *info) = call_data->func;
++ void *info = call_data->info;
++ int wait = call_data->wait;
+
+ /*
-+ * With O_SYNC, we can only take UC mapping. Fail if we cannot.
-+ * Without O_SYNC, we want to get
-+ * - WB for WB-able memory and no other conflicting mappings
-+ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
-+ * - Inherit from confliting mappings otherwise
++ * Notify initiating CPU that I've grabbed the data and am
++ * about to execute the function
+ */
-+ if (flags != _PAGE_CACHE_UC_MINUS) {
-+ retval = reserve_memtype(addr, addr + size, flags, NULL);
-+ } else {
-+ retval = reserve_memtype(addr, addr + size, -1, &flags);
++ mb();
++ atomic_inc(&call_data->started);
++ /*
++ * At this point the info structure may be out of scope unless wait==1
++ */
++ irq_enter();
++ (*func)(info);
++#ifdef CONFIG_X86_32
++ __get_cpu_var(irq_stat).irq_call_count++;
++#else
++ add_pda(irq_call_count, 1);
++#endif
++ irq_exit();
++
++ if (wait) {
++ mb();
++ atomic_inc(&call_data->finished);
+ }
+
-+ if (retval < 0)
-+ return 0;
++ return IRQ_HANDLED;
++}
+--- sle11-2009-05-14.orig/arch/x86/kernel/smp_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,647 +0,0 @@
+-/*
+- * Intel SMP support routines.
+- *
+- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+- *
+- * This code is released under the GNU General Public License version 2 or
+- * later.
+- */
+-
+-#include <linux/init.h>
+-
+-#include <linux/mm.h>
+-#include <linux/delay.h>
+-#include <linux/spinlock.h>
+-#include <linux/kernel_stat.h>
+-#include <linux/mc146818rtc.h>
+-#include <linux/cache.h>
+-#include <linux/interrupt.h>
+-#include <linux/cpu.h>
+-#include <linux/module.h>
+-
+-#include <asm/mtrr.h>
+-#include <asm/tlbflush.h>
+-#include <asm/mmu_context.h>
+-#if 0
+-#include <mach_apic.h>
+-#endif
+-#include <xen/evtchn.h>
+-
+-/*
+- * Some notes on x86 processor bugs affecting SMP operation:
+- *
+- * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+- * The Linux implications for SMP are handled as follows:
+- *
+- * Pentium III / [Xeon]
+- * None of the E1AP-E3AP errata are visible to the user.
+- *
+- * E1AP. see PII A1AP
+- * E2AP. see PII A2AP
+- * E3AP. see PII A3AP
+- *
+- * Pentium II / [Xeon]
+- * None of the A1AP-A3AP errata are visible to the user.
+- *
+- * A1AP. see PPro 1AP
+- * A2AP. see PPro 2AP
+- * A3AP. see PPro 7AP
+- *
+- * Pentium Pro
+- * None of 1AP-9AP errata are visible to the normal user,
+- * except occasional delivery of 'spurious interrupt' as trap #15.
+- * This is very rare and a non-problem.
+- *
+- * 1AP. Linux maps APIC as non-cacheable
+- * 2AP. worked around in hardware
+- * 3AP. fixed in C0 and above steppings microcode update.
+- * Linux does not use excessive STARTUP_IPIs.
+- * 4AP. worked around in hardware
+- * 5AP. symmetric IO mode (normal Linux operation) not affected.
+- * 'noapic' mode has vector 0xf filled out properly.
+- * 6AP. 'noapic' mode might be affected - fixed in later steppings
+- * 7AP. We do not assume writes to the LVT deassering IRQs
+- * 8AP. We do not enable low power mode (deep sleep) during MP bootup
+- * 9AP. We do not use mixed mode
+- *
+- * Pentium
+- * There is a marginal case where REP MOVS on 100MHz SMP
+- * machines with B stepping processors can fail. XXX should provide
+- * an L1cache=Writethrough or L1cache=off option.
+- *
+- * B stepping CPUs may hang. There are hardware work arounds
+- * for this. We warn about it in case your board doesn't have the work
+- * arounds. Basically that's so I can tell anyone with a B stepping
+- * CPU and SMP problems "tough".
+- *
+- * Specific items [From Pentium Processor Specification Update]
+- *
+- * 1AP. Linux doesn't use remote read
+- * 2AP. Linux doesn't trust APIC errors
+- * 3AP. We work around this
+- * 4AP. Linux never generated 3 interrupts of the same priority
+- * to cause a lost local interrupt.
+- * 5AP. Remote read is never used
+- * 6AP. not affected - worked around in hardware
+- * 7AP. not affected - worked around in hardware
+- * 8AP. worked around in hardware - we get explicit CS errors if not
+- * 9AP. only 'noapic' mode affected. Might generate spurious
+- * interrupts, we log only the first one and count the
+- * rest silently.
+- * 10AP. not affected - worked around in hardware
+- * 11AP. Linux reads the APIC between writes to avoid this, as per
+- * the documentation. Make sure you preserve this as it affects
+- * the C stepping chips too.
+- * 12AP. not affected - worked around in hardware
+- * 13AP. not affected - worked around in hardware
+- * 14AP. we always deassert INIT during bootup
+- * 15AP. not affected - worked around in hardware
+- * 16AP. not affected - worked around in hardware
+- * 17AP. not affected - worked around in hardware
+- * 18AP. not affected - worked around in hardware
+- * 19AP. not affected - worked around in BIOS
+- *
+- * If this sounds worrying believe me these bugs are either ___RARE___,
+- * or are signal timing bugs worked around in hardware and there's
+- * about nothing of note with C stepping upwards.
+- */
+-
+-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
+-
+-/*
+- * the following functions deal with sending IPIs between CPUs.
+- *
+- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+- */
+-
+-static inline int __prepare_ICR (unsigned int shortcut, int vector)
+-{
+- unsigned int icr = shortcut | APIC_DEST_LOGICAL;
+-
+- switch (vector) {
+- default:
+- icr |= APIC_DM_FIXED | vector;
+- break;
+- case NMI_VECTOR:
+- icr |= APIC_DM_NMI;
+- break;
+- }
+- return icr;
+-}
+-
+-static inline int __prepare_ICR2 (unsigned int mask)
+-{
+- return SET_APIC_DEST_FIELD(mask);
+-}
+-
+-DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
+-
+-static inline void __send_IPI_one(unsigned int cpu, int vector)
+-{
+- int irq = per_cpu(ipi_to_irq, cpu)[vector];
+- BUG_ON(irq < 0);
+- notify_remote_via_irq(irq);
+-}
+-
+-void __send_IPI_shortcut(unsigned int shortcut, int vector)
+-{
+- int cpu;
+-
+- switch (shortcut) {
+- case APIC_DEST_SELF:
+- __send_IPI_one(smp_processor_id(), vector);
+- break;
+- case APIC_DEST_ALLBUT:
+- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+- if (cpu == smp_processor_id())
+- continue;
+- if (cpu_isset(cpu, cpu_online_map)) {
+- __send_IPI_one(cpu, vector);
+- }
+- }
+- break;
+- default:
+- printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
+- vector);
+- break;
+- }
+-}
+-
+-void send_IPI_self(int vector)
+-{
+- __send_IPI_shortcut(APIC_DEST_SELF, vector);
+-}
+-
+-/*
+- * This is only used on smaller machines.
+- */
+-void send_IPI_mask_bitmask(cpumask_t mask, int vector)
+-{
+- unsigned long flags;
+- unsigned int cpu;
+-
+- local_irq_save(flags);
+- WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
+-
+- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+- if (cpu_isset(cpu, mask)) {
+- __send_IPI_one(cpu, vector);
+- }
+- }
+-
+- local_irq_restore(flags);
+-}
+-
+-void send_IPI_mask_sequence(cpumask_t mask, int vector)
+-{
+-
+- send_IPI_mask_bitmask(mask, vector);
+-}
+-
+-#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
+-
+-#if 0 /* XEN */
+-/*
+- * Smarter SMP flushing macros.
+- * c/o Linus Torvalds.
+- *
+- * These mean you can really definitely utterly forget about
+- * writing to user space from interrupts. (Its not allowed anyway).
+- *
+- * Optimizations Manfred Spraul <manfred@colorfullife.com>
+- */
+-
+-static cpumask_t flush_cpumask;
+-static struct mm_struct * flush_mm;
+-static unsigned long flush_va;
+-static DEFINE_SPINLOCK(tlbstate_lock);
+-
+-/*
+- * We cannot call mmdrop() because we are in interrupt context,
+- * instead update mm->cpu_vm_mask.
+- *
+- * We need to reload %cr3 since the page tables may be going
+- * away from under us..
+- */
+-void leave_mm(int cpu)
+-{
+- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+- BUG();
+- cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+- load_cr3(swapper_pg_dir);
+-}
+-EXPORT_SYMBOL_GPL(leave_mm);
+-
+-/*
+- *
+- * The flush IPI assumes that a thread switch happens in this order:
+- * [cpu0: the cpu that switches]
+- * 1) switch_mm() either 1a) or 1b)
+- * 1a) thread switch to a different mm
+- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+- * Stop ipi delivery for the old mm. This is not synchronized with
+- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
+- * for the wrong mm, and in the worst case we perform a superfluous
+- * tlb flush.
+- * 1a2) set cpu_tlbstate to TLBSTATE_OK
+- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+- * was in lazy tlb mode.
+- * 1a3) update cpu_tlbstate[].active_mm
+- * Now cpu0 accepts tlb flushes for the new mm.
+- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+- * Now the other cpus will send tlb flush ipis.
+- * 1a4) change cr3.
+- * 1b) thread switch without mm change
+- * cpu_tlbstate[].active_mm is correct, cpu0 already handles
+- * flush ipis.
+- * 1b1) set cpu_tlbstate to TLBSTATE_OK
+- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+- * Atomically set the bit [other cpus will start sending flush ipis],
+- * and test the bit.
+- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+- * 2) switch %%esp, ie current
+- *
+- * The interrupt must handle 2 special cases:
+- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+- * runs in kernel space, the cpu could load tlb entries for user space
+- * pages.
+- *
+- * The good news is that cpu_tlbstate is local to each cpu, no
+- * write/read ordering problems.
+- */
+-
+-/*
+- * TLB flush IPI:
+- *
+- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+- * 2) Leave the mm if we are in the lazy tlb mode.
+- */
+-
+-irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id)
+-{
+- unsigned long cpu;
+-
+- cpu = get_cpu();
+-
+- if (!cpu_isset(cpu, flush_cpumask))
+- goto out;
+- /*
+- * This was a BUG() but until someone can quote me the
+- * line from the intel manual that guarantees an IPI to
+- * multiple CPUs is retried _only_ on the erroring CPUs
+- * its staying as a return
+- *
+- * BUG();
+- */
+-
+- if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
+- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+- if (flush_va == TLB_FLUSH_ALL)
+- local_flush_tlb();
+- else
+- __flush_tlb_one(flush_va);
+- } else
+- leave_mm(cpu);
+- }
+- smp_mb__before_clear_bit();
+- cpu_clear(cpu, flush_cpumask);
+- smp_mb__after_clear_bit();
+-out:
+- put_cpu_no_resched();
+- __get_cpu_var(irq_stat).irq_tlb_count++;
+-
+- return IRQ_HANDLED;
+-}
+-
+-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+- unsigned long va)
+-{
+- cpumask_t cpumask = *cpumaskp;
+-
+- /*
+- * A couple of (to be removed) sanity checks:
+- *
+- * - current CPU must not be in mask
+- * - mask must exist :)
+- */
+- BUG_ON(cpus_empty(cpumask));
+- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+- BUG_ON(!mm);
+-
+-#ifdef CONFIG_HOTPLUG_CPU
+- /* If a CPU which we ran on has gone down, OK. */
+- cpus_and(cpumask, cpumask, cpu_online_map);
+- if (unlikely(cpus_empty(cpumask)))
+- return;
+-#endif
+-
+- /*
+- * i'm not happy about this global shared spinlock in the
+- * MM hot path, but we'll see how contended it is.
+- * AK: x86-64 has a faster method that could be ported.
+- */
+- spin_lock(&tlbstate_lock);
+-
+- flush_mm = mm;
+- flush_va = va;
+- cpus_or(flush_cpumask, cpumask, flush_cpumask);
+- /*
+- * We have to send the IPI only to
+- * CPUs affected.
+- */
+- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+-
+- while (!cpus_empty(flush_cpumask))
+- /* nothing. lockup detection does not belong here */
+- cpu_relax();
+-
+- flush_mm = NULL;
+- flush_va = 0;
+- spin_unlock(&tlbstate_lock);
+-}
+-
+-void flush_tlb_current_task(void)
+-{
+- struct mm_struct *mm = current->mm;
+- cpumask_t cpu_mask;
+-
+- preempt_disable();
+- cpu_mask = mm->cpu_vm_mask;
+- cpu_clear(smp_processor_id(), cpu_mask);
+-
+- local_flush_tlb();
+- if (!cpus_empty(cpu_mask))
+- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+- preempt_enable();
+-}
+-
+-void flush_tlb_mm (struct mm_struct * mm)
+-{
+- cpumask_t cpu_mask;
+-
+- preempt_disable();
+- cpu_mask = mm->cpu_vm_mask;
+- cpu_clear(smp_processor_id(), cpu_mask);
+-
+- if (current->active_mm == mm) {
+- if (current->mm)
+- local_flush_tlb();
+- else
+- leave_mm(smp_processor_id());
+- }
+- if (!cpus_empty(cpu_mask))
+- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+-
+- preempt_enable();
+-}
+-
+-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+-{
+- struct mm_struct *mm = vma->vm_mm;
+- cpumask_t cpu_mask;
+-
+- preempt_disable();
+- cpu_mask = mm->cpu_vm_mask;
+- cpu_clear(smp_processor_id(), cpu_mask);
+-
+- if (current->active_mm == mm) {
+- if(current->mm)
+- __flush_tlb_one(va);
+- else
+- leave_mm(smp_processor_id());
+- }
+-
+- if (!cpus_empty(cpu_mask))
+- flush_tlb_others(cpu_mask, mm, va);
+-
+- preempt_enable();
+-}
+-EXPORT_SYMBOL(flush_tlb_page);
+-
+-static void do_flush_tlb_all(void* info)
+-{
+- unsigned long cpu = smp_processor_id();
+-
+- __flush_tlb_all();
+- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+- leave_mm(cpu);
+-}
+-
+-void flush_tlb_all(void)
+-{
+- on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+-}
+-
+-#endif /* XEN */
+-
+-/*
+- * this function sends a 'reschedule' IPI to another CPU.
+- * it goes straight through and wastes no time serializing
+- * anything. Worst case is that we lose a reschedule ...
+- */
+-void xen_smp_send_reschedule(int cpu)
+-{
+- WARN_ON(cpu_is_offline(cpu));
+- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+-}
+-
+-/*
+- * Structure and data for smp_call_function(). This is designed to minimise
+- * static memory requirements. It also looks cleaner.
+- */
+-static DEFINE_SPINLOCK(call_lock);
+-
+-struct call_data_struct {
+- void (*func) (void *info);
+- void *info;
+- atomic_t started;
+- atomic_t finished;
+- int wait;
+-};
+-
+-void lock_ipi_call_lock(void)
+-{
+- spin_lock_irq(&call_lock);
+-}
+-
+-void unlock_ipi_call_lock(void)
+-{
+- spin_unlock_irq(&call_lock);
+-}
+-
+-static struct call_data_struct *call_data;
+-
+-static void __smp_call_function(void (*func) (void *info), void *info,
+- int nonatomic, int wait)
+-{
+- struct call_data_struct data;
+- int cpus = num_online_cpus() - 1;
+-
+- if (!cpus)
+- return;
+-
+- data.func = func;
+- data.info = info;
+- atomic_set(&data.started, 0);
+- data.wait = wait;
+- if (wait)
+- atomic_set(&data.finished, 0);
+-
+- call_data = &data;
+- mb();
+-
+- /* Send a message to all other CPUs and wait for them to respond */
+- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+-
+- /* Wait for response */
+- while (atomic_read(&data.started) != cpus)
+- cpu_relax();
+-
+- if (wait)
+- while (atomic_read(&data.finished) != cpus)
+- cpu_relax();
+-}
+-
+-
+-/**
+- * smp_call_function_mask(): Run a function on a set of other CPUs.
+- * @mask: The set of cpus to run on. Must not include the current cpu.
+- * @func: The function to run. This must be fast and non-blocking.
+- * @info: An arbitrary pointer to pass to the function.
+- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+- *
+- * Returns 0 on success, else a negative status code.
+- *
+- * If @wait is true, then returns once @func has returned; otherwise
+- * it returns just before the target cpu calls @func.
+- *
+- * You must not call this function with disabled interrupts or from a
+- * hardware interrupt handler or from a bottom half handler.
+- */
+-int
+-xen_smp_call_function_mask(cpumask_t mask,
+- void (*func)(void *), void *info,
+- int wait)
+-{
+- struct call_data_struct data;
+- cpumask_t allbutself;
+- int cpus;
+-
+- /* Can deadlock when called with interrupts disabled */
+- WARN_ON(irqs_disabled());
+-
+- /* Holding any lock stops cpus from going down. */
+- spin_lock(&call_lock);
+-
+- allbutself = cpu_online_map;
+- cpu_clear(smp_processor_id(), allbutself);
+-
+- cpus_and(mask, mask, allbutself);
+- cpus = cpus_weight(mask);
+-
+- if (!cpus) {
+- spin_unlock(&call_lock);
+- return 0;
+- }
+-
+- data.func = func;
+- data.info = info;
+- atomic_set(&data.started, 0);
+- data.wait = wait;
+- if (wait)
+- atomic_set(&data.finished, 0);
+-
+- call_data = &data;
+- mb();
+-
+- /* Send a message to other CPUs */
+- if (cpus_equal(mask, allbutself))
+- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+- else
+- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+-
+- /* Wait for response */
+- while (atomic_read(&data.started) != cpus)
+- cpu_relax();
+-
+- if (wait)
+- while (atomic_read(&data.finished) != cpus)
+- cpu_relax();
+- spin_unlock(&call_lock);
+-
+- return 0;
+-}
+-
+-static void stop_this_cpu (void * dummy)
+-{
+- local_irq_disable();
+- /*
+- * Remove this CPU:
+- */
+- cpu_clear(smp_processor_id(), cpu_online_map);
+- disable_all_local_evtchn();
+- if (cpu_data(smp_processor_id()).hlt_works_ok)
+- for(;;) halt();
+- for (;;);
+-}
+-
+-/*
+- * this function calls the 'stop' function on all other CPUs in the system.
+- */
+-
+-void xen_smp_send_stop(void)
+-{
+- /* Don't deadlock on the call lock in panic */
+- int nolock = !spin_trylock(&call_lock);
+- unsigned long flags;
+-
+- local_irq_save(flags);
+- __smp_call_function(stop_this_cpu, NULL, 0, 0);
+- if (!nolock)
+- spin_unlock(&call_lock);
+- disable_all_local_evtchn();
+- local_irq_restore(flags);
+-}
+-
+-/*
+- * Reschedule call back. Nothing to do,
+- * all the work is done automatically when
+- * we return from the interrupt.
+- */
+-irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
+-{
+- __get_cpu_var(irq_stat).irq_resched_count++;
+-
+- return IRQ_HANDLED;
+-}
+-
+-#include <linux/kallsyms.h>
+-irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
+-{
+- void (*func) (void *info) = call_data->func;
+- void *info = call_data->info;
+- int wait = call_data->wait;
+-
+- /*
+- * Notify initiating CPU that I've grabbed the data and am
+- * about to execute the function
+- */
+- mb();
+- atomic_inc(&call_data->started);
+- /*
+- * At this point the info structure may be out of scope unless wait==1
+- */
+- irq_enter();
+- (*func)(info);
+- __get_cpu_var(irq_stat).irq_call_count++;
+- irq_exit();
+-
+- if (wait) {
+- mb();
+- atomic_inc(&call_data->finished);
+- }
+-
+- return IRQ_HANDLED;
+-}
+--- sle11-2009-05-14.orig/arch/x86/kernel/smp_64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,554 +0,0 @@
+-/*
+- * Intel SMP support routines.
+- *
+- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+- * (c) 2002,2003 Andi Kleen, SuSE Labs.
+- *
+- * This code is released under the GNU General Public License version 2 or
+- * later.
+- */
+-
+-#include <linux/init.h>
+-
+-#include <linux/mm.h>
+-#include <linux/delay.h>
+-#include <linux/spinlock.h>
+-#include <linux/smp.h>
+-#include <linux/kernel_stat.h>
+-#include <linux/mc146818rtc.h>
+-#include <linux/interrupt.h>
+-
+-#include <asm/mtrr.h>
+-#include <asm/pgalloc.h>
+-#include <asm/tlbflush.h>
+-#include <asm/mach_apic.h>
+-#include <asm/mmu_context.h>
+-#include <asm/proto.h>
+-#include <asm/apicdef.h>
+-#include <asm/idle.h>
+-#ifdef CONFIG_XEN
+-#include <xen/evtchn.h>
+-#endif
+-
+-#ifndef CONFIG_XEN
+-/*
+- * Smarter SMP flushing macros.
+- * c/o Linus Torvalds.
+- *
+- * These mean you can really definitely utterly forget about
+- * writing to user space from interrupts. (Its not allowed anyway).
+- *
+- * Optimizations Manfred Spraul <manfred@colorfullife.com>
+- *
+- * More scalable flush, from Andi Kleen
+- *
+- * To avoid global state use 8 different call vectors.
+- * Each CPU uses a specific vector to trigger flushes on other
+- * CPUs. Depending on the received vector the target CPUs look into
+- * the right per cpu variable for the flush data.
+- *
+- * With more than 8 CPUs they are hashed to the 8 available
+- * vectors. The limited global vector space forces us to this right now.
+- * In future when interrupts are split into per CPU domains this could be
+- * fixed, at the cost of triggering multiple IPIs in some cases.
+- */
+-
+-union smp_flush_state {
+- struct {
+- cpumask_t flush_cpumask;
+- struct mm_struct *flush_mm;
+- unsigned long flush_va;
+- spinlock_t tlbstate_lock;
+- };
+- char pad[SMP_CACHE_BYTES];
+-} ____cacheline_aligned;
+-
+-/* State is put into the per CPU data section, but padded
+- to a full cache line because other CPUs can access it and we don't
+- want false sharing in the per cpu data segment. */
+-static DEFINE_PER_CPU(union smp_flush_state, flush_state);
+-
+-/*
+- * We cannot call mmdrop() because we are in interrupt context,
+- * instead update mm->cpu_vm_mask.
+- */
+-void leave_mm(int cpu)
+-{
+- if (read_pda(mmu_state) == TLBSTATE_OK)
+- BUG();
+- cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
+- load_cr3(swapper_pg_dir);
+-}
+-EXPORT_SYMBOL_GPL(leave_mm);
+-
+-/*
+- *
+- * The flush IPI assumes that a thread switch happens in this order:
+- * [cpu0: the cpu that switches]
+- * 1) switch_mm() either 1a) or 1b)
+- * 1a) thread switch to a different mm
+- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+- * Stop ipi delivery for the old mm. This is not synchronized with
+- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
+- * for the wrong mm, and in the worst case we perform a superfluous
+- * tlb flush.
+- * 1a2) set cpu mmu_state to TLBSTATE_OK
+- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+- * was in lazy tlb mode.
+- * 1a3) update cpu active_mm
+- * Now cpu0 accepts tlb flushes for the new mm.
+- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+- * Now the other cpus will send tlb flush ipis.
+- * 1a4) change cr3.
+- * 1b) thread switch without mm change
+- * cpu active_mm is correct, cpu0 already handles
+- * flush ipis.
+- * 1b1) set cpu mmu_state to TLBSTATE_OK
+- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+- * Atomically set the bit [other cpus will start sending flush ipis],
+- * and test the bit.
+- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+- * 2) switch %%esp, ie current
+- *
+- * The interrupt must handle 2 special cases:
+- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+- * runs in kernel space, the cpu could load tlb entries for user space
+- * pages.
+- *
+- * The good news is that cpu mmu_state is local to each cpu, no
+- * write/read ordering problems.
+- */
+-
+-/*
+- * TLB flush IPI:
+- *
+- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+- * 2) Leave the mm if we are in the lazy tlb mode.
+- *
+- * Interrupts are disabled.
+- */
+-
+-asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
+-{
+- int cpu;
+- int sender;
+- union smp_flush_state *f;
+-
+- cpu = smp_processor_id();
+- /*
+- * orig_rax contains the negated interrupt vector.
+- * Use that to determine where the sender put the data.
+- */
+- sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
+- f = &per_cpu(flush_state, sender);
+-
+- if (!cpu_isset(cpu, f->flush_cpumask))
+- goto out;
+- /*
+- * This was a BUG() but until someone can quote me the
+- * line from the intel manual that guarantees an IPI to
+- * multiple CPUs is retried _only_ on the erroring CPUs
+- * its staying as a return
+- *
+- * BUG();
+- */
+-
+- if (f->flush_mm == read_pda(active_mm)) {
+- if (read_pda(mmu_state) == TLBSTATE_OK) {
+- if (f->flush_va == TLB_FLUSH_ALL)
+- local_flush_tlb();
+- else
+- __flush_tlb_one(f->flush_va);
+- } else
+- leave_mm(cpu);
+- }
+-out:
+- ack_APIC_irq();
+- cpu_clear(cpu, f->flush_cpumask);
+- add_pda(irq_tlb_count, 1);
+-}
+-
+-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+- unsigned long va)
+-{
+- int sender;
+- union smp_flush_state *f;
+- cpumask_t cpumask = *cpumaskp;
+-
+- /* Caller has disabled preemption */
+- sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+- f = &per_cpu(flush_state, sender);
+-
+- /*
+- * Could avoid this lock when
+- * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+- * probably not worth checking this for a cache-hot lock.
+- */
+- spin_lock(&f->tlbstate_lock);
+-
+- f->flush_mm = mm;
+- f->flush_va = va;
+- cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
+-
+- /*
+- * We have to send the IPI only to
+- * CPUs affected.
+- */
+- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+-
+- while (!cpus_empty(f->flush_cpumask))
+- cpu_relax();
+-
+- f->flush_mm = NULL;
+- f->flush_va = 0;
+- spin_unlock(&f->tlbstate_lock);
+-}
+-
+-int __cpuinit init_smp_flush(void)
+-{
+- int i;
+-
+- for_each_cpu_mask(i, cpu_possible_map) {
+- spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
+- }
+- return 0;
+-}
+-core_initcall(init_smp_flush);
+-
+-void flush_tlb_current_task(void)
+-{
+- struct mm_struct *mm = current->mm;
+- cpumask_t cpu_mask;
+-
+- preempt_disable();
+- cpu_mask = mm->cpu_vm_mask;
+- cpu_clear(smp_processor_id(), cpu_mask);
+-
+- local_flush_tlb();
+- if (!cpus_empty(cpu_mask))
+- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+- preempt_enable();
+-}
+-
+-void flush_tlb_mm (struct mm_struct * mm)
+-{
+- cpumask_t cpu_mask;
+-
+- preempt_disable();
+- cpu_mask = mm->cpu_vm_mask;
+- cpu_clear(smp_processor_id(), cpu_mask);
+-
+- if (current->active_mm == mm) {
+- if (current->mm)
+- local_flush_tlb();
+- else
+- leave_mm(smp_processor_id());
+- }
+- if (!cpus_empty(cpu_mask))
+- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+-
+- preempt_enable();
+-}
+-
+-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+-{
+- struct mm_struct *mm = vma->vm_mm;
+- cpumask_t cpu_mask;
+-
+- preempt_disable();
+- cpu_mask = mm->cpu_vm_mask;
+- cpu_clear(smp_processor_id(), cpu_mask);
+-
+- if (current->active_mm == mm) {
+- if(current->mm)
+- __flush_tlb_one(va);
+- else
+- leave_mm(smp_processor_id());
+- }
+-
+- if (!cpus_empty(cpu_mask))
+- flush_tlb_others(cpu_mask, mm, va);
+-
+- preempt_enable();
+-}
+-
+-static void do_flush_tlb_all(void* info)
+-{
+- unsigned long cpu = smp_processor_id();
+-
+- __flush_tlb_all();
+- if (read_pda(mmu_state) == TLBSTATE_LAZY)
+- leave_mm(cpu);
+-}
+-
+-void flush_tlb_all(void)
+-{
+- on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+-}
+-#endif /* Xen */
+-
+-/*
+- * this function sends a 'reschedule' IPI to another CPU.
+- * it goes straight through and wastes no time serializing
+- * anything. Worst case is that we lose a reschedule ...
+- */
+-
+-void smp_send_reschedule(int cpu)
+-{
+- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+-}
+-
+-/*
+- * Structure and data for smp_call_function(). This is designed to minimise
+- * static memory requirements. It also looks cleaner.
+- */
+-static DEFINE_SPINLOCK(call_lock);
+-
+-struct call_data_struct {
+- void (*func) (void *info);
+- void *info;
+- atomic_t started;
+- atomic_t finished;
+- int wait;
+-};
+-
+-static struct call_data_struct * call_data;
+-
+-void lock_ipi_call_lock(void)
+-{
+- spin_lock_irq(&call_lock);
+-}
+-
+-void unlock_ipi_call_lock(void)
+-{
+- spin_unlock_irq(&call_lock);
+-}
+-
+-/*
+- * this function sends a 'generic call function' IPI to all other CPU
+- * of the system defined in the mask.
+- */
+-static int __smp_call_function_mask(cpumask_t mask,
+- void (*func)(void *), void *info,
+- int wait)
+-{
+- struct call_data_struct data;
+- cpumask_t allbutself;
+- int cpus;
+-
+- allbutself = cpu_online_map;
+- cpu_clear(smp_processor_id(), allbutself);
+-
+- cpus_and(mask, mask, allbutself);
+- cpus = cpus_weight(mask);
+-
+- if (!cpus)
+- return 0;
+-
+- data.func = func;
+- data.info = info;
+- atomic_set(&data.started, 0);
+- data.wait = wait;
+- if (wait)
+- atomic_set(&data.finished, 0);
+-
+- call_data = &data;
+- wmb();
+-
+- /* Send a message to other CPUs */
+- if (cpus_equal(mask, allbutself))
+- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+- else
+- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+-
+- /* Wait for response */
+- while (atomic_read(&data.started) != cpus)
+- cpu_relax();
+-
+- if (!wait)
+- return 0;
+-
+- while (atomic_read(&data.finished) != cpus)
+- cpu_relax();
+-
+- return 0;
+-}
+-/**
+- * smp_call_function_mask(): Run a function on a set of other CPUs.
+- * @mask: The set of cpus to run on. Must not include the current cpu.
+- * @func: The function to run. This must be fast and non-blocking.
+- * @info: An arbitrary pointer to pass to the function.
+- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+- *
+- * Returns 0 on success, else a negative status code.
+- *
+- * If @wait is true, then returns once @func has returned; otherwise
+- * it returns just before the target cpu calls @func.
+- *
+- * You must not call this function with disabled interrupts or from a
+- * hardware interrupt handler or from a bottom half handler.
+- */
+-int smp_call_function_mask(cpumask_t mask,
+- void (*func)(void *), void *info,
+- int wait)
+-{
+- int ret;
+-
+- /* Can deadlock when called with interrupts disabled */
+- WARN_ON(irqs_disabled());
+-
+- spin_lock(&call_lock);
+- ret = __smp_call_function_mask(mask, func, info, wait);
+- spin_unlock(&call_lock);
+- return ret;
+-}
+-EXPORT_SYMBOL(smp_call_function_mask);
+-
+-/*
+- * smp_call_function_single - Run a function on a specific CPU
+- * @func: The function to run. This must be fast and non-blocking.
+- * @info: An arbitrary pointer to pass to the function.
+- * @nonatomic: Currently unused.
+- * @wait: If true, wait until function has completed on other CPUs.
+- *
+- * Retrurns 0 on success, else a negative status code.
+- *
+- * Does not return until the remote CPU is nearly ready to execute <func>
+- * or is or has executed.
+- */
+-
+-int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+- int nonatomic, int wait)
+-{
+- /* prevent preemption and reschedule on another processor */
+- int ret, me = get_cpu();
+-
+- /* Can deadlock when called with interrupts disabled */
+- WARN_ON(irqs_disabled());
+-
+- if (cpu == me) {
+- local_irq_disable();
+- func(info);
+- local_irq_enable();
+- put_cpu();
+- return 0;
+- }
+-
+- ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
+-
+- put_cpu();
+- return ret;
+-}
+-EXPORT_SYMBOL(smp_call_function_single);
+-
+-/*
+- * smp_call_function - run a function on all other CPUs.
+- * @func: The function to run. This must be fast and non-blocking.
+- * @info: An arbitrary pointer to pass to the function.
+- * @nonatomic: currently unused.
+- * @wait: If true, wait (atomically) until function has completed on other
+- * CPUs.
+- *
+- * Returns 0 on success, else a negative status code. Does not return until
+- * remote CPUs are nearly ready to execute func or are or have executed.
+- *
+- * You must not call this function with disabled interrupts or from a
+- * hardware interrupt handler or from a bottom half handler.
+- * Actually there are a few legal cases, like panic.
+- */
+-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+- int wait)
+-{
+- return smp_call_function_mask(cpu_online_map, func, info, wait);
+-}
+-EXPORT_SYMBOL(smp_call_function);
+-
+-static void stop_this_cpu(void *dummy)
+-{
+- local_irq_disable();
+- /*
+- * Remove this CPU:
+- */
+- cpu_clear(smp_processor_id(), cpu_online_map);
+- disable_all_local_evtchn();
+- for (;;)
+- halt();
+-}
+-
+-void smp_send_stop(void)
+-{
+- int nolock;
+- unsigned long flags;
+-
+-#ifndef CONFIG_XEN
+- if (reboot_force)
+- return;
+-#endif
+-
+- /* Don't deadlock on the call lock in panic */
+- nolock = !spin_trylock(&call_lock);
+- local_irq_save(flags);
+- __smp_call_function_mask(cpu_online_map, stop_this_cpu, NULL, 0);
+- if (!nolock)
+- spin_unlock(&call_lock);
+- disable_all_local_evtchn();
+- local_irq_restore(flags);
+-}
+-
+-/*
+- * Reschedule call back. Nothing to do,
+- * all the work is done automatically when
+- * we return from the interrupt.
+- */
+-#ifndef CONFIG_XEN
+-asmlinkage void smp_reschedule_interrupt(void)
+-#else
+-asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx)
+-#endif
+-{
+-#ifndef CONFIG_XEN
+- ack_APIC_irq();
+-#endif
+- add_pda(irq_resched_count, 1);
+-#ifdef CONFIG_XEN
+- return IRQ_HANDLED;
+-#endif
+-}
+-
+-#ifndef CONFIG_XEN
+-asmlinkage void smp_call_function_interrupt(void)
+-#else
+-asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx)
+-#endif
+-{
+- void (*func) (void *info) = call_data->func;
+- void *info = call_data->info;
+- int wait = call_data->wait;
+-
+-#ifndef CONFIG_XEN
+- ack_APIC_irq();
+-#endif
+- /*
+- * Notify initiating CPU that I've grabbed the data and am
+- * about to execute the function
+- */
+- mb();
+- atomic_inc(&call_data->started);
+- /*
+- * At this point the info structure may be out of scope unless wait==1
+- */
+- exit_idle();
+- irq_enter();
+- (*func)(info);
+- add_pda(irq_call_count, 1);
+- irq_exit();
+- if (wait) {
+- mb();
+- atomic_inc(&call_data->finished);
+- }
+-#ifdef CONFIG_XEN
+- return IRQ_HANDLED;
+-#endif
+-}
+--- sle11-2009-05-14.orig/arch/x86/kernel/time_32-xen.c 2009-03-24 10:12:48.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/time_32-xen.c 2009-03-24 10:13:09.000000000 +0100
+@@ -699,8 +699,6 @@ int xen_update_persistent_clock(void)
+ return 0;
+ }
+
+-extern void (*late_time_init)(void);
+-
+ /* Dynamically-mapped IRQ. */
+ DEFINE_PER_CPU(int, timer_irq);
+
+--- sle11-2009-05-14.orig/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -9,26 +9,28 @@
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'asm.s'.
+ */
+-#include <linux/sched.h>
++#include <linux/interrupt.h>
++#include <linux/kallsyms.h>
++#include <linux/spinlock.h>
++#include <linux/highmem.h>
++#include <linux/kprobes.h>
++#include <linux/uaccess.h>
++#include <linux/utsname.h>
++#include <linux/kdebug.h>
+ #include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/ptrace.h>
+ #include <linux/string.h>
++#include <linux/unwind.h>
++#include <linux/delay.h>
+ #include <linux/errno.h>
++#include <linux/kexec.h>
++#include <linux/sched.h>
+ #include <linux/timer.h>
+-#include <linux/mm.h>
+ #include <linux/init.h>
+-#include <linux/delay.h>
+-#include <linux/spinlock.h>
+-#include <linux/interrupt.h>
+-#include <linux/highmem.h>
+-#include <linux/kallsyms.h>
+-#include <linux/ptrace.h>
+-#include <linux/utsname.h>
+-#include <linux/kprobes.h>
+-#include <linux/kexec.h>
+-#include <linux/unwind.h>
+-#include <linux/uaccess.h>
+-#include <linux/nmi.h>
+ #include <linux/bug.h>
++#include <linux/nmi.h>
++#include <linux/mm.h>
+
+ #ifdef CONFIG_EISA
+ #include <linux/ioport.h>
+@@ -43,21 +45,18 @@
+ #include <linux/edac.h>
+ #endif
+
++#include <asm/arch_hooks.h>
++#include <asm/stacktrace.h>
+ #include <asm/processor.h>
+-#include <asm/system.h>
+-#include <asm/io.h>
+-#include <asm/atomic.h>
+ #include <asm/debugreg.h>
++#include <asm/atomic.h>
++#include <asm/system.h>
++#include <asm/unwind.h>
+ #include <asm/desc.h>
+ #include <asm/i387.h>
+ #include <asm/nmi.h>
+-#include <asm/unwind.h>
+ #include <asm/smp.h>
+-#include <asm/arch_hooks.h>
+-#include <linux/kdebug.h>
+-#include <asm/stacktrace.h>
+-
+-#include <linux/module.h>
++#include <asm/io.h>
+
+ #include "mach_traps.h"
+
+@@ -71,7 +70,7 @@ EXPORT_SYMBOL_GPL(used_vectors);
+ asmlinkage int system_call(void);
+
+ /* Do we ignore FPU interrupts ? */
+-char ignore_fpu_irq = 0;
++char ignore_fpu_irq;
+
+ #ifndef CONFIG_X86_NO_IDT
+ /*
+@@ -113,12 +112,13 @@ static unsigned int code_bytes = 64;
+ void printk_address(unsigned long address, int reliable)
+ {
+ #ifdef CONFIG_KALLSYMS
+- unsigned long offset = 0, symsize;
++ char namebuf[KSYM_NAME_LEN];
++ unsigned long offset = 0;
++ unsigned long symsize;
+ const char *symname;
+- char *modname;
+- char *delim = ":";
+- char namebuf[128];
+ char reliab[4] = "";
++ char *delim = ":";
++ char *modname;
+
+ symname = kallsyms_lookup(address, &symsize, &offset,
+ &modname, namebuf);
+@@ -146,13 +146,14 @@ static inline int valid_stack_ptr(struct
+
+ /* The form of the top of the frame on the stack */
+ struct stack_frame {
+- struct stack_frame *next_frame;
+- unsigned long return_address;
++ struct stack_frame *next_frame;
++ unsigned long return_address;
+ };
+
+-static inline unsigned long print_context_stack(struct thread_info *tinfo,
+- unsigned long *stack, unsigned long bp,
+- const struct stacktrace_ops *ops, void *data)
++static inline unsigned long
++print_context_stack(struct thread_info *tinfo,
++ unsigned long *stack, unsigned long bp,
++ const struct stacktrace_ops *ops, void *data)
+ {
+ struct stack_frame *frame = (struct stack_frame *)bp;
+
+@@ -174,7 +175,7 @@ static inline unsigned long print_contex
+ return bp;
+ }
+
+-#define MSG(msg) ops->warning(data, msg)
++#define MSG(msg) ops->warning(data, msg)
+
+ void dump_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp,
+@@ -185,6 +186,7 @@ void dump_trace(struct task_struct *task
+
+ if (!stack) {
+ unsigned long dummy;
++
+ stack = &dummy;
+ if (task != current)
+ stack = (unsigned long *)task->thread.sp;
+@@ -194,7 +196,7 @@ void dump_trace(struct task_struct *task
+ if (!bp) {
+ if (task == current) {
+ /* Grab bp right from our regs */
+- asm ("movl %%ebp, %0" : "=r" (bp) : );
++ asm("movl %%ebp, %0" : "=r" (bp) :);
+ } else {
+ /* bp is the last reg pushed by switch_to */
+ bp = *(unsigned long *) task->thread.sp;
+@@ -204,15 +206,18 @@ void dump_trace(struct task_struct *task
+
+ while (1) {
+ struct thread_info *context;
+
-+ if (ioremap_check_change_attr(mfn, size, flags) < 0) {
-+ free_memtype(addr, addr + size);
-+ printk(KERN_INFO
-+ "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
-+ current->comm, current->pid,
-+ cattr_name(flags),
-+ addr, addr + size);
-+ return 0;
-+ }
+ context = (struct thread_info *)
+ ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+ bp = print_context_stack(context, stack, bp, ops, data);
+- /* Should be after the line below, but somewhere
+- in early boot context comes out corrupted and we
+- can't reference it -AK */
++ /*
++ * Should be after the line below, but somewhere
++ * in early boot context comes out corrupted and we
++ * can't reference it:
++ */
+ if (ops->stack(data, "IRQ") < 0)
+ break;
+- stack = (unsigned long*)context->previous_esp;
++ stack = (unsigned long *)context->previous_esp;
+ if (!stack)
+ break;
+ touch_nmi_watchdog();
+@@ -251,15 +256,15 @@ static void print_trace_address(void *da
+ }
+
+ static const struct stacktrace_ops print_trace_ops = {
+- .warning = print_trace_warning,
+- .warning_symbol = print_trace_warning_symbol,
+- .stack = print_trace_stack,
+- .address = print_trace_address,
++ .warning = print_trace_warning,
++ .warning_symbol = print_trace_warning_symbol,
++ .stack = print_trace_stack,
++ .address = print_trace_address,
+ };
+
+ static void
+ show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+- unsigned long *stack, unsigned long bp, char *log_lvl)
++ unsigned long *stack, unsigned long bp, char *log_lvl)
+ {
+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+ printk("%s =======================\n", log_lvl);
+@@ -271,21 +276,22 @@ void show_trace(struct task_struct *task
+ show_trace_log_lvl(task, regs, stack, bp, "");
+ }
+
+-static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+- unsigned long *sp, unsigned long bp, char *log_lvl)
++static void
++show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
++ unsigned long *sp, unsigned long bp, char *log_lvl)
+ {
+ unsigned long *stack;
+ int i;
+
+ if (sp == NULL) {
+ if (task)
+- sp = (unsigned long*)task->thread.sp;
++ sp = (unsigned long *)task->thread.sp;
+ else
+ sp = (unsigned long *)&sp;
+ }
+
+ stack = sp;
+- for(i = 0; i < kstack_depth_to_print; i++) {
++ for (i = 0; i < kstack_depth_to_print; i++) {
+ if (kstack_end(stack))
+ break;
+ if (i && ((i % 8) == 0))
+@@ -293,6 +299,7 @@ static void show_stack_log_lvl(struct ta
+ printk("%08lx ", *stack++);
+ }
+ printk("\n%sCall Trace:\n", log_lvl);
+
-+ *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
-+ flags);
-+ return 1;
-+}
+ show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ }
+
+@@ -307,8 +314,8 @@ void show_stack(struct task_struct *task
+ */
+ void dump_stack(void)
+ {
+- unsigned long stack;
+ unsigned long bp = 0;
++ unsigned long stack;
+
+ #ifdef CONFIG_FRAME_POINTER
+ if (!bp)
+@@ -320,6 +327,7 @@ void dump_stack(void)
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
-+void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
-+{
-+ u64 addr = (u64)mfn << PAGE_SHIFT;
-+ unsigned long flags;
-+ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
+ show_trace(current, NULL, &stack, bp);
+ }
+
+@@ -331,6 +339,7 @@ void show_registers(struct pt_regs *regs
+
+ print_modules();
+ __show_registers(regs, 0);
+
-+ reserve_memtype(addr, addr + size, want_flags, &flags);
-+ if (flags != want_flags) {
-+ printk(KERN_INFO
-+ "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
-+ current->comm, current->pid,
-+ cattr_name(want_flags),
-+ addr, (unsigned long long)(addr + size),
-+ cattr_name(flags));
-+ }
+ printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
+ TASK_COMM_LEN, current->comm, task_pid_nr(current),
+ current_thread_info(), current, task_thread_info(current));
+@@ -339,10 +348,10 @@ void show_registers(struct pt_regs *regs
+ * time of the fault..
+ */
+ if (!user_mode_vm(regs)) {
+- u8 *ip;
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
+ unsigned char c;
++ u8 *ip;
+
+ printk("\n" KERN_EMERG "Stack: ");
+ show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG);
+@@ -369,7 +378,7 @@ void show_registers(struct pt_regs *regs
+ }
+ }
+ printk("\n");
+-}
+}
+
+ int is_valid_bugaddr(unsigned long ip)
+ {
+@@ -385,10 +394,10 @@ int is_valid_bugaddr(unsigned long ip)
+
+ static int die_counter;
+
+-int __kprobes __die(const char * str, struct pt_regs * regs, long err)
++int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+ {
+- unsigned long sp;
+ unsigned short ss;
++ unsigned long sp;
+
+ printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+ #ifdef CONFIG_PREEMPT
+@@ -403,8 +412,8 @@ int __kprobes __die(const char * str, st
+ printk("\n");
+
+ if (notify_die(DIE_OOPS, str, regs, err,
+- current->thread.trap_no, SIGSEGV) !=
+- NOTIFY_STOP) {
++ current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
+
-+void unmap_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
-+{
-+ u64 addr = (u64)mfn << PAGE_SHIFT;
+ show_registers(regs);
+ /* Executive summary in case the oops scrolled away */
+ sp = (unsigned long) (®s->sp);
+@@ -416,17 +425,18 @@ int __kprobes __die(const char * str, st
+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+ print_symbol("%s", regs->ip);
+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
+
-+ free_memtype(addr, addr + size);
-+}
+ return 0;
+- } else {
+- return 1;
+ }
+
---- a/arch/x86/mm/pgtable_32-xen.c
-+++ b/arch/x86/mm/pgtable_32-xen.c
-@@ -1,7 +1,3 @@
--/*
-- * linux/arch/i386/mm/pgtable.c
-- */
--
- #include <linux/sched.h>
- #include <linux/kernel.h>
- #include <linux/errno.h>
-@@ -41,7 +37,6 @@ void show_mem(void)
++ return 1;
+ }
- printk(KERN_INFO "Mem-info:\n");
- show_free_areas();
-- printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
- for_each_online_pgdat(pgdat) {
- pgdat_resize_lock(pgdat, &flags);
- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-@@ -157,243 +152,6 @@ void __init reserve_top_address(unsigned
- __VMALLOC_RESERVE += reserve;
+ /*
+- * This is gone through when something in the kernel has done something bad and
+- * is about to be terminated.
++ * This is gone through when something in the kernel has done something bad
++ * and is about to be terminated:
+ */
+-void die(const char * str, struct pt_regs * regs, long err)
++void die(const char *str, struct pt_regs *regs, long err)
+ {
+ static struct {
+ raw_spinlock_t lock;
+@@ -448,8 +458,9 @@ void die(const char * str, struct pt_reg
+ die.lock_owner = smp_processor_id();
+ die.lock_owner_depth = 0;
+ bust_spinlocks(1);
+- } else
++ } else {
+ raw_local_irq_save(flags);
++ }
+
+ if (++die.lock_owner_depth < 3) {
+ report_bug(regs->ip, regs);
+@@ -482,19 +493,20 @@ void die(const char * str, struct pt_reg
+ do_exit(SIGSEGV);
}
--pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
--{
-- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-- if (pte)
-- make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
-- return pte;
--}
--
--/*
-- * List of all pgd's needed for non-PAE so it can invalidate entries
-- * in both cached and uncached pgd's; not needed for PAE since the
-- * kernel pmd is shared. If PAE were not to share the pmd a similar
-- * tactic would be needed. This is essentially codepath-based locking
-- * against pageattr.c; it is the unique case in which a valid change
-- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
-- * vmalloc faults work because attached pagetables are never freed.
-- * -- wli
-- */
--static inline void pgd_list_add(pgd_t *pgd)
--{
-- struct page *page = virt_to_page(pgd);
--
-- list_add(&page->lru, &pgd_list);
--}
--
--static inline void pgd_list_del(pgd_t *pgd)
--{
-- struct page *page = virt_to_page(pgd);
--
-- list_del(&page->lru);
--}
--
--#define UNSHARED_PTRS_PER_PGD \
-- (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
--
--static void pgd_ctor(void *p)
--{
-- pgd_t *pgd = p;
-- unsigned long flags;
--
-- pgd_test_and_unpin(pgd);
--
-- /* Clear usermode parts of PGD */
-- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
--
-- spin_lock_irqsave(&pgd_lock, flags);
--
-- /* If the pgd points to a shared pagetable level (either the
-- ptes in non-PAE, or shared PMD in PAE), then just copy the
-- references from swapper_pg_dir. */
-- if (PAGETABLE_LEVELS == 2 ||
-- (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
-- clone_pgd_range(pgd + USER_PTRS_PER_PGD,
-- swapper_pg_dir + USER_PTRS_PER_PGD,
-- KERNEL_PGD_PTRS);
-- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
-- __pa(swapper_pg_dir) >> PAGE_SHIFT,
-- USER_PTRS_PER_PGD,
-- KERNEL_PGD_PTRS);
-- }
--
-- /* list required to sync kernel mapping updates */
-- if (PAGETABLE_LEVELS == 2)
-- pgd_list_add(pgd);
--
-- spin_unlock_irqrestore(&pgd_lock, flags);
--}
--
--static void pgd_dtor(void *pgd)
--{
-- unsigned long flags; /* can be called from interrupt context */
--
-- if (!SHARED_KERNEL_PMD) {
-- spin_lock_irqsave(&pgd_lock, flags);
-- pgd_list_del(pgd);
-- spin_unlock_irqrestore(&pgd_lock, flags);
-- }
--
-- pgd_test_and_unpin(pgd);
--}
--
--#ifdef CONFIG_X86_PAE
--/*
-- * Mop up any pmd pages which may still be attached to the pgd.
-- * Normally they will be freed by munmap/exit_mmap, but any pmd we
-- * preallocate which never got a corresponding vma will need to be
-- * freed manually.
-- */
--static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
--{
-- int i;
--
-- for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
-- pgd_t pgd = pgdp[i];
--
-- if (__pgd_val(pgd) != 0) {
-- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
--
-- pgdp[i] = xen_make_pgd(0);
--
-- paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
-- pmd_free(mm, pmd);
-- }
-- }
--}
--
--/*
-- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
-- * updating the top-level pagetable entries to guarantee the
-- * processor notices the update. Since this is expensive, and
-- * all 4 top-level entries are used almost immediately in a
-- * new process's life, we just pre-populate them here.
-- *
-- * Also, if we're in a paravirt environment where the kernel pmd is
-- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
-- * and initialize the kernel pmds here.
-- */
--static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
--{
-- pud_t *pud;
-- pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
-- unsigned long addr, flags;
-- int i;
--
-- /*
-- * We can race save/restore (if we sleep during a GFP_KERNEL memory
-- * allocation). We therefore store virtual addresses of pmds as they
-- * do not change across save/restore, and poke the machine addresses
-- * into the pgdir under the pgd_lock.
-- */
-- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
-- pmds[i] = pmd_alloc_one(mm, addr);
-- if (!pmds[i])
-- goto out_oom;
-- }
--
-- spin_lock_irqsave(&pgd_lock, flags);
--
-- /* Protect against save/restore: move below 4GB under pgd_lock. */
-- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
-- && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
-- spin_unlock_irqrestore(&pgd_lock, flags);
--out_oom:
-- while (i--)
-- pmd_free(mm, pmds[i]);
-- return 0;
+-static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
++static inline void
++die_if_kernel(const char *str, struct pt_regs *regs, long err)
+ {
+ if (!user_mode_vm(regs))
+ die(str, regs, err);
+ }
+
+-static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
+- struct pt_regs * regs, long error_code,
+- siginfo_t *info)
++static void __kprobes
++do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
++ long error_code, siginfo_t *info)
+ {
+ struct task_struct *tsk = current;
+
+- if (regs->flags & VM_MASK) {
++ if (regs->flags & X86_VM_MASK) {
+ if (vm86)
+ goto vm86_trap;
+ goto trap_signal;
+@@ -503,109 +515,112 @@ static void __kprobes do_trap(int trapnr
+ if (!user_mode(regs))
+ goto kernel_trap;
+
+- trap_signal: {
+- /*
+- * We want error_code and trap_no set for userspace faults and
+- * kernelspace faults which result in die(), but not
+- * kernelspace faults which are fixed up. die() gives the
+- * process no chance to handle the signal and notice the
+- * kernel fault information, so that won't result in polluting
+- * the information about previously queued, but not yet
+- * delivered, faults. See also do_general_protection below.
+- */
+- tsk->thread.error_code = error_code;
+- tsk->thread.trap_no = trapnr;
++trap_signal:
++ /*
++ * We want error_code and trap_no set for userspace faults and
++ * kernelspace faults which result in die(), but not
++ * kernelspace faults which are fixed up. die() gives the
++ * process no chance to handle the signal and notice the
++ * kernel fault information, so that won't result in polluting
++ * the information about previously queued, but not yet
++ * delivered, faults. See also do_general_protection below.
++ */
++ tsk->thread.error_code = error_code;
++ tsk->thread.trap_no = trapnr;
+
+- if (info)
+- force_sig_info(signr, info, tsk);
+- else
+- force_sig(signr, tsk);
+- return;
- }
--
-- /* Copy kernel pmd contents and write-protect the new pmds. */
-- pud = pud_offset(pgd, 0);
-- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
-- i++, pud++, addr += PUD_SIZE) {
-- if (i >= USER_PTRS_PER_PGD) {
-- memcpy(pmds[i],
-- (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
-- sizeof(pmd_t) * PTRS_PER_PMD);
-- make_lowmem_page_readonly(
-- pmds[i], XENFEAT_writable_page_tables);
++ if (info)
++ force_sig_info(signr, info, tsk);
++ else
++ force_sig(signr, tsk);
++ return;
+
+- kernel_trap: {
+- if (!fixup_exception(regs)) {
+- tsk->thread.error_code = error_code;
+- tsk->thread.trap_no = trapnr;
+- die(str, regs, error_code);
- }
--
-- /* It is safe to poke machine addresses of pmds under the pgd_lock. */
-- pud_populate(mm, pud, pmds[i]);
-- }
--
-- /* List required to sync kernel mapping updates and
-- * to pin/unpin on save/restore. */
-- pgd_list_add(pgd);
--
-- spin_unlock_irqrestore(&pgd_lock, flags);
--
-- return 1;
--}
--#else /* !CONFIG_X86_PAE */
--/* No need to prepopulate any pagetable entries in non-PAE modes. */
--static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
--{
-- return 1;
--}
--
--static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
--{
--}
--#endif /* CONFIG_X86_PAE */
--
--pgd_t *pgd_alloc(struct mm_struct *mm)
--{
-- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
--
-- /* so that alloc_pd can use it */
-- mm->pgd = pgd;
-- if (pgd)
-- pgd_ctor(pgd);
--
-- if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
-- free_page((unsigned long)pgd);
-- pgd = NULL;
+- return;
++kernel_trap:
++ if (!fixup_exception(regs)) {
++ tsk->thread.error_code = error_code;
++ tsk->thread.trap_no = trapnr;
++ die(str, regs, error_code);
+ }
++ return;
+
+- vm86_trap: {
+- int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
+- if (ret) goto trap_signal;
+- return;
- }
--
-- return pgd;
--}
--
--void pgd_free(struct mm_struct *mm, pgd_t *pgd)
--{
-- /*
-- * After this the pgd should not be pinned for the duration of this
-- * function's execution. We should never sleep and thus never race:
-- * 1. User pmds will not become write-protected under our feet due
-- * to a concurrent mm_pin_all().
-- * 2. The machine addresses in PGD entries will not become invalid
-- * due to a concurrent save/restore.
-- */
-- pgd_dtor(pgd);
--
-- if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
-- xen_destroy_contiguous_region((unsigned long)pgd, 0);
--
-- pgd_mop_up_pmds(mm, pgd);
-- free_page((unsigned long)pgd);
++vm86_trap:
++ if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
++ error_code, trapnr))
++ goto trap_signal;
++ return;
+ }
+
+-#define DO_ERROR(trapnr, signr, str, name) \
+-void do_##name(struct pt_regs * regs, long error_code) \
+-{ \
+- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+- == NOTIFY_STOP) \
+- return; \
+- do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
-}
-
--void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
--{
-- pgtable_page_dtor(pte);
-- paravirt_release_pt(page_to_pfn(pte));
-- tlb_remove_page(tlb, pte);
+-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
+-void do_##name(struct pt_regs * regs, long error_code) \
+-{ \
+- siginfo_t info; \
+- if (irq) \
+- local_irq_enable(); \
+- info.si_signo = signr; \
+- info.si_errno = 0; \
+- info.si_code = sicode; \
+- info.si_addr = (void __user *)siaddr; \
+- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+- == NOTIFY_STOP) \
+- return; \
+- do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
-}
-
--#ifdef CONFIG_X86_PAE
--
--void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
--{
-- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-- tlb_remove_page(tlb, virt_to_page(pmd));
+-#define DO_VM86_ERROR(trapnr, signr, str, name) \
+-void do_##name(struct pt_regs * regs, long error_code) \
+-{ \
+- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+- == NOTIFY_STOP) \
+- return; \
+- do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
-}
-
--#endif
--
- void make_lowmem_page_readonly(void *va, unsigned int feature)
- {
- pte_t *pte;
---- /dev/null
-+++ b/arch/x86/mm/pgtable-xen.c
-@@ -0,0 +1,709 @@
-+#include <linux/mm.h>
-+#include <linux/module.h>
-+#include <xen/features.h>
-+#include <asm/pgalloc.h>
-+#include <asm/pgtable.h>
-+#include <asm/tlb.h>
-+#include <asm/hypervisor.h>
-+#include <asm/mmu_context.h>
-+
-+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-+{
-+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-+ if (pte)
-+ make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
-+ return pte;
-+}
-+
-+static void _pte_free(struct page *page, unsigned int order)
-+{
-+ BUG_ON(order);
-+ __pte_free(page);
-+}
-+
-+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-+{
-+ struct page *pte;
-+
-+#ifdef CONFIG_HIGHPTE
-+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
-+#else
-+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-+#endif
-+ if (pte) {
-+ pgtable_page_ctor(pte);
-+ SetPageForeign(pte, _pte_free);
-+ init_page_count(pte);
-+ }
-+ return pte;
-+}
-+
-+void __pte_free(pgtable_t pte)
-+{
-+ if (!PageHighMem(pte)) {
-+ unsigned long va = (unsigned long)page_address(pte);
-+ unsigned int level;
-+ pte_t *ptep = lookup_address(va, &level);
-+
-+ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
-+ if (!pte_write(*ptep)
-+ && HYPERVISOR_update_va_mapping(va,
-+ mk_pte(pte, PAGE_KERNEL),
-+ 0))
-+ BUG();
-+ } else
-+#ifdef CONFIG_HIGHPTE
-+ ClearPagePinned(pte);
-+#else
-+ BUG();
-+#endif
-+
-+ ClearPageForeign(pte);
-+ init_page_count(pte);
-+ pgtable_page_dtor(pte);
-+ __free_page(pte);
-+}
-+
-+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-+{
-+ pgtable_page_dtor(pte);
-+ paravirt_release_pte(page_to_pfn(pte));
-+ tlb_remove_page(tlb, pte);
+-#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+-void do_##name(struct pt_regs * regs, long error_code) \
+-{ \
+- siginfo_t info; \
+- info.si_signo = signr; \
+- info.si_errno = 0; \
+- info.si_code = sicode; \
+- info.si_addr = (void __user *)siaddr; \
+- trace_hardirqs_fixup(); \
+- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+- == NOTIFY_STOP) \
+- return; \
+- do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
++#define DO_ERROR(trapnr, signr, str, name) \
++void do_##name(struct pt_regs *regs, long error_code) \
++{ \
++ trace_hardirqs_fixup(); \
++ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
++ == NOTIFY_STOP) \
++ return; \
++ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
+}
+
-+#if PAGETABLE_LEVELS > 2
-+static void _pmd_free(struct page *page, unsigned int order)
-+{
-+ BUG_ON(order);
-+ __pmd_free(page);
++#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
++void do_##name(struct pt_regs *regs, long error_code) \
++{ \
++ siginfo_t info; \
++ if (irq) \
++ local_irq_enable(); \
++ info.si_signo = signr; \
++ info.si_errno = 0; \
++ info.si_code = sicode; \
++ info.si_addr = (void __user *)siaddr; \
++ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
++ == NOTIFY_STOP) \
++ return; \
++ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
+}
+
-+pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
-+{
-+ struct page *pmd;
-+
-+ pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-+ if (!pmd)
-+ return NULL;
-+ SetPageForeign(pmd, _pmd_free);
-+ init_page_count(pmd);
-+ return page_address(pmd);
++#define DO_VM86_ERROR(trapnr, signr, str, name) \
++void do_##name(struct pt_regs *regs, long error_code) \
++{ \
++ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
++ == NOTIFY_STOP) \
++ return; \
++ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
+}
+
-+void __pmd_free(pgtable_t pmd)
-+{
-+ unsigned long va = (unsigned long)page_address(pmd);
-+ unsigned int level;
-+ pte_t *ptep = lookup_address(va, &level);
-+
-+ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
-+ if (!pte_write(*ptep)
-+ && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
-+ BUG();
-+
-+ ClearPageForeign(pmd);
-+ init_page_count(pmd);
-+ __free_page(pmd);
-+}
++#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
++void do_##name(struct pt_regs *regs, long error_code) \
++{ \
++ siginfo_t info; \
++ info.si_signo = signr; \
++ info.si_errno = 0; \
++ info.si_code = sicode; \
++ info.si_addr = (void __user *)siaddr; \
++ trace_hardirqs_fixup(); \
++ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
++ == NOTIFY_STOP) \
++ return; \
++ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
+ }
+
+-DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
++DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+ #ifndef CONFIG_KPROBES
+-DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
++DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
+ #endif
+-DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
+-DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
+-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+-DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
++DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
++DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
++DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
++DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+ DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+ DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
+ DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
+-DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
++DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+
+ void __kprobes do_general_protection(struct pt_regs * regs,
+ long error_code)
+ {
+- if (regs->flags & VM_MASK)
++ struct thread_struct *thread;
+
-+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-+{
-+ paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
-+ tlb_remove_page(tlb, virt_to_page(pmd));
-+}
++ thread = ¤t->thread;
+
-+#if PAGETABLE_LEVELS > 3
-+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
-+{
-+ paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
-+ tlb_remove_page(tlb, virt_to_page(pud));
-+}
-+#endif /* PAGETABLE_LEVELS > 3 */
-+#endif /* PAGETABLE_LEVELS > 2 */
++ if (regs->flags & X86_VM_MASK)
+ goto gp_in_vm86;
+
+ if (!user_mode(regs))
+@@ -613,6 +628,7 @@ void __kprobes do_general_protection(str
+
+ current->thread.error_code = error_code;
+ current->thread.trap_no = 13;
+
-+#ifndef CONFIG_X86_64
-+#define TASK_SIZE64 TASK_SIZE
-+#endif
+ if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
+ printk_ratelimit()) {
+ printk(KERN_INFO
+@@ -642,22 +658,25 @@ gp_in_kernel:
+ }
+ }
+
+-static __kprobes void
+-mem_parity_error(unsigned char reason, struct pt_regs * regs)
++static notrace __kprobes void
++mem_parity_error(unsigned char reason, struct pt_regs *regs)
+ {
+- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+- "CPU %d.\n", reason, smp_processor_id());
+- printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
++ printk(KERN_EMERG
++ "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
++ reason, smp_processor_id());
+
-+static void _pin_lock(struct mm_struct *mm, int lock) {
-+ if (lock)
-+ spin_lock(&mm->page_table_lock);
-+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-+ /* While mm->page_table_lock protects us against insertions and
-+ * removals of higher level page table pages, it doesn't protect
-+ * against updates of pte-s. Such updates, however, require the
-+ * pte pages to be in consistent state (unpinned+writable or
-+ * pinned+readonly). The pinning and attribute changes, however
-+ * cannot be done atomically, which is why such updates must be
-+ * prevented from happening concurrently.
-+ * Note that no pte lock can ever elsewhere be acquired nesting
-+ * with an already acquired one in the same mm, or with the mm's
-+ * page_table_lock already acquired, as that would break in the
-+ * non-split case (where all these are actually resolving to the
-+ * one page_table_lock). Thus acquiring all of them here is not
-+ * going to result in dead locks, and the order of acquires
-+ * doesn't matter.
++ printk(KERN_EMERG
++ "You have some hardware problem, likely on the PCI bus.\n");
+
+ #if defined(CONFIG_EDAC)
+- if(edac_handler_set()) {
++ if (edac_handler_set()) {
+ edac_atomic_assert_error();
+ return;
+ }
+ #endif
+
+ if (panic_on_unrecovered_nmi)
+- panic("NMI: Not continuing");
++ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+
+@@ -665,8 +684,8 @@ mem_parity_error(unsigned char reason, s
+ clear_mem_error(reason);
+ }
+
+-static __kprobes void
+-io_check_error(unsigned char reason, struct pt_regs * regs)
++static notrace __kprobes void
++io_check_error(unsigned char reason, struct pt_regs *regs)
+ {
+ printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+ show_registers(regs);
+@@ -675,38 +694,43 @@ io_check_error(unsigned char reason, str
+ clear_io_check_error(reason);
+ }
+
+-static __kprobes void
+-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
++static notrace __kprobes void
++unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+ {
++ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
++ return;
+ #ifdef CONFIG_MCA
+- /* Might actually be able to figure out what the guilty party
+- * is. */
+- if( MCA_bus ) {
++ /*
++ * Might actually be able to figure out what the guilty party
++ * is:
+ */
-+ {
-+ pgd_t *pgd = mm->pgd;
-+ unsigned g;
-+
-+ for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
-+ pud_t *pud;
-+ unsigned u;
-+
-+ if (pgd_none(*pgd))
-+ continue;
-+ pud = pud_offset(pgd, 0);
-+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-+ pmd_t *pmd;
-+ unsigned m;
++ if (MCA_bus) {
+ mca_handle_nmi();
+ return;
+ }
+ #endif
+- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+- "CPU %d.\n", reason, smp_processor_id());
++ printk(KERN_EMERG
++ "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
++ reason, smp_processor_id());
+
-+ if (pud_none(*pud))
-+ continue;
-+ pmd = pmd_offset(pud, 0);
-+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
-+ spinlock_t *ptl;
+ printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+ if (panic_on_unrecovered_nmi)
+- panic("NMI: Not continuing");
++ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ }
+
+ static DEFINE_SPINLOCK(nmi_print_lock);
+
+-void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
++void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+ {
+- if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
+- NOTIFY_STOP)
++ if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
+
+ spin_lock(&nmi_print_lock);
+ /*
+ * We are in trouble anyway, lets at least try
+- * to get a message out.
++ * to get a message out:
+ */
+ bust_spinlocks(1);
+ printk(KERN_EMERG "%s", msg);
+@@ -717,9 +741,10 @@ void __kprobes die_nmi(struct pt_regs *r
+ spin_unlock(&nmi_print_lock);
+ bust_spinlocks(0);
+
+- /* If we are in kernel we are probably nested up pretty bad
+- * and might aswell get out now while we still can.
+- */
++ /*
++ * If we are in kernel we are probably nested up pretty bad
++ * and might aswell get out now while we still can:
++ */
+ if (!user_mode_vm(regs)) {
+ current->thread.trap_no = 2;
+ crash_kexec(regs);
+@@ -728,14 +753,14 @@ void __kprobes die_nmi(struct pt_regs *r
+ do_exit(SIGSEGV);
+ }
+
+-static __kprobes void default_do_nmi(struct pt_regs * regs)
++static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+ {
+ unsigned char reason = 0;
+
+- /* Only the BSP gets external NMIs from the system. */
++ /* Only the BSP gets external NMIs from the system: */
+ if (!smp_processor_id())
+ reason = get_nmi_reason();
+-
+
-+ if (pmd_none(*pmd))
-+ continue;
-+ ptl = pte_lockptr(0, pmd);
-+ if (lock)
-+ spin_lock(ptl);
-+ else
-+ spin_unlock(ptl);
-+ }
-+ }
-+ }
-+ }
+ if (!(reason & 0xc0)) {
+ if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP)
+@@ -748,8 +773,10 @@ static __kprobes void default_do_nmi(str
+ if (nmi_watchdog_tick(regs, reason))
+ return;
+ if (!do_nmi_callback(regs, smp_processor_id()))
+-#endif
+ unknown_nmi_error(reason, regs);
++#else
++ unknown_nmi_error(reason, regs);
+#endif
-+ if (!lock)
-+ spin_unlock(&mm->page_table_lock);
-+}
-+#define pin_lock(mm) _pin_lock(mm, 1)
-+#define pin_unlock(mm) _pin_lock(mm, 0)
-+
-+#define PIN_BATCH sizeof(void *)
-+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
-+
-+static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
-+ unsigned int cpu, unsigned int seq)
-+{
-+ unsigned long pfn = page_to_pfn(page);
+
+ return;
+ }
+@@ -761,14 +788,14 @@ static __kprobes void default_do_nmi(str
+ io_check_error(reason, regs);
+ /*
+ * Reassert NMI in case it became active meanwhile
+- * as it's edge-triggered.
++ * as it's edge-triggered:
+ */
+ reassert_nmi();
+ }
+
+ static int ignore_nmis;
+
+-__kprobes void do_nmi(struct pt_regs * regs, long error_code)
++notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
+ {
+ int cpu;
+
+@@ -804,9 +831,12 @@ void __kprobes do_int3(struct pt_regs *r
+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+ == NOTIFY_STOP)
+ return;
+- /* This is an interrupt gate, because kprobes wants interrupts
+- disabled. Normal trap handlers don't. */
++ /*
++ * This is an interrupt gate, because kprobes wants interrupts
++ * disabled. Normal trap handlers don't.
++ */
+ restore_interrupts(regs);
+
-+ if (PageHighMem(page)) {
-+ if (pgprot_val(flags) & _PAGE_RW)
-+ ClearPagePinned(page);
-+ else
-+ SetPagePinned(page);
-+ } else {
-+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
-+ (unsigned long)__va(pfn << PAGE_SHIFT),
-+ pfn_pte(pfn, flags), 0);
-+ if (unlikely(++seq == PIN_BATCH)) {
-+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
-+ PIN_BATCH, NULL)))
-+ BUG();
-+ seq = 0;
-+ }
+ do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+ }
+ #endif
+@@ -821,7 +851,7 @@ void __kprobes do_int3(struct pt_regs *r
+ * from user space. Such code must not hold kernel locks (since it
+ * can equally take a page fault), therefore it is safe to call
+ * force_sig_info even though that claims and releases locks.
+- *
++ *
+ * Code in ./signal.c ensures that the debug control register
+ * is restored before we deliver any signal, and therefore that
+ * user code runs with the correct debug control register even though
+@@ -833,10 +863,10 @@ void __kprobes do_int3(struct pt_regs *r
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ */
+-void __kprobes do_debug(struct pt_regs * regs, long error_code)
++void __kprobes do_debug(struct pt_regs *regs, long error_code)
+ {
+- unsigned int condition;
+ struct task_struct *tsk = current;
++ unsigned int condition;
+
+ trace_hardirqs_fixup();
+
+@@ -861,7 +891,7 @@ void __kprobes do_debug(struct pt_regs *
+ goto clear_dr7;
+ }
+
+- if (regs->flags & VM_MASK)
++ if (regs->flags & X86_VM_MASK)
+ goto debug_vm86;
+
+ /* Save debug status register where ptrace can see it */
+@@ -884,7 +914,8 @@ void __kprobes do_debug(struct pt_regs *
+ /* Ok, finally something we can handle */
+ send_sigtrap(tsk, regs, error_code);
+
+- /* Disable additional traps. They'll be re-enabled when
++ /*
++ * Disable additional traps. They'll be re-enabled when
+ * the signal is delivered.
+ */
+ clear_dr7:
+@@ -897,7 +928,7 @@ debug_vm86:
+
+ clear_TF_reenable:
+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+- regs->flags &= ~TF_MASK;
++ regs->flags &= ~X86_EFLAGS_TF;
+ return;
+ }
+
+@@ -908,9 +939,10 @@ clear_TF_reenable:
+ */
+ void math_error(void __user *ip)
+ {
+- struct task_struct * task;
++ struct task_struct *task;
++ unsigned short cwd;
++ unsigned short swd;
+ siginfo_t info;
+- unsigned short cwd, swd;
+
+ /*
+ * Save the info for the exception handler and clear the error.
+@@ -936,36 +968,36 @@ void math_error(void __user *ip)
+ cwd = get_fpu_cwd(task);
+ swd = get_fpu_swd(task);
+ switch (swd & ~cwd & 0x3f) {
+- case 0x000: /* No unmasked exception */
+- return;
+- default: /* Multiple exceptions */
+- break;
+- case 0x001: /* Invalid Op */
+- /*
+- * swd & 0x240 == 0x040: Stack Underflow
+- * swd & 0x240 == 0x240: Stack Overflow
+- * User must clear the SF bit (0x40) if set
+- */
+- info.si_code = FPE_FLTINV;
+- break;
+- case 0x002: /* Denormalize */
+- case 0x010: /* Underflow */
+- info.si_code = FPE_FLTUND;
+- break;
+- case 0x004: /* Zero Divide */
+- info.si_code = FPE_FLTDIV;
+- break;
+- case 0x008: /* Overflow */
+- info.si_code = FPE_FLTOVF;
+- break;
+- case 0x020: /* Precision */
+- info.si_code = FPE_FLTRES;
+- break;
++ case 0x000: /* No unmasked exception */
++ return;
++ default: /* Multiple exceptions */
++ break;
++ case 0x001: /* Invalid Op */
++ /*
++ * swd & 0x240 == 0x040: Stack Underflow
++ * swd & 0x240 == 0x240: Stack Overflow
++ * User must clear the SF bit (0x40) if set
++ */
++ info.si_code = FPE_FLTINV;
++ break;
++ case 0x002: /* Denormalize */
++ case 0x010: /* Underflow */
++ info.si_code = FPE_FLTUND;
++ break;
++ case 0x004: /* Zero Divide */
++ info.si_code = FPE_FLTDIV;
++ break;
++ case 0x008: /* Overflow */
++ info.si_code = FPE_FLTOVF;
++ break;
++ case 0x020: /* Precision */
++ info.si_code = FPE_FLTRES;
++ break;
+ }
+ force_sig_info(SIGFPE, &info, task);
+ }
+
+-void do_coprocessor_error(struct pt_regs * regs, long error_code)
++void do_coprocessor_error(struct pt_regs *regs, long error_code)
+ {
+ ignore_fpu_irq = 1;
+ math_error((void __user *)regs->ip);
+@@ -973,9 +1005,9 @@ void do_coprocessor_error(struct pt_regs
+
+ static void simd_math_error(void __user *ip)
+ {
+- struct task_struct * task;
+- siginfo_t info;
++ struct task_struct *task;
+ unsigned short mxcsr;
++ siginfo_t info;
+
+ /*
+ * Save the info for the exception handler and clear the error.
+@@ -996,84 +1028,82 @@ static void simd_math_error(void __user
+ */
+ mxcsr = get_fpu_mxcsr(task);
+ switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+- case 0x000:
+- default:
+- break;
+- case 0x001: /* Invalid Op */
+- info.si_code = FPE_FLTINV;
+- break;
+- case 0x002: /* Denormalize */
+- case 0x010: /* Underflow */
+- info.si_code = FPE_FLTUND;
+- break;
+- case 0x004: /* Zero Divide */
+- info.si_code = FPE_FLTDIV;
+- break;
+- case 0x008: /* Overflow */
+- info.si_code = FPE_FLTOVF;
+- break;
+- case 0x020: /* Precision */
+- info.si_code = FPE_FLTRES;
+- break;
++ case 0x000:
++ default:
++ break;
++ case 0x001: /* Invalid Op */
++ info.si_code = FPE_FLTINV;
++ break;
++ case 0x002: /* Denormalize */
++ case 0x010: /* Underflow */
++ info.si_code = FPE_FLTUND;
++ break;
++ case 0x004: /* Zero Divide */
++ info.si_code = FPE_FLTDIV;
++ break;
++ case 0x008: /* Overflow */
++ info.si_code = FPE_FLTOVF;
++ break;
++ case 0x020: /* Precision */
++ info.si_code = FPE_FLTRES;
++ break;
+ }
+ force_sig_info(SIGFPE, &info, task);
+ }
+
+-void do_simd_coprocessor_error(struct pt_regs * regs,
+- long error_code)
++void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+ {
+ if (cpu_has_xmm) {
+ /* Handle SIMD FPU exceptions on PIII+ processors. */
+ ignore_fpu_irq = 1;
+ simd_math_error((void __user *)regs->ip);
+- } else {
+- /*
+- * Handle strange cache flush from user space exception
+- * in all other cases. This is undocumented behaviour.
+- */
+- if (regs->flags & VM_MASK) {
+- handle_vm86_fault((struct kernel_vm86_regs *)regs,
+- error_code);
+- return;
+- }
+- current->thread.trap_no = 19;
+- current->thread.error_code = error_code;
+- die_if_kernel("cache flush denied", regs, error_code);
+- force_sig(SIGSEGV, current);
++ return;
+ }
-+
-+ return seq;
-+}
-+
-+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
-+{
-+ pgd_t *pgd = pgd_base;
-+ pud_t *pud;
-+ pmd_t *pmd;
-+ int g,u,m;
-+ unsigned int cpu, seq;
-+ multicall_entry_t *mcl;
-+
-+ if (xen_feature(XENFEAT_auto_translated_physmap))
++ /*
++ * Handle strange cache flush from user space exception
++ * in all other cases. This is undocumented behaviour.
++ */
++ if (regs->flags & X86_VM_MASK) {
++ handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
+ return;
+ }
++ current->thread.trap_no = 19;
++ current->thread.error_code = error_code;
++ die_if_kernel("cache flush denied", regs, error_code);
++ force_sig(SIGSEGV, current);
+ }
+
+ #ifndef CONFIG_XEN
+-void do_spurious_interrupt_bug(struct pt_regs * regs,
+- long error_code)
++void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+ {
+ #if 0
+ /* No need to warn about this any longer. */
+- printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
++ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+ #endif
+ }
+
+-unsigned long patch_espfix_desc(unsigned long uesp,
+- unsigned long kesp)
++unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
+ {
+ struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
+ unsigned long base = (kesp - uesp) & -THREAD_SIZE;
+ unsigned long new_kesp = kesp - base;
+ unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
+ __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
+
-+ cpu = get_cpu();
+ /* Set up base for espfix segment */
+- desc &= 0x00f0ff0000000000ULL;
+- desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
++ desc &= 0x00f0ff0000000000ULL;
++ desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
+ ((((__u64)base) << 32) & 0xff00000000000000ULL) |
+ ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
+ (lim_pages & 0xffff);
+ *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
+
-+ /*
-+ * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
-+ * may not be the 'current' task's pagetables (e.g., current may be
-+ * 32-bit, but the pagetables may be for a 64-bit task).
-+ * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
-+ * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
-+ */
-+ for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
-+ if (pgd_none(*pgd))
-+ continue;
-+ pud = pud_offset(pgd, 0);
-+ if (PTRS_PER_PUD > 1) /* not folded */
-+ seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
-+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-+ if (pud_none(*pud))
-+ continue;
-+ pmd = pmd_offset(pud, 0);
-+ if (PTRS_PER_PMD > 1) /* not folded */
-+ seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
-+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
-+ if (pmd_none(*pmd))
-+ continue;
-+ seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
-+ }
+ return new_kesp;
+ }
+ #endif
+
+ /*
+- * 'math_state_restore()' saves the current math information in the
++ * 'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+@@ -1087,9 +1117,22 @@ asmlinkage void math_state_restore(void)
+ struct thread_info *thread = current_thread_info();
+ struct task_struct *tsk = thread->task;
+
++ if (!tsk_used_math(tsk)) {
++ local_irq_enable();
++ /*
++ * does a slab alloc which can sleep
++ */
++ if (init_fpu(tsk)) {
++ /*
++ * ran out of memory!
++ */
++ do_group_exit(SIGKILL);
++ return;
+ }
++ local_irq_disable();
+ }
+
-+ mcl = per_cpu(pb_mcl, cpu);
-+#ifdef CONFIG_X86_64
-+ if (unlikely(seq > PIN_BATCH - 2)) {
-+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
-+ BUG();
-+ seq = 0;
-+ }
-+ MULTI_update_va_mapping(mcl + seq,
-+ (unsigned long)__user_pgd(pgd_base),
-+ pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
-+ 0);
-+ MULTI_update_va_mapping(mcl + seq + 1,
-+ (unsigned long)pgd_base,
-+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-+ UVMF_TLB_FLUSH);
-+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
-+ BUG();
-+#else
-+ if (likely(seq != 0)) {
-+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
-+ (unsigned long)pgd_base,
-+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-+ UVMF_TLB_FLUSH);
-+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
-+ seq + 1, NULL)))
-+ BUG();
-+ } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
-+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-+ UVMF_TLB_FLUSH))
-+ BUG();
-+#endif
-+
-+ put_cpu();
-+}
-+
-+static void __pgd_pin(pgd_t *pgd)
-+{
-+ pgd_walk(pgd, PAGE_KERNEL_RO);
-+ kmap_flush_unused();
-+ xen_pgd_pin(__pa(pgd)); /* kernel */
-+#ifdef CONFIG_X86_64
-+ xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
-+#endif
-+ SetPagePinned(virt_to_page(pgd));
-+}
-+
-+static void __pgd_unpin(pgd_t *pgd)
-+{
-+ xen_pgd_unpin(__pa(pgd));
-+#ifdef CONFIG_X86_64
-+ xen_pgd_unpin(__pa(__user_pgd(pgd)));
-+#endif
-+ pgd_walk(pgd, PAGE_KERNEL);
-+ ClearPagePinned(virt_to_page(pgd));
-+}
-+
-+static void pgd_test_and_unpin(pgd_t *pgd)
-+{
-+ if (PagePinned(virt_to_page(pgd)))
-+ __pgd_unpin(pgd);
-+}
-+
-+void mm_pin(struct mm_struct *mm)
-+{
-+ if (xen_feature(XENFEAT_writable_page_tables))
-+ return;
-+
-+ pin_lock(mm);
-+ __pgd_pin(mm->pgd);
-+ pin_unlock(mm);
-+}
-+
-+void mm_unpin(struct mm_struct *mm)
-+{
-+ if (xen_feature(XENFEAT_writable_page_tables))
-+ return;
+ /* NB. 'clts' is done for us by Xen during virtual trap. */
+- if (!tsk_used_math(tsk))
+- init_fpu(tsk);
+ restore_fpu(tsk);
+ thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
+ tsk->fpu_counter++;
+@@ -1100,15 +1143,15 @@ EXPORT_SYMBOL_GPL(math_state_restore);
+
+ asmlinkage void math_emulate(long arg)
+ {
+- printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
+- printk(KERN_EMERG "killing %s.\n",current->comm);
+- force_sig(SIGFPE,current);
++ printk(KERN_EMERG
++ "math-emulation not enabled and no coprocessor found.\n");
++ printk(KERN_EMERG "killing %s.\n", current->comm);
++ force_sig(SIGFPE, current);
+ schedule();
+ }
+
+ #endif /* CONFIG_MATH_EMULATION */
+
+-
+ /*
+ * NB. All these are "trap gates" (i.e. events_mask isn't set) except
+ * for those that specify <dpl>|4 in the second field.
+@@ -1146,25 +1189,21 @@ void __init trap_init(void)
+ if (ret)
+ printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
+
+- /*
+- * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
+- * Generate a build-time error if the alignment is wrong.
+- */
+- BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
+ if (cpu_has_fxsr) {
+ printk(KERN_INFO "Enabling fast FPU save and restore... ");
+ set_in_cr4(X86_CR4_OSFXSR);
+ printk("done.\n");
+ }
+ if (cpu_has_xmm) {
+- printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
+- "support... ");
++ printk(KERN_INFO
++ "Enabling unmasked SIMD FPU exception support... ");
+ set_in_cr4(X86_CR4_OSXMMEXCPT);
+ printk("done.\n");
+ }
+
++ init_thread_xstate();
+ /*
+- * Should be a barrier for any external CPU state.
++ * Should be a barrier for any external CPU state:
+ */
+ cpu_init();
+ }
+@@ -1183,6 +1222,7 @@ void __cpuinit smp_trap_init(trap_info_t
+ static int __init kstack_setup(char *s)
+ {
+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+
-+ pin_lock(mm);
-+ __pgd_unpin(mm->pgd);
-+ pin_unlock(mm);
-+}
+ return 1;
+ }
+ __setup("kstack=", kstack_setup);
+--- sle11-2009-05-14.orig/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -33,6 +33,8 @@
+ #include <linux/kdebug.h>
+ #include <linux/utsname.h>
+
++#include <mach_traps.h>
+
-+void mm_pin_all(void)
-+{
-+ struct page *page;
+ #if defined(CONFIG_EDAC)
+ #include <linux/edac.h>
+ #endif
+@@ -601,10 +603,16 @@ void die(const char * str, struct pt_reg
+ }
+
+ #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
+-void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
++notrace __kprobes void
++die_nmi(char *str, struct pt_regs *regs, int do_panic)
+ {
+- unsigned long flags = oops_begin();
+ unsigned long flags;
+
-+ if (xen_feature(XENFEAT_writable_page_tables))
++ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
++ NOTIFY_STOP)
++ return;
+
++ flags = oops_begin();
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out.
+@@ -769,7 +777,7 @@ asmlinkage void __kprobes do_general_pro
+ die("general protection fault", regs, error_code);
+ }
+
+-static __kprobes void
++static notrace __kprobes void
+ mem_parity_error(unsigned char reason, struct pt_regs * regs)
+ {
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+@@ -792,7 +800,7 @@ mem_parity_error(unsigned char reason, s
+ clear_mem_error(reason);
+ }
+
+-static __kprobes void
++static notrace __kprobes void
+ io_check_error(unsigned char reason, struct pt_regs * regs)
+ {
+ printk("NMI: IOCK error (debug interrupt?)\n");
+@@ -802,9 +810,11 @@ io_check_error(unsigned char reason, str
+ clear_io_check_error(reason);
+ }
+
+-static __kprobes void
++static notrace __kprobes void
+ unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+ {
++ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ return;
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+ reason);
+ printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+@@ -817,7 +827,7 @@ unknown_nmi_error(unsigned char reason,
+
+ /* Runs on IST stack. This code must keep interrupts off all the time.
+ Nested NMIs are prevented by the CPU. */
+-asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
++asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+ {
+ unsigned char reason = 0;
+ int cpu;
+@@ -1117,11 +1127,25 @@ asmlinkage void __attribute__((weak)) mc
+ asmlinkage void math_state_restore(void)
+ {
+ struct task_struct *me = current;
+
-+ /*
-+ * Allow uninterrupted access to the pgd_list. Also protects
-+ * __pgd_pin() by disabling preemption.
-+ * All other CPUs must be at a safe point (e.g., in stop_machine
-+ * or offlined entirely).
-+ */
-+ spin_lock_irqsave(&pgd_lock, flags);
-+ list_for_each_entry(page, &pgd_list, lru) {
-+ if (!PagePinned(page))
-+ __pgd_pin((pgd_t *)page_address(page));
++ if (!used_math()) {
++ local_irq_enable();
++ /*
++ * does a slab alloc which can sleep
++ */
++ if (init_fpu(me)) {
++ /*
++ * ran out of memory!
++ */
++ do_group_exit(SIGKILL);
++ return;
++ }
++ local_irq_disable();
+ }
-+ spin_unlock_irqrestore(&pgd_lock, flags);
-+}
-+
-+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
-+{
-+ if (!PagePinned(virt_to_page(mm->pgd)))
-+ mm_pin(mm);
-+}
-+
-+void arch_exit_mmap(struct mm_struct *mm)
-+{
-+ struct task_struct *tsk = current;
-+
-+ task_lock(tsk);
+
-+ /*
-+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
-+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+ /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
+
+- if (!used_math())
+- init_fpu(me);
+- restore_fpu_checking(&me->thread.i387.fxsave);
++ restore_fpu_checking(&me->thread.xstate->fxsave);
+ task_thread_info(me)->status |= TS_USEDFPU;
+ me->fpu_counter++;
+ }
+@@ -1168,6 +1192,10 @@ void __init trap_init(void)
+ printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
+
+ /*
++ * initialize the per thread extended state:
+ */
-+ if (tsk->active_mm == mm) {
-+ tsk->active_mm = &init_mm;
-+ atomic_inc(&init_mm.mm_count);
-+
-+ switch_mm(mm, &init_mm, tsk);
-+
-+ atomic_dec(&mm->mm_count);
-+ BUG_ON(atomic_read(&mm->mm_count) == 0);
-+ }
-+
-+ task_unlock(tsk);
-+
-+ if (PagePinned(virt_to_page(mm->pgd))
-+ && atomic_read(&mm->mm_count) == 1
-+ && !mm->context.has_foreign_mappings)
-+ mm_unpin(mm);
-+}
++ init_thread_xstate();
++ /*
+ * Should be a barrier for any external CPU state.
+ */
+ cpu_init();
+--- sle11-2009-05-14.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -216,7 +216,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
+ return 0;
+ }
+
+-long __vsyscall(3) venosys_1(void)
++static long __vsyscall(3) venosys_1(void)
+ {
+ return -ENOSYS;
+ }
+--- sle11-2009-05-14.orig/arch/x86/mm/fault-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/mm/fault-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -510,6 +510,11 @@ static int vmalloc_fault(unsigned long a
+ unsigned long pgd_paddr;
+ pmd_t *pmd_k;
+ pte_t *pte_k;
+
-+static inline void pgd_list_add(pgd_t *pgd)
-+{
-+ struct page *page = virt_to_page(pgd);
++ /* Make sure we are in vmalloc area */
++ if (!(address >= VMALLOC_START && address < VMALLOC_END))
++ return -1;
+
-+ list_add(&page->lru, &pgd_list);
-+}
+ /*
+ * Synchronize this task's top level page-table
+ * with the 'reference' page table.
+@@ -670,7 +675,7 @@ void __kprobes do_page_fault(struct pt_r
+ #ifdef CONFIG_X86_32
+ /* It's safe to allow irq's after cr2 has been saved and the vmalloc
+ fault has been handled. */
+- if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
++ if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
+ local_irq_enable();
+
+ /*
+@@ -1017,9 +1022,5 @@ void vmalloc_sync_all(void)
+ if (address == start)
+ start = address + PGDIR_SIZE;
+ }
+- /* Check that there is no need to do the same for the modules area. */
+- BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+- (__START_KERNEL & PGDIR_MASK)));
+ #endif
+ }
+--- sle11-2009-05-14.orig/arch/x86/mm/highmem_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/mm/highmem_32-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -200,6 +200,8 @@ EXPORT_SYMBOL(kmap);
+ EXPORT_SYMBOL(kunmap);
+ EXPORT_SYMBOL(kmap_atomic);
+ EXPORT_SYMBOL(kunmap_atomic);
++#ifdef CONFIG_HIGHPTE
+ EXPORT_SYMBOL(kmap_atomic_to_page);
++#endif
+ EXPORT_SYMBOL(clear_highpage);
+ EXPORT_SYMBOL(copy_highpage);
+--- sle11-2009-05-14.orig/arch/x86/mm/init_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/mm/init_32-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -1,5 +1,4 @@
+ /*
+- * linux/arch/i386/mm/init.c
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ *
+@@ -22,6 +21,7 @@
+ #include <linux/init.h>
+ #include <linux/highmem.h>
+ #include <linux/pagemap.h>
++#include <linux/pci.h>
+ #include <linux/pfn.h>
+ #include <linux/poison.h>
+ #include <linux/bootmem.h>
+@@ -54,6 +54,8 @@
+
+ unsigned int __VMALLOC_RESERVE = 128 << 20;
+
++unsigned long max_pfn_mapped;
+
-+static inline void pgd_list_del(pgd_t *pgd)
+ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+ unsigned long highstart_pfn, highend_pfn;
+
+@@ -73,7 +75,7 @@ static pmd_t * __init one_md_table_init(
+ if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
+ pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+
+- paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
++ paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+ make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
+ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+ pud = pud_offset(pgd, 0);
+@@ -107,7 +109,7 @@ static pte_t * __init one_page_table_ini
+ (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ }
+
+- paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
++ paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+ make_lowmem_page_readonly(page_table,
+ XENFEAT_writable_page_tables);
+ set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+@@ -209,8 +211,13 @@ static void __init kernel_physical_mappi
+ /*
+ * Map with big pages if possible, otherwise
+ * create normal page tables:
++ *
++ * Don't use a large page for the first 2/4MB of memory
++ * because there are often fixed size MTRRs in there
++ * and overlapping MTRRs into large pages can cause
++ * slowdowns.
+ */
+- if (cpu_has_pse) {
++ if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
+ unsigned int addr2;
+ pgprot_t prot = PAGE_KERNEL_LARGE;
+
+@@ -224,6 +231,7 @@ static void __init kernel_physical_mappi
+ set_pmd(pmd, pfn_pmd(pfn, prot));
+
+ pfn += PTRS_PER_PTE;
++ max_pfn_mapped = pfn;
+ continue;
+ }
+ pte = one_page_table_init(pmd);
+@@ -241,6 +249,7 @@ static void __init kernel_physical_mappi
+
+ set_pte(pte, pfn_pte(pfn, prot));
+ }
++ max_pfn_mapped = pfn;
+ pte_ofs = 0;
+ }
+ pmd_idx = 0;
+@@ -262,6 +271,25 @@ static inline int page_kills_ppro(unsign
+
+ #endif
+
++/*
++ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
++ * is valid. The argument is a physical page number.
++ *
++ *
++ * On x86, access has to be given to the first megabyte of ram because that area
++ * contains bios code and data regions used by X and dosemu and similar apps.
++ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
++ * mmio resources as well as potential bios/acpi data regions.
++ */
++int devmem_is_allowed(unsigned long pagenr)
+{
-+ struct page *page = virt_to_page(pgd);
-+
-+ list_del(&page->lru);
++ if (pagenr <= 256)
++ return 1;
++ if (mfn_to_local_pfn(pagenr) >= max_pfn)
++ return 1;
++ return 0;
+}
+
-+#define UNSHARED_PTRS_PER_PGD \
-+ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-+
-+static void pgd_ctor(void *p)
-+{
-+ pgd_t *pgd = p;
-+ unsigned long flags;
-+
-+ pgd_test_and_unpin(pgd);
-+
-+ /* Clear usermode parts of PGD */
-+ memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
-+
-+ spin_lock_irqsave(&pgd_lock, flags);
-+
-+ /* If the pgd points to a shared pagetable level (either the
-+ ptes in non-PAE, or shared PMD in PAE), then just copy the
-+ references from swapper_pg_dir. */
-+ if (PAGETABLE_LEVELS == 2 ||
-+ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
-+ PAGETABLE_LEVELS == 4) {
-+ clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
-+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-+ KERNEL_PGD_PTRS);
-+ paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
-+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
-+ KERNEL_PGD_BOUNDARY,
-+ KERNEL_PGD_PTRS);
-+ }
-+
-+#ifdef CONFIG_X86_64
-+ /* set level3_user_pgt for vsyscall area */
-+ __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
-+ __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
-+#endif
-+
-+#ifndef CONFIG_X86_PAE
-+ /* list required to sync kernel mapping updates */
-+ if (!SHARED_KERNEL_PMD)
-+ pgd_list_add(pgd);
+ #ifdef CONFIG_HIGHMEM
+ pte_t *kmap_pte;
+ pgprot_t kmap_prot;
+@@ -303,48 +331,18 @@ static void __init permanent_kmaps_init(
+ pkmap_page_table = pte;
+ }
+
+-static void __meminit free_new_highpage(struct page *page, int pfn)
+-{
+- init_page_count(page);
+- if (pfn < xen_start_info->nr_pages)
+- __free_page(page);
+- totalhigh_pages++;
+-}
+-
+ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+ {
+ if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
+ ClearPageReserved(page);
+- free_new_highpage(page, pfn);
++ init_page_count(page);
++ if (pfn < xen_start_info->nr_pages)
++ __free_page(page);
++ totalhigh_pages++;
+ } else
+ SetPageReserved(page);
+ }
+
+-static int __meminit
+-add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+-{
+- free_new_highpage(page, pfn);
+- totalram_pages++;
+-#ifdef CONFIG_FLATMEM
+- max_mapnr = max(pfn, max_mapnr);
+-#endif
+- num_physpages++;
+-
+- return 0;
+-}
+-
+-/*
+- * Not currently handling the NUMA case.
+- * Assuming single node and all memory that
+- * has been added dynamically that would be
+- * onlined here is in HIGHMEM.
+- */
+-void __meminit online_page(struct page *page)
+-{
+- ClearPageReserved(page);
+- add_one_highpage_hotplug(page, page_to_pfn(page));
+-}
+-
+ #ifndef CONFIG_NUMA
+ static void __init set_highmem_pages_init(int bad_ppro)
+ {
+@@ -459,15 +457,13 @@ void zap_low_mappings(void)
+ {
+ int i;
+
+- save_pg_dir();
+-
+ /*
+ * Zap initial low-memory mappings.
+ *
+ * Note that "pgd_clear()" doesn't do it for
+ * us, because pgd_clear() is a no-op on i386.
+ */
+- for (i = 0; i < USER_PTRS_PER_PGD; i++) {
++ for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
+ #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
+ set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
+ #else
+@@ -572,9 +568,9 @@ void __init paging_init(void)
+
+ /*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+- * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
+- * used to involve black magic jumps to work around some nasty CPU bugs,
+- * but fortunately the switch to using exceptions got rid of all that.
++ * and also on some strange 486's. All 586+'s are OK. This used to involve
++ * black magic jumps to work around some nasty CPU bugs, but fortunately the
++ * switch to using exceptions got rid of all that.
+ */
+ static void __init test_wp_bit(void)
+ {
+@@ -605,9 +601,7 @@ void __init mem_init(void)
+ int tmp, bad_ppro;
+ unsigned long pfn;
+
+-#if defined(CONFIG_SWIOTLB)
+- swiotlb_init();
+-#endif
++ pci_iommu_alloc();
+
+ #ifdef CONFIG_FLATMEM
+ BUG_ON(!mem_map);
+@@ -710,16 +704,8 @@ void __init mem_init(void)
+ test_wp_bit();
+
+ cpa_init();
+-
+- /*
+- * Subtle. SMP is doing it's boot stuff late (because it has to
+- * fork idle threads) - but it also needs low mappings for the
+- * protected-mode entry to work. We zap these entries only after
+- * the WP-bit has been tested.
+- */
+-#ifndef CONFIG_SMP
++ save_pg_dir();
+ zap_low_mappings();
+-#endif
+
+ SetPagePinned(virt_to_page(init_mm.pgd));
+ }
+@@ -769,25 +755,17 @@ void mark_rodata_ro(void)
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
+
+-#ifndef CONFIG_KPROBES
+-#ifdef CONFIG_HOTPLUG_CPU
+- /* It must still be possible to apply SMP alternatives. */
+- if (num_possible_cpus() <= 1)
+-#endif
+- {
+- set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+- printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+- size >> 10);
++ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
++ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
++ size >> 10);
+
+ #ifdef CONFIG_CPA_DEBUG
+- printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+- start, start+size);
+- set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
++ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
++ start, start+size);
++ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+- printk(KERN_INFO "Testing CPA: write protecting again\n");
+- set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+-#endif
+- }
++ printk(KERN_INFO "Testing CPA: write protecting again\n");
++ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+ #endif
+ start += size;
+ size = (unsigned long)__end_rodata - start;
+--- sle11-2009-05-14.orig/arch/x86/mm/init_64-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/mm/init_64-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -52,9 +52,6 @@
+
+ #include <xen/features.h>
+
+-const struct dma_mapping_ops *dma_ops;
+-EXPORT_SYMBOL(dma_ops);
+-
+ #if CONFIG_XEN_COMPAT <= 0x030002
+ unsigned int __kernel_page_user;
+ EXPORT_SYMBOL(__kernel_page_user);
+@@ -68,6 +65,28 @@ extern unsigned long start_pfn;
+ extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
+ extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
+
++#ifndef CONFIG_XEN
++int direct_gbpages __meminitdata
++#ifdef CONFIG_DIRECT_GBPAGES
++ = 1
+#endif
++;
+
-+ spin_unlock_irqrestore(&pgd_lock, flags);
-+}
-+
-+static void pgd_dtor(void *pgd)
-+{
-+ unsigned long flags; /* can be called from interrupt context */
-+
-+ if (!SHARED_KERNEL_PMD) {
-+ spin_lock_irqsave(&pgd_lock, flags);
-+ pgd_list_del(pgd);
-+ spin_unlock_irqrestore(&pgd_lock, flags);
-+ }
-+
-+ pgd_test_and_unpin(pgd);
-+}
-+
-+/*
-+ * List of all pgd's needed for non-PAE so it can invalidate entries
-+ * in both cached and uncached pgd's; not needed for PAE since the
-+ * kernel pmd is shared. If PAE were not to share the pmd a similar
-+ * tactic would be needed. This is essentially codepath-based locking
-+ * against pageattr.c; it is the unique case in which a valid change
-+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
-+ * vmalloc faults work because attached pagetables are never freed.
-+ * -- wli
-+ */
-+
-+#ifdef CONFIG_X86_PAE
-+/*
-+ * Mop up any pmd pages which may still be attached to the pgd.
-+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
-+ * preallocate which never got a corresponding vma will need to be
-+ * freed manually.
-+ */
-+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
++static int __init parse_direct_gbpages_off(char *arg)
+{
-+ int i;
-+
-+ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
-+ pgd_t pgd = pgdp[i];
-+
-+ if (__pgd_val(pgd) != 0) {
-+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-+
-+ pgdp[i] = xen_make_pgd(0);
-+
-+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
-+ pmd_free(mm, pmd);
-+ }
-+ }
-+
-+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
-+ xen_destroy_contiguous_region((unsigned long)pgdp, 0);
++ direct_gbpages = 0;
++ return 0;
+}
++early_param("nogbpages", parse_direct_gbpages_off);
+
-+/*
-+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
-+ * updating the top-level pagetable entries to guarantee the
-+ * processor notices the update. Since this is expensive, and
-+ * all 4 top-level entries are used almost immediately in a
-+ * new process's life, we just pre-populate them here.
-+ *
-+ * Also, if we're in a paravirt environment where the kernel pmd is
-+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
-+ * and initialize the kernel pmds here.
-+ */
-+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
++static int __init parse_direct_gbpages_on(char *arg)
+{
-+ pud_t *pud;
-+ pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
-+ unsigned long addr, flags;
-+ int i;
-+
-+ /*
-+ * We can race save/restore (if we sleep during a GFP_KERNEL memory
-+ * allocation). We therefore store virtual addresses of pmds as they
-+ * do not change across save/restore, and poke the machine addresses
-+ * into the pgdir under the pgd_lock.
-+ */
-+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
-+ pmds[i] = pmd_alloc_one(mm, addr);
-+ if (!pmds[i])
-+ goto out_oom;
-+ }
-+
-+ spin_lock_irqsave(&pgd_lock, flags);
-+
-+ /* Protect against save/restore: move below 4GB under pgd_lock. */
-+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
-+ && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
-+ spin_unlock_irqrestore(&pgd_lock, flags);
-+out_oom:
-+ while (i--)
-+ pmd_free(mm, pmds[i]);
-+ return 0;
-+ }
-+
-+ /* Copy kernel pmd contents and write-protect the new pmds. */
-+ pud = pud_offset(pgd, 0);
-+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
-+ i++, pud++, addr += PUD_SIZE) {
-+ if (i >= KERNEL_PGD_BOUNDARY) {
-+ memcpy(pmds[i],
-+ (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
-+ sizeof(pmd_t) * PTRS_PER_PMD);
-+ make_lowmem_page_readonly(
-+ pmds[i], XENFEAT_writable_page_tables);
-+ }
-+
-+ /* It is safe to poke machine addresses of pmds under the pgd_lock. */
-+ pud_populate(mm, pud, pmds[i]);
-+ }
-+
-+ /* List required to sync kernel mapping updates and
-+ * to pin/unpin on save/restore. */
-+ pgd_list_add(pgd);
-+
-+ spin_unlock_irqrestore(&pgd_lock, flags);
-+
-+ return 1;
++ direct_gbpages = 1;
++ return 0;
+}
++early_param("gbpages", parse_direct_gbpages_on);
++#endif
+
-+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
-+{
-+ struct page *page = virt_to_page(pmd);
-+ unsigned long pfn = page_to_pfn(page);
-+
-+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+ /*
+ * Use this until direct mapping is established, i.e. before __va() is
+ * available in init_memory_mapping().
+@@ -135,9 +154,6 @@ void show_mem(void)
+
+ printk(KERN_INFO "Mem-info:\n");
+ show_free_areas();
+- printk(KERN_INFO "Free swap: %6ldkB\n",
+- nr_swap_pages << (PAGE_SHIFT-10));
+-
+ for_each_online_pgdat(pgdat) {
+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+ /*
+@@ -328,7 +344,7 @@ void __init cleanup_highmap(void)
+ pmd_t *last_pmd = pmd + PTRS_PER_PMD;
+
+ for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
+- if (!pmd_present(*pmd))
++ if (pmd_none(*pmd))
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > end)
+ set_pmd(pmd, __pmd(0));
+@@ -337,8 +353,7 @@ void __init cleanup_highmap(void)
+ #endif
+
+ /* NOTE: this is meant to be run only at boot */
+-void __init
+-__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
++void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+ {
+ unsigned long address = __fix_to_virt(idx);
+
+@@ -463,7 +478,7 @@ __meminit void early_iounmap(void *addr,
+ }
+ #endif
+
+-static void __meminit
++static unsigned long __meminit
+ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
+ {
+ int i = pmd_index(address);
+@@ -503,21 +518,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
+ set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
+ }
+ }
++ return address;
+ }
+
+-static void __meminit
++static unsigned long __meminit
+ phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+ {
+ pmd_t *pmd = pmd_offset(pud, 0);
++ unsigned long last_map_addr;
+
-+ /* Note: almost everything apart from _PAGE_PRESENT is
-+ reserved at the pmd (PDPT) level. */
-+ if (PagePinned(virt_to_page(mm->pgd))) {
-+ BUG_ON(PageHighMem(page));
-+ BUG_ON(HYPERVISOR_update_va_mapping(
-+ (unsigned long)__va(pfn << PAGE_SHIFT),
-+ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
-+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
-+ } else
-+ *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
+ spin_lock(&init_mm.page_table_lock);
+- phys_pmd_init(pmd, address, end);
++ last_map_addr = phys_pmd_init(pmd, address, end);
+ spin_unlock(&init_mm.page_table_lock);
+ __flush_tlb_all();
++ return last_map_addr;
+ }
+
+-static void __meminit
++static unsigned long __meminit
+ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+ {
++ unsigned long last_map_addr = end;
+ int i = pud_index(addr);
+
+ for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
+@@ -529,7 +549,15 @@ phys_pud_init(pud_t *pud_page, unsigned
+ break;
+
+ if (__pud_val(*pud)) {
+- phys_pmd_update(pud, addr, end);
++ if (!pud_large(*pud))
++ last_map_addr = phys_pmd_update(pud, addr, end);
++ continue;
++ }
+
-+ /*
-+ * According to Intel App note "TLBs, Paging-Structure Caches,
-+ * and Their Invalidation", April 2007, document 317080-001,
-+ * section 8.1: in PAE mode we explicitly have to flush the
-+ * TLB via cr3 if the top-level pgd is changed...
-+ */
-+ if (mm == current->active_mm)
-+ xen_tlb_flush();
-+}
-+#else /* !CONFIG_X86_PAE */
-+/* No need to prepopulate any pagetable entries in non-PAE modes. */
-+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-+{
-+ return 1;
-+}
++ if (direct_gbpages) {
++ set_pte((pte_t *)pud,
++ pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
++ last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
+ continue;
+ }
+
+@@ -537,12 +565,14 @@ phys_pud_init(pud_t *pud_page, unsigned
+
+ spin_lock(&init_mm.page_table_lock);
+ *pud = __pud(pmd_phys | _KERNPG_TABLE);
+- phys_pmd_init(pmd, addr, end);
++ last_map_addr = phys_pmd_init(pmd, addr, end);
+ spin_unlock(&init_mm.page_table_lock);
+
+ early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
+ }
+ __flush_tlb_all();
+
-+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
++ return last_map_addr >> PAGE_SHIFT;
+ }
+
+ void __init xen_init_pt(void)
+@@ -754,16 +784,138 @@ static void __init xen_finish_init_mappi
+ table_end = start_pfn;
+ }
+
++static void __init init_gbpages(void)
+{
++#ifndef CONFIG_XEN
++ if (direct_gbpages && cpu_has_gbpages)
++ printk(KERN_INFO "Using GB pages for direct mapping\n");
++ else
++ direct_gbpages = 0;
++#endif
+}
-+#endif /* CONFIG_X86_PAE */
+
-+#ifdef CONFIG_X86_64
-+/* We allocate two contiguous pages for kernel and user. */
-+#define PGD_ORDER 1
-+#else
-+#define PGD_ORDER 0
-+#endif
++#ifdef CONFIG_MEMTEST_BOOTPARAM
+
-+pgd_t *pgd_alloc(struct mm_struct *mm)
++static void __init memtest(unsigned long start_phys, unsigned long size,
++ unsigned pattern)
+{
-+ pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
-+
-+ /* so that alloc_pd can use it */
-+ mm->pgd = pgd;
-+ if (pgd)
-+ pgd_ctor(pgd);
++ unsigned long i;
++ unsigned long *start;
++ unsigned long start_bad;
++ unsigned long last_bad;
++ unsigned long val;
++ unsigned long start_phys_aligned;
++ unsigned long count;
++ unsigned long incr;
+
-+ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
-+ free_pages((unsigned long)pgd, PGD_ORDER);
-+ pgd = NULL;
++ switch (pattern) {
++ case 0:
++ val = 0UL;
++ break;
++ case 1:
++ val = -1UL;
++ break;
++ case 2:
++ val = 0x5555555555555555UL;
++ break;
++ case 3:
++ val = 0xaaaaaaaaaaaaaaaaUL;
++ break;
++ default:
++ return;
+ }
+
-+ return pgd;
-+}
++ incr = sizeof(unsigned long);
++ start_phys_aligned = ALIGN(start_phys, incr);
++ count = (size - (start_phys_aligned - start_phys))/incr;
++ start = __va(start_phys_aligned);
++ start_bad = 0;
++ last_bad = 0;
+
-+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-+{
-+ /*
-+ * After this the pgd should not be pinned for the duration of this
-+ * function's execution. We should never sleep and thus never race:
-+ * 1. User pmds will not become write-protected under our feet due
-+ * to a concurrent mm_pin_all().
-+ * 2. The machine addresses in PGD entries will not become invalid
-+ * due to a concurrent save/restore.
-+ */
-+ pgd_dtor(pgd);
++ for (i = 0; i < count; i++)
++ start[i] = val;
++ for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
++ if (*start != val) {
++ if (start_phys_aligned == last_bad + incr) {
++ last_bad += incr;
++ } else {
++ if (start_bad) {
++ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
++ val, start_bad, last_bad + incr);
++ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
++ }
++ start_bad = last_bad = start_phys_aligned;
++ }
++ }
++ }
++ if (start_bad) {
++ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
++ val, start_bad, last_bad + incr);
++ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
++ }
+
-+ pgd_mop_up_pmds(mm, pgd);
-+ free_pages((unsigned long)pgd, PGD_ORDER);
+}
+
-+/* blktap and gntdev need this, as otherwise they would implicitly (and
-+ * needlessly, as they never use it) reference init_mm. */
-+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
-+ unsigned long addr, pte_t *ptep, int full)
-+{
-+ return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
-+}
-+EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
++static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+
-+int ptep_set_access_flags(struct vm_area_struct *vma,
-+ unsigned long address, pte_t *ptep,
-+ pte_t entry, int dirty)
++static int __init parse_memtest(char *arg)
+{
-+ int changed = !pte_same(*ptep, entry);
-+
-+ if (changed && dirty) {
-+ if (likely(vma->vm_mm == current->mm)) {
-+ if (HYPERVISOR_update_va_mapping(address,
-+ entry,
-+ (unsigned long)vma->vm_mm->cpu_vm_mask.bits|
-+ UVMF_INVLPG|UVMF_MULTI))
-+ BUG();
-+ } else {
-+ xen_l1_entry_update(ptep, entry);
-+ flush_tlb_page(vma, address);
-+ }
-+ }
-+
-+ return changed;
++ if (arg)
++ memtest_pattern = simple_strtoul(arg, NULL, 0);
++ return 0;
+}
+
-+int ptep_test_and_clear_young(struct vm_area_struct *vma,
-+ unsigned long addr, pte_t *ptep)
++early_param("memtest", parse_memtest);
++
++static void __init early_memtest(unsigned long start, unsigned long end)
+{
-+ int ret = 0;
++ u64 t_start, t_size;
++ unsigned pattern;
+
-+ if (pte_young(*ptep))
-+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
-+ &ptep->pte);
++ if (!memtest_pattern)
++ return;
+
-+ if (ret)
-+ pte_update(vma->vm_mm, addr, ptep);
++ printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
++ for (pattern = 0; pattern < memtest_pattern; pattern++) {
++ t_start = start;
++ t_size = 0;
++ while (t_start < end) {
++ t_start = find_e820_area_size(t_start, &t_size, 1);
+
-+ return ret;
-+}
++ /* done ? */
++ if (t_start >= end)
++ break;
++ if (t_start + t_size > end)
++ t_size = end - t_start;
+
-+int ptep_clear_flush_young(struct vm_area_struct *vma,
-+ unsigned long address, pte_t *ptep)
-+{
-+ pte_t pte = *ptep;
-+ int young = pte_young(pte);
++ printk(KERN_CONT "\n %016llx - %016llx pattern %d",
++ (unsigned long long)t_start,
++ (unsigned long long)t_start + t_size, pattern);
+
-+ pte = pte_mkold(pte);
-+ if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
-+ ptep_set_access_flags(vma, address, ptep, pte, young);
-+ else if (young)
-+ ptep->pte_low = pte.pte_low;
++ memtest(t_start, t_size, pattern);
+
-+ return young;
-+}
---- a/arch/x86/pci/i386.c
-+++ b/arch/x86/pci/i386.c
-@@ -328,10 +328,14 @@ int pci_mmap_page_range(struct pci_dev *
- flags = new_flags;
- }
-
-+#ifndef CONFIG_XEN
- if (((vma->vm_pgoff < max_low_pfn_mapped) ||
- (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
- vma->vm_pgoff < max_pfn_mapped)) &&
- ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
-+#else
-+ if (ioremap_check_change_attr(vma->vm_pgoff, len, flags)) {
-+#endif
- free_memtype(addr, addr + len);
- return -EINVAL;
- }
---- a/arch/x86/pci/irq-xen.c
-+++ b/arch/x86/pci/irq-xen.c
-@@ -140,9 +140,11 @@ static void __init pirq_peer_trick(void)
- busmap[e->bus] = 1;
- }
- for(i = 1; i < 256; i++) {
-+ int node;
- if (!busmap[i] || pci_find_bus(0, i))
- continue;
-- if (pci_scan_bus_with_sysdata(i))
-+ node = get_mp_bus_to_node(i);
-+ if (pci_scan_bus_on_node(i, &pci_root_ops, node))
- printk(KERN_INFO "PCI: Discovered primary peer "
- "bus %02x [IRQ]\n", i);
- }
-@@ -204,7 +206,7 @@ static int pirq_ali_get(struct pci_dev *
++ t_start += t_size;
++ }
++ }
++ printk(KERN_CONT "\n");
++}
++#else
++static void __init early_memtest(unsigned long start, unsigned long end)
++{
++}
++#endif
++
+ /*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+-void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
++unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
{
- static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
-
-- WARN_ON_ONCE(pirq >= 16);
-+ WARN_ON_ONCE(pirq > 16);
- return irqmap[read_config_nybble(router, 0x48, pirq-1)];
- }
+- unsigned long next;
++ unsigned long next, last_map_addr = end;
++ unsigned long start_phys = start, end_phys = end;
-@@ -213,7 +215,7 @@ static int pirq_ali_set(struct pci_dev *
- static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
- unsigned int val = irqmap[irq];
+- pr_debug("init_memory_mapping\n");
++ printk(KERN_INFO "init_memory_mapping\n");
-- WARN_ON_ONCE(pirq >= 16);
-+ WARN_ON_ONCE(pirq > 16);
- if (val) {
- write_config_nybble(router, 0x48, pirq-1, val);
- return 1;
-@@ -264,7 +266,7 @@ static int pirq_via586_get(struct pci_de
- {
- static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+ /*
+ * Find space for the kernel direct mapping tables.
+@@ -772,8 +924,10 @@ void __init_refok init_memory_mapping(un
+ * memory mapped. Unfortunately this is done currently before the
+ * nodes are discovered.
+ */
+- if (!after_bootmem)
++ if (!after_bootmem) {
++ init_gbpages();
+ find_early_table_space(end);
++ }
-- WARN_ON_ONCE(pirq >= 5);
-+ WARN_ON_ONCE(pirq > 5);
- return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
+@@ -790,7 +944,7 @@ void __init_refok init_memory_mapping(un
+ next = start + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+- phys_pud_init(pud, __pa(start), __pa(next));
++ last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
+ if (!after_bootmem) {
+ early_make_page_readonly(pud, XENFEAT_writable_page_tables);
+ set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+@@ -807,6 +961,11 @@ void __init_refok init_memory_mapping(un
+ if (!after_bootmem)
+ reserve_early(table_start << PAGE_SHIFT,
+ table_end << PAGE_SHIFT, "PGTABLE");
++
++ if (!after_bootmem)
++ early_memtest(start_phys, end_phys);
++
++ return last_map_addr;
}
-@@ -272,7 +274,7 @@ static int pirq_via586_set(struct pci_de
+ #ifndef CONFIG_NUMA
+@@ -830,15 +989,6 @@ void __init paging_init(void)
+ /*
+ * Memory hotplug specific functions
+ */
+-void online_page(struct page *page)
+-{
+- ClearPageReserved(page);
+- init_page_count(page);
+- __free_page(page);
+- totalram_pages++;
+- num_physpages++;
+-}
+-
+ #ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory is added always to NORMAL zone. This means you will never get
+@@ -848,11 +998,13 @@ int arch_add_memory(int nid, u64 start,
{
- static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
+- unsigned long start_pfn = start >> PAGE_SHIFT;
++ unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ int ret;
-- WARN_ON_ONCE(pirq >= 5);
-+ WARN_ON_ONCE(pirq > 5);
- write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
- return 1;
- }
-@@ -286,7 +288,7 @@ static int pirq_ite_get(struct pci_dev *
- {
- static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+- init_memory_mapping(start, start + size-1);
++ last_mapped_pfn = init_memory_mapping(start, start + size-1);
++ if (last_mapped_pfn > max_pfn_mapped)
++ max_pfn_mapped = last_mapped_pfn;
-- WARN_ON_ONCE(pirq >= 4);
-+ WARN_ON_ONCE(pirq > 4);
- return read_config_nybble(router,0x43, pirqmap[pirq-1]);
- }
+ ret = __add_pages(zone, start_pfn, nr_pages);
+ WARN_ON(1);
+@@ -871,6 +1023,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
-@@ -294,7 +296,7 @@ static int pirq_ite_set(struct pci_dev *
- {
- static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+ #endif /* CONFIG_MEMORY_HOTPLUG */
-- WARN_ON_ONCE(pirq >= 4);
-+ WARN_ON_ONCE(pirq > 4);
- write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
- return 1;
- }
-@@ -623,6 +625,13 @@ static __init int via_router_probe(struc
- */
- device = PCI_DEVICE_ID_VIA_8235;
- break;
-+ case PCI_DEVICE_ID_VIA_8237:
-+ /**
-+ * Asus a7v600 bios wrongly reports 8237
-+ * as 586-compatible
-+ */
-+ device = PCI_DEVICE_ID_VIA_8237;
-+ break;
- }
- }
++/*
++ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
++ * is valid. The argument is a physical page number.
++ *
++ *
++ * On x86, access has to be given to the first megabyte of ram because that area
++ * contains bios code and data regions used by X and dosemu and similar apps.
++ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
++ * mmio resources as well as potential bios/acpi data regions.
++ */
++int devmem_is_allowed(unsigned long pagenr)
++{
++ if (pagenr <= 256)
++ return 1;
++ if (mfn_to_local_pfn(pagenr) >= max_pfn)
++ return 1;
++ return 0;
++}
++
++
+ static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
+ kcore_modules, kcore_vsyscall;
---- a/arch/x86/vdso/vdso32-setup-xen.c
-+++ b/arch/x86/vdso/vdso32-setup-xen.c
-@@ -164,7 +164,7 @@ static __init void relocate_vdso(Elf32_E
- Elf32_Shdr *shdr;
- int i;
+@@ -979,24 +1151,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
-- BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
-+ BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
- !elf_check_arch_ia32(ehdr) ||
- ehdr->e_type != ET_DYN);
+ void mark_rodata_ro(void)
+ {
+- unsigned long start = (unsigned long)_stext, end;
+-
+-#ifdef CONFIG_HOTPLUG_CPU
+- /* It must still be possible to apply SMP alternatives. */
+- if (num_possible_cpus() > 1)
+- start = (unsigned long)_etext;
+-#endif
+-
+-#ifdef CONFIG_KPROBES
+- start = (unsigned long)__start_rodata;
+-#endif
+-
+- end = (unsigned long)__end_rodata;
+- start = (start + PAGE_SIZE - 1) & PAGE_MASK;
+- end &= PAGE_MASK;
+- if (end <= start)
+- return;
+-
++ unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
-@@ -233,8 +233,12 @@ void syscall32_cpu_init(void)
- BUG();
+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+ (end - start) >> 10);
+@@ -1019,6 +1174,7 @@ void mark_rodata_ro(void)
+ set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
-
-- if (use_sysenter < 0)
-- use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
-+ if (use_sysenter < 0) {
-+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-+ use_sysenter = 1;
-+ if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
-+ use_sysenter = 1;
-+ }
}
-
- #define compat_uses_vma 1
-@@ -337,8 +341,6 @@ int __init sysenter_setup(void)
-
- #ifdef CONFIG_X86_32
- gate_vma_init();
--
-- printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
- #endif
-
- #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
-@@ -383,6 +385,9 @@ int arch_setup_additional_pages(struct l
- int ret = 0;
- bool compat;
-
-+ if (vdso_enabled == VDSO_DISABLED)
-+ return 0;
+
- down_write(&mm->mmap_sem);
-
- /* Test compat mode once here, in case someone
---- a/drivers/acpi/processor_core.c
-+++ b/drivers/acpi/processor_core.c
-@@ -657,7 +657,7 @@ static int acpi_processor_get_info(struc
- * of /proc/cpuinfo
- */
- status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
-- if (ACPI_SUCCESS(status))
-+ if (ACPI_SUCCESS(status) && pr->id != -1)
- arch_fix_phys_package_id(pr->id, object.integer.value);
-
- return 0;
---- a/drivers/input/xen-kbdfront.c
-+++ b/drivers/input/xen-kbdfront.c
-@@ -325,7 +325,6 @@ static struct xenbus_device_id xenkbd_id
+ #endif
- static struct xenbus_driver xenkbd = {
- .name = "vkbd",
-- .owner = THIS_MODULE,
- .ids = xenkbd_ids,
- .probe = xenkbd_probe,
- .remove = xenkbd_remove,
---- a/drivers/oprofile/cpu_buffer.c
-+++ b/drivers/oprofile/cpu_buffer.c
-@@ -310,7 +310,7 @@ void oprofile_add_trace(unsigned long pc
- #ifdef CONFIG_XEN
- int oprofile_add_domain_switch(int32_t domain_id)
+ #ifdef CONFIG_BLK_DEV_INITRD
+@@ -1031,7 +1187,7 @@ void free_initrd_mem(unsigned long start
+ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
{
-- struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
-+ struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
+ #ifdef CONFIG_NUMA
+- int nid = phys_to_nid(phys);
++ int nid, next_nid;
+ #endif
+ unsigned long pfn = phys >> PAGE_SHIFT;
- /* should have space for switching into and out of domain
- (2 slots each) plus one sample and one cpu mode switch */
---- a/drivers/pci/msi-xen.c
-+++ b/drivers/pci/msi-xen.c
-@@ -588,7 +588,7 @@ int pci_enable_msi(struct pci_dev* dev)
- EXPORT_SYMBOL(pci_enable_msi);
+@@ -1040,7 +1196,7 @@ void __init reserve_bootmem_generic(unsi
+ * This can happen with kdump kernels when accessing
+ * firmware tables:
+ */
+- if (pfn < end_pfn_map)
++ if (pfn < max_pfn_mapped)
+ return;
- extern void pci_frontend_disable_msi(struct pci_dev* dev);
--void pci_disable_msi(struct pci_dev* dev)
-+void pci_msi_shutdown(struct pci_dev* dev)
+ printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+@@ -1050,10 +1206,16 @@ void __init reserve_bootmem_generic(unsi
+
+ /* Should check here against the e820 map to avoid double free */
+ #ifdef CONFIG_NUMA
+- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
++ nid = phys_to_nid(phys);
++ next_nid = phys_to_nid(phys + len - 1);
++ if (nid == next_nid)
++ reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
++ else
++ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ #else
+ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ #endif
++
+ #ifndef CONFIG_XEN
+ if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
+ dma_reserve += len / PAGE_SIZE;
+@@ -1149,6 +1311,10 @@ const char *arch_vma_name(struct vm_area
+ /*
+ * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
+ */
++static long __meminitdata addr_start, addr_end;
++static void __meminitdata *p_start, *p_end;
++static int __meminitdata node_start;
++
+ int __meminit
+ vmemmap_populate(struct page *start_page, unsigned long size, int node)
{
- int pirq;
+@@ -1183,12 +1349,32 @@ vmemmap_populate(struct page *start_page
+ PAGE_KERNEL_LARGE);
+ set_pmd(pmd, __pmd_ma(__pte_val(entry)));
-@@ -617,6 +617,10 @@ void pci_disable_msi(struct pci_dev* dev
- pci_intx_for_msi(dev, 1);
- dev->msi_enabled = 0;
+- printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
+- addr, addr + PMD_SIZE - 1, p, node);
++ /* check to see if we have contiguous blocks */
++ if (p_end != p || node_start != node) {
++ if (p_start)
++ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
++ addr_start, addr_end-1, p_start, p_end-1, node_start);
++ addr_start = addr;
++ node_start = node;
++ p_start = p;
++ }
++ addr_end = addr + PMD_SIZE;
++ p_end = p + PMD_SIZE;
+ } else {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+ }
+ }
+ return 0;
}
-+void pci_disable_msi(struct pci_dev* dev)
++
++void __meminit vmemmap_populate_print_last(void)
+{
-+ pci_msi_shutdown(dev);
++ if (p_start) {
++ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
++ addr_start, addr_end-1, p_start, p_end-1, node_start);
++ p_start = NULL;
++ p_end = NULL;
++ node_start = 0;
++ }
+}
- EXPORT_SYMBOL(pci_disable_msi);
+ #endif
+--- sle11-2009-05-14.orig/arch/x86/mm/ioremap-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/mm/ioremap-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -20,14 +20,11 @@
+ #include <asm/pgtable.h>
+ #include <asm/tlbflush.h>
+ #include <asm/pgalloc.h>
++#include <asm/pat.h>
- /**
-@@ -719,7 +723,7 @@ int pci_enable_msix(struct pci_dev* dev,
- EXPORT_SYMBOL(pci_enable_msix);
+-enum ioremap_mode {
+- IOR_MODE_UNCACHED,
+- IOR_MODE_CACHED,
+-};
+-
+-#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
++#ifdef CONFIG_X86_64
- extern void pci_frontend_disable_msix(struct pci_dev* dev);
--void pci_disable_msix(struct pci_dev* dev)
-+void pci_msix_shutdown(struct pci_dev* dev)
++#ifndef CONFIG_XEN
+ unsigned long __phys_addr(unsigned long x)
{
- if (!pci_msi_enable)
- return;
-@@ -756,6 +760,10 @@ void pci_disable_msix(struct pci_dev* de
- pci_intx_for_msi(dev, 1);
- dev->msix_enabled = 0;
+ if (x >= __START_KERNEL_map)
+@@ -35,6 +32,19 @@ unsigned long __phys_addr(unsigned long
+ return x - PAGE_OFFSET;
}
-+void pci_disable_msix(struct pci_dev* dev)
+ EXPORT_SYMBOL(__phys_addr);
++#endif
++
++static inline int phys_addr_valid(unsigned long addr)
+{
-+ pci_msix_shutdown(dev);
++ return addr < (1UL << boot_cpu_data.x86_phys_bits);
++}
++
++#else
++
++static inline int phys_addr_valid(unsigned long addr)
++{
++ return 1;
+}
- EXPORT_SYMBOL(pci_disable_msix);
- /**
---- a/drivers/video/Kconfig
-+++ b/drivers/video/Kconfig
-@@ -2047,7 +2047,7 @@ config FB_VIRTUAL
+ #endif
- config XEN_FBDEV_FRONTEND
- tristate "Xen virtual frame buffer support"
-- depends on FB && XEN
-+ depends on FB && PARAVIRT_XEN
- select FB_SYS_FILLRECT
- select FB_SYS_COPYAREA
- select FB_SYS_IMAGEBLIT
---- a/drivers/video/xen-fbfront.c
-+++ b/drivers/video/xen-fbfront.c
-@@ -670,7 +670,6 @@ static struct xenbus_device_id xenfb_ids
+@@ -92,7 +102,8 @@ static int __direct_remap_pfn_range(stru
+ * Fill in the machine address: PTE ptr is done later by
+ * apply_to_page_range().
+ */
+- v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
++ pgprot_val(prot) |= _PAGE_IO;
++ v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot)));
- static struct xenbus_driver xenfb = {
- .name = "vfb",
-- .owner = THIS_MODULE,
- .ids = xenfb_ids,
- .probe = xenfb_probe,
- .remove = xenfb_remove,
---- a/drivers/xen/blkfront/blkfront.c
-+++ b/drivers/xen/blkfront/blkfront.c
-@@ -282,7 +282,9 @@ static void backend_changed(struct xenbu
- break;
+ mfn++;
+ address += PAGE_SIZE;
+@@ -189,10 +200,9 @@ int touch_pte_range(struct mm_struct *mm
- case XenbusStateClosing:
-- bd = bdget(info->dev);
-+ if (!info->gd)
-+ break;
-+ bd = bdget_disk(info->gd, 0);
- if (bd == NULL)
- xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
+ EXPORT_SYMBOL(touch_pte_range);
---- a/drivers/xen/blkfront/block.h
-+++ b/drivers/xen/blkfront/block.h
-@@ -96,7 +96,6 @@ struct blk_shadow {
- struct blkfront_info
+-#ifdef CONFIG_X86_32
+ int page_is_ram(unsigned long pagenr)
{
- struct xenbus_device *xbdev;
-- dev_t dev;
- struct gendisk *gd;
- int vdevice;
- blkif_vdev_t handle;
---- a/drivers/xen/blkfront/vbd.c
-+++ b/drivers/xen/blkfront/vbd.c
-@@ -246,17 +246,32 @@ xlvbd_init_blk_queue(struct gendisk *gd,
+- unsigned long addr, end;
++ resource_size_t addr, end;
+ int i;
+
+ #ifndef CONFIG_XEN
+@@ -228,31 +238,51 @@ int page_is_ram(unsigned long pagenr)
+ }
return 0;
}
+-#endif
--static int
--xlvbd_alloc_gendisk(int major, int minor, blkif_sector_t capacity, int vdevice,
-- u16 vdisk_info, u16 sector_size,
-- struct blkfront_info *info)
-+int
-+xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
-+ u16 sector_size, struct blkfront_info *info)
+ /*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+ static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+- enum ioremap_mode mode)
++ unsigned long prot_val)
{
-+ int major, minor;
- struct gendisk *gd;
- struct xlbd_major_info *mi;
- int nr_minors = 1;
- int err = -ENODEV;
- unsigned int offset;
-
-+ if ((vdevice>>EXT_SHIFT) > 1) {
-+ /* this is above the extended range; something is wrong */
-+ printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
-+ return -ENODEV;
-+ }
-+
-+ if (!VDEV_IS_EXTENDED(vdevice)) {
-+ major = BLKIF_MAJOR(vdevice);
-+ minor = BLKIF_MINOR(vdevice);
-+ }
-+ else {
-+ major = 202;
-+ minor = BLKIF_MINOR_EXT(vdevice);
-+ }
-+
- BUG_ON(info->gd != NULL);
- BUG_ON(info->mi != NULL);
- BUG_ON(info->rq != NULL);
-@@ -337,41 +352,6 @@ xlvbd_alloc_gendisk(int major, int minor
- return err;
- }
+ unsigned long nrpages = size >> PAGE_SHIFT;
+ int err;
--int
--xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
-- u16 sector_size, struct blkfront_info *info)
--{
-- struct block_device *bd;
-- int err = 0;
-- int major, minor;
--
-- if ((vdevice>>EXT_SHIFT) > 1) {
-- /* this is above the extended range; something is wrong */
-- printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
-- return -ENODEV;
-- }
--
-- if (!VDEV_IS_EXTENDED(vdevice)) {
-- major = BLKIF_MAJOR(vdevice);
-- minor = BLKIF_MINOR(vdevice);
-- }
-- else {
-- major = 202;
-- minor = BLKIF_MINOR_EXT(vdevice);
-- }
--
-- info->dev = MKDEV(major, minor);
-- bd = bdget(info->dev);
-- if (bd == NULL)
-- return -ENODEV;
--
-- err = xlvbd_alloc_gendisk(major, minor, capacity, vdevice, vdisk_info,
-- sector_size, info);
--
-- bdput(bd);
-- return err;
--}
--
- void
- xlvbd_del(struct blkfront_info *info)
- {
---- a/drivers/xen/blktap/blktap.c
-+++ b/drivers/xen/blktap/blktap.c
-@@ -111,6 +111,7 @@ typedef struct tap_blkif {
- unsigned long mode; /*current switching mode */
- int minor; /*Minor number for tapdisk device */
- pid_t pid; /*tapdisk process id */
-+ struct pid_namespace *pid_ns; /*... and its corresponding namespace */
- enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
- shutdown */
- unsigned long *idx_map; /*Record the user ring id to kern
-@@ -295,16 +296,14 @@ static inline int OFFSET_TO_SEG(int offs
- * BLKTAP VM OPS
- */
+- switch (mode) {
+- case IOR_MODE_UNCACHED:
++ switch (prot_val) {
++ case _PAGE_CACHE_UC:
+ default:
+- err = set_memory_uc(vaddr, nrpages);
++ err = _set_memory_uc(vaddr, nrpages);
++ break;
++ case _PAGE_CACHE_WC:
++ err = _set_memory_wc(vaddr, nrpages);
+ break;
+- case IOR_MODE_CACHED:
+- err = set_memory_wb(vaddr, nrpages);
++ case _PAGE_CACHE_WB:
++ err = _set_memory_wb(vaddr, nrpages);
+ break;
+ }
--static struct page *blktap_nopage(struct vm_area_struct *vma,
-- unsigned long address,
-- int *type)
-+static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ return err;
+ }
+
++int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
++ unsigned long prot_val)
++{
++ unsigned long sz;
++ int rc;
++
++ for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
++ unsigned long pfn = mfn_to_local_pfn(mfn);
++
++ if (pfn >= max_pfn_mapped)
++ continue;
++ rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
++ PAGE_SIZE, prot_val);
++ }
++
++ return rc;
++}
++
+ /*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+@@ -262,12 +292,15 @@ static int ioremap_change_attr(unsigned
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+-static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
+- enum ioremap_mode mode)
++static void __iomem *__ioremap_caller(resource_size_t phys_addr,
++ unsigned long size, unsigned long prot_val, void *caller)
{
+- unsigned long mfn, offset, last_addr, vaddr;
++ unsigned long mfn, offset, vaddr;
++ resource_size_t last_addr;
+ struct vm_struct *area;
++ unsigned long new_prot_val;
+ pgprot_t prot;
++ int retval;
+ domid_t domid = DOMID_IO;
+
+ /* Don't allow wraparound or zero size */
+@@ -275,6 +308,13 @@ static void __iomem *__ioremap(resource_
+ if (!size || last_addr < phys_addr)
+ return NULL;
+
++ if (!phys_addr_valid(phys_addr)) {
++ printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
++ (unsigned long long)phys_addr);
++ WARN_ON_ONCE(1);
++ return NULL;
++ }
++
/*
- * if the page has not been mapped in by the driver then return
-- * NOPAGE_SIGBUS to the domain.
-+ * VM_FAULT_SIGBUS to the domain.
+ * Don't remap the low PCI/ISA area, it's always mapped..
*/
+@@ -287,55 +327,86 @@ static void __iomem *__ioremap(resource_
+ for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
+ unsigned long pfn = mfn_to_local_pfn(mfn);
-- return NOPAGE_SIGBUS;
-+ return VM_FAULT_SIGBUS;
- }
-
- static pte_t blktap_clear_pte(struct vm_area_struct *vma,
-@@ -390,7 +389,7 @@ static pte_t blktap_clear_pte(struct vm_
- }
+- if (pfn >= max_pfn)
+- continue;
++ if (pfn_valid(pfn)) {
++ if (!PageReserved(pfn_to_page(pfn)))
++ return NULL;
++ domid = DOMID_SELF;
++ }
++ }
++ WARN_ON_ONCE(domid == DOMID_SELF);
- struct vm_operations_struct blktap_vm_ops = {
-- nopage: blktap_nopage,
-+ fault: blktap_fault,
- zap_pte: blktap_clear_pte,
- };
+- domid = DOMID_SELF;
++ /*
++ * Mappings have to be page-aligned
++ */
++ offset = phys_addr & ~PAGE_MASK;
++ phys_addr &= PAGE_MASK;
++ size = PAGE_ALIGN(last_addr+1) - phys_addr;
-@@ -483,9 +482,8 @@ found:
- tapfds[minor] = info;
+- if (pfn >= max_pfn_mapped) /* bogus */
+- continue;
++ retval = reserve_memtype(phys_addr, phys_addr + size,
++ prot_val, &new_prot_val);
++ if (retval) {
++ pr_debug("Warning: reserve_memtype returned %d\n", retval);
++ return NULL;
++ }
- if ((class = get_xen_class()) != NULL)
-- class_device_create(class, NULL,
-- MKDEV(blktap_major, minor), NULL,
-- "blktap%d", minor);
-+ device_create(class, NULL, MKDEV(blktap_major, minor),
-+ "blktap%d", minor);
+- if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
++ if (prot_val != new_prot_val) {
++ /*
++ * Do not fallback to certain memory types with certain
++ * requested type:
++ * - request is uc-, return cannot be write-back
++ * - request is uc-, return cannot be write-combine
++ * - request is write-combine, return cannot be write-back
++ */
++ if ((prot_val == _PAGE_CACHE_UC_MINUS &&
++ (new_prot_val == _PAGE_CACHE_WB ||
++ new_prot_val == _PAGE_CACHE_WC)) ||
++ (prot_val == _PAGE_CACHE_WC &&
++ new_prot_val == _PAGE_CACHE_WB)) {
++ pr_debug(
++ "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
++ (unsigned long long)phys_addr,
++ (unsigned long long)(phys_addr + size),
++ prot_val, new_prot_val);
++ free_memtype(phys_addr, phys_addr + size);
+ return NULL;
++ }
++ prot_val = new_prot_val;
}
- out:
-@@ -527,7 +525,7 @@ void signal_tapdisk(int idx)
- return;
+- switch (mode) {
+- case IOR_MODE_UNCACHED:
++ switch (prot_val) {
++ case _PAGE_CACHE_UC:
+ default:
+- /*
+- * FIXME: we will use UC MINUS for now, as video fb drivers
+- * depend on it. Upcoming ioremap_wc() will fix this behavior.
+- */
++ prot = PAGE_KERNEL_NOCACHE;
++ break;
++ case _PAGE_CACHE_UC_MINUS:
+ prot = PAGE_KERNEL_UC_MINUS;
+ break;
+- case IOR_MODE_CACHED:
++ case _PAGE_CACHE_WC:
++ prot = PAGE_KERNEL_WC;
++ break;
++ case _PAGE_CACHE_WB:
+ prot = PAGE_KERNEL;
+ break;
+ }
- if (info->pid > 0) {
-- ptask = find_task_by_pid(info->pid);
-+ ptask = find_task_by_pid_ns(info->pid, info->pid_ns);
- if (ptask)
- info->status = CLEANSHUTDOWN;
+ /*
+- * Mappings have to be page-aligned
+- */
+- offset = phys_addr & ~PAGE_MASK;
+- phys_addr &= PAGE_MASK;
+- size = PAGE_ALIGN(last_addr+1) - phys_addr;
+-
+- /*
+ * Ok, go for it..
+ */
+- area = get_vm_area(size, VM_IOREMAP | (mode << 20));
++ area = get_vm_area_caller(size, VM_IOREMAP, caller);
+ if (!area)
+ return NULL;
+ area->phys_addr = phys_addr;
+ vaddr = (unsigned long) area->addr;
+ if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
+ size, prot, domid)) {
++ free_memtype(phys_addr, phys_addr + size);
+ free_vm_area(area);
+ return NULL;
}
-@@ -773,8 +771,9 @@ static int blktap_ioctl(struct inode *in
- {
- if (info) {
- info->pid = (pid_t)arg;
-- DPRINTK("blktap: pid received %d\n",
-- info->pid);
-+ info->pid_ns = current->nsproxy->pid_ns;
-+ DPRINTK("blktap: pid received %p:%d\n",
-+ info->pid_ns, info->pid);
- }
- return 0;
+
+- if (ioremap_change_attr(vaddr, size, mode) < 0) {
+- iounmap((void __iomem *) vaddr);
++ if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
++ free_memtype(phys_addr, phys_addr + size);
++ vunmap(area->addr);
+ return NULL;
}
-@@ -1687,9 +1686,7 @@ static int __init blkif_init(void)
- * We only create the device when a request of a new device is
- * made.
- */
-- class_device_create(class, NULL,
-- MKDEV(blktap_major, 0), NULL,
-- "blktap0");
-+ device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
- } else {
- /* this is bad, but not fatal */
- WPRINTK("blktap: sysfs xen_class not created\n");
---- a/drivers/xen/char/mem.c
-+++ b/drivers/xen/char/mem.c
-@@ -33,6 +33,27 @@ static inline int uncached_access(struct
- return 0;
+
+@@ -365,16 +436,72 @@ static void __iomem *__ioremap(resource_
+ */
+ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
+ {
+- return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
++ /*
++ * Ideally, this should be:
++ * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
++ *
++ * Till we fix all X drivers to use ioremap_wc(), we will use
++ * UC MINUS.
++ */
++ unsigned long val = _PAGE_CACHE_UC_MINUS;
++
++ return __ioremap_caller(phys_addr, size, val,
++ __builtin_return_address(0));
+ }
+ EXPORT_SYMBOL(ioremap_nocache);
+
++/**
++ * ioremap_wc - map memory into CPU space write combined
++ * @offset: bus address of the memory
++ * @size: size of the resource to map
++ *
++ * This version of ioremap ensures that the memory is marked write combining.
++ * Write combining allows faster writes to some hardware devices.
++ *
++ * Must be freed with iounmap.
++ */
++void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
++{
++ if (pat_wc_enabled)
++ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
++ __builtin_return_address(0));
++ else
++ return ioremap_nocache(phys_addr, size);
++}
++EXPORT_SYMBOL(ioremap_wc);
++
+ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
+ {
+- return __ioremap(phys_addr, size, IOR_MODE_CACHED);
++ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
++ __builtin_return_address(0));
}
+ EXPORT_SYMBOL(ioremap_cache);
-+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
++#ifndef CONFIG_XEN
++static void __iomem *ioremap_default(resource_size_t phys_addr,
++ unsigned long size)
+{
-+#ifdef CONFIG_NONPROMISC_DEVMEM
-+ u64 from = ((u64)pfn) << PAGE_SHIFT;
-+ u64 to = from + size;
-+ u64 cursor = from;
++ unsigned long flags;
++ void *ret;
++ int err;
+
-+ while (cursor < to) {
-+ if (!devmem_is_allowed(pfn)) {
-+ printk(KERN_INFO
-+ "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
-+ current->comm, from, to);
-+ return 0;
-+ }
-+ cursor += PAGE_SIZE;
-+ pfn++;
-+ }
-+#endif
-+ return 1;
-+}
++ /*
++ * - WB for WB-able memory and no other conflicting mappings
++ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
++ * - Inherit from confliting mappings otherwise
++ */
++ err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
++ if (err < 0)
++ return NULL;
+
- /*
- * This funcion reads the *physical* memory. The f_pos points directly to the
- * memory location.
-@@ -55,6 +76,9 @@ static ssize_t read_mem(struct file * fi
-
- sz = min_t(unsigned long, sz, count);
-
-+ if (!range_is_allowed(p >> PAGE_SHIFT, count))
-+ return -EPERM;
++ ret = (void *) __ioremap_caller(phys_addr, size, flags,
++ __builtin_return_address(0));
+
- v = ioremap(p, sz);
- if (IS_ERR(v) || v == NULL) {
- /*
-@@ -103,6 +127,9 @@ static ssize_t write_mem(struct file * f
++ free_memtype(phys_addr, phys_addr + size);
++ return (void __iomem *)ret;
++}
++#endif
++
+ /**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+@@ -417,15 +544,7 @@ void iounmap(volatile void __iomem *addr
+ return;
+ }
- sz = min_t(unsigned long, sz, count);
+- if ((p->flags >> 20) != IOR_MODE_CACHED) {
+- unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
+- unsigned long mfn = p->phys_addr;
+- unsigned long va = (unsigned long)addr;
+-
+- for (; n > 0; n--, mfn++, va += PAGE_SIZE)
+- if (mfn_to_local_pfn(mfn) < max_pfn)
+- set_memory_wb(va, 1);
+- }
++ free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
-+ if (!range_is_allowed(p >> PAGE_SHIFT, sz))
-+ return -EPERM;
-+
- v = ioremap(p, sz);
- if (v == NULL)
- break;
-@@ -131,6 +158,23 @@ static ssize_t write_mem(struct file * f
+ /* Finally remove it */
+ o = remove_vm_area((void *)addr);
+@@ -434,6 +553,37 @@ void iounmap(volatile void __iomem *addr
}
+ EXPORT_SYMBOL(iounmap);
- #ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
-+static void mmap_mem_open(struct vm_area_struct *vma)
-+{
-+ map_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
-+ vma->vm_page_prot);
-+}
-+
-+static void mmap_mem_close(struct vm_area_struct *vma)
++#ifndef CONFIG_XEN
++/*
++ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
++ * access
++ */
++void *xlate_dev_mem_ptr(unsigned long phys)
+{
-+ unmap_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
-+ vma->vm_page_prot);
-+}
++ void *addr;
++ unsigned long start = phys & PAGE_MASK;
+
-+static struct vm_operations_struct mmap_mem_ops = {
-+ .open = mmap_mem_open,
-+ .close = mmap_mem_close
-+};
++ /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
++ if (page_is_ram(start >> PAGE_SHIFT))
++ return __va(phys);
+
- static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
- {
- size_t size = vma->vm_end - vma->vm_start;
-@@ -138,6 +182,15 @@ static int xen_mmap_mem(struct file * fi
- if (uncached_access(file))
- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-+ if (!range_is_allowed(vma->vm_pgoff, size))
-+ return -EPERM;
++ addr = (void *)ioremap_default(start, PAGE_SIZE);
++ if (addr)
++ addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+
-+ if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
-+ &vma->vm_page_prot))
-+ return -EINVAL;
++ return addr;
++}
+
-+ vma->vm_ops = &mmap_mem_ops;
++void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
++{
++ if (page_is_ram(phys >> PAGE_SHIFT))
++ return;
+
- /* We want to return the real error code, not EAGAIN. */
- return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
- size, vma->vm_page_prot, DOMID_IO);
---- a/drivers/xen/console/console.c
-+++ b/drivers/xen/console/console.c
-@@ -536,16 +536,18 @@ static int xencons_write(
- return i;
- }
-
--static void xencons_put_char(struct tty_struct *tty, u_char ch)
-+static int xencons_put_char(struct tty_struct *tty, u_char ch)
- {
- unsigned long flags;
-+ int ret;
-
- if (DUMMY_TTY(tty))
-- return;
-+ return 0;
-
- spin_lock_irqsave(&xencons_lock, flags);
-- (void)__xencons_put_char(ch);
-+ ret = __xencons_put_char(ch);
- spin_unlock_irqrestore(&xencons_lock, flags);
-+ return ret;
- }
-
- static void xencons_flush_chars(struct tty_struct *tty)
-@@ -567,7 +569,7 @@ static void xencons_wait_until_sent(stru
- if (DUMMY_TTY(tty))
- return;
-
-- while (DRV(tty->driver)->chars_in_buffer(tty)) {
-+ while (tty_chars_in_buffer(tty)) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(1);
- if (signal_pending(current))
-@@ -616,8 +618,7 @@ static void xencons_close(struct tty_str
-
- tty->closing = 1;
- tty_wait_until_sent(tty, 0);
-- if (DRV(tty->driver)->flush_buffer != NULL)
-- DRV(tty->driver)->flush_buffer(tty);
-+ tty_driver_flush_buffer(tty);
- if (tty->ldisc.flush_buffer != NULL)
- tty->ldisc.flush_buffer(tty);
- tty->closing = 0;
---- a/drivers/xen/core/machine_kexec.c
-+++ b/drivers/xen/core/machine_kexec.c
-@@ -90,6 +90,9 @@ void __init xen_machine_kexec_setup_reso
- xen_hypervisor_res.start = range.start;
- xen_hypervisor_res.end = range.start + range.size - 1;
- xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
-+#ifdef CONFIG_X86_64
-+ insert_resource(&iomem_resource, &xen_hypervisor_res);
++ iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
++ return;
++}
+#endif
++
+ int __initdata early_ioremap_debug;
- /* fill in crashk_res if range is reserved by hypervisor */
-
-@@ -102,6 +105,9 @@ void __init xen_machine_kexec_setup_reso
- if (range.size) {
- crashk_res.start = range.start;
- crashk_res.end = range.start + range.size - 1;
-+#ifdef CONFIG_X86_64
-+ insert_resource(&iomem_resource, &crashk_res);
-+#endif
- }
+ static int __init early_ioremap_debug_setup(char *str)
+@@ -445,8 +595,8 @@ static int __init early_ioremap_debug_se
+ early_param("early_ioremap_debug", early_ioremap_debug_setup);
- /* get physical address of vmcoreinfo */
-@@ -146,11 +152,13 @@ void __init xen_machine_kexec_setup_reso
- return;
- }
+ static __initdata int after_paging_init;
+-static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
+- __attribute__((aligned(PAGE_SIZE)));
++static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
++ __section(.bss.page_aligned);
-+#ifndef CONFIG_X86_64
- void __init xen_machine_kexec_register_resources(struct resource *res)
- {
- request_resource(res, &xen_hypervisor_res);
- machine_kexec_register_resources(res);
+ #ifdef CONFIG_X86_32
+ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
+@@ -461,8 +611,8 @@ static inline pmd_t * __init early_iorem
}
-+#endif
+ #else
+ #define early_ioremap_pmd early_get_pmd
++#undef make_lowmem_page_readonly
+ #define make_lowmem_page_readonly early_make_page_readonly
+-#define make_lowmem_page_writable make_page_writable
+ #endif
- static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
- {
---- a/drivers/xen/core/machine_reboot.c
-+++ b/drivers/xen/core/machine_reboot.c
-@@ -52,6 +52,14 @@ void machine_power_off(void)
- HYPERVISOR_shutdown(SHUTDOWN_poweroff);
+ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
+@@ -512,7 +662,7 @@ void __init early_ioremap_clear(void)
+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+ pmd_clear(pmd);
+ make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
+- /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
++ /* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */
+ __flush_tlb_all();
}
-+#ifdef CONFIG_KEXEC
-+#include <asm/reboot.h>
-+void machine_crash_shutdown(struct pt_regs *regs)
-+{
-+ native_machine_crash_shutdown(regs);
-+}
-+#endif
-+
- int reboot_thru_bios = 0; /* for dmi_scan.c */
- EXPORT_SYMBOL(machine_restart);
- EXPORT_SYMBOL(machine_halt);
---- a/drivers/xen/core/smpboot.c
-+++ b/drivers/xen/core/smpboot.c
-@@ -57,17 +57,16 @@ static DEFINE_PER_CPU(int, callfunc_irq)
- static char resched_name[NR_CPUS][15];
- static char callfunc_name[NR_CPUS][15];
+@@ -654,10 +804,11 @@ void __init early_iounmap(void *addr, un
+ unsigned long offset;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+- unsigned int nesting;
++ int nesting;
--u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-+#ifdef CONFIG_X86_LOCAL_APIC
-+#define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
-+#else
-+#define set_cpu_to_apicid(cpu, apicid)
-+#endif
+ nesting = --early_ioremap_nested;
+- WARN_ON(nesting < 0);
++ if (WARN_ON(nesting < 0))
++ return;
- DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
- DEFINE_PER_CPU(cpumask_t, cpu_core_map);
- EXPORT_PER_CPU_SYMBOL(cpu_core_map);
+ if (early_ioremap_debug) {
+ printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
+--- sle11-2009-05-14.orig/arch/x86/mm/pageattr-xen.c 2009-03-16 16:37:14.000000000 +0100
++++ sle11-2009-05-14/arch/x86/mm/pageattr-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -9,6 +9,8 @@
+ #include <linux/slab.h>
+ #include <linux/mm.h>
+ #include <linux/interrupt.h>
++#include <linux/seq_file.h>
++#include <linux/debugfs.h>
--#if defined(__i386__)
--DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
--EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+ #include <asm/e820.h>
+ #include <asm/processor.h>
+@@ -17,370 +19,7 @@
+ #include <asm/uaccess.h>
+ #include <asm/pgalloc.h>
+ #include <asm/proto.h>
+-#include <asm/mmu_context.h>
+-
+-#ifndef CONFIG_X86_64
+-#define TASK_SIZE64 TASK_SIZE
+-#endif
+-
+-static void _pin_lock(struct mm_struct *mm, int lock) {
+- if (lock)
+- spin_lock(&mm->page_table_lock);
+-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+- /* While mm->page_table_lock protects us against insertions and
+- * removals of higher level page table pages, it doesn't protect
+- * against updates of pte-s. Such updates, however, require the
+- * pte pages to be in consistent state (unpinned+writable or
+- * pinned+readonly). The pinning and attribute changes, however
+- * cannot be done atomically, which is why such updates must be
+- * prevented from happening concurrently.
+- * Note that no pte lock can ever elsewhere be acquired nesting
+- * with an already acquired one in the same mm, or with the mm's
+- * page_table_lock already acquired, as that would break in the
+- * non-split case (where all these are actually resolving to the
+- * one page_table_lock). Thus acquiring all of them here is not
+- * going to result in dead locks, and the order of acquires
+- * doesn't matter.
+- */
+- {
+- pgd_t *pgd = mm->pgd;
+- unsigned g;
+-
+- for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+- pud_t *pud;
+- unsigned u;
+-
+- if (pgd_none(*pgd))
+- continue;
+- pud = pud_offset(pgd, 0);
+- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+- pmd_t *pmd;
+- unsigned m;
+-
+- if (pud_none(*pud))
+- continue;
+- pmd = pmd_offset(pud, 0);
+- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+- spinlock_t *ptl;
+-
+- if (pmd_none(*pmd))
+- continue;
+- ptl = pte_lockptr(0, pmd);
+- if (lock)
+- spin_lock(ptl);
+- else
+- spin_unlock(ptl);
+- }
+- }
+- }
+- }
+-#endif
+- if (!lock)
+- spin_unlock(&mm->page_table_lock);
+-}
+-#define pin_lock(mm) _pin_lock(mm, 1)
+-#define pin_unlock(mm) _pin_lock(mm, 0)
+-
+-#define PIN_BATCH sizeof(void *)
+-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
+-
+-static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
+- unsigned int cpu, unsigned int seq)
+-{
+- unsigned long pfn = page_to_pfn(page);
+-
+- if (PageHighMem(page)) {
+- if (pgprot_val(flags) & _PAGE_RW)
+- ClearPagePinned(page);
+- else
+- SetPagePinned(page);
+- } else {
+- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+- (unsigned long)__va(pfn << PAGE_SHIFT),
+- pfn_pte(pfn, flags), 0);
+- if (unlikely(++seq == PIN_BATCH)) {
+- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+- PIN_BATCH, NULL)))
+- BUG();
+- seq = 0;
+- }
+- }
+-
+- return seq;
+-}
+-
+-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+-{
+- pgd_t *pgd = pgd_base;
+- pud_t *pud;
+- pmd_t *pmd;
+- int g,u,m;
+- unsigned int cpu, seq;
+- multicall_entry_t *mcl;
+-
+- if (xen_feature(XENFEAT_auto_translated_physmap))
+- return;
+-
+- cpu = get_cpu();
+-
+- /*
+- * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
+- * may not be the 'current' task's pagetables (e.g., current may be
+- * 32-bit, but the pagetables may be for a 64-bit task).
+- * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
+- * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
+- */
+- for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+- if (pgd_none(*pgd))
+- continue;
+- pud = pud_offset(pgd, 0);
+- if (PTRS_PER_PUD > 1) /* not folded */
+- seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
+- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+- if (pud_none(*pud))
+- continue;
+- pmd = pmd_offset(pud, 0);
+- if (PTRS_PER_PMD > 1) /* not folded */
+- seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
+- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+- if (pmd_none(*pmd))
+- continue;
+- seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
+- }
+- }
+- }
+-
+- mcl = per_cpu(pb_mcl, cpu);
+-#ifdef CONFIG_X86_64
+- if (unlikely(seq > PIN_BATCH - 2)) {
+- if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
+- BUG();
+- seq = 0;
+- }
+- MULTI_update_va_mapping(mcl + seq,
+- (unsigned long)__user_pgd(pgd_base),
+- pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
+- 0);
+- MULTI_update_va_mapping(mcl + seq + 1,
+- (unsigned long)pgd_base,
+- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+- UVMF_TLB_FLUSH);
+- if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
+- BUG();
+-#else
+- if (likely(seq != 0)) {
+- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+- (unsigned long)pgd_base,
+- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+- UVMF_TLB_FLUSH);
+- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+- seq + 1, NULL)))
+- BUG();
+- } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
+- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+- UVMF_TLB_FLUSH))
+- BUG();
+-#endif
+-
+- put_cpu();
+-}
+-
+-static void __pgd_pin(pgd_t *pgd)
+-{
+- pgd_walk(pgd, PAGE_KERNEL_RO);
+- kmap_flush_unused();
+- xen_pgd_pin(__pa(pgd)); /* kernel */
+-#ifdef CONFIG_X86_64
+- xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
+-#endif
+- SetPagePinned(virt_to_page(pgd));
+-}
+-
+-static void __pgd_unpin(pgd_t *pgd)
+-{
+- xen_pgd_unpin(__pa(pgd));
+-#ifdef CONFIG_X86_64
+- xen_pgd_unpin(__pa(__user_pgd(pgd)));
+-#endif
+- pgd_walk(pgd, PAGE_KERNEL);
+- ClearPagePinned(virt_to_page(pgd));
+-}
+-
+-void pgd_test_and_unpin(pgd_t *pgd)
+-{
+- if (PagePinned(virt_to_page(pgd)))
+- __pgd_unpin(pgd);
+-}
+-
+-void mm_pin(struct mm_struct *mm)
+-{
+- if (xen_feature(XENFEAT_writable_page_tables))
+- return;
+-
+- pin_lock(mm);
+- __pgd_pin(mm->pgd);
+- pin_unlock(mm);
+-}
+-
+-void mm_unpin(struct mm_struct *mm)
+-{
+- if (xen_feature(XENFEAT_writable_page_tables))
+- return;
+-
+- pin_lock(mm);
+- __pgd_unpin(mm->pgd);
+- pin_unlock(mm);
+-}
+-
+-void mm_pin_all(void)
+-{
+- struct page *page;
+- unsigned long flags;
+-
+- if (xen_feature(XENFEAT_writable_page_tables))
+- return;
+-
+- /*
+- * Allow uninterrupted access to the pgd_list. Also protects
+- * __pgd_pin() by disabling preemption.
+- * All other CPUs must be at a safe point (e.g., in stop_machine
+- * or offlined entirely).
+- */
+- spin_lock_irqsave(&pgd_lock, flags);
+- list_for_each_entry(page, &pgd_list, lru) {
+- if (!PagePinned(page))
+- __pgd_pin((pgd_t *)page_address(page));
+- }
+- spin_unlock_irqrestore(&pgd_lock, flags);
+-}
+-
+-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+-{
+- if (!PagePinned(virt_to_page(mm->pgd)))
+- mm_pin(mm);
+-}
+-
+-void arch_exit_mmap(struct mm_struct *mm)
+-{
+- struct task_struct *tsk = current;
+-
+- task_lock(tsk);
+-
+- /*
+- * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+- * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+- */
+- if (tsk->active_mm == mm) {
+- tsk->active_mm = &init_mm;
+- atomic_inc(&init_mm.mm_count);
+-
+- switch_mm(mm, &init_mm, tsk);
+-
+- atomic_dec(&mm->mm_count);
+- BUG_ON(atomic_read(&mm->mm_count) == 0);
+- }
+-
+- task_unlock(tsk);
+-
+- if (PagePinned(virt_to_page(mm->pgd))
+- && atomic_read(&mm->mm_count) == 1
+- && !mm->context.has_foreign_mappings)
+- mm_unpin(mm);
+-}
+-
+-static void _pte_free(struct page *page, unsigned int order)
+-{
+- BUG_ON(order);
+- __pte_free(page);
+-}
+-
+-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+-{
+- struct page *pte;
+-
+-#ifdef CONFIG_HIGHPTE
+- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+-#else
+- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-#endif
+- if (pte) {
+- pgtable_page_ctor(pte);
+- SetPageForeign(pte, _pte_free);
+- init_page_count(pte);
+- }
+- return pte;
+-}
-
- void __init prefill_possible_map(void)
- {
- int i, rc;
-@@ -158,7 +157,7 @@ static int __cpuinit xen_smp_intr_init(u
- }
-
- #ifdef CONFIG_HOTPLUG_CPU
--static void xen_smp_intr_exit(unsigned int cpu)
-+static void __cpuexit xen_smp_intr_exit(unsigned int cpu)
- {
- if (cpu != 0)
- local_teardown_timer(cpu);
-@@ -267,8 +266,7 @@ void __init smp_prepare_cpus(unsigned in
- boot_cpu_data.apicid = apicid;
- cpu_data(0) = boot_cpu_data;
-
-- cpu_2_logical_apicid[0] = apicid;
-- per_cpu(x86_cpu_to_apicid, 0) = apicid;
-+ set_cpu_to_apicid(0, apicid);
-
- current_thread_info()->cpu = 0;
-
-@@ -323,8 +321,7 @@ void __init smp_prepare_cpus(unsigned in
- cpu_data(cpu).cpu_index = cpu;
- cpu_data(cpu).apicid = apicid;
-
-- cpu_2_logical_apicid[cpu] = apicid;
-- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
-+ set_cpu_to_apicid(cpu, apicid);
-
- #ifdef __x86_64__
- cpu_pda(cpu)->pcurrent = idle;
-@@ -379,7 +376,7 @@ static int __init initialize_cpu_present
- }
- core_initcall(initialize_cpu_present_map);
-
--int __cpu_disable(void)
-+int __cpuexit __cpu_disable(void)
- {
- cpumask_t map = cpu_online_map;
- unsigned int cpu = smp_processor_id();
-@@ -396,7 +393,7 @@ int __cpu_disable(void)
- return 0;
- }
-
--void __cpu_die(unsigned int cpu)
-+void __cpuexit __cpu_die(unsigned int cpu)
- {
- while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
- current->state = TASK_UNINTERRUPTIBLE;
---- a/drivers/xen/core/xen_proc.c
-+++ b/drivers/xen/core/xen_proc.c
-@@ -8,7 +8,7 @@ static struct proc_dir_entry *xen_base;
- struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
- {
- if ( xen_base == NULL )
-- if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
-+ if ( (xen_base = proc_mkdir("xen", NULL)) == NULL )
- panic("Couldn't create /proc/xen");
- return create_proc_entry(name, mode, xen_base);
- }
---- a/drivers/xen/fbfront/xenfb.c
-+++ b/drivers/xen/fbfront/xenfb.c
-@@ -94,7 +94,7 @@ struct xenfb_info
- * only mappings. The former creates unfaulted pages. Preserves
- * invariant. The latter removes pages. Preserves invariant.
- *
-- * 3. Holding both locks: xenfb_vm_nopage(). Extends the dirty
-+ * 3. Holding both locks: xenfb_vm_fault(). Extends the dirty
- * rectangle and updates mappings consistently. Preserves
- * invariant.
- *
-@@ -113,13 +113,13 @@ struct xenfb_info
- *
- * But FIXME: the invariant is too weak. It misses that the fault
- * record in mappings must be consistent with the mapping of pages in
-- * the associated address space! do_no_page() updates the PTE after
-- * xenfb_vm_nopage() returns, i.e. outside the critical region. This
-+ * the associated address space! __do_fault() updates the PTE after
-+ * xenfb_vm_fault() returns, i.e. outside the critical region. This
- * allows the following race:
- *
- * X writes to some address in the Xen frame buffer
-- * Fault - call do_no_page()
-- * call xenfb_vm_nopage()
-+ * Fault - call __do_fault()
-+ * call xenfb_vm_fault()
- * grab mm_lock
- * map->faults++;
- * release mm_lock
-@@ -386,18 +386,17 @@ static void xenfb_vm_close(struct vm_are
- mutex_unlock(&info->mm_lock);
- }
-
--static struct page *xenfb_vm_nopage(struct vm_area_struct *vma,
-- unsigned long vaddr, int *type)
-+static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
- {
- struct xenfb_mapping *map = vma->vm_private_data;
- struct xenfb_info *info = map->info;
-- int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT;
-+ int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
- unsigned long flags;
- struct page *page;
- int y1, y2;
-
- if (pgnr >= info->nr_pages)
-- return NOPAGE_SIGBUS;
-+ return VM_FAULT_SIGBUS;
-
- mutex_lock(&info->mm_lock);
- spin_lock_irqsave(&info->dirty_lock, flags);
-@@ -413,16 +412,15 @@ static struct page *xenfb_vm_nopage(stru
- spin_unlock_irqrestore(&info->dirty_lock, flags);
- mutex_unlock(&info->mm_lock);
-
-- if (type)
-- *type = VM_FAULT_MINOR;
-+ vmf->page = page;
-
-- return page;
-+ return VM_FAULT_MINOR;
- }
-
- static struct vm_operations_struct xenfb_vm_ops = {
- .open = xenfb_vm_open,
- .close = xenfb_vm_close,
-- .nopage = xenfb_vm_nopage,
-+ .fault = xenfb_vm_fault,
- };
-
- static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
---- a/drivers/xen/gntdev/gntdev.c
-+++ b/drivers/xen/gntdev/gntdev.c
-@@ -392,7 +392,7 @@ nomem_out:
- static int __init gntdev_init(void)
- {
- struct class *class;
-- struct class_device *device;
-+ struct device *device;
-
- if (!is_running_on_xen()) {
- printk(KERN_ERR "You must be running Xen to use gntdev\n");
-@@ -417,8 +417,8 @@ static int __init gntdev_init(void)
- return 0;
- }
-
-- device = class_device_create(class, NULL, MKDEV(gntdev_major, 0),
-- NULL, GNTDEV_NAME);
-+ device = device_create(class, NULL, MKDEV(gntdev_major, 0),
-+ GNTDEV_NAME);
- if (IS_ERR(device)) {
- printk(KERN_ERR "Error creating gntdev device in xen_class\n");
- printk(KERN_ERR "gntdev created with major number = %d\n",
-@@ -435,7 +435,7 @@ static void __exit gntdev_exit(void)
- {
- struct class *class;
- if ((class = get_xen_class()) != NULL)
-- class_device_destroy(class, MKDEV(gntdev_major, 0));
-+ device_destroy(class, MKDEV(gntdev_major, 0));
- unregister_chrdev(gntdev_major, GNTDEV_NAME);
- }
-
---- a/drivers/xen/Kconfig
-+++ b/drivers/xen/Kconfig
-@@ -2,8 +2,6 @@
- # This Kconfig describe xen options
- #
-
--mainmenu "Xen Configuration"
+-void __pte_free(pgtable_t pte)
+-{
+- if (!PageHighMem(pte)) {
+- unsigned long va = (unsigned long)page_address(pte);
+- unsigned int level;
+- pte_t *ptep = lookup_address(va, &level);
+-
+- BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
+- if (!pte_write(*ptep)
+- && HYPERVISOR_update_va_mapping(va,
+- mk_pte(pte, PAGE_KERNEL),
+- 0))
+- BUG();
+- } else
+-#ifdef CONFIG_HIGHPTE
+- ClearPagePinned(pte);
+-#else
+- BUG();
+-#endif
+-
+- ClearPageForeign(pte);
+- init_page_count(pte);
+- pgtable_page_dtor(pte);
+- __free_page(pte);
+-}
+-
+-#if PAGETABLE_LEVELS >= 3
+-static void _pmd_free(struct page *page, unsigned int order)
+-{
+- BUG_ON(order);
+- __pmd_free(page);
+-}
+-
+-pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+-{
+- struct page *pmd;
+-
+- pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+- if (!pmd)
+- return NULL;
+- SetPageForeign(pmd, _pmd_free);
+- init_page_count(pmd);
+- return page_address(pmd);
+-}
+-
+-void __pmd_free(pgtable_t pmd)
+-{
+- unsigned long va = (unsigned long)page_address(pmd);
+- unsigned int level;
+- pte_t *ptep = lookup_address(va, &level);
+-
+- BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
+- if (!pte_write(*ptep)
+- && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
+- BUG();
+-
+- ClearPageForeign(pmd);
+- init_page_count(pmd);
+- __free_page(pmd);
+-}
+-#endif
-
- config XEN
- bool
-
---- a/drivers/xen/Makefile
-+++ b/drivers/xen/Makefile
-@@ -1,5 +1,8 @@
--obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o
-+obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
-+xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
-+xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
+-/* blktap and gntdev need this, as otherwise they would implicitly (and
+- * needlessly, as they never use it) reference init_mm. */
+-pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
+- unsigned long addr, pte_t *ptep, int full)
+-{
+- return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
+-}
+-EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
++#include <asm/pat.h>
-+xen-balloon-$(CONFIG_XEN) := balloon/
- obj-$(CONFIG_XEN) += core/
- obj-$(CONFIG_XEN) += console/
- obj-$(CONFIG_XEN) += evtchn/
-@@ -7,7 +10,8 @@ obj-y += xenbus/
- obj-$(CONFIG_XEN) += char/
+ /*
+ * The current flushing context - we pass it instead of 5 arguments:
+@@ -392,6 +31,7 @@ struct cpa_data {
+ int numpages;
+ int flushtlb;
+ unsigned long pfn;
++ unsigned force_split : 1;
+ };
- obj-$(CONFIG_XEN) += util.o
--obj-$(CONFIG_XEN_BALLOON) += balloon/
-+obj-$(CONFIG_XEN_XENCOMM) += $(xen-xencomm-y)
-+obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y)
- obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
- obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
- obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
---- a/drivers/xen/netfront/netfront.c
-+++ b/drivers/xen/netfront/netfront.c
-@@ -1464,8 +1464,7 @@ err:
- }
- }
+ #ifdef CONFIG_X86_64
+@@ -637,6 +277,9 @@ try_preserve_large_page(pte_t *kpte, uns
+ int i, do_split = 1;
+ unsigned int level;
-- while ((skb = __skb_dequeue(&errq)))
-- kfree_skb(skb);
-+ __skb_queue_purge(&errq);
++ if (cpa->force_split)
++ return 1;
++
+ spin_lock_irqsave(&pgd_lock, flags);
+ /*
+ * Check for races, another CPU might have split this page
+@@ -856,9 +499,7 @@ static int split_large_page(pte_t *kpte,
+ goto out_unlock;
- while ((skb = __skb_dequeue(&rxq)) != NULL) {
- struct page *page = NETFRONT_SKB_CB(skb)->page;
-@@ -1630,8 +1629,7 @@ static void netif_release_rx_bufs_flip(s
- }
- }
+ pbase = (pte_t *)page_address(base);
+-#ifdef CONFIG_X86_32
+- paravirt_alloc_pt(&init_mm, page_to_pfn(base));
+-#endif
++ paravirt_alloc_pte(&init_mm, page_to_pfn(base));
+ ref_prot = pte_pgprot(pte_clrhuge(*kpte));
-- while ((skb = __skb_dequeue(&free_list)) != NULL)
-- dev_kfree_skb(skb);
-+ __skb_queue_purge(&free_list);
+ #ifdef CONFIG_X86_64
+@@ -919,7 +560,7 @@ static int __change_page_attr(struct cpa
+ repeat:
+ kpte = lookup_address(address, &level);
+ if (!kpte)
+- return primary ? -EINVAL : 0;
++ return 0;
- spin_unlock_bh(&np->rx_lock);
- }
---- a/drivers/xen/privcmd/privcmd.c
-+++ b/drivers/xen/privcmd/privcmd.c
-@@ -261,15 +261,13 @@ static long privcmd_ioctl(struct file *f
+ old_pte = *kpte;
+ if (!__pte_val(old_pte)) {
+@@ -1078,7 +719,8 @@ static inline int cache_attr(pgprot_t at
}
- #ifndef HAVE_ARCH_PRIVCMD_MMAP
--static struct page *privcmd_nopage(struct vm_area_struct *vma,
-- unsigned long address,
-- int *type)
-+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ static int change_page_attr_set_clr(unsigned long addr, int numpages,
+- pgprot_t mask_set, pgprot_t mask_clr)
++ pgprot_t mask_set, pgprot_t mask_clr,
++ int force_split)
{
-- return NOPAGE_SIGBUS;
-+ return VM_FAULT_SIGBUS;
- }
-
- static struct vm_operations_struct privcmd_vm_ops = {
-- .nopage = privcmd_nopage
-+ .fault = privcmd_fault
- };
+ struct cpa_data cpa;
+ int ret, cache, checkalias;
+@@ -1089,7 +731,7 @@ static int change_page_attr_set_clr(unsi
+ */
+ mask_set = canon_pgprot(mask_set);
+ mask_clr = canon_pgprot(mask_clr);
+- if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
++ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
+ return 0;
- static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
---- a/drivers/xen/xenbus/xenbus_client.c
-+++ b/drivers/xen/xenbus/xenbus_client.c
-@@ -440,7 +440,7 @@ int xenbus_map_ring_valloc(struct xenbus
+ /* Ensure we are PAGE_SIZE aligned */
+@@ -1106,6 +748,7 @@ static int change_page_attr_set_clr(unsi
+ cpa.mask_set = mask_set;
+ cpa.mask_clr = mask_clr;
+ cpa.flushtlb = 0;
++ cpa.force_split = force_split;
- *vaddr = NULL;
+ /* No alias checking for _NX bit modifications */
+ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+@@ -1144,26 +787,67 @@ out:
+ static inline int change_page_attr_set(unsigned long addr, int numpages,
+ pgprot_t mask)
+ {
+- return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
++ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
+ }
-- area = alloc_vm_area(PAGE_SIZE);
-+ area = xen_alloc_vm_area(PAGE_SIZE);
- if (!area)
- return -ENOMEM;
+ static inline int change_page_attr_clear(unsigned long addr, int numpages,
+ pgprot_t mask)
+ {
+- return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
++ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
+ }
-@@ -450,7 +450,7 @@ int xenbus_map_ring_valloc(struct xenbus
- BUG();
+-int set_memory_uc(unsigned long addr, int numpages)
++int _set_memory_uc(unsigned long addr, int numpages)
+ {
++ /*
++ * for now UC MINUS. see comments in ioremap_nocache()
++ */
+ return change_page_attr_set(addr, numpages,
+- __pgprot(_PAGE_PCD));
++ __pgprot(_PAGE_CACHE_UC_MINUS));
++}
++
++int set_memory_uc(unsigned long addr, int numpages)
++{
++ /*
++ * for now UC MINUS. see comments in ioremap_nocache()
++ */
++ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
++ _PAGE_CACHE_UC_MINUS, NULL))
++ return -EINVAL;
++
++ return _set_memory_uc(addr, numpages);
+ }
+ EXPORT_SYMBOL(set_memory_uc);
- if (op.status != GNTST_okay) {
-- free_vm_area(area);
-+ xen_free_vm_area(area);
- xenbus_dev_fatal(dev, op.status,
- "mapping in shared page %d from domain %d",
- gnt_ref, dev->otherend_id);
-@@ -549,7 +549,7 @@ int xenbus_unmap_ring_vfree(struct xenbu
- BUG();
+-int set_memory_wb(unsigned long addr, int numpages)
++int _set_memory_wc(unsigned long addr, int numpages)
++{
++ return change_page_attr_set(addr, numpages,
++ __pgprot(_PAGE_CACHE_WC));
++}
++
++int set_memory_wc(unsigned long addr, int numpages)
++{
++ if (!pat_wc_enabled)
++ return set_memory_uc(addr, numpages);
++
++ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
++ _PAGE_CACHE_WC, NULL))
++ return -EINVAL;
++
++ return _set_memory_wc(addr, numpages);
++}
++EXPORT_SYMBOL(set_memory_wc);
++
++int _set_memory_wb(unsigned long addr, int numpages)
+ {
+ return change_page_attr_clear(addr, numpages,
+- __pgprot(_PAGE_PCD | _PAGE_PWT));
++ __pgprot(_PAGE_CACHE_MASK));
++}
++
++int set_memory_wb(unsigned long addr, int numpages)
++{
++ free_memtype(addr, addr + numpages * PAGE_SIZE);
++
++ return _set_memory_wb(addr, numpages);
+ }
+ EXPORT_SYMBOL(set_memory_wb);
- if (op.status == GNTST_okay)
-- free_vm_area(area);
-+ xen_free_vm_area(area);
- else
- xenbus_dev_error(dev, op.status,
- "unmapping page at handle %d error %d",
---- a/drivers/xen/xenbus/xenbus_probe.c
-+++ b/drivers/xen/xenbus/xenbus_probe.c
-@@ -173,7 +173,7 @@ static int read_backend_details(struct x
- return read_otherend_details(xendev, "backend-id", "backend");
+@@ -1194,6 +878,12 @@ int set_memory_np(unsigned long addr, in
+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
}
--#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) && (defined(CONFIG_XEN) || defined(MODULE))
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
- static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env)
++int set_memory_4k(unsigned long addr, int numpages)
++{
++ return change_page_attr_set_clr(addr, numpages, __pgprot(0),
++ __pgprot(0), 1);
++}
++
+ int set_pages_uc(struct page *page, int numpages)
{
- struct xenbus_device *xdev;
-@@ -185,8 +185,10 @@ static int xenbus_uevent_frontend(struct
- return -ENODEV;
+ unsigned long addr = (unsigned long)page_address(page);
+@@ -1303,6 +993,45 @@ void kernel_map_pages(struct page *page,
+ cpa_fill_pool(NULL);
+ }
+
++#ifdef CONFIG_DEBUG_FS
++static int dpa_show(struct seq_file *m, void *v)
++{
++ seq_puts(m, "DEBUG_PAGEALLOC\n");
++ seq_printf(m, "pool_size : %lu\n", pool_size);
++ seq_printf(m, "pool_pages : %lu\n", pool_pages);
++ seq_printf(m, "pool_low : %lu\n", pool_low);
++ seq_printf(m, "pool_used : %lu\n", pool_used);
++ seq_printf(m, "pool_failed : %lu\n", pool_failed);
++
++ return 0;
++}
++
++static int dpa_open(struct inode *inode, struct file *filp)
++{
++ return single_open(filp, dpa_show, NULL);
++}
++
++static const struct file_operations dpa_fops = {
++ .open = dpa_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = single_release,
++};
++
++static int __init debug_pagealloc_proc_init(void)
++{
++ struct dentry *de;
++
++ de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
++ &dpa_fops);
++ if (!de)
++ return -ENOMEM;
++
++ return 0;
++}
++__initcall(debug_pagealloc_proc_init);
++#endif
++
+ #ifdef CONFIG_HIBERNATION
- /* stuff we want to pass to /sbin/hotplug */
-+#if defined(CONFIG_XEN) || defined(MODULE)
- add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype);
- add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename);
+ bool kernel_page_present(struct page *page)
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-05-14/arch/x86/mm/pat-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -0,0 +1,602 @@
++/*
++ * Handle caching attributes in page tables (PAT)
++ *
++ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
++ * Suresh B Siddha <suresh.b.siddha@intel.com>
++ *
++ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
++ */
++
++#include <linux/mm.h>
++#include <linux/kernel.h>
++#include <linux/gfp.h>
++#include <linux/fs.h>
++#include <linux/bootmem.h>
++
++#include <asm/msr.h>
++#include <asm/tlbflush.h>
++#include <asm/processor.h>
++#include <asm/page.h>
++#include <asm/pgtable.h>
++#include <asm/pat.h>
++#include <asm/e820.h>
++#include <asm/cacheflush.h>
++#include <asm/fcntl.h>
++#include <asm/mtrr.h>
++#include <asm/io.h>
++
++#ifdef CONFIG_X86_PAT
++int __read_mostly pat_wc_enabled = 1;
++
++void __cpuinit pat_disable(char *reason)
++{
++ pat_wc_enabled = 0;
++ printk(KERN_INFO "%s\n", reason);
++}
++
++static int __init nopat(char *str)
++{
++ pat_disable("PAT support disabled.");
++ return 0;
++}
++early_param("nopat", nopat);
++#endif
++
++static u64 __read_mostly boot_pat_state;
++
++enum {
++ PAT_UC = 0, /* uncached */
++ PAT_WC = 1, /* Write combining */
++ PAT_WT = 4, /* Write Through */
++ PAT_WP = 5, /* Write Protected */
++ PAT_WB = 6, /* Write Back (default) */
++ PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
++};
++
++#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
++
++void pat_init(void)
++{
++ u64 pat;
++
++ if (!pat_wc_enabled)
++ return;
++
++ /* Paranoia check. */
++ if (!cpu_has_pat) {
++ printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
++ /*
++ * Panic if this happens on the secondary CPU, and we
++ * switched to PAT on the boot CPU. We have no way to
++ * undo PAT.
++ */
++ BUG_ON(boot_pat_state);
++ }
++
++#ifndef CONFIG_XEN
++ /* Set PWT to Write-Combining. All other bits stay the same */
++ /*
++ * PTE encoding used in Linux:
++ * PAT
++ * |PCD
++ * ||PWT
++ * |||
++ * 000 WB _PAGE_CACHE_WB
++ * 001 WC _PAGE_CACHE_WC
++ * 010 UC- _PAGE_CACHE_UC_MINUS
++ * 011 UC _PAGE_CACHE_UC
++ * PAT bit unused
++ */
++ pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
++ PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
++
++ /* Boot CPU check */
++ if (!boot_pat_state)
++ rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
++
++ wrmsrl(MSR_IA32_CR_PAT, pat);
++#else
++ /*
++ * PAT settings are part of the hypervisor interface, and their
++ * assignment cannot be changed.
++ */
++ rdmsrl(MSR_IA32_CR_PAT, pat);
++ if (!boot_pat_state)
++ boot_pat_state = pat;
++#endif
++ printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
++ smp_processor_id(), boot_pat_state, pat);
++}
++
++#undef PAT
++
++static char *cattr_name(unsigned long flags)
++{
++ switch (flags & _PAGE_CACHE_MASK) {
++ case _PAGE_CACHE_UC: return "uncached";
++ case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
++ case _PAGE_CACHE_WB: return "write-back";
++ case _PAGE_CACHE_WC: return "write-combining";
++ case _PAGE_CACHE_WP: return "write-protected";
++ case _PAGE_CACHE_WT: return "write-through";
++ default: return "broken";
++ }
++}
++
++/*
++ * The global memtype list keeps track of memory type for specific
++ * physical memory areas. Conflicting memory types in different
++ * mappings can cause CPU cache corruption. To avoid this we keep track.
++ *
++ * The list is sorted based on starting address and can contain multiple
++ * entries for each address (this allows reference counting for overlapping
++ * areas). All the aliases have the same cache attributes of course.
++ * Zero attributes are represented as holes.
++ *
++ * Currently the data structure is a list because the number of mappings
++ * are expected to be relatively small. If this should be a problem
++ * it could be changed to a rbtree or similar.
++ *
++ * memtype_lock protects the whole list.
++ */
++
++struct memtype {
++ u64 start;
++ u64 end;
++ unsigned long type;
++ struct list_head nd;
++};
++
++static LIST_HEAD(memtype_list);
++static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
++
++/*
++ * Does intersection of PAT memory type and MTRR memory type and returns
++ * the resulting memory type as PAT understands it.
++ * (Type in pat and mtrr will not have same value)
++ * The intersection is based on "Effective Memory Type" tables in IA-32
++ * SDM vol 3a
++ */
++static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
++ unsigned long *ret_prot)
++{
++ unsigned long pat_type;
++ u8 mtrr_type;
++
++ pat_type = prot & _PAGE_CACHE_MASK;
++ prot &= (~_PAGE_CACHE_MASK);
++
++ /*
++ * We return the PAT request directly for types where PAT takes
++ * precedence with respect to MTRR and for UC_MINUS.
++ * Consistency checks with other PAT requests is done later
++ * while going through memtype list.
++ */
++ if (pat_type == _PAGE_CACHE_WC) {
++ *ret_prot = prot | _PAGE_CACHE_WC;
++ return 0;
++ } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
++ *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
++ return 0;
++ } else if (pat_type == _PAGE_CACHE_UC) {
++ *ret_prot = prot | _PAGE_CACHE_UC;
++ return 0;
++ }
++
++ /*
++ * Look for MTRR hint to get the effective type in case where PAT
++ * request is for WB.
++ */
++ mtrr_type = mtrr_type_lookup(start, end);
++
++ if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
++ *ret_prot = prot | _PAGE_CACHE_UC;
++ } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
++ *ret_prot = prot | _PAGE_CACHE_WC;
++ } else {
++ *ret_prot = prot | _PAGE_CACHE_WB;
++ }
++
++ return 0;
++}
++
++/*
++ * req_type typically has one of the:
++ * - _PAGE_CACHE_WB
++ * - _PAGE_CACHE_WC
++ * - _PAGE_CACHE_UC_MINUS
++ * - _PAGE_CACHE_UC
++ *
++ * req_type will have a special case value '-1', when requester want to inherit
++ * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
++ *
++ * If ret_type is NULL, function will return an error if it cannot reserve the
++ * region with req_type. If ret_type is non-null, function will return
++ * available type in ret_type in case of no error. In case of any error
++ * it will return a negative return value.
++ */
++int reserve_memtype(u64 start, u64 end, unsigned long req_type,
++ unsigned long *ret_type)
++{
++ struct memtype *new_entry = NULL;
++ struct memtype *parse;
++ unsigned long actual_type;
++ int err = 0;
++
++ /* Only track when pat_wc_enabled */
++ if (!pat_wc_enabled) {
++ /* This is identical to page table setting without PAT */
++ if (ret_type) {
++ if (req_type == -1) {
++ *ret_type = _PAGE_CACHE_WB;
++ } else {
++ *ret_type = req_type;
++ }
++ }
++ return 0;
++ }
++
++ /* Low ISA region is always mapped WB in page table. No need to track */
++ if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
++ if (ret_type)
++ *ret_type = _PAGE_CACHE_WB;
++
++ return 0;
++ }
++
++ if (req_type == -1) {
++ /*
++ * Call mtrr_lookup to get the type hint. This is an
++ * optimization for /dev/mem mmap'ers into WB memory (BIOS
++ * tools and ACPI tools). Use WB request for WB memory and use
++ * UC_MINUS otherwise.
++ */
++ u8 mtrr_type = mtrr_type_lookup(start, end);
++
++ if (mtrr_type == MTRR_TYPE_WRBACK) {
++ req_type = _PAGE_CACHE_WB;
++ actual_type = _PAGE_CACHE_WB;
++ } else {
++ req_type = _PAGE_CACHE_UC_MINUS;
++ actual_type = _PAGE_CACHE_UC_MINUS;
++ }
++ } else {
++ req_type &= _PAGE_CACHE_MASK;
++ err = pat_x_mtrr_type(start, end, req_type, &actual_type);
++ }
++
++ if (err) {
++ if (ret_type)
++ *ret_type = actual_type;
++
++ return -EINVAL;
++ }
++
++ new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
++ if (!new_entry)
++ return -ENOMEM;
++
++ new_entry->start = start;
++ new_entry->end = end;
++ new_entry->type = actual_type;
++
++ if (ret_type)
++ *ret_type = actual_type;
++
++ spin_lock(&memtype_lock);
++
++ /* Search for existing mapping that overlaps the current range */
++ list_for_each_entry(parse, &memtype_list, nd) {
++ struct memtype *saved_ptr;
++
++ if (parse->start >= end) {
++ pr_debug("New Entry\n");
++ list_add(&new_entry->nd, parse->nd.prev);
++ new_entry = NULL;
++ break;
++ }
++
++ if (start <= parse->start && end >= parse->start) {
++ if (actual_type != parse->type && ret_type) {
++ actual_type = parse->type;
++ *ret_type = actual_type;
++ new_entry->type = actual_type;
++ }
++
++ if (actual_type != parse->type) {
++ printk(
++ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
++ current->comm, current->pid,
++ start, end,
++ cattr_name(actual_type),
++ cattr_name(parse->type));
++ err = -EBUSY;
++ break;
++ }
++
++ saved_ptr = parse;
++ /*
++ * Check to see whether the request overlaps more
++ * than one entry in the list
++ */
++ list_for_each_entry_continue(parse, &memtype_list, nd) {
++ if (end <= parse->start) {
++ break;
++ }
++
++ if (actual_type != parse->type) {
++ printk(
++ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
++ current->comm, current->pid,
++ start, end,
++ cattr_name(actual_type),
++ cattr_name(parse->type));
++ err = -EBUSY;
++ break;
++ }
++ }
++
++ if (err) {
++ break;
++ }
++
++ pr_debug("Overlap at 0x%Lx-0x%Lx\n",
++ saved_ptr->start, saved_ptr->end);
++ /* No conflict. Go ahead and add this new entry */
++ list_add(&new_entry->nd, saved_ptr->nd.prev);
++ new_entry = NULL;
++ break;
++ }
++
++ if (start < parse->end) {
++ if (actual_type != parse->type && ret_type) {
++ actual_type = parse->type;
++ *ret_type = actual_type;
++ new_entry->type = actual_type;
++ }
++
++ if (actual_type != parse->type) {
++ printk(
++ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
++ current->comm, current->pid,
++ start, end,
++ cattr_name(actual_type),
++ cattr_name(parse->type));
++ err = -EBUSY;
++ break;
++ }
++
++ saved_ptr = parse;
++ /*
++ * Check to see whether the request overlaps more
++ * than one entry in the list
++ */
++ list_for_each_entry_continue(parse, &memtype_list, nd) {
++ if (end <= parse->start) {
++ break;
++ }
++
++ if (actual_type != parse->type) {
++ printk(
++ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
++ current->comm, current->pid,
++ start, end,
++ cattr_name(actual_type),
++ cattr_name(parse->type));
++ err = -EBUSY;
++ break;
++ }
++ }
++
++ if (err) {
++ break;
++ }
++
++ pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
++ saved_ptr->start, saved_ptr->end);
++ /* No conflict. Go ahead and add this new entry */
++ list_add(&new_entry->nd, &saved_ptr->nd);
++ new_entry = NULL;
++ break;
++ }
++ }
++
++ if (err) {
++ printk(KERN_INFO
++ "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
++ start, end, cattr_name(new_entry->type),
++ cattr_name(req_type));
++ kfree(new_entry);
++ spin_unlock(&memtype_lock);
++ return err;
++ }
++
++ if (new_entry) {
++ /* No conflict. Not yet added to the list. Add to the tail */
++ list_add_tail(&new_entry->nd, &memtype_list);
++ pr_debug("New Entry\n");
++ }
++
++ if (ret_type) {
++ pr_debug(
++ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
++ start, end, cattr_name(actual_type),
++ cattr_name(req_type), cattr_name(*ret_type));
++ } else {
++ pr_debug(
++ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
++ start, end, cattr_name(actual_type),
++ cattr_name(req_type));
++ }
++
++ spin_unlock(&memtype_lock);
++ return err;
++}
++
++int free_memtype(u64 start, u64 end)
++{
++ struct memtype *ml;
++ int err = -EINVAL;
++
++ /* Only track when pat_wc_enabled */
++ if (!pat_wc_enabled) {
++ return 0;
++ }
++
++ /* Low ISA region is always mapped WB. No need to track */
++ if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
++ return 0;
++ }
++
++ spin_lock(&memtype_lock);
++ list_for_each_entry(ml, &memtype_list, nd) {
++ if (ml->start == start && ml->end == end) {
++ list_del(&ml->nd);
++ kfree(ml);
++ err = 0;
++ break;
++ }
++ }
++ spin_unlock(&memtype_lock);
++
++ if (err) {
++ printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
++ current->comm, current->pid, start, end);
++ }
++
++ pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
++ return err;
++}
++
++
++/*
++ * /dev/mem mmap interface. The memtype used for mapping varies:
++ * - Use UC for mappings with O_SYNC flag
++ * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
++ * inherit the memtype from existing mapping.
++ * - Else use UC_MINUS memtype (for backward compatibility with existing
++ * X drivers.
++ */
++pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
++ unsigned long size, pgprot_t vma_prot)
++{
++ return vma_prot;
++}
++
++#ifdef CONFIG_NONPROMISC_DEVMEM
++/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
++static inline int range_is_allowed(unsigned long mfn, unsigned long size)
++{
++ return 1;
++}
++#else
++static inline int range_is_allowed(unsigned long mfn, unsigned long size)
++{
++ u64 from = ((u64)mfn) << PAGE_SHIFT;
++ u64 to = from + size;
++ u64 cursor = from;
++
++ while (cursor < to) {
++ if (!devmem_is_allowed(mfn)) {
++ printk(KERN_INFO
++ "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
++ current->comm, from, to);
++ return 0;
++ }
++ cursor += PAGE_SIZE;
++ mfn++;
++ }
++ return 1;
++}
++#endif /* CONFIG_NONPROMISC_DEVMEM */
++
++int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
++ unsigned long size, pgprot_t *vma_prot)
++{
++ u64 addr = (u64)mfn << PAGE_SHIFT;
++ unsigned long flags = _PAGE_CACHE_UC_MINUS;
++ int retval;
++
++ if (!range_is_allowed(mfn, size))
++ return 0;
++
++ if (file->f_flags & O_SYNC) {
++ flags = _PAGE_CACHE_UC;
++ }
++
++#ifndef CONFIG_X86_32
++#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */
++ /*
++ * On the PPro and successors, the MTRRs are used to set
++ * memory types for physical addresses outside main memory,
++ * so blindly setting UC or PWT on those pages is wrong.
++ * For Pentiums and earlier, the surround logic should disable
++ * caching for the high addresses through the KEN pin, but
++ * we maintain the tradition of paranoia in this code.
++ */
++ if (!pat_wc_enabled &&
++ ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
++ test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
++ test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
++ test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
++ (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
++ flags = _PAGE_CACHE_UC;
++ }
++#endif
++#endif
++
++ /*
++ * With O_SYNC, we can only take UC mapping. Fail if we cannot.
++ * Without O_SYNC, we want to get
++ * - WB for WB-able memory and no other conflicting mappings
++ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
++ * - Inherit from confliting mappings otherwise
++ */
++ if (flags != _PAGE_CACHE_UC_MINUS) {
++ retval = reserve_memtype(addr, addr + size, flags, NULL);
++ } else {
++ retval = reserve_memtype(addr, addr + size, -1, &flags);
++ }
++
++ if (retval < 0)
++ return 0;
++
++ if (ioremap_check_change_attr(mfn, size, flags) < 0) {
++ free_memtype(addr, addr + size);
++ printk(KERN_INFO
++ "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
++ current->comm, current->pid,
++ cattr_name(flags),
++ addr, addr + size);
++ return 0;
++ }
++
++ *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
++ flags);
++ return 1;
++}
++
++void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
++{
++ u64 addr = (u64)mfn << PAGE_SHIFT;
++ unsigned long flags;
++ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
++
++ reserve_memtype(addr, addr + size, want_flags, &flags);
++ if (flags != want_flags) {
++ printk(KERN_INFO
++ "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
++ current->comm, current->pid,
++ cattr_name(want_flags),
++ addr, (unsigned long long)(addr + size),
++ cattr_name(flags));
++ }
++}
++
++void unmap_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
++{
++ u64 addr = (u64)mfn << PAGE_SHIFT;
++
++ free_memtype(addr, addr + size);
++}
++
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-05-14/arch/x86/mm/pgtable-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -0,0 +1,709 @@
++#include <linux/mm.h>
++#include <linux/module.h>
++#include <xen/features.h>
++#include <asm/pgalloc.h>
++#include <asm/pgtable.h>
++#include <asm/tlb.h>
++#include <asm/hypervisor.h>
++#include <asm/mmu_context.h>
++
++pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
++{
++ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
++ if (pte)
++ make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
++ return pte;
++}
++
++static void _pte_free(struct page *page, unsigned int order)
++{
++ BUG_ON(order);
++ __pte_free(page);
++}
++
++pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
++{
++ struct page *pte;
++
++#ifdef CONFIG_HIGHPTE
++ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
++#else
++ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
++#endif
++ if (pte) {
++ pgtable_page_ctor(pte);
++ SetPageForeign(pte, _pte_free);
++ init_page_count(pte);
++ }
++ return pte;
++}
++
++void __pte_free(pgtable_t pte)
++{
++ if (!PageHighMem(pte)) {
++ unsigned long va = (unsigned long)page_address(pte);
++ unsigned int level;
++ pte_t *ptep = lookup_address(va, &level);
++
++ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
++ if (!pte_write(*ptep)
++ && HYPERVISOR_update_va_mapping(va,
++ mk_pte(pte, PAGE_KERNEL),
++ 0))
++ BUG();
++ } else
++#ifdef CONFIG_HIGHPTE
++ ClearPagePinned(pte);
++#else
++ BUG();
++#endif
++
++ ClearPageForeign(pte);
++ init_page_count(pte);
++ pgtable_page_dtor(pte);
++ __free_page(pte);
++}
++
++void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
++{
++ pgtable_page_dtor(pte);
++ paravirt_release_pte(page_to_pfn(pte));
++ tlb_remove_page(tlb, pte);
++}
++
++#if PAGETABLE_LEVELS > 2
++static void _pmd_free(struct page *page, unsigned int order)
++{
++ BUG_ON(order);
++ __pmd_free(page);
++}
++
++pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
++{
++ struct page *pmd;
++
++ pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
++ if (!pmd)
++ return NULL;
++ SetPageForeign(pmd, _pmd_free);
++ init_page_count(pmd);
++ return page_address(pmd);
++}
++
++void __pmd_free(pgtable_t pmd)
++{
++ unsigned long va = (unsigned long)page_address(pmd);
++ unsigned int level;
++ pte_t *ptep = lookup_address(va, &level);
++
++ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
++ if (!pte_write(*ptep)
++ && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
++ BUG();
++
++ ClearPageForeign(pmd);
++ init_page_count(pmd);
++ __free_page(pmd);
++}
++
++void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
++{
++ paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
++ tlb_remove_page(tlb, virt_to_page(pmd));
++}
++
++#if PAGETABLE_LEVELS > 3
++void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
++{
++ paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
++ tlb_remove_page(tlb, virt_to_page(pud));
++}
++#endif /* PAGETABLE_LEVELS > 3 */
++#endif /* PAGETABLE_LEVELS > 2 */
++
++#ifndef CONFIG_X86_64
++#define TASK_SIZE64 TASK_SIZE
++#endif
++
++static void _pin_lock(struct mm_struct *mm, int lock) {
++ if (lock)
++ spin_lock(&mm->page_table_lock);
++#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
++ /* While mm->page_table_lock protects us against insertions and
++ * removals of higher level page table pages, it doesn't protect
++ * against updates of pte-s. Such updates, however, require the
++ * pte pages to be in consistent state (unpinned+writable or
++ * pinned+readonly). The pinning and attribute changes, however
++ * cannot be done atomically, which is why such updates must be
++ * prevented from happening concurrently.
++ * Note that no pte lock can ever elsewhere be acquired nesting
++ * with an already acquired one in the same mm, or with the mm's
++ * page_table_lock already acquired, as that would break in the
++ * non-split case (where all these are actually resolving to the
++ * one page_table_lock). Thus acquiring all of them here is not
++ * going to result in dead locks, and the order of acquires
++ * doesn't matter.
++ */
++ {
++ pgd_t *pgd = mm->pgd;
++ unsigned g;
++
++ for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
++ pud_t *pud;
++ unsigned u;
++
++ if (pgd_none(*pgd))
++ continue;
++ pud = pud_offset(pgd, 0);
++ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
++ pmd_t *pmd;
++ unsigned m;
++
++ if (pud_none(*pud))
++ continue;
++ pmd = pmd_offset(pud, 0);
++ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
++ spinlock_t *ptl;
++
++ if (pmd_none(*pmd))
++ continue;
++ ptl = pte_lockptr(0, pmd);
++ if (lock)
++ spin_lock(ptl);
++ else
++ spin_unlock(ptl);
++ }
++ }
++ }
++ }
++#endif
++ if (!lock)
++ spin_unlock(&mm->page_table_lock);
++}
++#define pin_lock(mm) _pin_lock(mm, 1)
++#define pin_unlock(mm) _pin_lock(mm, 0)
++
++#define PIN_BATCH sizeof(void *)
++static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
++
++static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
++ unsigned int cpu, unsigned int seq)
++{
++ unsigned long pfn = page_to_pfn(page);
++
++ if (PageHighMem(page)) {
++ if (pgprot_val(flags) & _PAGE_RW)
++ ClearPagePinned(page);
++ else
++ SetPagePinned(page);
++ } else {
++ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
++ (unsigned long)__va(pfn << PAGE_SHIFT),
++ pfn_pte(pfn, flags), 0);
++ if (unlikely(++seq == PIN_BATCH)) {
++ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
++ PIN_BATCH, NULL)))
++ BUG();
++ seq = 0;
++ }
++ }
++
++ return seq;
++}
++
++static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
++{
++ pgd_t *pgd = pgd_base;
++ pud_t *pud;
++ pmd_t *pmd;
++ int g,u,m;
++ unsigned int cpu, seq;
++ multicall_entry_t *mcl;
++
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ return;
++
++ cpu = get_cpu();
++
++ /*
++ * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
++ * may not be the 'current' task's pagetables (e.g., current may be
++ * 32-bit, but the pagetables may be for a 64-bit task).
++ * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
++ * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
++ */
++ for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
++ if (pgd_none(*pgd))
++ continue;
++ pud = pud_offset(pgd, 0);
++ if (PTRS_PER_PUD > 1) /* not folded */
++ seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
++ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
++ if (pud_none(*pud))
++ continue;
++ pmd = pmd_offset(pud, 0);
++ if (PTRS_PER_PMD > 1) /* not folded */
++ seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
++ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
++ if (pmd_none(*pmd))
++ continue;
++ seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
++ }
++ }
++ }
++
++ mcl = per_cpu(pb_mcl, cpu);
++#ifdef CONFIG_X86_64
++ if (unlikely(seq > PIN_BATCH - 2)) {
++ if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
++ BUG();
++ seq = 0;
++ }
++ MULTI_update_va_mapping(mcl + seq,
++ (unsigned long)__user_pgd(pgd_base),
++ pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
++ 0);
++ MULTI_update_va_mapping(mcl + seq + 1,
++ (unsigned long)pgd_base,
++ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
++ UVMF_TLB_FLUSH);
++ if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
++ BUG();
++#else
++ if (likely(seq != 0)) {
++ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
++ (unsigned long)pgd_base,
++ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
++ UVMF_TLB_FLUSH);
++ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
++ seq + 1, NULL)))
++ BUG();
++ } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
++ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
++ UVMF_TLB_FLUSH))
++ BUG();
++#endif
++
++ put_cpu();
++}
++
++static void __pgd_pin(pgd_t *pgd)
++{
++ pgd_walk(pgd, PAGE_KERNEL_RO);
++ kmap_flush_unused();
++ xen_pgd_pin(__pa(pgd)); /* kernel */
++#ifdef CONFIG_X86_64
++ xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
++#endif
++ SetPagePinned(virt_to_page(pgd));
++}
++
++static void __pgd_unpin(pgd_t *pgd)
++{
++ xen_pgd_unpin(__pa(pgd));
++#ifdef CONFIG_X86_64
++ xen_pgd_unpin(__pa(__user_pgd(pgd)));
++#endif
++ pgd_walk(pgd, PAGE_KERNEL);
++ ClearPagePinned(virt_to_page(pgd));
++}
++
++static void pgd_test_and_unpin(pgd_t *pgd)
++{
++ if (PagePinned(virt_to_page(pgd)))
++ __pgd_unpin(pgd);
++}
++
++void mm_pin(struct mm_struct *mm)
++{
++ if (xen_feature(XENFEAT_writable_page_tables))
++ return;
++
++ pin_lock(mm);
++ __pgd_pin(mm->pgd);
++ pin_unlock(mm);
++}
++
++void mm_unpin(struct mm_struct *mm)
++{
++ if (xen_feature(XENFEAT_writable_page_tables))
++ return;
++
++ pin_lock(mm);
++ __pgd_unpin(mm->pgd);
++ pin_unlock(mm);
++}
++
++void mm_pin_all(void)
++{
++ struct page *page;
++ unsigned long flags;
++
++ if (xen_feature(XENFEAT_writable_page_tables))
++ return;
++
++ /*
++ * Allow uninterrupted access to the pgd_list. Also protects
++ * __pgd_pin() by disabling preemption.
++ * All other CPUs must be at a safe point (e.g., in stop_machine
++ * or offlined entirely).
++ */
++ spin_lock_irqsave(&pgd_lock, flags);
++ list_for_each_entry(page, &pgd_list, lru) {
++ if (!PagePinned(page))
++ __pgd_pin((pgd_t *)page_address(page));
++ }
++ spin_unlock_irqrestore(&pgd_lock, flags);
++}
++
++void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
++{
++ if (!PagePinned(virt_to_page(mm->pgd)))
++ mm_pin(mm);
++}
++
++void arch_exit_mmap(struct mm_struct *mm)
++{
++ struct task_struct *tsk = current;
++
++ task_lock(tsk);
++
++ /*
++ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
++ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
++ */
++ if (tsk->active_mm == mm) {
++ tsk->active_mm = &init_mm;
++ atomic_inc(&init_mm.mm_count);
++
++ switch_mm(mm, &init_mm, tsk);
++
++ atomic_dec(&mm->mm_count);
++ BUG_ON(atomic_read(&mm->mm_count) == 0);
++ }
++
++ task_unlock(tsk);
++
++ if (PagePinned(virt_to_page(mm->pgd))
++ && atomic_read(&mm->mm_count) == 1
++ && !mm->context.has_foreign_mappings)
++ mm_unpin(mm);
++}
++
++static inline void pgd_list_add(pgd_t *pgd)
++{
++ struct page *page = virt_to_page(pgd);
++
++ list_add(&page->lru, &pgd_list);
++}
++
++static inline void pgd_list_del(pgd_t *pgd)
++{
++ struct page *page = virt_to_page(pgd);
++
++ list_del(&page->lru);
++}
++
++#define UNSHARED_PTRS_PER_PGD \
++ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
++
++static void pgd_ctor(void *p)
++{
++ pgd_t *pgd = p;
++ unsigned long flags;
++
++ pgd_test_and_unpin(pgd);
++
++ /* Clear usermode parts of PGD */
++ memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
++
++ spin_lock_irqsave(&pgd_lock, flags);
++
++ /* If the pgd points to a shared pagetable level (either the
++ ptes in non-PAE, or shared PMD in PAE), then just copy the
++ references from swapper_pg_dir. */
++ if (PAGETABLE_LEVELS == 2 ||
++ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
++ PAGETABLE_LEVELS == 4) {
++ clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
++ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
++ KERNEL_PGD_PTRS);
++ paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
++ __pa(swapper_pg_dir) >> PAGE_SHIFT,
++ KERNEL_PGD_BOUNDARY,
++ KERNEL_PGD_PTRS);
++ }
++
++#ifdef CONFIG_X86_64
++ /* set level3_user_pgt for vsyscall area */
++ __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
++ __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
+#endif
- add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype);
-
- return 0;
-@@ -207,10 +209,8 @@ static struct xen_bus_type xenbus_fronte
- .probe = xenbus_dev_probe,
- .remove = xenbus_dev_remove,
- .shutdown = xenbus_dev_shutdown,
--#if defined(CONFIG_XEN) || defined(MODULE)
- .uevent = xenbus_uevent_frontend,
- #endif
--#endif
- },
- #if defined(CONFIG_XEN) || defined(MODULE)
- .dev = {
-@@ -519,6 +519,15 @@ static ssize_t xendev_show_devtype(struc
- }
- DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
-
-+static ssize_t xendev_show_modalias(struct device *dev,
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
-+ struct device_attribute *attr,
++
++#ifndef CONFIG_X86_PAE
++ /* list required to sync kernel mapping updates */
++ if (!SHARED_KERNEL_PMD)
++ pgd_list_add(pgd);
+#endif
-+ char *buf)
++
++ spin_unlock_irqrestore(&pgd_lock, flags);
++}
++
++static void pgd_dtor(void *pgd)
++{
++ unsigned long flags; /* can be called from interrupt context */
++
++ if (!SHARED_KERNEL_PMD) {
++ spin_lock_irqsave(&pgd_lock, flags);
++ pgd_list_del(pgd);
++ spin_unlock_irqrestore(&pgd_lock, flags);
++ }
++
++ pgd_test_and_unpin(pgd);
++}
++
++/*
++ * List of all pgd's needed for non-PAE so it can invalidate entries
++ * in both cached and uncached pgd's; not needed for PAE since the
++ * kernel pmd is shared. If PAE were not to share the pmd a similar
++ * tactic would be needed. This is essentially codepath-based locking
++ * against pageattr.c; it is the unique case in which a valid change
++ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
++ * vmalloc faults work because attached pagetables are never freed.
++ * -- wli
++ */
++
++#ifdef CONFIG_X86_PAE
++/*
++ * Mop up any pmd pages which may still be attached to the pgd.
++ * Normally they will be freed by munmap/exit_mmap, but any pmd we
++ * preallocate which never got a corresponding vma will need to be
++ * freed manually.
++ */
++static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
++{
++ int i;
++
++ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
++ pgd_t pgd = pgdp[i];
++
++ if (__pgd_val(pgd) != 0) {
++ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
++
++ pgdp[i] = xen_make_pgd(0);
++
++ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
++ pmd_free(mm, pmd);
++ }
++ }
++
++ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
++ xen_destroy_contiguous_region((unsigned long)pgdp, 0);
++}
++
++/*
++ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
++ * updating the top-level pagetable entries to guarantee the
++ * processor notices the update. Since this is expensive, and
++ * all 4 top-level entries are used almost immediately in a
++ * new process's life, we just pre-populate them here.
++ *
++ * Also, if we're in a paravirt environment where the kernel pmd is
++ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
++ * and initialize the kernel pmds here.
++ */
++static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
++{
++ pud_t *pud;
++ pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
++ unsigned long addr, flags;
++ int i;
++
++ /*
++ * We can race save/restore (if we sleep during a GFP_KERNEL memory
++ * allocation). We therefore store virtual addresses of pmds as they
++ * do not change across save/restore, and poke the machine addresses
++ * into the pgdir under the pgd_lock.
++ */
++ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
++ pmds[i] = pmd_alloc_one(mm, addr);
++ if (!pmds[i])
++ goto out_oom;
++ }
++
++ spin_lock_irqsave(&pgd_lock, flags);
++
++ /* Protect against save/restore: move below 4GB under pgd_lock. */
++ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
++ && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
++ spin_unlock_irqrestore(&pgd_lock, flags);
++out_oom:
++ while (i--)
++ pmd_free(mm, pmds[i]);
++ return 0;
++ }
++
++ /* Copy kernel pmd contents and write-protect the new pmds. */
++ pud = pud_offset(pgd, 0);
++ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
++ i++, pud++, addr += PUD_SIZE) {
++ if (i >= KERNEL_PGD_BOUNDARY) {
++ memcpy(pmds[i],
++ (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
++ sizeof(pmd_t) * PTRS_PER_PMD);
++ make_lowmem_page_readonly(
++ pmds[i], XENFEAT_writable_page_tables);
++ }
++
++ /* It is safe to poke machine addresses of pmds under the pgd_lock. */
++ pud_populate(mm, pud, pmds[i]);
++ }
++
++ /* List required to sync kernel mapping updates and
++ * to pin/unpin on save/restore. */
++ pgd_list_add(pgd);
++
++ spin_unlock_irqrestore(&pgd_lock, flags);
++
++ return 1;
++}
++
++void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
++{
++ struct page *page = virt_to_page(pmd);
++ unsigned long pfn = page_to_pfn(page);
++
++ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
++
++ /* Note: almost everything apart from _PAGE_PRESENT is
++ reserved at the pmd (PDPT) level. */
++ if (PagePinned(virt_to_page(mm->pgd))) {
++ BUG_ON(PageHighMem(page));
++ BUG_ON(HYPERVISOR_update_va_mapping(
++ (unsigned long)__va(pfn << PAGE_SHIFT),
++ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
++ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
++ } else
++ *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
++
++ /*
++ * According to Intel App note "TLBs, Paging-Structure Caches,
++ * and Their Invalidation", April 2007, document 317080-001,
++ * section 8.1: in PAE mode we explicitly have to flush the
++ * TLB via cr3 if the top-level pgd is changed...
++ */
++ if (mm == current->active_mm)
++ xen_tlb_flush();
++}
++#else /* !CONFIG_X86_PAE */
++/* No need to prepopulate any pagetable entries in non-PAE modes. */
++static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
++{
++ return 1;
++}
++
++static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
++{
++}
++#endif /* CONFIG_X86_PAE */
++
++#ifdef CONFIG_X86_64
++/* We allocate two contiguous pages for kernel and user. */
++#define PGD_ORDER 1
++#else
++#define PGD_ORDER 0
++#endif
++
++pgd_t *pgd_alloc(struct mm_struct *mm)
++{
++ pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
++
++ /* so that alloc_pd can use it */
++ mm->pgd = pgd;
++ if (pgd)
++ pgd_ctor(pgd);
++
++ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
++ free_pages((unsigned long)pgd, PGD_ORDER);
++ pgd = NULL;
++ }
++
++ return pgd;
++}
++
++void pgd_free(struct mm_struct *mm, pgd_t *pgd)
++{
++ /*
++ * After this the pgd should not be pinned for the duration of this
++ * function's execution. We should never sleep and thus never race:
++ * 1. User pmds will not become write-protected under our feet due
++ * to a concurrent mm_pin_all().
++ * 2. The machine addresses in PGD entries will not become invalid
++ * due to a concurrent save/restore.
++ */
++ pgd_dtor(pgd);
++
++ pgd_mop_up_pmds(mm, pgd);
++ free_pages((unsigned long)pgd, PGD_ORDER);
++}
++
++/* blktap and gntdev need this, as otherwise they would implicitly (and
++ * needlessly, as they never use it) reference init_mm. */
++pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
++ unsigned long addr, pte_t *ptep, int full)
+{
-+ return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
++ return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
+}
-+DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
-
- int xenbus_probe_node(struct xen_bus_type *bus,
- const char *type,
-@@ -579,10 +588,16 @@ int xenbus_probe_node(struct xen_bus_typ
-
- err = device_create_file(&xendev->dev, &dev_attr_devtype);
- if (err)
-- goto fail_remove_file;
-+ goto fail_remove_nodename;
++EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
+
-+ err = device_create_file(&xendev->dev, &dev_attr_modalias);
-+ if (err)
-+ goto fail_remove_devtype;
-
- return 0;
--fail_remove_file:
-+fail_remove_devtype:
-+ device_remove_file(&xendev->dev, &dev_attr_devtype);
-+fail_remove_nodename:
- device_remove_file(&xendev->dev, &dev_attr_nodename);
- fail_unregister:
- device_unregister(&xendev->dev);
---- a/fs/aio.c
-+++ b/fs/aio.c
-@@ -1255,6 +1255,7 @@ static void io_destroy(struct kioctx *io
- #ifdef CONFIG_EPOLL
- /* forget the poll file, but it's up to the user to close it */
- if (ioctx->file) {
-+ fput(ioctx->file);
- ioctx->file->private_data = 0;
- ioctx->file = 0;
- }
-@@ -1279,6 +1280,7 @@ static int aio_queue_fd_close(struct ino
- spin_lock_irq(&ioctx->ctx_lock);
- ioctx->file = 0;
- spin_unlock_irq(&ioctx->ctx_lock);
-+ fput(file);
- }
- return 0;
- }
-@@ -1314,16 +1316,17 @@ static const struct file_operations aioq
-
- static int make_aio_fd(struct kioctx *ioctx)
- {
-- int error, fd;
-- struct inode *inode;
-+ int fd;
- struct file *file;
-
-- error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
-- &aioq_fops, ioctx);
-- if (error)
-- return error;
-+ fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
-+ if (fd < 0)
-+ return fd;
-
- /* associate the file with the IO context */
-+ file = fget(fd);
-+ if (!file)
-+ return -EBADF;
- file->private_data = ioctx;
- ioctx->file = file;
- init_waitqueue_head(&ioctx->poll_wait);
---- a/include/asm-x86/dma-mapping.h
-+++ b/include/asm-x86/dma-mapping.h
-@@ -223,8 +223,13 @@ static inline dma_addr_t dma_map_page(st
- struct dma_mapping_ops *ops = get_dma_ops(dev);
-
- BUG_ON(!valid_dma_direction(direction));
-+#ifndef CONFIG_XEN
- return ops->map_single(dev, page_to_phys(page) + offset,
- size, direction);
-+#else
-+ return ops->map_single(dev, page_to_pseudophys(page) + offset,
-+ size, direction);
-+#endif
- }
-
- static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
---- a/include/asm-x86/genapic_64.h
-+++ b/include/asm-x86/genapic_64.h
-@@ -46,5 +46,6 @@ extern struct genapic apic_x2apic_phys;
- extern int acpi_madt_oem_check(char *, char *);
-
-+#ifndef CONFIG_XEN
- enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
- extern enum uv_system_type get_uv_system_type(void);
- extern int is_uv_system(void);
-@@ -55,6 +56,10 @@ DECLARE_PER_CPU(int, x2apic_extra_bits);
- extern void uv_cpu_init(void);
- extern void uv_system_init(void);
- extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
-+#else
-+#define is_uv_system() 0
-+#define uv_cpu_init() ((void)0)
-+#endif
-
- extern void setup_apic_routing(void);
-
---- a/include/asm-x86/mach-xen/asm/desc.h
-+++ b/include/asm-x86/mach-xen/asm/desc.h
-@@ -64,8 +64,8 @@ static inline struct desc_struct *get_cp
- }
-
- static inline void pack_gate(gate_desc *gate, unsigned char type,
-- unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
--
-+ unsigned long base, unsigned dpl, unsigned flags,
-+ unsigned short seg)
- {
- gate->a = (seg << 16) | (base & 0xffff);
- gate->b = (base & 0xffff0000) |
-@@ -84,22 +84,23 @@ static inline int desc_empty(const void
- #define load_TR_desc() native_load_tr_desc()
- #define load_gdt(dtr) native_load_gdt(dtr)
- #define load_idt(dtr) native_load_idt(dtr)
--#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
--#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
-+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
-+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
-
- #define store_gdt(dtr) native_store_gdt(dtr)
- #define store_idt(dtr) native_store_idt(dtr)
- #define store_tr(tr) (tr = native_store_tr())
--#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
-+#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
-
- #define load_TLS(t, cpu) native_load_tls(t, cpu)
- #define set_ldt native_set_ldt
-
--#define write_ldt_entry(dt, entry, desc) \
-- native_write_ldt_entry(dt, entry, desc)
--#define write_gdt_entry(dt, entry, desc, type) \
-- native_write_gdt_entry(dt, entry, desc, type)
--#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
-+#define write_ldt_entry(dt, entry, desc) \
-+ native_write_ldt_entry(dt, entry, desc)
-+#define write_gdt_entry(dt, entry, desc, type) \
-+ native_write_gdt_entry(dt, entry, desc, type)
-+#define write_idt_entry(dt, entry, g) \
-+ native_write_idt_entry(dt, entry, g)
-
- static inline void native_write_idt_entry(gate_desc *idt, int entry,
- const gate_desc *gate)
-@@ -138,8 +139,8 @@ static inline void pack_descriptor(struc
- {
- desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
- desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
-- (limit & 0x000f0000) | ((type & 0xff) << 8) |
-- ((flags & 0xf) << 20);
-+ (limit & 0x000f0000) | ((type & 0xff) << 8) |
-+ ((flags & 0xf) << 20);
- desc->p = 1;
- }
-
-@@ -160,7 +161,6 @@ static inline void set_tssldt_descriptor
- desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
- desc->base3 = PTR_HIGH(addr);
- #else
--
- pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
- #endif
- }
-@@ -178,7 +178,8 @@ static inline void __set_tss_desc(unsign
- * last valid byte
- */
- set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
-- IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
-+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
-+ sizeof(unsigned long) - 1);
- write_gdt_entry(d, entry, &tss, DESC_TSS);
- }
-
-@@ -187,16 +188,16 @@ static inline void __set_tss_desc(unsign
- static inline void native_set_ldt(const void *addr, unsigned int entries)
- {
- if (likely(entries == 0))
-- __asm__ __volatile__("lldt %w0"::"q" (0));
-+ asm volatile("lldt %w0"::"q" (0));
- else {
- unsigned cpu = smp_processor_id();
- ldt_desc ldt;
-
-- set_tssldt_descriptor(&ldt, (unsigned long)addr,
-- DESC_LDT, entries * sizeof(ldt) - 1);
-+ set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
-+ entries * LDT_ENTRY_SIZE - 1);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
- &ldt, DESC_LDT);
-- __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
-+ asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
- }
- }
-
-@@ -261,15 +262,15 @@ static inline void xen_load_tls(struct t
- }
- #endif
-
--#define _LDT_empty(info) (\
-- (info)->base_addr == 0 && \
-- (info)->limit == 0 && \
-- (info)->contents == 0 && \
-- (info)->read_exec_only == 1 && \
-- (info)->seg_32bit == 0 && \
-- (info)->limit_in_pages == 0 && \
-- (info)->seg_not_present == 1 && \
-- (info)->useable == 0)
-+#define _LDT_empty(info) \
-+ ((info)->base_addr == 0 && \
-+ (info)->limit == 0 && \
-+ (info)->contents == 0 && \
-+ (info)->read_exec_only == 1 && \
-+ (info)->seg_32bit == 0 && \
-+ (info)->limit_in_pages == 0 && \
-+ (info)->seg_not_present == 1 && \
-+ (info)->useable == 0)
-
- #ifdef CONFIG_X86_64
- #define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
-@@ -309,7 +310,7 @@ static inline unsigned long get_desc_lim
-
- #ifndef CONFIG_X86_NO_IDT
- static inline void _set_gate(int gate, unsigned type, void *addr,
-- unsigned dpl, unsigned ist, unsigned seg)
-+ unsigned dpl, unsigned ist, unsigned seg)
- {
- gate_desc s;
- pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
-@@ -393,10 +394,10 @@ static inline void set_system_gate_ist(i
- * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
- */
- #define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
-- movb idx*8+4(gdt), lo_b; \
-- movb idx*8+7(gdt), hi_b; \
-- shll $16, base; \
-- movw idx*8+2(gdt), lo_w;
-+ movb idx * 8 + 4(gdt), lo_b; \
-+ movb idx * 8 + 7(gdt), hi_b; \
-+ shll $16, base; \
-+ movw idx * 8 + 2(gdt), lo_w;
-
-
- #endif /* __ASSEMBLY__ */
---- a/include/asm-x86/mach-xen/asm/dma-mapping_32.h
-+++ /dev/null
-@@ -1,141 +0,0 @@
--#ifndef _ASM_I386_DMA_MAPPING_H
--#define _ASM_I386_DMA_MAPPING_H
--
++int ptep_set_access_flags(struct vm_area_struct *vma,
++ unsigned long address, pte_t *ptep,
++ pte_t entry, int dirty)
++{
++ int changed = !pte_same(*ptep, entry);
++
++ if (changed && dirty) {
++ if (likely(vma->vm_mm == current->mm)) {
++ if (HYPERVISOR_update_va_mapping(address,
++ entry,
++ (unsigned long)vma->vm_mm->cpu_vm_mask.bits|
++ UVMF_INVLPG|UVMF_MULTI))
++ BUG();
++ } else {
++ xen_l1_entry_update(ptep, entry);
++ flush_tlb_page(vma, address);
++ }
++ }
++
++ return changed;
++}
++
++int ptep_test_and_clear_young(struct vm_area_struct *vma,
++ unsigned long addr, pte_t *ptep)
++{
++ int ret = 0;
++
++ if (pte_young(*ptep))
++ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
++ &ptep->pte);
++
++ if (ret)
++ pte_update(vma->vm_mm, addr, ptep);
++
++ return ret;
++}
++
++int ptep_clear_flush_young(struct vm_area_struct *vma,
++ unsigned long address, pte_t *ptep)
++{
++ pte_t pte = *ptep;
++ int young = pte_young(pte);
++
++ pte = pte_mkold(pte);
++ if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
++ ptep_set_access_flags(vma, address, ptep, pte, young);
++ else if (young)
++ ptep->pte_low = pte.pte_low;
++
++ return young;
++}
+--- sle11-2009-05-14.orig/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -1,7 +1,3 @@
-/*
-- * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
-- * documentation.
+- * linux/arch/i386/mm/pgtable.c
- */
-
--#include <linux/mm.h>
--#include <linux/scatterlist.h>
--#include <asm/cache.h>
--#include <asm/io.h>
--#include <asm/swiotlb.h>
--
--static inline int
--address_needs_mapping(struct device *hwdev, dma_addr_t addr)
--{
-- dma_addr_t mask = 0xffffffff;
-- /* If the device has a mask, use it, otherwise default to 32 bits */
-- if (hwdev && hwdev->dma_mask)
-- mask = *hwdev->dma_mask;
-- return (addr & ~mask) != 0;
--}
--
--extern int range_straddles_page_boundary(paddr_t p, size_t size);
--
--#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
--#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
--
--void *dma_alloc_coherent(struct device *dev, size_t size,
-- dma_addr_t *dma_handle, gfp_t flag);
--
--void dma_free_coherent(struct device *dev, size_t size,
-- void *vaddr, dma_addr_t dma_handle);
--
--extern dma_addr_t
--dma_map_single(struct device *dev, void *ptr, size_t size,
-- enum dma_data_direction direction);
--
--extern void
--dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-- enum dma_data_direction direction);
--
--extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
-- int nents, enum dma_data_direction direction);
--extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
-- int nents, enum dma_data_direction direction);
--
--#ifdef CONFIG_HIGHMEM
--extern dma_addr_t
--dma_map_page(struct device *dev, struct page *page, unsigned long offset,
-- size_t size, enum dma_data_direction direction);
--
--extern void
--dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
-- enum dma_data_direction direction);
--#else
--#define dma_map_page(dev, page, offset, size, dir) \
-- dma_map_single(dev, page_address(page) + (offset), (size), (dir))
--#define dma_unmap_page dma_unmap_single
--#endif
--
--extern void
--dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
-- enum dma_data_direction direction);
--
--extern void
--dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
-- enum dma_data_direction direction);
--
--static inline void
--dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
-- unsigned long offset, size_t size,
-- enum dma_data_direction direction)
--{
-- dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
--}
--
--static inline void
--dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
-- unsigned long offset, size_t size,
-- enum dma_data_direction direction)
--{
-- dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
--}
--
--extern void
--dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
-- enum dma_data_direction direction);
--
--extern void
--dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
-- enum dma_data_direction direction);
--
--extern int
--dma_mapping_error(dma_addr_t dma_addr);
--
--extern int
--dma_supported(struct device *dev, u64 mask);
--
--static inline int
--dma_set_mask(struct device *dev, u64 mask)
--{
-- if(!dev->dma_mask || !dma_supported(dev, mask))
-- return -EIO;
--
-- *dev->dma_mask = mask;
--
-- return 0;
--}
--
--static inline int
--dma_get_cache_alignment(void)
--{
-- /* no easy way to get cache size on all x86, so return the
-- * maximum possible, to be safe */
-- return (1 << INTERNODE_CACHE_SHIFT);
--}
--
--#define dma_is_consistent(d, h) (1)
--
--static inline void
--dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-- enum dma_data_direction direction)
+ #include <linux/sched.h>
+ #include <linux/kernel.h>
+ #include <linux/errno.h>
+@@ -41,7 +37,6 @@ void show_mem(void)
+
+ printk(KERN_INFO "Mem-info:\n");
+ show_free_areas();
+- printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+ for_each_online_pgdat(pgdat) {
+ pgdat_resize_lock(pgdat, &flags);
+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+@@ -157,243 +152,6 @@ void __init reserve_top_address(unsigned
+ __VMALLOC_RESERVE += reserve;
+ }
+
+-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
-- flush_write_buffers();
+- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+- if (pte)
+- make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
+- return pte;
-}
-
--#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
--extern int
--dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
-- dma_addr_t device_addr, size_t size, int flags);
--
--extern void
--dma_release_declared_memory(struct device *dev);
--
--extern void *
--dma_mark_declared_memory_occupied(struct device *dev,
-- dma_addr_t device_addr, size_t size);
--
--#endif
---- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
-+++ /dev/null
-@@ -1,205 +0,0 @@
--#ifndef _X8664_DMA_MAPPING_H
--#define _X8664_DMA_MAPPING_H 1
--
-/*
-- * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
-- * documentation.
-- */
--
--#include <linux/scatterlist.h>
--#include <asm/io.h>
--
--struct dma_mapping_ops {
-- int (*mapping_error)(dma_addr_t dma_addr);
-- void* (*alloc_coherent)(struct device *dev, size_t size,
-- dma_addr_t *dma_handle, gfp_t gfp);
-- void (*free_coherent)(struct device *dev, size_t size,
-- void *vaddr, dma_addr_t dma_handle);
-- dma_addr_t (*map_single)(struct device *hwdev, void *ptr,
-- size_t size, int direction);
-- /* like map_single, but doesn't check the device mask */
-- dma_addr_t (*map_simple)(struct device *hwdev, char *ptr,
-- size_t size, int direction);
-- void (*unmap_single)(struct device *dev, dma_addr_t addr,
-- size_t size, int direction);
-- void (*sync_single_for_cpu)(struct device *hwdev,
-- dma_addr_t dma_handle, size_t size,
-- int direction);
-- void (*sync_single_for_device)(struct device *hwdev,
-- dma_addr_t dma_handle, size_t size,
-- int direction);
-- void (*sync_single_range_for_cpu)(struct device *hwdev,
-- dma_addr_t dma_handle, unsigned long offset,
-- size_t size, int direction);
-- void (*sync_single_range_for_device)(struct device *hwdev,
-- dma_addr_t dma_handle, unsigned long offset,
-- size_t size, int direction);
-- void (*sync_sg_for_cpu)(struct device *hwdev,
-- struct scatterlist *sg, int nelems,
-- int direction);
-- void (*sync_sg_for_device)(struct device *hwdev,
-- struct scatterlist *sg, int nelems,
-- int direction);
-- int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
-- int nents, int direction);
-- void (*unmap_sg)(struct device *hwdev,
-- struct scatterlist *sg, int nents,
-- int direction);
-- int (*dma_supported)(struct device *hwdev, u64 mask);
-- int is_phys;
--};
+- * List of all pgd's needed for non-PAE so it can invalidate entries
+- * in both cached and uncached pgd's; not needed for PAE since the
+- * kernel pmd is shared. If PAE were not to share the pmd a similar
+- * tactic would be needed. This is essentially codepath-based locking
+- * against pageattr.c; it is the unique case in which a valid change
+- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+- * vmalloc faults work because attached pagetables are never freed.
+- * -- wli
+- */
+-static inline void pgd_list_add(pgd_t *pgd)
+-{
+- struct page *page = virt_to_page(pgd);
-
--extern dma_addr_t bad_dma_address;
--extern const struct dma_mapping_ops* dma_ops;
--extern int iommu_merge;
+- list_add(&page->lru, &pgd_list);
+-}
-
--#if 0
--static inline int dma_mapping_error(dma_addr_t dma_addr)
+-static inline void pgd_list_del(pgd_t *pgd)
-{
-- if (dma_ops->mapping_error)
-- return dma_ops->mapping_error(dma_addr);
+- struct page *page = virt_to_page(pgd);
-
-- return (dma_addr == bad_dma_address);
+- list_del(&page->lru);
-}
-
--#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
--#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+-#define UNSHARED_PTRS_PER_PGD \
+- (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-
--#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
--#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+-static void pgd_ctor(void *p)
+-{
+- pgd_t *pgd = p;
+- unsigned long flags;
-
--extern void *dma_alloc_coherent(struct device *dev, size_t size,
-- dma_addr_t *dma_handle, gfp_t gfp);
--extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
-- dma_addr_t dma_handle);
+- pgd_test_and_unpin(pgd);
-
--static inline dma_addr_t
--dma_map_single(struct device *hwdev, void *ptr, size_t size,
-- int direction)
--{
-- BUG_ON(!valid_dma_direction(direction));
-- return dma_ops->map_single(hwdev, ptr, size, direction);
--}
+- /* Clear usermode parts of PGD */
+- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
-
--static inline void
--dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
-- int direction)
--{
-- BUG_ON(!valid_dma_direction(direction));
-- dma_ops->unmap_single(dev, addr, size, direction);
--}
+- spin_lock_irqsave(&pgd_lock, flags);
-
--#define dma_map_page(dev,page,offset,size,dir) \
-- dma_map_single((dev), page_address(page)+(offset), (size), (dir))
+- /* If the pgd points to a shared pagetable level (either the
+- ptes in non-PAE, or shared PMD in PAE), then just copy the
+- references from swapper_pg_dir. */
+- if (PAGETABLE_LEVELS == 2 ||
+- (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
+- clone_pgd_range(pgd + USER_PTRS_PER_PGD,
+- swapper_pg_dir + USER_PTRS_PER_PGD,
+- KERNEL_PGD_PTRS);
+- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+- __pa(swapper_pg_dir) >> PAGE_SHIFT,
+- USER_PTRS_PER_PGD,
+- KERNEL_PGD_PTRS);
+- }
-
--#define dma_unmap_page dma_unmap_single
+- /* list required to sync kernel mapping updates */
+- if (PAGETABLE_LEVELS == 2)
+- pgd_list_add(pgd);
-
--static inline void
--dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
-- size_t size, int direction)
--{
-- BUG_ON(!valid_dma_direction(direction));
-- if (dma_ops->sync_single_for_cpu)
-- dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
-- direction);
-- flush_write_buffers();
+- spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
--static inline void
--dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
-- size_t size, int direction)
+-static void pgd_dtor(void *pgd)
-{
-- BUG_ON(!valid_dma_direction(direction));
-- if (dma_ops->sync_single_for_device)
-- dma_ops->sync_single_for_device(hwdev, dma_handle, size,
-- direction);
-- flush_write_buffers();
--}
+- unsigned long flags; /* can be called from interrupt context */
-
--static inline void
--dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
-- unsigned long offset, size_t size, int direction)
--{
-- BUG_ON(!valid_dma_direction(direction));
-- if (dma_ops->sync_single_range_for_cpu) {
-- dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
+- if (!SHARED_KERNEL_PMD) {
+- spin_lock_irqsave(&pgd_lock, flags);
+- pgd_list_del(pgd);
+- spin_unlock_irqrestore(&pgd_lock, flags);
- }
-
-- flush_write_buffers();
+- pgd_test_and_unpin(pgd);
-}
-
--static inline void
--dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
-- unsigned long offset, size_t size, int direction)
+-#ifdef CONFIG_X86_PAE
+-/*
+- * Mop up any pmd pages which may still be attached to the pgd.
+- * Normally they will be freed by munmap/exit_mmap, but any pmd we
+- * preallocate which never got a corresponding vma will need to be
+- * freed manually.
+- */
+-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-- BUG_ON(!valid_dma_direction(direction));
-- if (dma_ops->sync_single_range_for_device)
-- dma_ops->sync_single_range_for_device(hwdev, dma_handle,
-- offset, size, direction);
+- int i;
-
-- flush_write_buffers();
--}
+- for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+- pgd_t pgd = pgdp[i];
-
--static inline void
--dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
-- int nelems, int direction)
--{
-- BUG_ON(!valid_dma_direction(direction));
-- if (dma_ops->sync_sg_for_cpu)
-- dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
-- flush_write_buffers();
+- if (__pgd_val(pgd) != 0) {
+- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+-
+- pgdp[i] = xen_make_pgd(0);
+-
+- paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
+- pmd_free(mm, pmd);
+- }
+- }
-}
-
--static inline void
--dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
-- int nelems, int direction)
+-/*
+- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+- * updating the top-level pagetable entries to guarantee the
+- * processor notices the update. Since this is expensive, and
+- * all 4 top-level entries are used almost immediately in a
+- * new process's life, we just pre-populate them here.
+- *
+- * Also, if we're in a paravirt environment where the kernel pmd is
+- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+- * and initialize the kernel pmds here.
+- */
+-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-- BUG_ON(!valid_dma_direction(direction));
-- if (dma_ops->sync_sg_for_device) {
-- dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+- pud_t *pud;
+- pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
+- unsigned long addr, flags;
+- int i;
+-
+- /*
+- * We can race save/restore (if we sleep during a GFP_KERNEL memory
+- * allocation). We therefore store virtual addresses of pmds as they
+- * do not change across save/restore, and poke the machine addresses
+- * into the pgdir under the pgd_lock.
+- */
+- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
+- pmds[i] = pmd_alloc_one(mm, addr);
+- if (!pmds[i])
+- goto out_oom;
- }
-
-- flush_write_buffers();
+- spin_lock_irqsave(&pgd_lock, flags);
+-
+- /* Protect against save/restore: move below 4GB under pgd_lock. */
+- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
+- && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
+- spin_unlock_irqrestore(&pgd_lock, flags);
+-out_oom:
+- while (i--)
+- pmd_free(mm, pmds[i]);
+- return 0;
+- }
+-
+- /* Copy kernel pmd contents and write-protect the new pmds. */
+- pud = pud_offset(pgd, 0);
+- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+- i++, pud++, addr += PUD_SIZE) {
+- if (i >= USER_PTRS_PER_PGD) {
+- memcpy(pmds[i],
+- (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+- sizeof(pmd_t) * PTRS_PER_PMD);
+- make_lowmem_page_readonly(
+- pmds[i], XENFEAT_writable_page_tables);
+- }
+-
+- /* It is safe to poke machine addresses of pmds under the pgd_lock. */
+- pud_populate(mm, pud, pmds[i]);
+- }
+-
+- /* List required to sync kernel mapping updates and
+- * to pin/unpin on save/restore. */
+- pgd_list_add(pgd);
+-
+- spin_unlock_irqrestore(&pgd_lock, flags);
+-
+- return 1;
+-}
+-#else /* !CONFIG_X86_PAE */
+-/* No need to prepopulate any pagetable entries in non-PAE modes. */
+-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+-{
+- return 1;
-}
-
--static inline int
--dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
+-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-- BUG_ON(!valid_dma_direction(direction));
-- return dma_ops->map_sg(hwdev, sg, nents, direction);
-}
+-#endif /* CONFIG_X86_PAE */
-
--static inline void
--dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
-- int direction)
+-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-- BUG_ON(!valid_dma_direction(direction));
-- dma_ops->unmap_sg(hwdev, sg, nents, direction);
+- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+-
+- /* so that alloc_pd can use it */
+- mm->pgd = pgd;
+- if (pgd)
+- pgd_ctor(pgd);
+-
+- if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+- free_page((unsigned long)pgd);
+- pgd = NULL;
+- }
+-
+- return pgd;
-}
-
--extern int dma_supported(struct device *hwdev, u64 mask);
+-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+-{
+- /*
+- * After this the pgd should not be pinned for the duration of this
+- * function's execution. We should never sleep and thus never race:
+- * 1. User pmds will not become write-protected under our feet due
+- * to a concurrent mm_pin_all().
+- * 2. The machine addresses in PGD entries will not become invalid
+- * due to a concurrent save/restore.
+- */
+- pgd_dtor(pgd);
+-
+- if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
+- xen_destroy_contiguous_region((unsigned long)pgd, 0);
+-
+- pgd_mop_up_pmds(mm, pgd);
+- free_page((unsigned long)pgd);
+-}
-
--/* same for gart, swiotlb, and nommu */
--static inline int dma_get_cache_alignment(void)
+-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
-- return boot_cpu_data.x86_clflush_size;
+- pgtable_page_dtor(pte);
+- paravirt_release_pt(page_to_pfn(pte));
+- tlb_remove_page(tlb, pte);
-}
-
--#define dma_is_consistent(d, h) 1
--
--extern int dma_set_mask(struct device *dev, u64 mask);
+-#ifdef CONFIG_X86_PAE
-
--static inline void
--dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-- enum dma_data_direction dir)
+-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
-- flush_write_buffers();
+- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+- tlb_remove_page(tlb, virt_to_page(pmd));
-}
-
--extern struct device fallback_dev;
--extern int panic_on_overflow;
-#endif
-
--#endif /* _X8664_DMA_MAPPING_H */
+ void make_lowmem_page_readonly(void *va, unsigned int feature)
+ {
+ pte_t *pte;
+--- sle11-2009-05-14.orig/arch/x86/pci/i386.c 2009-05-14 10:56:29.000000000 +0200
++++ sle11-2009-05-14/arch/x86/pci/i386.c 2009-05-14 11:20:29.000000000 +0200
+@@ -331,10 +331,14 @@ int pci_mmap_page_range(struct pci_dev *
+ flags);
+ }
+
++#ifndef CONFIG_XEN
+ if (((vma->vm_pgoff < max_low_pfn_mapped) ||
+ (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
+ vma->vm_pgoff < max_pfn_mapped)) &&
+ ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
++#else
++ if (ioremap_check_change_attr(vma->vm_pgoff, len, flags)) {
++#endif
+ free_memtype(addr, addr + len);
+ return -EINVAL;
+ }
+--- sle11-2009-05-14.orig/arch/x86/pci/irq-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/pci/irq-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -140,9 +140,11 @@ static void __init pirq_peer_trick(void)
+ busmap[e->bus] = 1;
+ }
+ for(i = 1; i < 256; i++) {
++ int node;
+ if (!busmap[i] || pci_find_bus(0, i))
+ continue;
+- if (pci_scan_bus_with_sysdata(i))
++ node = get_mp_bus_to_node(i);
++ if (pci_scan_bus_on_node(i, &pci_root_ops, node))
+ printk(KERN_INFO "PCI: Discovered primary peer "
+ "bus %02x [IRQ]\n", i);
+ }
+@@ -204,7 +206,7 @@ static int pirq_ali_get(struct pci_dev *
+ {
+ static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
+
+- WARN_ON_ONCE(pirq >= 16);
++ WARN_ON_ONCE(pirq > 16);
+ return irqmap[read_config_nybble(router, 0x48, pirq-1)];
+ }
+
+@@ -213,7 +215,7 @@ static int pirq_ali_set(struct pci_dev *
+ static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
+ unsigned int val = irqmap[irq];
+
+- WARN_ON_ONCE(pirq >= 16);
++ WARN_ON_ONCE(pirq > 16);
+ if (val) {
+ write_config_nybble(router, 0x48, pirq-1, val);
+ return 1;
+@@ -264,7 +266,7 @@ static int pirq_via586_get(struct pci_de
+ {
+ static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+
+- WARN_ON_ONCE(pirq >= 5);
++ WARN_ON_ONCE(pirq > 5);
+ return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
+ }
+
+@@ -272,7 +274,7 @@ static int pirq_via586_set(struct pci_de
+ {
+ static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+
+- WARN_ON_ONCE(pirq >= 5);
++ WARN_ON_ONCE(pirq > 5);
+ write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
+ return 1;
+ }
+@@ -286,7 +288,7 @@ static int pirq_ite_get(struct pci_dev *
+ {
+ static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+
+- WARN_ON_ONCE(pirq >= 4);
++ WARN_ON_ONCE(pirq > 4);
+ return read_config_nybble(router,0x43, pirqmap[pirq-1]);
+ }
+
+@@ -294,7 +296,7 @@ static int pirq_ite_set(struct pci_dev *
+ {
+ static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+
+- WARN_ON_ONCE(pirq >= 4);
++ WARN_ON_ONCE(pirq > 4);
+ write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
+ return 1;
+ }
+@@ -623,6 +625,13 @@ static __init int via_router_probe(struc
+ */
+ device = PCI_DEVICE_ID_VIA_8235;
+ break;
++ case PCI_DEVICE_ID_VIA_8237:
++ /**
++ * Asus a7v600 bios wrongly reports 8237
++ * as 586-compatible
++ */
++ device = PCI_DEVICE_ID_VIA_8237;
++ break;
+ }
+ }
+
+--- sle11-2009-05-14.orig/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -164,7 +164,7 @@ static __init void relocate_vdso(Elf32_E
+ Elf32_Shdr *shdr;
+ int i;
+
+- BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
++ BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
+ !elf_check_arch_ia32(ehdr) ||
+ ehdr->e_type != ET_DYN);
+
+@@ -233,8 +233,12 @@ void syscall32_cpu_init(void)
+ BUG();
+ #endif
+
+- if (use_sysenter < 0)
+- use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
++ if (use_sysenter < 0) {
++ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
++ use_sysenter = 1;
++ if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
++ use_sysenter = 1;
++ }
+ }
+
+ #define compat_uses_vma 1
+@@ -337,8 +341,6 @@ int __init sysenter_setup(void)
+
+ #ifdef CONFIG_X86_32
+ gate_vma_init();
-
--#include "dma-mapping_32.h"
---- a/include/asm-x86/mach-xen/asm/dma-mapping.h
-+++ b/include/asm-x86/mach-xen/asm/dma-mapping.h
-@@ -1,5 +1,17 @@
--#ifdef CONFIG_X86_32
--# include "dma-mapping_32.h"
--#else
--# include "dma-mapping_64.h"
--#endif
-+#ifndef _ASM_DMA_MAPPING_H_
-+
-+#include "../../dma-mapping.h"
+- printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+ #endif
+
+ #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
+@@ -383,6 +385,9 @@ int arch_setup_additional_pages(struct l
+ int ret = 0;
+ bool compat;
+
++ if (vdso_enabled == VDSO_DISABLED)
++ return 0;
+
-+static inline int
-+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
+ down_write(&mm->mmap_sem);
+
+ /* Test compat mode once here, in case someone
+--- sle11-2009-05-14.orig/drivers/acpi/processor_core.c 2009-02-16 15:58:14.000000000 +0100
++++ sle11-2009-05-14/drivers/acpi/processor_core.c 2009-03-16 16:38:05.000000000 +0100
+@@ -657,7 +657,7 @@ static int acpi_processor_get_info(struc
+ * of /proc/cpuinfo
+ */
+ status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
+- if (ACPI_SUCCESS(status))
++ if (ACPI_SUCCESS(status) && pr->id != -1)
+ arch_fix_phys_package_id(pr->id, object.integer.value);
+
+ return 0;
+--- sle11-2009-05-14.orig/drivers/input/xen-kbdfront.c 2009-05-14 10:56:29.000000000 +0200
++++ sle11-2009-05-14/drivers/input/xen-kbdfront.c 2009-03-16 16:38:05.000000000 +0100
+@@ -325,7 +325,6 @@ static struct xenbus_device_id xenkbd_id
+
+ static struct xenbus_driver xenkbd = {
+ .name = "vkbd",
+- .owner = THIS_MODULE,
+ .ids = xenkbd_ids,
+ .probe = xenkbd_probe,
+ .remove = xenkbd_remove,
+--- sle11-2009-05-14.orig/drivers/oprofile/cpu_buffer.c 2009-03-12 16:15:32.000000000 +0100
++++ sle11-2009-05-14/drivers/oprofile/cpu_buffer.c 2009-03-16 16:38:05.000000000 +0100
+@@ -341,7 +341,7 @@ void oprofile_add_mode(int cpu_mode)
+
+ int oprofile_add_domain_switch(int32_t domain_id)
+ {
+- struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
++ struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
+
+ /* should have space for switching into and out of domain
+ (2 slots each) plus one sample and one cpu mode switch */
+--- sle11-2009-05-14.orig/drivers/pci/msi-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/drivers/pci/msi-xen.c 2009-03-16 16:38:05.000000000 +0100
+@@ -583,7 +583,7 @@ int pci_enable_msi(struct pci_dev* dev)
+ EXPORT_SYMBOL(pci_enable_msi);
+
+ extern void pci_frontend_disable_msi(struct pci_dev* dev);
+-void pci_disable_msi(struct pci_dev* dev)
++void pci_msi_shutdown(struct pci_dev* dev)
+ {
+ int pirq;
+
+@@ -612,6 +612,10 @@ void pci_disable_msi(struct pci_dev* dev
+ pci_intx_for_msi(dev, 1);
+ dev->msi_enabled = 0;
+ }
++void pci_disable_msi(struct pci_dev* dev)
+{
-+ dma_addr_t mask = 0xffffffff;
-+ /* If the device has a mask, use it, otherwise default to 32 bits */
-+ if (hwdev && hwdev->dma_mask)
-+ mask = *hwdev->dma_mask;
-+ return (addr & ~mask) != 0;
++ pci_msi_shutdown(dev);
+}
-+
-+extern int range_straddles_page_boundary(paddr_t p, size_t size);
-+
-+#endif /* _ASM_DMA_MAPPING_H_ */
---- a/include/asm-x86/mach-xen/asm/fixmap_32.h
-+++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
-@@ -10,8 +10,8 @@
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- */
-
--#ifndef _ASM_FIXMAP_H
--#define _ASM_FIXMAP_H
-+#ifndef _ASM_FIXMAP_32_H
-+#define _ASM_FIXMAP_32_H
+ EXPORT_SYMBOL(pci_disable_msi);
- /* used by vmalloc.c, vsyscall.lds.S.
- *
-@@ -102,8 +102,7 @@ enum fixed_addresses {
- */
- #define NR_FIX_BTMAPS 64
- #define FIX_BTMAPS_NESTING 4
-- FIX_BTMAP_END =
-- __end_of_permanent_fixed_addresses + 512 -
-+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
- (__end_of_permanent_fixed_addresses & 511),
- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
- FIX_WP_TEST,
-@@ -114,19 +113,16 @@ enum fixed_addresses {
- };
+ /**
+@@ -714,7 +718,7 @@ int pci_enable_msix(struct pci_dev* dev,
+ EXPORT_SYMBOL(pci_enable_msix);
- extern void __set_fixmap(enum fixed_addresses idx,
-- maddr_t phys, pgprot_t flags);
-+ maddr_t phys, pgprot_t flags);
- extern void reserve_top_address(unsigned long reserve);
+ extern void pci_frontend_disable_msix(struct pci_dev* dev);
+-void pci_disable_msix(struct pci_dev* dev)
++void pci_msix_shutdown(struct pci_dev* dev)
+ {
+ if (!pci_msi_enable)
+ return;
+@@ -751,6 +755,10 @@ void pci_disable_msix(struct pci_dev* de
+ pci_intx_for_msi(dev, 1);
+ dev->msix_enabled = 0;
+ }
++void pci_disable_msix(struct pci_dev* dev)
++{
++ pci_msix_shutdown(dev);
++}
+ EXPORT_SYMBOL(pci_disable_msix);
--#define set_fixmap(idx, phys) \
-- __set_fixmap(idx, phys, PAGE_KERNEL)
-+#define set_fixmap(idx, phys) \
-+ __set_fixmap(idx, phys, PAGE_KERNEL)
- /*
- * Some hardware wants to get fixmapped without caching.
- */
--#define set_fixmap_nocache(idx, phys) \
-- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
--
--#define clear_fixmap(idx) \
-- __set_fixmap(idx, 0, __pgprot(0))
-+#define set_fixmap_nocache(idx, phys) \
-+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+ /**
+--- sle11-2009-05-14.orig/drivers/video/Kconfig 2009-02-16 15:58:02.000000000 +0100
++++ sle11-2009-05-14/drivers/video/Kconfig 2009-03-16 16:38:05.000000000 +0100
+@@ -2029,7 +2029,7 @@ config FB_VIRTUAL
- #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
+ config XEN_FBDEV_FRONTEND
+ tristate "Xen virtual frame buffer support"
+- depends on FB && XEN
++ depends on FB && PARAVIRT_XEN
+ select FB_SYS_FILLRECT
+ select FB_SYS_COPYAREA
+ select FB_SYS_IMAGEBLIT
+--- sle11-2009-05-14.orig/drivers/video/xen-fbfront.c 2009-05-14 10:56:29.000000000 +0200
++++ sle11-2009-05-14/drivers/video/xen-fbfront.c 2009-03-16 16:38:05.000000000 +0100
+@@ -670,7 +670,6 @@ static struct xenbus_device_id xenfb_ids
-@@ -159,7 +155,7 @@ static __always_inline unsigned long fix
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
+ static struct xenbus_driver xenfb = {
+ .name = "vfb",
+- .owner = THIS_MODULE,
+ .ids = xenfb_ids,
+ .probe = xenfb_probe,
+ .remove = xenfb_remove,
+--- sle11-2009-05-14.orig/drivers/xen/Kconfig 2009-03-04 11:28:34.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/Kconfig 2009-03-16 16:38:05.000000000 +0100
+@@ -2,8 +2,6 @@
+ # This Kconfig describe xen options
+ #
-- return __fix_to_virt(idx);
-+ return __fix_to_virt(idx);
- }
+-mainmenu "Xen Configuration"
+-
+ config XEN
+ bool
- static inline unsigned long virt_to_fix(const unsigned long vaddr)
---- a/include/asm-x86/mach-xen/asm/fixmap_64.h
-+++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
-@@ -8,8 +8,8 @@
- * Copyright (C) 1998 Ingo Molnar
- */
+--- sle11-2009-05-14.orig/drivers/xen/Makefile 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/Makefile 2009-03-16 16:38:05.000000000 +0100
+@@ -1,5 +1,8 @@
+-obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o
++obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
++xen-xencomm-$(CONFIG_PARAVIRT_XEN) := xencomm.o
++xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
--#ifndef _ASM_FIXMAP_H
--#define _ASM_FIXMAP_H
-+#ifndef _ASM_FIXMAP_64_H
-+#define _ASM_FIXMAP_64_H
++xen-balloon-$(CONFIG_XEN) := balloon/
+ obj-$(CONFIG_XEN) += core/
+ obj-$(CONFIG_XEN) += console/
+ obj-$(CONFIG_XEN) += evtchn/
+@@ -7,7 +10,8 @@ obj-y += xenbus/
+ obj-$(CONFIG_XEN) += char/
- #include <linux/kernel.h>
- #include <asm/apicdef.h>
-@@ -35,7 +35,8 @@
+ obj-$(CONFIG_XEN) += util.o
+-obj-$(CONFIG_XEN_BALLOON) += balloon/
++obj-$(CONFIG_XEN_XENCOMM) += $(xen-xencomm-y)
++obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y)
+ obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
+ obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
+ obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
+--- sle11-2009-05-14.orig/drivers/xen/blkfront/blkfront.c 2009-03-24 10:12:53.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/blkfront/blkfront.c 2009-05-19 10:38:53.000000000 +0200
+@@ -285,7 +285,11 @@ static void backend_changed(struct xenbu
+ break;
- enum fixed_addresses {
- VSYSCALL_LAST_PAGE,
-- VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
-+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
-+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
- VSYSCALL_HPET,
- FIX_DBGP_BASE,
- FIX_EARLYCON_MEM_BASE,
-@@ -45,11 +46,12 @@ enum fixed_addresses {
- #endif
- #ifndef CONFIG_XEN
- FIX_IO_APIC_BASE_0,
-- FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
-+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
- #endif
- #ifdef CONFIG_EFI
- FIX_EFI_IO_MAP_LAST_PAGE,
-- FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
-+ FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
-+ + MAX_EFI_IO_PAGES - 1,
- #endif
- #ifdef CONFIG_ACPI
- FIX_ACPI_BEGIN,
-@@ -79,19 +81,16 @@ enum fixed_addresses {
- __end_of_fixed_addresses
- };
+ case XenbusStateClosing:
+- bd = bdget(info->dev);
++ if (!info->gd) {
++ xenbus_frontend_closed(dev);
++ break;
++ }
++ bd = bdget_disk(info->gd, 0);
+ if (bd == NULL)
+ xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
--extern void __set_fixmap (enum fixed_addresses idx,
-- unsigned long phys, pgprot_t flags);
-+extern void __set_fixmap(enum fixed_addresses idx,
-+ unsigned long phys, pgprot_t flags);
+--- sle11-2009-05-14.orig/drivers/xen/blkfront/block.h 2009-03-24 10:11:58.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/blkfront/block.h 2009-03-16 16:38:05.000000000 +0100
+@@ -96,7 +96,6 @@ struct blk_shadow {
+ struct blkfront_info
+ {
+ struct xenbus_device *xbdev;
+- dev_t dev;
+ struct gendisk *gd;
+ int vdevice;
+ blkif_vdev_t handle;
+--- sle11-2009-05-14.orig/drivers/xen/blkfront/vbd.c 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/blkfront/vbd.c 2009-03-16 16:38:05.000000000 +0100
+@@ -246,17 +246,32 @@ xlvbd_init_blk_queue(struct gendisk *gd,
+ return 0;
+ }
--#define set_fixmap(idx, phys) \
-- __set_fixmap(idx, phys, PAGE_KERNEL)
-+#define set_fixmap(idx, phys) \
-+ __set_fixmap(idx, phys, PAGE_KERNEL)
- /*
- * Some hardware wants to get fixmapped without caching.
- */
--#define set_fixmap_nocache(idx, phys) \
-- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
--
--#define clear_fixmap(idx) \
-- __set_fixmap(idx, 0, __pgprot(0))
-+#define set_fixmap_nocache(idx, phys) \
-+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+-static int
+-xlvbd_alloc_gendisk(int major, int minor, blkif_sector_t capacity, int vdevice,
+- u16 vdisk_info, u16 sector_size,
+- struct blkfront_info *info)
++int
++xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
++ u16 sector_size, struct blkfront_info *info)
+ {
++ int major, minor;
+ struct gendisk *gd;
+ struct xlbd_major_info *mi;
+ int nr_minors = 1;
+ int err = -ENODEV;
+ unsigned int offset;
- #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
- #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
---- a/include/asm-x86/mach-xen/asm/fixmap.h
-+++ b/include/asm-x86/mach-xen/asm/fixmap.h
-@@ -1,5 +1,13 @@
-+#ifndef _ASM_FIXMAP_H
-+#define _ASM_FIXMAP_H
-+
- #ifdef CONFIG_X86_32
- # include "fixmap_32.h"
- #else
- # include "fixmap_64.h"
- #endif
++ if ((vdevice>>EXT_SHIFT) > 1) {
++ /* this is above the extended range; something is wrong */
++ printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
++ return -ENODEV;
++ }
+
-+#define clear_fixmap(idx) \
-+ __set_fixmap(idx, 0, __pgprot(0))
++ if (!VDEV_IS_EXTENDED(vdevice)) {
++ major = BLKIF_MAJOR(vdevice);
++ minor = BLKIF_MINOR(vdevice);
++ }
++ else {
++ major = 202;
++ minor = BLKIF_MINOR_EXT(vdevice);
++ }
+
-+#endif
---- a/include/asm-x86/mach-xen/asm/highmem.h
-+++ b/include/asm-x86/mach-xen/asm/highmem.h
-@@ -8,7 +8,7 @@
- * Gerhard.Wichert@pdb.siemens.de
- *
- *
-- * Redesigned the x86 32-bit VM architecture to deal with
-+ * Redesigned the x86 32-bit VM architecture to deal with
- * up to 16 Terabyte physical memory. With current x86 CPUs
- * we now support up to 64 Gigabytes physical RAM.
- *
---- a/include/asm-x86/mach-xen/asm/io_32.h
-+++ b/include/asm-x86/mach-xen/asm/io_32.h
-@@ -50,12 +50,6 @@
- #include <asm/fixmap.h>
+ BUG_ON(info->gd != NULL);
+ BUG_ON(info->mi != NULL);
+ BUG_ON(info->rq != NULL);
+@@ -337,41 +352,6 @@ xlvbd_alloc_gendisk(int major, int minor
+ return err;
+ }
- /*
-- * Convert a physical pointer to a virtual kernel pointer for /dev/mem
-- * access
-- */
--#define xlate_dev_mem_ptr(p) __va(p)
+-int
+-xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
+- u16 sector_size, struct blkfront_info *info)
+-{
+- struct block_device *bd;
+- int err = 0;
+- int major, minor;
-
--/*
- * Convert a virtual cached pointer to an uncached pointer
- */
- #define xlate_dev_kmem_ptr(p) p
-@@ -66,14 +60,14 @@
- *
- * The returned physical address is the physical (CPU) mapping for
- * the memory address given. It is only valid to use this function on
-- * addresses directly mapped or allocated via kmalloc.
-+ * addresses directly mapped or allocated via kmalloc.
- *
- * This function does not give bus mappings for DMA transfers. In
- * almost all conceivable cases a device driver should not be using
- * this function
- */
--
--static inline unsigned long virt_to_phys(volatile void * address)
-+
-+static inline unsigned long virt_to_phys(volatile void *address)
+- if ((vdevice>>EXT_SHIFT) > 1) {
+- /* this is above the extended range; something is wrong */
+- printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
+- return -ENODEV;
+- }
+-
+- if (!VDEV_IS_EXTENDED(vdevice)) {
+- major = BLKIF_MAJOR(vdevice);
+- minor = BLKIF_MINOR(vdevice);
+- }
+- else {
+- major = 202;
+- minor = BLKIF_MINOR_EXT(vdevice);
+- }
+-
+- info->dev = MKDEV(major, minor);
+- bd = bdget(info->dev);
+- if (bd == NULL)
+- return -ENODEV;
+-
+- err = xlvbd_alloc_gendisk(major, minor, capacity, vdevice, vdisk_info,
+- sector_size, info);
+-
+- bdput(bd);
+- return err;
+-}
+-
+ void
+ xlvbd_del(struct blkfront_info *info)
+ {
+--- sle11-2009-05-14.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:38:54.000000000 +0200
++++ sle11-2009-05-14/drivers/xen/blktap/blktap.c 2009-04-20 11:40:14.000000000 +0200
+@@ -111,6 +111,7 @@ typedef struct tap_blkif {
+ unsigned long mode; /*current switching mode */
+ int minor; /*Minor number for tapdisk device */
+ pid_t pid; /*tapdisk process id */
++ struct pid_namespace *pid_ns; /*... and its corresponding namespace */
+ enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
+ shutdown */
+ unsigned long *idx_map; /*Record the user ring id to kern
+@@ -299,16 +300,14 @@ struct tap_vma_priv {
+ struct page *map[];
+ };
+
+-static struct page *blktap_nopage(struct vm_area_struct *vma,
+- unsigned long address,
+- int *type)
++static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return __pa(address);
+ /*
+ * if the page has not been mapped in by the driver then return
+- * NOPAGE_SIGBUS to the domain.
++ * VM_FAULT_SIGBUS to the domain.
+ */
+
+- return NOPAGE_SIGBUS;
++ return VM_FAULT_SIGBUS;
}
-@@ -91,7 +85,7 @@ static inline unsigned long virt_to_phys
- * this function
- */
--static inline void * phys_to_virt(unsigned long address)
-+static inline void *phys_to_virt(unsigned long address)
- {
- return __va(address);
+ static pte_t blktap_clear_pte(struct vm_area_struct *vma,
+@@ -404,7 +403,7 @@ static void blktap_vma_close(struct vm_a
}
-@@ -152,11 +146,6 @@ extern void *early_ioremap(unsigned long
- extern void early_iounmap(void *addr, unsigned long size);
- extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
--/* Use early IO mappings for DMI because it's initialized early */
--#define dmi_ioremap early_ioremap
--#define dmi_iounmap early_iounmap
--#define dmi_alloc alloc_bootmem
--
- /*
- * ISA I/O bus memory addresses are 1:1 with the physical address.
- */
-@@ -182,16 +171,19 @@ extern void __iomem *fix_ioremap(unsigne
+ struct vm_operations_struct blktap_vm_ops = {
+- nopage: blktap_nopage,
++ fault: blktap_fault,
+ zap_pte: blktap_clear_pte,
+ close: blktap_vma_close,
+ };
+@@ -498,9 +497,8 @@ found:
+ tapfds[minor] = info;
- static inline unsigned char readb(const volatile void __iomem *addr)
- {
-- return *(volatile unsigned char __force *) addr;
-+ return *(volatile unsigned char __force *)addr;
- }
-+
- static inline unsigned short readw(const volatile void __iomem *addr)
- {
-- return *(volatile unsigned short __force *) addr;
-+ return *(volatile unsigned short __force *)addr;
- }
-+
- static inline unsigned int readl(const volatile void __iomem *addr)
- {
- return *(volatile unsigned int __force *) addr;
- }
-+
- #define readb_relaxed(addr) readb(addr)
- #define readw_relaxed(addr) readw(addr)
- #define readl_relaxed(addr) readl(addr)
-@@ -201,15 +193,17 @@ static inline unsigned int readl(const v
+ if ((class = get_xen_class()) != NULL)
+- class_device_create(class, NULL,
+- MKDEV(blktap_major, minor), NULL,
+- "blktap%d", minor);
++ device_create(class, NULL, MKDEV(blktap_major, minor),
++ "blktap%d", minor);
+ }
- static inline void writeb(unsigned char b, volatile void __iomem *addr)
- {
-- *(volatile unsigned char __force *) addr = b;
-+ *(volatile unsigned char __force *)addr = b;
- }
-+
- static inline void writew(unsigned short b, volatile void __iomem *addr)
- {
-- *(volatile unsigned short __force *) addr = b;
-+ *(volatile unsigned short __force *)addr = b;
+ out:
+@@ -542,7 +540,7 @@ void signal_tapdisk(int idx)
+ return;
+
+ if (info->pid > 0) {
+- ptask = find_task_by_pid(info->pid);
++ ptask = find_task_by_pid_ns(info->pid, info->pid_ns);
+ if (ptask)
+ info->status = CLEANSHUTDOWN;
+ }
+@@ -770,8 +768,9 @@ static int blktap_ioctl(struct inode *in
+ {
+ if (info) {
+ info->pid = (pid_t)arg;
+- DPRINTK("blktap: pid received %d\n",
+- info->pid);
++ info->pid_ns = current->nsproxy->pid_ns;
++ DPRINTK("blktap: pid received %p:%d\n",
++ info->pid_ns, info->pid);
+ }
+ return 0;
+ }
+@@ -1684,9 +1683,7 @@ static int __init blkif_init(void)
+ * We only create the device when a request of a new device is
+ * made.
+ */
+- class_device_create(class, NULL,
+- MKDEV(blktap_major, 0), NULL,
+- "blktap0");
++ device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
+ } else {
+ /* this is bad, but not fatal */
+ WPRINTK("blktap: sysfs xen_class not created\n");
+--- sle11-2009-05-14.orig/drivers/xen/char/mem.c 2008-12-15 11:27:22.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/char/mem.c 2009-03-16 16:38:05.000000000 +0100
+@@ -33,6 +33,27 @@ static inline int uncached_access(struct
+ return 0;
}
+
++static inline int range_is_allowed(unsigned long pfn, unsigned long size)
++{
++#ifdef CONFIG_NONPROMISC_DEVMEM
++ u64 from = ((u64)pfn) << PAGE_SHIFT;
++ u64 to = from + size;
++ u64 cursor = from;
+
- static inline void writel(unsigned int b, volatile void __iomem *addr)
- {
-- *(volatile unsigned int __force *) addr = b;
-+ *(volatile unsigned int __force *)addr = b;
- }
- #define __raw_writeb writeb
- #define __raw_writew writew
-@@ -252,12 +246,12 @@ memcpy_toio(volatile void __iomem *dst,
- * 1. Out of order aware processors
- * 2. Accidentally out of order processors (PPro errata #51)
- */
--
++ while (cursor < to) {
++ if (!devmem_is_allowed(pfn)) {
++ printk(KERN_INFO
++ "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
++ current->comm, from, to);
++ return 0;
++ }
++ cursor += PAGE_SIZE;
++ pfn++;
++ }
++#endif
++ return 1;
++}
+
- #if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+ /*
+ * This funcion reads the *physical* memory. The f_pos points directly to the
+ * memory location.
+@@ -55,6 +76,9 @@ static ssize_t read_mem(struct file * fi
- static inline void flush_write_buffers(void)
- {
-- __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
-+ asm volatile("lock; addl $0,0(%%esp)": : :"memory");
- }
+ sz = min_t(unsigned long, sz, count);
- #else
-@@ -274,7 +268,8 @@ extern void xen_io_delay(void);
- extern int io_delay_type;
- extern void io_delay_init(void);
++ if (!range_is_allowed(p >> PAGE_SHIFT, count))
++ return -EPERM;
++
+ v = ioremap(p, sz);
+ if (IS_ERR(v) || v == NULL) {
+ /*
+@@ -103,6 +127,9 @@ static ssize_t write_mem(struct file * f
--static inline void slow_down_io(void) {
-+static inline void slow_down_io(void)
-+{
- native_io_delay();
- #ifdef REALLY_SLOW_IO
- native_io_delay();
-@@ -283,52 +278,75 @@ static inline void slow_down_io(void) {
- #endif
+ sz = min_t(unsigned long, sz, count);
+
++ if (!range_is_allowed(p >> PAGE_SHIFT, sz))
++ return -EPERM;
++
+ v = ioremap(p, sz);
+ if (v == NULL)
+ break;
+@@ -131,6 +158,23 @@ static ssize_t write_mem(struct file * f
}
--#define __BUILDIO(bwl,bw,type) \
--static inline void out##bwl(unsigned type value, int port) { \
-- out##bwl##_local(value, port); \
--} \
--static inline unsigned type in##bwl(int port) { \
-- return in##bwl##_local(port); \
--}
--
--#define BUILDIO(bwl,bw,type) \
--static inline void out##bwl##_local(unsigned type value, int port) { \
-- __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
--} \
--static inline unsigned type in##bwl##_local(int port) { \
-- unsigned type value; \
-- __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
-- return value; \
--} \
--static inline void out##bwl##_local_p(unsigned type value, int port) { \
-- out##bwl##_local(value, port); \
-- slow_down_io(); \
--} \
--static inline unsigned type in##bwl##_local_p(int port) { \
-- unsigned type value = in##bwl##_local(port); \
-- slow_down_io(); \
-- return value; \
--} \
--__BUILDIO(bwl,bw,type) \
--static inline void out##bwl##_p(unsigned type value, int port) { \
-- out##bwl(value, port); \
-- slow_down_io(); \
--} \
--static inline unsigned type in##bwl##_p(int port) { \
-- unsigned type value = in##bwl(port); \
-- slow_down_io(); \
-- return value; \
--} \
--static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
-- __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
--} \
--static inline void ins##bwl(int port, void *addr, unsigned long count) { \
-- __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
--}
--
--BUILDIO(b,b,char)
--BUILDIO(w,w,short)
--BUILDIO(l,,int)
-+#define __BUILDIO(bwl, bw, type) \
-+static inline void out##bwl(unsigned type value, int port) \
-+{ \
-+ out##bwl##_local(value, port); \
-+} \
-+ \
-+static inline unsigned type in##bwl(int port) \
-+{ \
-+ return in##bwl##_local(port); \
+ #ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
++static void mmap_mem_open(struct vm_area_struct *vma)
++{
++ map_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
++ vma->vm_page_prot);
+}
+
-+#define BUILDIO(bwl, bw, type) \
-+static inline void out##bwl##_local(unsigned type value, int port) \
-+{ \
-+ asm volatile("out" #bwl " %" #bw "0, %w1" \
-+ : : "a"(value), "Nd"(port)); \
-+} \
-+ \
-+static inline unsigned type in##bwl##_local(int port) \
-+{ \
-+ unsigned type value; \
-+ asm volatile("in" #bwl " %w1, %" #bw "0" \
-+ : "=a"(value) : "Nd"(port)); \
-+ return value; \
-+} \
-+ \
-+static inline void out##bwl##_local_p(unsigned type value, int port) \
-+{ \
-+ out##bwl##_local(value, port); \
-+ slow_down_io(); \
-+} \
-+ \
-+static inline unsigned type in##bwl##_local_p(int port) \
-+{ \
-+ unsigned type value = in##bwl##_local(port); \
-+ slow_down_io(); \
-+ return value; \
-+} \
-+ \
-+__BUILDIO(bwl, bw, type) \
-+ \
-+static inline void out##bwl##_p(unsigned type value, int port) \
-+{ \
-+ out##bwl(value, port); \
-+ slow_down_io(); \
-+} \
-+ \
-+static inline unsigned type in##bwl##_p(int port) \
-+{ \
-+ unsigned type value = in##bwl(port); \
-+ slow_down_io(); \
-+ return value; \
-+} \
-+ \
-+static inline void outs##bwl(int port, const void *addr, unsigned long count) \
-+{ \
-+ asm volatile("rep; outs" #bwl \
-+ : "+S"(addr), "+c"(count) : "d"(port)); \
-+} \
-+ \
-+static inline void ins##bwl(int port, void *addr, unsigned long count) \
-+{ \
-+ asm volatile("rep; ins" #bwl \
-+ : "+D"(addr), "+c"(count) : "d"(port)); \
++static void mmap_mem_close(struct vm_area_struct *vma)
++{
++ unmap_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
++ vma->vm_page_prot);
+}
+
-+BUILDIO(b, b, char)
-+BUILDIO(w, w, short)
-+BUILDIO(l, , int)
-
- /* We will be supplying our own /dev/mem implementation */
- #define ARCH_HAS_DEV_MEM
---- a/include/asm-x86/mach-xen/asm/io_64.h
-+++ b/include/asm-x86/mach-xen/asm/io_64.h
-@@ -55,60 +55,75 @@ static inline void slow_down_io(void)
- /*
- * Talk about misusing macros..
- */
--#define __OUT1(s,x) \
-+#define __OUT1(s, x) \
- static inline void out##s(unsigned x value, unsigned short port) {
-
--#define __OUT2(s,s1,s2) \
--__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
-+#define __OUT2(s, s1, s2) \
-+asm volatile ("out" #s " %" s1 "0,%" s2 "1"
-
- #ifndef REALLY_SLOW_IO
- #define REALLY_SLOW_IO
- #define UNSET_REALLY_SLOW_IO
- #endif
++static struct vm_operations_struct mmap_mem_ops = {
++ .open = mmap_mem_open,
++ .close = mmap_mem_close
++};
++
+ static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
+ {
+ size_t size = vma->vm_end - vma->vm_start;
+@@ -138,6 +182,15 @@ static int xen_mmap_mem(struct file * fi
+ if (uncached_access(file))
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
--#define __OUT(s,s1,x) \
--__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
--__OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
-- slow_down_io(); }
--
--#define __IN1(s) \
--static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
--
--#define __IN2(s,s1,s2) \
--__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
--
--#define __IN(s,s1,i...) \
--__IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); return _v; } \
--__IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
-- slow_down_io(); return _v; }
-+#define __OUT(s, s1, x) \
-+ __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
-+ } \
-+ __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
-+ slow_down_io(); \
-+}
++ if (!range_is_allowed(vma->vm_pgoff, size))
++ return -EPERM;
+
-+#define __IN1(s) \
-+static inline RETURN_TYPE in##s(unsigned short port) \
-+{ \
-+ RETURN_TYPE _v;
++ if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
++ &vma->vm_page_prot))
++ return -EINVAL;
+
-+#define __IN2(s, s1, s2) \
-+ asm volatile ("in" #s " %" s2 "1,%" s1 "0"
++ vma->vm_ops = &mmap_mem_ops;
+
-+#define __IN(s, s1, i...) \
-+ __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
-+ return _v; \
-+ } \
-+ __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
-+ slow_down_io(); \
-+ return _v; }
+ /* We want to return the real error code, not EAGAIN. */
+ return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+ size, vma->vm_page_prot, DOMID_IO);
+--- sle11-2009-05-14.orig/drivers/xen/console/console.c 2008-12-15 11:26:44.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/console/console.c 2009-03-16 16:38:05.000000000 +0100
+@@ -552,16 +552,18 @@ static int xencons_write(
+ return i;
+ }
- #ifdef UNSET_REALLY_SLOW_IO
- #undef REALLY_SLOW_IO
- #endif
+-static void xencons_put_char(struct tty_struct *tty, u_char ch)
++static int xencons_put_char(struct tty_struct *tty, u_char ch)
+ {
+ unsigned long flags;
++ int ret;
--#define __INS(s) \
--static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
--{ __asm__ __volatile__ ("rep ; ins" #s \
--: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
--
--#define __OUTS(s) \
--static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
--{ __asm__ __volatile__ ("rep ; outs" #s \
--: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
-+#define __INS(s) \
-+static inline void ins##s(unsigned short port, void *addr, \
-+ unsigned long count) \
-+{ \
-+ asm volatile ("rep ; ins" #s \
-+ : "=D" (addr), "=c" (count) \
-+ : "d" (port), "0" (addr), "1" (count)); \
-+}
-+
-+#define __OUTS(s) \
-+static inline void outs##s(unsigned short port, const void *addr, \
-+ unsigned long count) \
-+{ \
-+ asm volatile ("rep ; outs" #s \
-+ : "=S" (addr), "=c" (count) \
-+ : "d" (port), "0" (addr), "1" (count)); \
-+}
+ if (DUMMY_TTY(tty))
+- return;
++ return 0;
- #define RETURN_TYPE unsigned char
--__IN(b,"")
-+__IN(b, "")
- #undef RETURN_TYPE
- #define RETURN_TYPE unsigned short
--__IN(w,"")
-+__IN(w, "")
- #undef RETURN_TYPE
- #define RETURN_TYPE unsigned int
--__IN(l,"")
-+__IN(l, "")
- #undef RETURN_TYPE
-
--__OUT(b,"b",char)
--__OUT(w,"w",short)
--__OUT(l,,int)
-+__OUT(b, "b", char)
-+__OUT(w, "w", short)
-+__OUT(l, , int)
-
- __INS(b)
- __INS(w)
-@@ -129,12 +144,12 @@ __OUTS(l)
- * Change virtual addresses to physical addresses and vv.
- * These are pretty trivial
- */
--static inline unsigned long virt_to_phys(volatile void * address)
-+static inline unsigned long virt_to_phys(volatile void *address)
- {
- return __pa(address);
+ spin_lock_irqsave(&xencons_lock, flags);
+- (void)__xencons_put_char(ch);
++ ret = __xencons_put_char(ch);
+ spin_unlock_irqrestore(&xencons_lock, flags);
++ return ret;
}
--static inline void * phys_to_virt(unsigned long address)
-+static inline void *phys_to_virt(unsigned long address)
- {
- return __va(address);
- }
-@@ -216,18 +231,22 @@ static inline __u8 __readb(const volatil
- {
- return *(__force volatile __u8 *)addr;
+ static void xencons_flush_chars(struct tty_struct *tty)
+@@ -583,7 +585,7 @@ static void xencons_wait_until_sent(stru
+ if (DUMMY_TTY(tty))
+ return;
+
+- while (DRV(tty->driver)->chars_in_buffer(tty)) {
++ while (tty_chars_in_buffer(tty)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
+ if (signal_pending(current))
+@@ -632,8 +634,7 @@ static void xencons_close(struct tty_str
+
+ tty->closing = 1;
+ tty_wait_until_sent(tty, 0);
+- if (DRV(tty->driver)->flush_buffer != NULL)
+- DRV(tty->driver)->flush_buffer(tty);
++ tty_driver_flush_buffer(tty);
+ if (tty->ldisc.flush_buffer != NULL)
+ tty->ldisc.flush_buffer(tty);
+ tty->closing = 0;
+--- sle11-2009-05-14.orig/drivers/xen/core/machine_kexec.c 2009-02-17 11:46:41.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/core/machine_kexec.c 2009-03-16 16:38:05.000000000 +0100
+@@ -5,6 +5,7 @@
+
+ #include <linux/kexec.h>
+ #include <xen/interface/kexec.h>
++#include <linux/reboot.h>
+ #include <linux/mm.h>
+ #include <linux/bootmem.h>
+
+@@ -90,6 +91,9 @@ void __init xen_machine_kexec_setup_reso
+ xen_hypervisor_res.start = range.start;
+ xen_hypervisor_res.end = range.start + range.size - 1;
+ xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
++#ifdef CONFIG_X86_64
++ insert_resource(&iomem_resource, &xen_hypervisor_res);
++#endif
+
+ /* fill in crashk_res if range is reserved by hypervisor */
+
+@@ -102,6 +106,9 @@ void __init xen_machine_kexec_setup_reso
+ if (range.size) {
+ crashk_res.start = range.start;
+ crashk_res.end = range.start + range.size - 1;
++#ifdef CONFIG_X86_64
++ insert_resource(&iomem_resource, &crashk_res);
++#endif
+ }
+
+ /* get physical address of vmcoreinfo */
+@@ -153,11 +160,13 @@ void __init xen_machine_kexec_setup_reso
+ return;
}
-+
- static inline __u16 __readw(const volatile void __iomem *addr)
+
++#ifndef CONFIG_X86_64
+ void __init xen_machine_kexec_register_resources(struct resource *res)
{
- return *(__force volatile __u16 *)addr;
+ request_resource(res, &xen_hypervisor_res);
+ machine_kexec_register_resources(res);
}
-+
- static __always_inline __u32 __readl(const volatile void __iomem *addr)
++#endif
+
+ static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
{
- return *(__force volatile __u32 *)addr;
+@@ -228,6 +237,11 @@ void machine_shutdown(void)
+ /* do nothing */
}
-+
- static inline __u64 __readq(const volatile void __iomem *addr)
+
++void machine_crash_shutdown(struct pt_regs *regs)
++{
++ /* The kernel is broken so disable interrupts */
++ local_irq_disable();
++}
+
+ /*
+ * Local variables:
+--- sle11-2009-05-14.orig/drivers/xen/core/smpboot.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/core/smpboot.c 2009-03-16 16:38:05.000000000 +0100
+@@ -53,17 +53,16 @@ static DEFINE_PER_CPU(int, callfunc_irq)
+ static char resched_name[NR_CPUS][15];
+ static char callfunc_name[NR_CPUS][15];
+
+-u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
++#ifdef CONFIG_X86_LOCAL_APIC
++#define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
++#else
++#define set_cpu_to_apicid(cpu, apicid)
++#endif
+
+ DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
+ DEFINE_PER_CPU(cpumask_t, cpu_core_map);
+ EXPORT_PER_CPU_SYMBOL(cpu_core_map);
+
+-#if defined(__i386__)
+-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
+-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+-#endif
+-
+ void __init prefill_possible_map(void)
{
- return *(__force volatile __u64 *)addr;
+ int i, rc;
+@@ -154,7 +153,7 @@ static int __cpuinit xen_smp_intr_init(u
}
-+
- #define readb(x) __readb(x)
- #define readw(x) __readw(x)
- #define readl(x) __readl(x)
-@@ -247,37 +266,44 @@ static inline void __writel(__u32 b, vol
+
+ #ifdef CONFIG_HOTPLUG_CPU
+-static void xen_smp_intr_exit(unsigned int cpu)
++static void __cpuexit xen_smp_intr_exit(unsigned int cpu)
{
- *(__force volatile __u32 *)addr = b;
+ if (cpu != 0)
+ local_teardown_timer(cpu);
+@@ -263,8 +262,7 @@ void __init smp_prepare_cpus(unsigned in
+ boot_cpu_data.apicid = apicid;
+ cpu_data(0) = boot_cpu_data;
+
+- cpu_2_logical_apicid[0] = apicid;
+- per_cpu(x86_cpu_to_apicid, 0) = apicid;
++ set_cpu_to_apicid(0, apicid);
+
+ current_thread_info()->cpu = 0;
+
+@@ -319,8 +317,7 @@ void __init smp_prepare_cpus(unsigned in
+ cpu_data(cpu).cpu_index = cpu;
+ cpu_data(cpu).apicid = apicid;
+
+- cpu_2_logical_apicid[cpu] = apicid;
+- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
++ set_cpu_to_apicid(cpu, apicid);
+
+ #ifdef __x86_64__
+ cpu_pda(cpu)->pcurrent = idle;
+@@ -375,7 +372,7 @@ static int __init initialize_cpu_present
}
-+
- static inline void __writeq(__u64 b, volatile void __iomem *addr)
+ core_initcall(initialize_cpu_present_map);
+
+-int __cpu_disable(void)
++int __cpuexit __cpu_disable(void)
{
- *(__force volatile __u64 *)addr = b;
+ cpumask_t map = cpu_online_map;
+ unsigned int cpu = smp_processor_id();
+@@ -392,7 +389,7 @@ int __cpu_disable(void)
+ return 0;
}
-+
- static inline void __writeb(__u8 b, volatile void __iomem *addr)
+
+-void __cpu_die(unsigned int cpu)
++void __cpuexit __cpu_die(unsigned int cpu)
{
- *(__force volatile __u8 *)addr = b;
- }
-+
- static inline void __writew(__u16 b, volatile void __iomem *addr)
+ while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
+ current->state = TASK_UNINTERRUPTIBLE;
+--- sle11-2009-05-14.orig/drivers/xen/core/xen_proc.c 2009-05-14 10:56:29.000000000 +0200
++++ sle11-2009-05-14/drivers/xen/core/xen_proc.c 2009-03-16 16:38:05.000000000 +0100
+@@ -8,7 +8,7 @@ static struct proc_dir_entry *xen_base;
+ struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
{
- *(__force volatile __u16 *)addr = b;
+ if ( xen_base == NULL )
+- if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
++ if ( (xen_base = proc_mkdir("xen", NULL)) == NULL )
+ panic("Couldn't create /proc/xen");
+ return create_proc_entry(name, mode, xen_base);
}
--#define writeq(val,addr) __writeq((val),(addr))
--#define writel(val,addr) __writel((val),(addr))
--#define writew(val,addr) __writew((val),(addr))
--#define writeb(val,addr) __writeb((val),(addr))
-+
-+#define writeq(val, addr) __writeq((val), (addr))
-+#define writel(val, addr) __writel((val), (addr))
-+#define writew(val, addr) __writew((val), (addr))
-+#define writeb(val, addr) __writeb((val), (addr))
- #define __raw_writeb writeb
- #define __raw_writew writew
- #define __raw_writel writel
- #define __raw_writeq writeq
-
--void __memcpy_fromio(void*,unsigned long,unsigned);
--void __memcpy_toio(unsigned long,const void*,unsigned);
-+void __memcpy_fromio(void *, unsigned long, unsigned);
-+void __memcpy_toio(unsigned long, const void *, unsigned);
-
--static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
-+static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
-+ unsigned len)
- {
-- __memcpy_fromio(to,(unsigned long)from,len);
-+ __memcpy_fromio(to, (unsigned long)from, len);
+--- sle11-2009-05-14.orig/drivers/xen/fbfront/xenfb.c 2009-03-04 11:25:55.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/fbfront/xenfb.c 2009-03-16 16:38:05.000000000 +0100
+@@ -93,7 +93,7 @@ struct xenfb_info
+ * only mappings. The former creates unfaulted pages. Preserves
+ * invariant. The latter removes pages. Preserves invariant.
+ *
+- * 3. Holding both locks: xenfb_vm_nopage(). Extends the dirty
++ * 3. Holding both locks: xenfb_vm_fault(). Extends the dirty
+ * rectangle and updates mappings consistently. Preserves
+ * invariant.
+ *
+@@ -112,13 +112,13 @@ struct xenfb_info
+ *
+ * But FIXME: the invariant is too weak. It misses that the fault
+ * record in mappings must be consistent with the mapping of pages in
+- * the associated address space! do_no_page() updates the PTE after
+- * xenfb_vm_nopage() returns, i.e. outside the critical region. This
++ * the associated address space! __do_fault() updates the PTE after
++ * xenfb_vm_fault() returns, i.e. outside the critical region. This
+ * allows the following race:
+ *
+ * X writes to some address in the Xen frame buffer
+- * Fault - call do_no_page()
+- * call xenfb_vm_nopage()
++ * Fault - call __do_fault()
++ * call xenfb_vm_fault()
+ * grab mm_lock
+ * map->faults++;
+ * release mm_lock
+@@ -387,18 +387,17 @@ static void xenfb_vm_close(struct vm_are
+ mutex_unlock(&info->mm_lock);
}
--static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
-+
-+static inline void memcpy_toio(volatile void __iomem *to, const void *from,
-+ unsigned len)
+
+-static struct page *xenfb_vm_nopage(struct vm_area_struct *vma,
+- unsigned long vaddr, int *type)
++static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
-- __memcpy_toio((unsigned long)to,from,len);
-+ __memcpy_toio((unsigned long)to, from, len);
- }
+ struct xenfb_mapping *map = vma->vm_private_data;
+ struct xenfb_info *info = map->info;
+- int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT;
++ int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
+ unsigned long flags;
+ struct page *page;
+ int y1, y2;
- void memset_io(volatile void __iomem *a, int b, size_t c);
-@@ -292,18 +318,12 @@ void memset_io(volatile void __iomem *a,
- */
- #define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
+ if (pgnr >= info->nr_pages)
+- return NOPAGE_SIGBUS;
++ return VM_FAULT_SIGBUS;
+
+ mutex_lock(&info->mm_lock);
+ spin_lock_irqsave(&info->dirty_lock, flags);
+@@ -414,16 +413,15 @@ static struct page *xenfb_vm_nopage(stru
+ spin_unlock_irqrestore(&info->dirty_lock, flags);
+ mutex_unlock(&info->mm_lock);
--#define flush_write_buffers()
-+#define flush_write_buffers()
+- if (type)
+- *type = VM_FAULT_MINOR;
++ vmf->page = page;
- extern int iommu_bio_merge;
- #define BIO_VMERGE_BOUNDARY iommu_bio_merge
+- return page;
++ return VM_FAULT_MINOR;
+ }
- /*
-- * Convert a physical pointer to a virtual kernel pointer for /dev/mem
-- * access
-- */
--#define xlate_dev_mem_ptr(p) __va(p)
--
--/*
- * Convert a virtual cached pointer to an uncached pointer
- */
- #define xlate_dev_kmem_ptr(p) p
---- a/include/asm-x86/mach-xen/asm/io.h
-+++ b/include/asm-x86/mach-xen/asm/io.h
-@@ -1,5 +1,22 @@
-+#ifndef _ASM_X86_IO_H
-+#define _ASM_X86_IO_H
-+
-+#define ARCH_HAS_IOREMAP_WC
-+
- #ifdef CONFIG_X86_32
- # include "io_32.h"
- #else
- # include "io_64.h"
- #endif
-+
-+extern void *xlate_dev_mem_ptr(unsigned long phys);
-+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
-+
-+extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
-+extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
-+
-+extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
-+ unsigned long prot_val);
-+extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
-+
-+#endif /* _ASM_X86_IO_H */
---- a/include/asm-x86/mach-xen/asm/irqflags.h
-+++ b/include/asm-x86/mach-xen/asm/irqflags.h
-@@ -137,11 +137,11 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT
- #endif /* __ASSEMBLY__ */
+ static struct vm_operations_struct xenfb_vm_ops = {
+ .open = xenfb_vm_open,
+ .close = xenfb_vm_close,
+- .nopage = xenfb_vm_nopage,
++ .fault = xenfb_vm_fault,
+ };
- #ifndef __ASSEMBLY__
--#define raw_local_save_flags(flags) \
-- do { (flags) = __raw_local_save_flags(); } while (0)
-+#define raw_local_save_flags(flags) \
-+ do { (flags) = __raw_local_save_flags(); } while (0)
+ static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
+--- sle11-2009-05-14.orig/drivers/xen/gntdev/gntdev.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/gntdev/gntdev.c 2009-03-16 16:38:05.000000000 +0100
+@@ -392,7 +392,7 @@ nomem_out:
+ static int __init gntdev_init(void)
+ {
+ struct class *class;
+- struct class_device *device;
++ struct device *device;
--#define raw_local_irq_save(flags) \
-- do { (flags) = __raw_local_irq_save(); } while (0)
-+#define raw_local_irq_save(flags) \
-+ do { (flags) = __raw_local_irq_save(); } while (0)
+ if (!is_running_on_xen()) {
+ printk(KERN_ERR "You must be running Xen to use gntdev\n");
+@@ -417,8 +417,8 @@ static int __init gntdev_init(void)
+ return 0;
+ }
- static inline int raw_irqs_disabled_flags(unsigned long flags)
+- device = class_device_create(class, NULL, MKDEV(gntdev_major, 0),
+- NULL, GNTDEV_NAME);
++ device = device_create(class, NULL, MKDEV(gntdev_major, 0),
++ GNTDEV_NAME);
+ if (IS_ERR(device)) {
+ printk(KERN_ERR "Error creating gntdev device in xen_class\n");
+ printk(KERN_ERR "gntdev created with major number = %d\n",
+@@ -435,7 +435,7 @@ static void __exit gntdev_exit(void)
{
---- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
-+++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
-@@ -94,7 +94,7 @@ static inline void switch_mm(struct mm_s
- BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
+ struct class *class;
+ if ((class = get_xen_class()) != NULL)
+- class_device_destroy(class, MKDEV(gntdev_major, 0));
++ device_destroy(class, MKDEV(gntdev_major, 0));
+ unregister_chrdev(gntdev_major, GNTDEV_NAME);
+ }
- if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
-- /* We were in lazy tlb mode and leave_mm disabled
-+ /* We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload %cr3.
- */
- load_cr3(next->pgd);
-@@ -107,10 +107,10 @@ static inline void switch_mm(struct mm_s
- #define deactivate_mm(tsk, mm) \
- asm("movl %0,%%gs": :"r" (0));
+--- sle11-2009-05-14.orig/drivers/xen/netfront/netfront.c 2009-03-30 16:39:44.000000000 +0200
++++ sle11-2009-05-14/drivers/xen/netfront/netfront.c 2009-03-30 16:40:17.000000000 +0200
+@@ -1464,8 +1464,7 @@ err:
+ }
+ }
--#define activate_mm(prev, next) \
-- do { \
-- xen_activate_mm(prev, next); \
-- switch_mm((prev),(next),NULL); \
-- } while(0)
-+#define activate_mm(prev, next) \
-+do { \
-+ xen_activate_mm(prev, next); \
-+ switch_mm((prev), (next), NULL); \
-+} while (0)
+- while ((skb = __skb_dequeue(&errq)))
+- kfree_skb(skb);
++ __skb_queue_purge(&errq);
- #endif
---- a/include/asm-x86/mach-xen/asm/mmu_context_64.h
-+++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h
-@@ -21,7 +21,7 @@ void destroy_context(struct mm_struct *m
- static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
- {
- #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-- if (read_pda(mmu_state) == TLBSTATE_OK)
-+ if (read_pda(mmu_state) == TLBSTATE_OK)
- write_pda(mmu_state, TLBSTATE_LAZY);
- #endif
+ while ((skb = __skb_dequeue(&rxq)) != NULL) {
+ struct page *page = NETFRONT_SKB_CB(skb)->page;
+@@ -1630,8 +1629,7 @@ static void netif_release_rx_bufs_flip(s
+ }
+ }
+
+- while ((skb = __skb_dequeue(&free_list)) != NULL)
+- dev_kfree_skb(skb);
++ __skb_queue_purge(&free_list);
+
+ spin_unlock_bh(&np->rx_lock);
+ }
+--- sle11-2009-05-14.orig/drivers/xen/privcmd/privcmd.c 2009-03-04 11:28:34.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/privcmd/privcmd.c 2009-03-16 16:38:05.000000000 +0100
+@@ -261,15 +261,13 @@ static long privcmd_ioctl(struct file *f
}
-@@ -62,7 +62,7 @@ extern void mm_pin(struct mm_struct *mm)
- extern void mm_unpin(struct mm_struct *mm);
- void mm_pin_all(void);
--static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
+ #ifndef HAVE_ARCH_PRIVCMD_MMAP
+-static struct page *privcmd_nopage(struct vm_area_struct *vma,
+- unsigned long address,
+- int *type)
++static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- unsigned cpu = smp_processor_id();
-@@ -106,7 +106,7 @@ static inline void switch_mm(struct mm_s
- if (read_pda(active_mm) != next)
- BUG();
- if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
-- /* We were in lazy tlb mode and leave_mm disabled
-+ /* We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- */
-@@ -118,10 +118,11 @@ static inline void switch_mm(struct mm_s
- #endif
+- return NOPAGE_SIGBUS;
++ return VM_FAULT_SIGBUS;
}
--#define deactivate_mm(tsk,mm) do { \
-- load_gs_index(0); \
-- asm volatile("movl %0,%%fs"::"r"(0)); \
--} while(0)
-+#define deactivate_mm(tsk, mm) \
-+do { \
-+ load_gs_index(0); \
-+ asm volatile("movl %0,%%fs"::"r"(0)); \
-+} while (0)
+ static struct vm_operations_struct privcmd_vm_ops = {
+- .nopage = privcmd_nopage
++ .fault = privcmd_fault
+ };
- static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
- {
---- a/include/asm-x86/mach-xen/asm/page_64.h
-+++ b/include/asm-x86/mach-xen/asm/page_64.h
-@@ -5,7 +5,7 @@
+ static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
+--- sle11-2009-05-14.orig/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:12:22.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/xenbus/xenbus_client.c 2009-03-24 10:13:17.000000000 +0100
+@@ -442,7 +442,7 @@ int xenbus_map_ring_valloc(struct xenbus
- #define THREAD_ORDER 1
- #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
--#define CURRENT_MASK (~(THREAD_SIZE-1))
-+#define CURRENT_MASK (~(THREAD_SIZE - 1))
+ *vaddr = NULL;
- #define EXCEPTION_STACK_ORDER 0
- #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
-@@ -53,10 +53,10 @@
- #define __VIRTUAL_MASK_SHIFT 48
+- area = alloc_vm_area(PAGE_SIZE);
++ area = xen_alloc_vm_area(PAGE_SIZE);
+ if (!area)
+ return -ENOMEM;
- /*
-- * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
-+ * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
- * arch/x86/kernel/head_64.S), and it is mapped here:
- */
--#define KERNEL_IMAGE_SIZE (128*1024*1024)
-+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
- #define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
+@@ -452,7 +452,7 @@ int xenbus_map_ring_valloc(struct xenbus
+ BUG();
- #ifndef __ASSEMBLY__
-@@ -64,7 +64,6 @@ void clear_page(void *page);
- void copy_page(void *to, void *from);
+ if (op.status != GNTST_okay) {
+- free_vm_area(area);
++ xen_free_vm_area(area);
+ xenbus_dev_fatal(dev, op.status,
+ "mapping in shared page %d from domain %d",
+ gnt_ref, dev->otherend_id);
+@@ -551,7 +551,7 @@ int xenbus_unmap_ring_vfree(struct xenbu
+ BUG();
- extern unsigned long end_pfn;
--extern unsigned long end_pfn_map;
+ if (op.status == GNTST_okay)
+- free_vm_area(area);
++ xen_free_vm_area(area);
+ else
+ xenbus_dev_error(dev, op.status,
+ "unmapping page at handle %d error %d",
+--- sle11-2009-05-14.orig/drivers/xen/xenbus/xenbus_probe.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-05-14/drivers/xen/xenbus/xenbus_probe.c 2009-03-16 16:38:05.000000000 +0100
+@@ -173,7 +173,7 @@ static int read_backend_details(struct x
+ return read_otherend_details(xendev, "backend-id", "backend");
+ }
- static inline unsigned long __phys_addr(unsigned long x)
+-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) && (defined(CONFIG_XEN) || defined(MODULE))
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+ static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env)
{
-@@ -89,6 +88,9 @@ typedef union { pteval_t pte; unsigned i
+ struct xenbus_device *xdev;
+@@ -185,8 +185,10 @@ static int xenbus_uevent_frontend(struct
+ return -ENODEV;
- #define vmemmap ((struct page *)VMEMMAP_START)
+ /* stuff we want to pass to /sbin/hotplug */
++#if defined(CONFIG_XEN) || defined(MODULE)
+ add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype);
+ add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename);
++#endif
+ add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype);
-+extern unsigned long init_memory_mapping(unsigned long start,
-+ unsigned long end);
-+
- #endif /* !__ASSEMBLY__ */
+ return 0;
+@@ -207,10 +209,8 @@ static struct xen_bus_type xenbus_fronte
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+ .shutdown = xenbus_dev_shutdown,
+-#if defined(CONFIG_XEN) || defined(MODULE)
+ .uevent = xenbus_uevent_frontend,
+ #endif
+-#endif
+ },
+ #if defined(CONFIG_XEN) || defined(MODULE)
+ .dev = {
+@@ -519,6 +519,15 @@ static ssize_t xendev_show_devtype(struc
+ }
+ DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
- #ifdef CONFIG_FLATMEM
---- a/include/asm-x86/mach-xen/asm/page.h
-+++ b/include/asm-x86/mach-xen/asm/page.h
-@@ -20,8 +20,16 @@
- #define _PAGE_BIT_IO 9
- #define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
++static ssize_t xendev_show_modalias(struct device *dev,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
++ struct device_attribute *attr,
++#endif
++ char *buf)
++{
++ return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
++}
++DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
--#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
--#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK)
-+#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
-+#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
-+
-+/* Cast PAGE_MASK to a signed type so that it is sign-extended if
-+ virtual addresses are 32-bits but physical addresses are larger
-+ (ie, 32-bit PAE). */
-+#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
+ int xenbus_probe_node(struct xen_bus_type *bus,
+ const char *type,
+@@ -579,10 +588,16 @@ int xenbus_probe_node(struct xen_bus_typ
+
+ err = device_create_file(&xendev->dev, &dev_attr_devtype);
+ if (err)
+- goto fail_remove_file;
++ goto fail_remove_nodename;
+
-+/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
-+#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
++ err = device_create_file(&xendev->dev, &dev_attr_modalias);
++ if (err)
++ goto fail_remove_devtype;
- #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
- #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
-@@ -34,19 +42,14 @@
- /* to align the pointer to the (next) page boundary */
- #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+ return 0;
+-fail_remove_file:
++fail_remove_devtype:
++ device_remove_file(&xendev->dev, &dev_attr_devtype);
++fail_remove_nodename:
+ device_remove_file(&xendev->dev, &dev_attr_nodename);
+ fail_unregister:
+ device_unregister(&xendev->dev);
+--- sle11-2009-05-14.orig/fs/aio.c 2009-03-24 10:11:37.000000000 +0100
++++ sle11-2009-05-14/fs/aio.c 2009-03-24 10:13:25.000000000 +0100
+@@ -1271,6 +1271,7 @@ static void io_destroy(struct kioctx *io
+ #ifdef CONFIG_EPOLL
+ /* forget the poll file, but it's up to the user to close it */
+ if (ioctx->file) {
++ fput(ioctx->file);
+ ioctx->file->private_data = 0;
+ ioctx->file = 0;
+ }
+@@ -1295,6 +1296,7 @@ static int aio_queue_fd_close(struct ino
+ spin_lock_irq(&ioctx->ctx_lock);
+ ioctx->file = 0;
+ spin_unlock_irq(&ioctx->ctx_lock);
++ fput(file);
+ }
+ return 0;
+ }
+@@ -1330,16 +1332,17 @@ static const struct file_operations aioq
--#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
--#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
--
- #ifndef __ASSEMBLY__
- #include <linux/types.h>
- #endif
+ static int make_aio_fd(struct kioctx *ioctx)
+ {
+- int error, fd;
+- struct inode *inode;
++ int fd;
+ struct file *file;
- #ifdef CONFIG_X86_64
- #include <asm/page_64.h>
--#define max_pfn_mapped end_pfn_map
- #else
- #include <asm/page_32.h>
--#define max_pfn_mapped max_low_pfn
- #endif /* CONFIG_X86_64 */
+- error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
+- &aioq_fops, ioctx);
+- if (error)
+- return error;
++ fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
++ if (fd < 0)
++ return fd;
- #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
-@@ -59,6 +62,9 @@
- #ifndef __ASSEMBLY__
+ /* associate the file with the IO context */
++ file = fget(fd);
++ if (!file)
++ return -EBADF;
+ file->private_data = ioctx;
+ ioctx->file = file;
+ init_waitqueue_head(&ioctx->poll_wait);
+--- sle11-2009-05-14.orig/include/asm-x86/dma-mapping.h 2009-05-14 10:56:29.000000000 +0200
++++ sle11-2009-05-14/include/asm-x86/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
+@@ -223,8 +223,13 @@ static inline dma_addr_t dma_map_page(st
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
- extern int page_is_ram(unsigned long pagenr);
-+extern int devmem_is_allowed(unsigned long pagenr);
-+
-+extern unsigned long max_pfn_mapped;
+ BUG_ON(!valid_dma_direction(direction));
++#ifndef CONFIG_XEN
+ return ops->map_single(dev, page_to_phys(page) + offset,
+ size, direction);
++#else
++ return ops->map_single(dev, page_to_pseudophys(page) + offset,
++ size, direction);
++#endif
+ }
- struct page;
+ static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
+--- sle11-2009-05-14.orig/include/asm-x86/genapic_64.h 2009-05-14 10:56:29.000000000 +0200
++++ sle11-2009-05-14/include/asm-x86/genapic_64.h 2009-03-16 16:38:05.000000000 +0100
+@@ -46,6 +46,7 @@ extern struct genapic apic_x2apic_phys;
+ extern int acpi_madt_oem_check(char *, char *);
+
+ extern void apic_send_IPI_self(int vector);
++#ifndef CONFIG_XEN
+ enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
+ extern enum uv_system_type get_uv_system_type(void);
+ extern int is_uv_system(void);
+@@ -55,6 +56,10 @@ DECLARE_PER_CPU(int, x2apic_extra_bits);
+ extern void uv_cpu_init(void);
+ extern void uv_system_init(void);
+ extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
++#else
++#define is_uv_system() 0
++#define uv_cpu_init() ((void)0)
++#endif
---- a/include/asm-x86/mach-xen/asm/pci_64.h
-+++ b/include/asm-x86/mach-xen/asm/pci_64.h
-@@ -1,12 +1,10 @@
- #ifndef __x8664_PCI_H
- #define __x8664_PCI_H
+ extern void setup_apic_routing(void);
--
- #ifdef __KERNEL__
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:38:05.000000000 +0100
+@@ -64,8 +64,8 @@ static inline struct desc_struct *get_cp
+ }
+ static inline void pack_gate(gate_desc *gate, unsigned char type,
+- unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
-
- #ifdef CONFIG_CALGARY_IOMMU
--static inline void* pci_iommu(struct pci_bus *bus)
-+static inline void *pci_iommu(struct pci_bus *bus)
++ unsigned long base, unsigned dpl, unsigned flags,
++ unsigned short seg)
{
- struct pci_sysdata *sd = bus->sysdata;
- return sd->iommu;
-@@ -19,13 +17,12 @@ static inline void set_pci_iommu(struct
- }
- #endif /* CONFIG_CALGARY_IOMMU */
-
-+extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
-+ int reg, int len, u32 *value);
-+extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
-+ int reg, int len, u32 value);
+ gate->a = (seg << 16) | (base & 0xffff);
+ gate->b = (base & 0xffff0000) |
+@@ -84,22 +84,23 @@ static inline int desc_empty(const void
+ #define load_TR_desc() native_load_tr_desc()
+ #define load_gdt(dtr) native_load_gdt(dtr)
+ #define load_idt(dtr) native_load_idt(dtr)
+-#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
+-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
++#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
++#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
--extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
--extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
--
--
--
--extern void pci_iommu_alloc(void);
-+extern void dma32_reserve_bootmem(void);
+ #define store_gdt(dtr) native_store_gdt(dtr)
+ #define store_idt(dtr) native_store_idt(dtr)
+ #define store_tr(tr) (tr = native_store_tr())
+-#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
++#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
- /* The PCI address space does equal the physical memory
- * address space. The networking and block device layers use
-@@ -82,5 +79,4 @@ extern void pci_iommu_alloc(void);
+ #define load_TLS(t, cpu) native_load_tls(t, cpu)
+ #define set_ldt native_set_ldt
- #endif /* __KERNEL__ */
+-#define write_ldt_entry(dt, entry, desc) \
+- native_write_ldt_entry(dt, entry, desc)
+-#define write_gdt_entry(dt, entry, desc, type) \
+- native_write_gdt_entry(dt, entry, desc, type)
+-#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
++#define write_ldt_entry(dt, entry, desc) \
++ native_write_ldt_entry(dt, entry, desc)
++#define write_gdt_entry(dt, entry, desc, type) \
++ native_write_gdt_entry(dt, entry, desc, type)
++#define write_idt_entry(dt, entry, g) \
++ native_write_idt_entry(dt, entry, g)
--
- #endif /* __x8664_PCI_H */
---- a/include/asm-x86/mach-xen/asm/pci.h
-+++ b/include/asm-x86/mach-xen/asm/pci.h
-@@ -8,14 +8,13 @@
- #include <asm/scatterlist.h>
- #include <asm/io.h>
+ static inline void native_write_idt_entry(gate_desc *idt, int entry,
+ const gate_desc *gate)
+@@ -138,8 +139,8 @@ static inline void pack_descriptor(struc
+ {
+ desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
+ desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+- (limit & 0x000f0000) | ((type & 0xff) << 8) |
+- ((flags & 0xf) << 20);
++ (limit & 0x000f0000) | ((type & 0xff) << 8) |
++ ((flags & 0xf) << 20);
+ desc->p = 1;
+ }
+@@ -160,7 +161,6 @@ static inline void set_tssldt_descriptor
+ desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
+ desc->base3 = PTR_HIGH(addr);
+ #else
-
- #ifdef __KERNEL__
-
- struct pci_sysdata {
- int domain; /* PCI domain */
- int node; /* NUMA node */
- #ifdef CONFIG_X86_64
-- void* iommu; /* IOMMU private data */
-+ void *iommu; /* IOMMU private data */
+ pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
#endif
- #ifdef CONFIG_XEN_PCIDEV_FRONTEND
- struct pcifront_device *pdev;
-@@ -23,6 +22,8 @@ struct pci_sysdata {
- };
+ }
+@@ -178,7 +178,8 @@ static inline void __set_tss_desc(unsign
+ * last valid byte
+ */
+ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
+- IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
++ IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
++ sizeof(unsigned long) - 1);
+ write_gdt_entry(d, entry, &tss, DESC_TSS);
+ }
- /* scan a bus after allocating a pci_sysdata for it */
-+extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
-+ int node);
- extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
+@@ -187,16 +188,16 @@ static inline void __set_tss_desc(unsign
+ static inline void native_set_ldt(const void *addr, unsigned int entries)
+ {
+ if (likely(entries == 0))
+- __asm__ __volatile__("lldt %w0"::"q" (0));
++ asm volatile("lldt %w0"::"q" (0));
+ else {
+ unsigned cpu = smp_processor_id();
+ ldt_desc ldt;
- static inline int pci_domain_nr(struct pci_bus *bus)
-@@ -36,6 +37,7 @@ static inline int pci_proc_domain(struct
- return pci_domain_nr(bus);
+- set_tssldt_descriptor(&ldt, (unsigned long)addr,
+- DESC_LDT, entries * sizeof(ldt) - 1);
++ set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
++ entries * LDT_ENTRY_SIZE - 1);
+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+ &ldt, DESC_LDT);
+- __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
++ asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+ }
}
-+extern void pci_iommu_alloc(void);
-
- /* Can be used to override the logic in pci_scan_bus for skipping
- already-configured bus numbers - to be used for buggy BIOSes
-@@ -57,7 +59,7 @@ extern unsigned long pci_mem_start;
- #define PCIBIOS_MIN_CARDBUS_IO 0x4000
+@@ -261,15 +262,15 @@ static inline void xen_load_tls(struct t
+ }
+ #endif
- void pcibios_config_init(void);
--struct pci_bus * pcibios_scan_root(int bus);
-+struct pci_bus *pcibios_scan_root(int bus);
+-#define _LDT_empty(info) (\
+- (info)->base_addr == 0 && \
+- (info)->limit == 0 && \
+- (info)->contents == 0 && \
+- (info)->read_exec_only == 1 && \
+- (info)->seg_32bit == 0 && \
+- (info)->limit_in_pages == 0 && \
+- (info)->seg_not_present == 1 && \
+- (info)->useable == 0)
++#define _LDT_empty(info) \
++ ((info)->base_addr == 0 && \
++ (info)->limit == 0 && \
++ (info)->contents == 0 && \
++ (info)->read_exec_only == 1 && \
++ (info)->seg_32bit == 0 && \
++ (info)->limit_in_pages == 0 && \
++ (info)->seg_not_present == 1 && \
++ (info)->useable == 0)
- void pcibios_set_master(struct pci_dev *dev);
- void pcibios_penalize_isa_irq(int irq, int active);
-@@ -67,7 +69,8 @@ int pcibios_set_irq_routing(struct pci_d
+ #ifdef CONFIG_X86_64
+ #define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+@@ -309,7 +310,7 @@ static inline unsigned long get_desc_lim
- #define HAVE_PCI_MMAP
- extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-- enum pci_mmap_state mmap_state, int write_combine);
-+ enum pci_mmap_state mmap_state,
-+ int write_combine);
+ #ifndef CONFIG_X86_NO_IDT
+ static inline void _set_gate(int gate, unsigned type, void *addr,
+- unsigned dpl, unsigned ist, unsigned seg)
++ unsigned dpl, unsigned ist, unsigned seg)
+ {
+ gate_desc s;
+ pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+@@ -393,10 +394,10 @@ static inline void set_system_gate_ist(i
+ * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
+ */
+ #define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
+- movb idx*8+4(gdt), lo_b; \
+- movb idx*8+7(gdt), hi_b; \
+- shll $16, base; \
+- movw idx*8+2(gdt), lo_w;
++ movb idx * 8 + 4(gdt), lo_b; \
++ movb idx * 8 + 7(gdt), hi_b; \
++ shll $16, base; \
++ movw idx * 8 + 2(gdt), lo_w;
- #ifdef CONFIG_PCI
---- a/include/asm-x86/mach-xen/asm/pgalloc_32.h
-+++ /dev/null
-@@ -1,111 +0,0 @@
--#ifndef _I386_PGALLOC_H
--#define _I386_PGALLOC_H
--
--#include <linux/threads.h>
--#include <linux/mm.h> /* for struct page */
--#include <linux/pagemap.h>
--#include <asm/tlb.h>
--#include <asm-generic/tlb.h>
--#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
--
--#define paravirt_alloc_pt(mm, pfn) do { } while (0)
--#define paravirt_alloc_pd(mm, pfn) do { } while (0)
--#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
--#define paravirt_release_pt(pfn) do { } while (0)
--#define paravirt_release_pd(pfn) do { } while (0)
--
--static inline void pmd_populate_kernel(struct mm_struct *mm,
-- pmd_t *pmd, pte_t *pte)
--{
-- paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
-- set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
--}
--
--static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
--{
-- unsigned long pfn = page_to_pfn(pte);
--
-- paravirt_alloc_pt(mm, pfn);
-- if (PagePinned(virt_to_page(mm->pgd))) {
-- if (!PageHighMem(pte))
-- BUG_ON(HYPERVISOR_update_va_mapping(
-- (unsigned long)__va(pfn << PAGE_SHIFT),
-- pfn_pte(pfn, PAGE_KERNEL_RO), 0));
-- else if (!test_and_set_bit(PG_pinned, &pte->flags))
-- kmap_flush_unused();
-- set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
-- } else
-- *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
--}
--#define pmd_pgtable(pmd) pmd_page(pmd)
--
--/*
-- * Allocate and free page tables.
-- */
--extern void pgd_test_and_unpin(pgd_t *);
--extern pgd_t *pgd_alloc(struct mm_struct *);
--extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
--
--extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
--extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
--
--static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
--{
-- make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
-- free_page((unsigned long)pte);
--}
--
--extern void __pte_free(pgtable_t);
--static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
--{
-- __pte_free(pte);
--}
--
--
--extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+ #endif /* __ASSEMBLY__ */
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/dma-mapping.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/dma-mapping.h 2009-03-16 16:38:05.000000000 +0100
+@@ -1,5 +1,17 @@
+-#ifdef CONFIG_X86_32
+-# include "dma-mapping_32.h"
+-#else
+-# include "dma-mapping_64.h"
+-#endif
++#ifndef _ASM_DMA_MAPPING_H_
++
++#include "../../dma-mapping.h"
++
++static inline int
++address_needs_mapping(struct device *hwdev, dma_addr_t addr)
++{
++ dma_addr_t mask = 0xffffffff;
++ /* If the device has a mask, use it, otherwise default to 32 bits */
++ if (hwdev && hwdev->dma_mask)
++ mask = *hwdev->dma_mask;
++ return (addr & ~mask) != 0;
++}
++
++extern int range_straddles_page_boundary(paddr_t p, size_t size);
++
++#endif /* _ASM_DMA_MAPPING_H_ */
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2009-03-16 16:33:40.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,141 +0,0 @@
+-#ifndef _ASM_I386_DMA_MAPPING_H
+-#define _ASM_I386_DMA_MAPPING_H
-
--#ifdef CONFIG_X86_PAE
-/*
-- * In the PAE case we free the pmds as part of the pgd.
+- * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
+- * documentation.
- */
--extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
-
--extern void __pmd_free(pgtable_t);
--static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+-#include <linux/mm.h>
+-#include <linux/scatterlist.h>
+-#include <asm/cache.h>
+-#include <asm/io.h>
+-#include <asm/swiotlb.h>
+-
+-static inline int
+-address_needs_mapping(struct device *hwdev, dma_addr_t addr)
-{
-- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-- __pmd_free(virt_to_page(pmd));
+- dma_addr_t mask = 0xffffffff;
+- /* If the device has a mask, use it, otherwise default to 32 bits */
+- if (hwdev && hwdev->dma_mask)
+- mask = *hwdev->dma_mask;
+- return (addr & ~mask) != 0;
-}
-
--extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+-extern int range_straddles_page_boundary(paddr_t p, size_t size);
-
--static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
--{
-- struct page *page = virt_to_page(pmd);
-- unsigned long pfn = page_to_pfn(page);
+-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-- paravirt_alloc_pd(mm, pfn);
+-void *dma_alloc_coherent(struct device *dev, size_t size,
+- dma_addr_t *dma_handle, gfp_t flag);
-
-- /* Note: almost everything apart from _PAGE_PRESENT is
-- reserved at the pmd (PDPT) level. */
-- if (PagePinned(virt_to_page(mm->pgd))) {
-- BUG_ON(PageHighMem(page));
-- BUG_ON(HYPERVISOR_update_va_mapping(
-- (unsigned long)__va(pfn << PAGE_SHIFT),
-- pfn_pte(pfn, PAGE_KERNEL_RO), 0));
-- set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
-- } else
-- *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
+-void dma_free_coherent(struct device *dev, size_t size,
+- void *vaddr, dma_addr_t dma_handle);
-
-- /*
-- * According to Intel App note "TLBs, Paging-Structure Caches,
-- * and Their Invalidation", April 2007, document 317080-001,
-- * section 8.1: in PAE mode we explicitly have to flush the
-- * TLB via cr3 if the top-level pgd is changed...
-- */
-- if (mm == current->active_mm)
-- xen_tlb_flush();
--}
--#endif /* CONFIG_X86_PAE */
+-extern dma_addr_t
+-dma_map_single(struct device *dev, void *ptr, size_t size,
+- enum dma_data_direction direction);
-
--#endif /* _I386_PGALLOC_H */
---- a/include/asm-x86/mach-xen/asm/pgalloc_64.h
-+++ /dev/null
-@@ -1,179 +0,0 @@
--#ifndef _X86_64_PGALLOC_H
--#define _X86_64_PGALLOC_H
+-extern void
+-dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+- enum dma_data_direction direction);
-
--#include <asm/pda.h>
--#include <linux/threads.h>
--#include <linux/mm.h>
--#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
+-extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
+- int nents, enum dma_data_direction direction);
+-extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+- int nents, enum dma_data_direction direction);
-
--pmd_t *early_get_pmd(unsigned long va);
--void early_make_page_readonly(void *va, unsigned int feature);
+-#ifdef CONFIG_HIGHMEM
+-extern dma_addr_t
+-dma_map_page(struct device *dev, struct page *page, unsigned long offset,
+- size_t size, enum dma_data_direction direction);
-
--#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
+-extern void
+-dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
+- enum dma_data_direction direction);
+-#else
+-#define dma_map_page(dev, page, offset, size, dir) \
+- dma_map_single(dev, page_address(page) + (offset), (size), (dir))
+-#define dma_unmap_page dma_unmap_single
+-#endif
-
--#define pmd_populate_kernel(mm, pmd, pte) \
-- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
+-extern void
+-dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+- enum dma_data_direction direction);
-
--static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+-extern void
+-dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
+- enum dma_data_direction direction);
+-
+-static inline void
+-dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
+- unsigned long offset, size_t size,
+- enum dma_data_direction direction)
-{
-- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
-- BUG_ON(HYPERVISOR_update_va_mapping(
-- (unsigned long)pmd,
-- pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
-- PAGE_KERNEL_RO), 0));
-- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
-- } else {
-- *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
-- }
+- dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
-}
-
--/*
-- * We need to use the batch mode here, but pgd_pupulate() won't be
-- * be called frequently.
-- */
--static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+-static inline void
+-dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
+- unsigned long offset, size_t size,
+- enum dma_data_direction direction)
-{
-- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
-- BUG_ON(HYPERVISOR_update_va_mapping(
-- (unsigned long)pud,
-- pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
-- PAGE_KERNEL_RO), 0));
-- set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
-- set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
-- } else {
-- *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
-- *(__user_pgd(pgd)) = *(pgd);
-- }
+- dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
-}
-
--#define pmd_pgtable(pmd) pmd_page(pmd)
+-extern void
+-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+- enum dma_data_direction direction);
-
--static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
--{
-- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
-- BUG_ON(HYPERVISOR_update_va_mapping(
-- (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
-- pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
-- set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
-- } else {
-- *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
-- }
--}
+-extern void
+-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+- enum dma_data_direction direction);
-
--extern void __pmd_free(pgtable_t);
--static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
--{
-- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-- __pmd_free(virt_to_page(pmd));
--}
+-extern int
+-dma_mapping_error(dma_addr_t dma_addr);
-
--extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
+-extern int
+-dma_supported(struct device *dev, u64 mask);
-
--static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+-static inline int
+-dma_set_mask(struct device *dev, u64 mask)
-{
-- return (pud_t *)pmd_alloc_one(mm, addr);
--}
+- if(!dev->dma_mask || !dma_supported(dev, mask))
+- return -EIO;
-
--static inline void pud_free(struct mm_struct *mm, pud_t *pud)
--{
-- BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-- __pmd_free(virt_to_page(pud));
+- *dev->dma_mask = mask;
+-
+- return 0;
-}
-
--static inline void pgd_list_add(pgd_t *pgd)
+-static inline int
+-dma_get_cache_alignment(void)
-{
-- struct page *page = virt_to_page(pgd);
-- unsigned long flags;
--
-- spin_lock_irqsave(&pgd_lock, flags);
-- list_add(&page->lru, &pgd_list);
-- spin_unlock_irqrestore(&pgd_lock, flags);
+- /* no easy way to get cache size on all x86, so return the
+- * maximum possible, to be safe */
+- return (1 << INTERNODE_CACHE_SHIFT);
-}
-
--static inline void pgd_list_del(pgd_t *pgd)
--{
-- struct page *page = virt_to_page(pgd);
-- unsigned long flags;
+-#define dma_is_consistent(d, h) (1)
-
-- spin_lock_irqsave(&pgd_lock, flags);
-- list_del(&page->lru);
-- spin_unlock_irqrestore(&pgd_lock, flags);
+-static inline void
+-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+- enum dma_data_direction direction)
+-{
+- flush_write_buffers();
-}
-
--extern void pgd_test_and_unpin(pgd_t *);
+-#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
+-extern int
+-dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+- dma_addr_t device_addr, size_t size, int flags);
-
--static inline pgd_t *pgd_alloc(struct mm_struct *mm)
--{
-- /*
-- * We allocate two contiguous pages for kernel and user.
-- */
-- unsigned boundary;
-- pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
-- if (!pgd)
-- return NULL;
-- pgd_list_add(pgd);
-- pgd_test_and_unpin(pgd);
-- /*
-- * Copy kernel pointers in from init.
-- * Could keep a freelist or slab cache of those because the kernel
-- * part never changes.
-- */
-- boundary = pgd_index(__PAGE_OFFSET);
-- memset(pgd, 0, boundary * sizeof(pgd_t));
-- memcpy(pgd + boundary,
-- init_level4_pgt + boundary,
-- (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
+-extern void
+-dma_release_declared_memory(struct device *dev);
-
-- memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
-- /*
-- * Set level3_user_pgt for vsyscall area
-- */
-- __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
-- __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
-- return pgd;
--}
+-extern void *
+-dma_mark_declared_memory_occupied(struct device *dev,
+- dma_addr_t device_addr, size_t size);
-
--static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
--{
-- pgd_test_and_unpin(pgd);
-- pgd_list_del(pgd);
-- free_pages((unsigned long)pgd, 1);
--}
+-#endif
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/dma-mapping_64.h 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,205 +0,0 @@
+-#ifndef _X8664_DMA_MAPPING_H
+-#define _X8664_DMA_MAPPING_H 1
-
--static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
--{
-- pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-- if (pte)
-- make_page_readonly(pte, XENFEAT_writable_page_tables);
+-/*
+- * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
+- * documentation.
+- */
-
-- return pte;
--}
+-#include <linux/scatterlist.h>
+-#include <asm/io.h>
-
--extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
+-struct dma_mapping_ops {
+- int (*mapping_error)(dma_addr_t dma_addr);
+- void* (*alloc_coherent)(struct device *dev, size_t size,
+- dma_addr_t *dma_handle, gfp_t gfp);
+- void (*free_coherent)(struct device *dev, size_t size,
+- void *vaddr, dma_addr_t dma_handle);
+- dma_addr_t (*map_single)(struct device *hwdev, void *ptr,
+- size_t size, int direction);
+- /* like map_single, but doesn't check the device mask */
+- dma_addr_t (*map_simple)(struct device *hwdev, char *ptr,
+- size_t size, int direction);
+- void (*unmap_single)(struct device *dev, dma_addr_t addr,
+- size_t size, int direction);
+- void (*sync_single_for_cpu)(struct device *hwdev,
+- dma_addr_t dma_handle, size_t size,
+- int direction);
+- void (*sync_single_for_device)(struct device *hwdev,
+- dma_addr_t dma_handle, size_t size,
+- int direction);
+- void (*sync_single_range_for_cpu)(struct device *hwdev,
+- dma_addr_t dma_handle, unsigned long offset,
+- size_t size, int direction);
+- void (*sync_single_range_for_device)(struct device *hwdev,
+- dma_addr_t dma_handle, unsigned long offset,
+- size_t size, int direction);
+- void (*sync_sg_for_cpu)(struct device *hwdev,
+- struct scatterlist *sg, int nelems,
+- int direction);
+- void (*sync_sg_for_device)(struct device *hwdev,
+- struct scatterlist *sg, int nelems,
+- int direction);
+- int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
+- int nents, int direction);
+- void (*unmap_sg)(struct device *hwdev,
+- struct scatterlist *sg, int nents,
+- int direction);
+- int (*dma_supported)(struct device *hwdev, u64 mask);
+- int is_phys;
+-};
-
--/* Should really implement gc for free page table pages. This could be
-- done with a reference count in struct page. */
+-extern dma_addr_t bad_dma_address;
+-extern const struct dma_mapping_ops* dma_ops;
+-extern int iommu_merge;
-
--static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+-#if 0
+-static inline int dma_mapping_error(dma_addr_t dma_addr)
-{
-- BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
-- make_page_writable(pte, XENFEAT_writable_page_tables);
-- free_page((unsigned long)pte);
--}
+- if (dma_ops->mapping_error)
+- return dma_ops->mapping_error(dma_addr);
-
--extern void __pte_free(pgtable_t);
--static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
--{
-- __pte_free(pte);
+- return (dma_addr == bad_dma_address);
-}
-
--#define __pte_free_tlb(tlb,pte) \
--do { \
-- pgtable_page_dtor((pte)); \
-- tlb_remove_page((tlb), (pte)); \
--} while (0)
+-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
--#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
--#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
+-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
--#endif /* _X86_64_PGALLOC_H */
---- a/include/asm-x86/mach-xen/asm/pgalloc.h
-+++ b/include/asm-x86/mach-xen/asm/pgalloc.h
-@@ -1,5 +1,149 @@
--#ifdef CONFIG_X86_32
--# include "pgalloc_32.h"
--#else
--# include "pgalloc_64.h"
-+#ifndef _ASM_X86_PGALLOC_H
-+#define _ASM_X86_PGALLOC_H
-+
-+#include <linux/threads.h>
-+#include <linux/mm.h> /* for struct page */
-+#include <linux/pagemap.h>
-+
-+#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
-+
-+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
-+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
-+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
-+ unsigned long start, unsigned long count) {}
-+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
-+static inline void paravirt_release_pte(unsigned long pfn) {}
-+static inline void paravirt_release_pmd(unsigned long pfn) {}
-+static inline void paravirt_release_pud(unsigned long pfn) {}
-+
-+#ifdef CONFIG_X86_64
-+void early_make_page_readonly(void *va, unsigned int feature);
-+pmd_t *early_get_pmd(unsigned long va);
-+#define make_lowmem_page_readonly make_page_readonly
-+#define make_lowmem_page_writable make_page_writable
- #endif
-+
-+/*
-+ * Allocate and free page tables.
-+ */
-+extern pgd_t *pgd_alloc(struct mm_struct *);
-+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
-+
-+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
-+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
-+
-+/* Should really implement gc for free page table pages. This could be
-+ done with a reference count in struct page. */
-+
-+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-+{
-+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
-+ make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
-+ free_page((unsigned long)pte);
-+}
-+
-+extern void __pte_free(pgtable_t);
-+static inline void pte_free(struct mm_struct *mm, struct page *pte)
-+{
-+ __pte_free(pte);
-+}
-+
-+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
-+
-+static inline void pmd_populate_kernel(struct mm_struct *mm,
-+ pmd_t *pmd, pte_t *pte)
-+{
-+ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
-+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
-+}
-+
-+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
-+ struct page *pte)
-+{
-+ unsigned long pfn = page_to_pfn(pte);
-+
-+ paravirt_alloc_pte(mm, pfn);
-+ if (PagePinned(virt_to_page(mm->pgd))) {
-+ if (!PageHighMem(pte))
-+ BUG_ON(HYPERVISOR_update_va_mapping(
-+ (unsigned long)__va(pfn << PAGE_SHIFT),
-+ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
-+#ifndef CONFIG_X86_64
-+ else if (!TestSetPagePinned(pte))
-+ kmap_flush_unused();
-+#endif
-+ set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
-+ } else
-+ *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
-+}
-+
-+#define pmd_pgtable(pmd) pmd_page(pmd)
-+
-+#if PAGETABLE_LEVELS > 2
-+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
-+extern void __pmd_free(pgtable_t);
-+
-+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-+{
-+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-+ __pmd_free(virt_to_page(pmd));
-+}
-+
-+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
-+
-+#ifdef CONFIG_X86_PAE
-+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
-+#else /* !CONFIG_X86_PAE */
-+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
-+{
-+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
-+ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
-+ BUG_ON(HYPERVISOR_update_va_mapping(
-+ (unsigned long)pmd,
-+ pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
-+ PAGE_KERNEL_RO), 0));
-+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
-+ } else
-+ *pud = __pud(_PAGE_TABLE | __pa(pmd));
-+}
-+#endif /* CONFIG_X86_PAE */
-+
-+#if PAGETABLE_LEVELS > 3
-+#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
-+
-+/*
-+ * We need to use the batch mode here, but pgd_pupulate() won't be
-+ * be called frequently.
-+ */
-+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
-+{
-+ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
-+ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
-+ BUG_ON(HYPERVISOR_update_va_mapping(
-+ (unsigned long)pud,
-+ pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
-+ PAGE_KERNEL_RO), 0));
-+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
-+ set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
-+ } else {
-+ *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
-+ *__user_pgd(pgd) = *(pgd);
-+ }
-+}
-+
-+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-+{
-+ return (pud_t *)pmd_alloc_one(mm, addr);
-+}
-+
-+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-+{
-+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-+ __pmd_free(virt_to_page(pud));
-+}
-+
-+extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
-+#endif /* PAGETABLE_LEVELS > 3 */
-+#endif /* PAGETABLE_LEVELS > 2 */
-+
-+#endif /* _ASM_X86_PGALLOC_H */
---- a/include/asm-x86/mach-xen/asm/pgtable_32.h
-+++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
-@@ -38,16 +38,13 @@ void paging_init(void);
- #ifdef CONFIG_X86_PAE
- # include <asm/pgtable-3level-defs.h>
- # define PMD_SIZE (1UL << PMD_SHIFT)
--# define PMD_MASK (~(PMD_SIZE-1))
-+# define PMD_MASK (~(PMD_SIZE - 1))
- #else
- # include <asm/pgtable-2level-defs.h>
- #endif
-
- #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
--#define PGDIR_MASK (~(PGDIR_SIZE-1))
+-extern void *dma_alloc_coherent(struct device *dev, size_t size,
+- dma_addr_t *dma_handle, gfp_t gfp);
+-extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
+- dma_addr_t dma_handle);
+-
+-static inline dma_addr_t
+-dma_map_single(struct device *hwdev, void *ptr, size_t size,
+- int direction)
+-{
+- BUG_ON(!valid_dma_direction(direction));
+- return dma_ops->map_single(hwdev, ptr, size, direction);
+-}
-
--#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
--#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
-+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-
- /* Just any arbitrary offset to the start of the vmalloc VM area: the
- * current 8MB value just means that there will be a 8MB "hole" after the
-@@ -56,21 +53,22 @@ void paging_init(void);
- * The vmalloc() routines leaves a hole of 4kB between each vmalloced
- * area for the same reason. ;)
- */
--#define VMALLOC_OFFSET (8*1024*1024)
--#define VMALLOC_START (((unsigned long) high_memory + \
-- 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
-+#define VMALLOC_OFFSET (8 * 1024 * 1024)
-+#define VMALLOC_START (((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) \
-+ & ~(VMALLOC_OFFSET - 1))
- #ifdef CONFIG_X86_PAE
- #define LAST_PKMAP 512
- #else
- #define LAST_PKMAP 1024
- #endif
-
--#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
-+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
-+ & PMD_MASK)
-
- #ifdef CONFIG_HIGHMEM
--# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
-+# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
- #else
--# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
-+# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
- #endif
-
- /*
-@@ -91,10 +89,10 @@ extern unsigned long pg0[];
- /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
- can temporarily clear it. */
- #define pmd_present(x) (__pmd_val(x))
--#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
-+#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
- #else
- #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
--#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
-+#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
- #endif
-
-
-@@ -107,32 +105,18 @@ extern unsigned long pg0[];
- #endif
-
- /*
-- * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
-- *
-- * dst - pointer to pgd range anwhere on a pgd page
-- * src - ""
-- * count - the number of pgds to copy.
-- *
-- * dst and src can be on the same page, but the range must not overlap,
-- * and must not cross a page boundary.
-+ * Macro to mark a page protection value as "uncacheable".
-+ * On processors which do not support it, this is a no-op.
- */
--static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+-static inline void
+-dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
+- int direction)
-{
-- memcpy(dst, src, count * sizeof(pgd_t));
+- BUG_ON(!valid_dma_direction(direction));
+- dma_ops->unmap_single(dev, addr, size, direction);
-}
-
--/*
-- * Macro to mark a page protection value as "uncacheable". On processors which do not support
-- * it, this is a no-op.
-- */
--#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
-- ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
-+#define pgprot_noncached(prot) \
-+ ((boot_cpu_data.x86 > 3) \
-+ ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
-+ : (prot))
-
- /*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
+-#define dma_map_page(dev,page,offset,size,dir) \
+- dma_map_single((dev), page_address(page)+(offset), (size), (dir))
-
- #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
- /*
-@@ -141,20 +125,20 @@ static inline void clone_pgd_range(pgd_t
- * this macro returns the index of the entry in the pgd page which would
- * control the given virtual address
- */
--#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
--#define pgd_index_k(addr) pgd_index(addr)
-+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
-+#define pgd_index_k(addr) pgd_index((addr))
-
- /*
- * pgd_offset() returns a (pgd_t *)
- * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
- */
--#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
-+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
-
- /*
- * a shortcut which implies the use of the kernel's pgd, instead
- * of a process's
- */
--#define pgd_offset_k(address) pgd_offset(&init_mm, address)
-+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
-
- static inline int pud_large(pud_t pud) { return 0; }
-
-@@ -164,8 +148,8 @@ static inline int pud_large(pud_t pud) {
- * this macro returns the index of the entry in the pmd page which would
- * control the given virtual address
- */
--#define pmd_index(address) \
-- (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
-+#define pmd_index(address) \
-+ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
-
- /*
- * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
-@@ -173,33 +157,36 @@ static inline int pud_large(pud_t pud) {
- * this macro returns the index of the entry in the pte page which would
- * control the given virtual address
- */
--#define pte_index(address) \
-- (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
--#define pte_offset_kernel(dir, address) \
-- ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
-+#define pte_index(address) \
-+ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-+#define pte_offset_kernel(dir, address) \
-+ ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
-
--#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
-+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
-
--#define pmd_page_vaddr(pmd) \
-- ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
-+#define pmd_page_vaddr(pmd) \
-+ ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
-
- #if defined(CONFIG_HIGHPTE)
--#define pte_offset_map(dir, address) \
-- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
--#define pte_offset_map_nested(dir, address) \
-- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
--#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
--#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
--#else
--#define pte_offset_map(dir, address) \
-- ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
--#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
-+#define pte_offset_map(dir, address) \
-+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
-+ pte_index((address)))
-+#define pte_offset_map_nested(dir, address) \
-+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
-+ pte_index((address)))
-+#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
-+#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
-+#else
-+#define pte_offset_map(dir, address) \
-+ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
-+#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
- #define pte_unmap(pte) do { } while (0)
- #define pte_unmap_nested(pte) do { } while (0)
- #endif
-
- /* Clear a kernel PTE and flush it from the TLB */
--#define kpte_clear_flush(ptep, vaddr) do { \
-+#define kpte_clear_flush(ptep, vaddr) \
-+do { \
- if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
- BUG(); \
- } while (0)
-@@ -208,7 +195,7 @@ static inline int pud_large(pud_t pud) {
- * The i386 doesn't have any external MMU info: the kernel page
- * tables contain all the necessary information.
- */
--#define update_mmu_cache(vma,address,pte) do { } while (0)
-+#define update_mmu_cache(vma, address, pte) do { } while (0)
-
- void make_lowmem_page_readonly(void *va, unsigned int feature);
- void make_lowmem_page_writable(void *va, unsigned int feature);
-@@ -225,7 +212,7 @@ void make_lowmem_page_writable(void *va,
- #define kern_addr_valid(kaddr) (0)
- #endif
-
--#define io_remap_pfn_range(vma,from,pfn,size,prot) \
--direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
-+#define io_remap_pfn_range(vma, from, pfn, size, prot) \
-+ direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO)
-
- #endif /* _I386_PGTABLE_H */
---- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
-+++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
-@@ -8,25 +8,28 @@
- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
- */
-
--#define pte_ERROR(e) \
-- printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \
-- &(e), __pte_val(e), pte_pfn(e))
--#define pmd_ERROR(e) \
-- printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
-- &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
--#define pgd_ERROR(e) \
-- printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
-- &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+-#define dma_unmap_page dma_unmap_single
+-
+-static inline void
+-dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
+- size_t size, int direction)
+-{
+- BUG_ON(!valid_dma_direction(direction));
+- if (dma_ops->sync_single_for_cpu)
+- dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
+- direction);
+- flush_write_buffers();
+-}
+-
+-static inline void
+-dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
+- size_t size, int direction)
+-{
+- BUG_ON(!valid_dma_direction(direction));
+- if (dma_ops->sync_single_for_device)
+- dma_ops->sync_single_for_device(hwdev, dma_handle, size,
+- direction);
+- flush_write_buffers();
+-}
+-
+-static inline void
+-dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
+- unsigned long offset, size_t size, int direction)
+-{
+- BUG_ON(!valid_dma_direction(direction));
+- if (dma_ops->sync_single_range_for_cpu) {
+- dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
+- }
+-
+- flush_write_buffers();
+-}
+-
+-static inline void
+-dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
+- unsigned long offset, size_t size, int direction)
+-{
+- BUG_ON(!valid_dma_direction(direction));
+- if (dma_ops->sync_single_range_for_device)
+- dma_ops->sync_single_range_for_device(hwdev, dma_handle,
+- offset, size, direction);
-
-+#define pte_ERROR(e) \
-+ printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \
-+ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
-+#define pmd_ERROR(e) \
-+ printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
-+ __FILE__, __LINE__, &(e), __pmd_val(e), \
-+ (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
-+#define pgd_ERROR(e) \
-+ printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
-+ __FILE__, __LINE__, &(e), __pgd_val(e), \
-+ (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
-
- static inline int pud_none(pud_t pud)
- {
- return __pud_val(pud) == 0;
-+
- }
- static inline int pud_bad(pud_t pud)
- {
- return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
- }
+- flush_write_buffers();
+-}
+-
+-static inline void
+-dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+- int nelems, int direction)
+-{
+- BUG_ON(!valid_dma_direction(direction));
+- if (dma_ops->sync_sg_for_cpu)
+- dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
+- flush_write_buffers();
+-}
+-
+-static inline void
+-dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+- int nelems, int direction)
+-{
+- BUG_ON(!valid_dma_direction(direction));
+- if (dma_ops->sync_sg_for_device) {
+- dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+- }
+-
+- flush_write_buffers();
+-}
+-
+-static inline int
+-dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
+-{
+- BUG_ON(!valid_dma_direction(direction));
+- return dma_ops->map_sg(hwdev, sg, nents, direction);
+-}
+-
+-static inline void
+-dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
+- int direction)
+-{
+- BUG_ON(!valid_dma_direction(direction));
+- dma_ops->unmap_sg(hwdev, sg, nents, direction);
+-}
+-
+-extern int dma_supported(struct device *hwdev, u64 mask);
+-
+-/* same for gart, swiotlb, and nommu */
+-static inline int dma_get_cache_alignment(void)
+-{
+- return boot_cpu_data.x86_clflush_size;
+-}
+-
+-#define dma_is_consistent(d, h) 1
+-
+-extern int dma_set_mask(struct device *dev, u64 mask);
+-
+-static inline void
+-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+- enum dma_data_direction dir)
+-{
+- flush_write_buffers();
+-}
+-
+-extern struct device fallback_dev;
+-extern int panic_on_overflow;
+-#endif
+-
+-#endif /* _X8664_DMA_MAPPING_H */
+-
+-#include "dma-mapping_32.h"
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/fixmap.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/fixmap.h 2009-03-16 16:38:05.000000000 +0100
+@@ -1,5 +1,13 @@
++#ifndef _ASM_FIXMAP_H
++#define _ASM_FIXMAP_H
+
- static inline int pud_present(pud_t pud)
- {
- return __pud_val(pud) & _PAGE_PRESENT;
-@@ -48,12 +51,14 @@ static inline void xen_set_pte(pte_t *pt
-
- static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
- {
-- set_64bit((unsigned long long *)(ptep),__pte_val(pte));
-+ set_64bit((unsigned long long *)(ptep), __pte_val(pte));
- }
+ #ifdef CONFIG_X86_32
+ # include "fixmap_32.h"
+ #else
+ # include "fixmap_64.h"
+ #endif
+
- static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
- {
- xen_l2_entry_update(pmdp, pmd);
- }
++#define clear_fixmap(idx) \
++ __set_fixmap(idx, 0, __pgprot(0))
+
- static inline void xen_set_pud(pud_t *pudp, pud_t pud)
- {
- xen_l3_entry_update(pudp, pud);
-@@ -92,20 +97,19 @@ static inline void pud_clear(pud_t *pudp
- * current pgd to avoid unnecessary TLB flushes.
- */
- pgd = read_cr3();
-- if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
-+ if (__pa(pudp) >= pgd && __pa(pudp) <
-+ (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
- xen_tlb_flush();
- }
-
--#define pud_page(pud) \
--((struct page *) __va(pud_val(pud) & PAGE_MASK))
-+#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
-
--#define pud_page_vaddr(pud) \
--((unsigned long) __va(pud_val(pud) & PAGE_MASK))
-+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
-
-
- /* Find an entry in the second-level page table.. */
--#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
-- pmd_index(address))
-+#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \
-+ pmd_index(address))
-
- #ifdef CONFIG_SMP
- static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
-@@ -150,7 +154,8 @@ static inline int pte_none(pte_t pte)
- * put the 32 bits of offset into the high part.
++#endif
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:38:05.000000000 +0100
+@@ -10,8 +10,8 @@
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
*/
- #define pte_to_pgoff(pte) ((pte).pte_high)
--#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
-+#define pgoff_to_pte(off) \
-+ ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
- #define PTE_FILE_MAX_BITS 32
- /* Encode and de-code a swap entry */
---- a/include/asm-x86/mach-xen/asm/pgtable_64.h
-+++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
-@@ -31,7 +31,7 @@ extern void paging_init(void);
+-#ifndef _ASM_FIXMAP_H
+-#define _ASM_FIXMAP_H
++#ifndef _ASM_FIXMAP_32_H
++#define _ASM_FIXMAP_32_H
- #endif /* !__ASSEMBLY__ */
+ /* used by vmalloc.c, vsyscall.lds.S.
+ *
+@@ -102,8 +102,7 @@ enum fixed_addresses {
+ */
+ #define NR_FIX_BTMAPS 64
+ #define FIX_BTMAPS_NESTING 4
+- FIX_BTMAP_END =
+- __end_of_permanent_fixed_addresses + 512 -
++ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
+ (__end_of_permanent_fixed_addresses & 511),
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
+ FIX_WP_TEST,
+@@ -114,19 +113,16 @@ enum fixed_addresses {
+ };
--#define SHARED_KERNEL_PMD 1
-+#define SHARED_KERNEL_PMD 0
+ extern void __set_fixmap(enum fixed_addresses idx,
+- maddr_t phys, pgprot_t flags);
++ maddr_t phys, pgprot_t flags);
+ extern void reserve_top_address(unsigned long reserve);
+-#define set_fixmap(idx, phys) \
+- __set_fixmap(idx, phys, PAGE_KERNEL)
++#define set_fixmap(idx, phys) \
++ __set_fixmap(idx, phys, PAGE_KERNEL)
/*
- * PGDIR_SHIFT determines what a top-level page table entry can map
-@@ -59,18 +59,20 @@ extern void paging_init(void);
-
- #ifndef __ASSEMBLY__
-
--#define pte_ERROR(e) \
-- printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
-- &(e), __pte_val(e), pte_pfn(e))
--#define pmd_ERROR(e) \
-- printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
-- &(e), __pmd_val(e), pmd_pfn(e))
--#define pud_ERROR(e) \
-- printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
-- &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
--#define pgd_ERROR(e) \
-- printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
-- &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-+#define pte_ERROR(e) \
-+ printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \
-+ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
-+#define pmd_ERROR(e) \
-+ printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \
-+ __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
-+#define pud_ERROR(e) \
-+ printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", \
-+ __FILE__, __LINE__, &(e), __pud_val(e), \
-+ (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-+#define pgd_ERROR(e) \
-+ printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", \
-+ __FILE__, __LINE__, &(e), __pgd_val(e), \
-+ (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-
- #define pgd_none(x) (!__pgd_val(x))
- #define pud_none(x) (!__pud_val(x))
-@@ -125,7 +127,7 @@ static inline void xen_set_pgd(pgd_t *pg
- xen_l4_entry_update(pgdp, pgd);
- }
-
--static inline void xen_pgd_clear(pgd_t * pgd)
-+static inline void xen_pgd_clear(pgd_t *pgd)
- {
- xen_set_pgd(pgd, xen_make_pgd(0));
- xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
-@@ -135,43 +137,43 @@ static inline void xen_pgd_clear(pgd_t *
-
- #endif /* !__ASSEMBLY__ */
-
--#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
--#define PMD_MASK (~(PMD_SIZE-1))
--#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT)
--#define PUD_MASK (~(PUD_SIZE-1))
--#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
--#define PGDIR_MASK (~(PGDIR_SIZE-1))
-+#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
-+#define PMD_MASK (~(PMD_SIZE - 1))
-+#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
-+#define PUD_MASK (~(PUD_SIZE - 1))
-+#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
-+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-
-
--#define MAXMEM _AC(0x3fffffffffff, UL)
-+#define MAXMEM _AC(0x00003fffffffffff, UL)
- #define VMALLOC_START _AC(0xffffc20000000000, UL)
- #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
- #define VMEMMAP_START _AC(0xffffe20000000000, UL)
--#define MODULES_VADDR _AC(0xffffffff88000000, UL)
-+#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
- #define MODULES_END _AC(0xfffffffffff00000, UL)
- #define MODULES_LEN (MODULES_END - MODULES_VADDR)
+ * Some hardware wants to get fixmapped without caching.
+ */
+-#define set_fixmap_nocache(idx, phys) \
+- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+-
+-#define clear_fixmap(idx) \
+- __set_fixmap(idx, 0, __pgprot(0))
++#define set_fixmap_nocache(idx, phys) \
++ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
- #ifndef __ASSEMBLY__
+ #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
--static inline unsigned long pgd_bad(pgd_t pgd)
-+static inline int pgd_bad(pgd_t pgd)
- {
-- return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
-+ return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
- }
+@@ -159,7 +155,7 @@ static __always_inline unsigned long fix
+ if (idx >= __end_of_fixed_addresses)
+ __this_fixmap_does_not_exist();
--static inline unsigned long pud_bad(pud_t pud)
-+static inline int pud_bad(pud_t pud)
- {
-- return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
-+ return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+- return __fix_to_virt(idx);
++ return __fix_to_virt(idx);
}
--static inline unsigned long pmd_bad(pmd_t pmd)
-+static inline int pmd_bad(pmd_t pmd)
- {
-- return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
-+ return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
- }
+ static inline unsigned long virt_to_fix(const unsigned long vaddr)
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:38:05.000000000 +0100
+@@ -8,8 +8,8 @@
+ * Copyright (C) 1998 Ingo Molnar
+ */
- #define pte_none(x) (!(x).pte)
- #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
+-#ifndef _ASM_FIXMAP_H
+-#define _ASM_FIXMAP_H
++#ifndef _ASM_FIXMAP_64_H
++#define _ASM_FIXMAP_64_H
--#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
-+#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
+ #include <linux/kernel.h>
+ #include <asm/apicdef.h>
+@@ -35,7 +35,8 @@
- #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
- #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
-@@ -181,13 +183,13 @@ static inline unsigned long pmd_bad(pmd_
- mfn_to_local_pfn(__pte_mfn(_pte)) : \
- __pte_mfn(_pte))
+ enum fixed_addresses {
+ VSYSCALL_LAST_PAGE,
+- VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
++ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
++ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+ VSYSCALL_HPET,
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+@@ -45,11 +46,12 @@ enum fixed_addresses {
+ #endif
+ #ifndef CONFIG_XEN
+ FIX_IO_APIC_BASE_0,
+- FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
++ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+ #endif
+ #ifdef CONFIG_EFI
+ FIX_EFI_IO_MAP_LAST_PAGE,
+- FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
++ FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
++ + MAX_EFI_IO_PAGES - 1,
+ #endif
+ #ifdef CONFIG_ACPI
+ FIX_ACPI_BEGIN,
+@@ -79,19 +81,16 @@ enum fixed_addresses {
+ __end_of_fixed_addresses
+ };
--#define pte_page(x) pfn_to_page(pte_pfn(x))
-+#define pte_page(x) pfn_to_page(pte_pfn((x)))
+-extern void __set_fixmap (enum fixed_addresses idx,
+- unsigned long phys, pgprot_t flags);
++extern void __set_fixmap(enum fixed_addresses idx,
++ unsigned long phys, pgprot_t flags);
+-#define set_fixmap(idx, phys) \
+- __set_fixmap(idx, phys, PAGE_KERNEL)
++#define set_fixmap(idx, phys) \
++ __set_fixmap(idx, phys, PAGE_KERNEL)
/*
- * Macro to mark a page protection value as "uncacheable".
+ * Some hardware wants to get fixmapped without caching.
*/
--#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
+-#define set_fixmap_nocache(idx, phys) \
+- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
-
-+#define pgprot_noncached(prot) \
-+ (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
+-#define clear_fixmap(idx) \
+- __set_fixmap(idx, 0, __pgprot(0))
++#define set_fixmap_nocache(idx, phys) \
++ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
- /*
- * Conversion functions: convert a page and protection to a page entry,
-@@ -197,36 +199,39 @@ static inline unsigned long pmd_bad(pmd_
- /*
- * Level 4 access.
- */
--#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
--#define pgd_page(pgd) (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
--#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
--#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
--#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
-+#define pgd_page_vaddr(pgd) \
-+ ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
-+#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
-+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
-+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
-+#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
- #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
- static inline int pgd_large(pgd_t pgd) { return 0; }
- #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+ #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
+ #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:38:05.000000000 +0100
+@@ -8,7 +8,7 @@
+ * Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+- * Redesigned the x86 32-bit VM architecture to deal with
++ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/io.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/io.h 2009-03-16 16:38:05.000000000 +0100
+@@ -1,5 +1,22 @@
++#ifndef _ASM_X86_IO_H
++#define _ASM_X86_IO_H
++
++#define ARCH_HAS_IOREMAP_WC
++
+ #ifdef CONFIG_X86_32
+ # include "io_32.h"
+ #else
+ # include "io_64.h"
+ #endif
++
++extern void *xlate_dev_mem_ptr(unsigned long phys);
++extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
++
++extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
++extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
++
++extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
++ unsigned long prot_val);
++extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
++
++#endif /* _ASM_X86_IO_H */
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:38:05.000000000 +0100
+@@ -137,11 +137,11 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT
+ #endif /* __ASSEMBLY__ */
- /* PUD - Level3 access */
- /* to find an entry in a page-table-directory. */
--#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
--#define pud_page(pud) (pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
--#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
--#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
-+#define pud_page_vaddr(pud) \
-+ ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
-+#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
-+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
-+#define pud_offset(pgd, address) \
-+ ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
- #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
+ #ifndef __ASSEMBLY__
+-#define raw_local_save_flags(flags) \
+- do { (flags) = __raw_local_save_flags(); } while (0)
++#define raw_local_save_flags(flags) \
++ do { (flags) = __raw_local_save_flags(); } while (0)
- static inline int pud_large(pud_t pte)
+-#define raw_local_irq_save(flags) \
+- do { (flags) = __raw_local_irq_save(); } while (0)
++#define raw_local_irq_save(flags) \
++ do { (flags) = __raw_local_irq_save(); } while (0)
+
+ static inline int raw_irqs_disabled_flags(unsigned long flags)
{
-- return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
-- (_PAGE_PSE|_PAGE_PRESENT);
-+ return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
-+ (_PAGE_PSE | _PAGE_PRESENT);
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:38:05.000000000 +0100
+@@ -94,7 +94,7 @@ static inline void switch_mm(struct mm_s
+ BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
+
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+- /* We were in lazy tlb mode and leave_mm disabled
++ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload %cr3.
+ */
+ load_cr3(next->pgd);
+@@ -107,10 +107,10 @@ static inline void switch_mm(struct mm_s
+ #define deactivate_mm(tsk, mm) \
+ asm("movl %0,%%gs": :"r" (0));
+
+-#define activate_mm(prev, next) \
+- do { \
+- xen_activate_mm(prev, next); \
+- switch_mm((prev),(next),NULL); \
+- } while(0)
++#define activate_mm(prev, next) \
++do { \
++ xen_activate_mm(prev, next); \
++ switch_mm((prev), (next), NULL); \
++} while (0)
+
+ #endif
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:38:05.000000000 +0100
+@@ -21,7 +21,7 @@ void destroy_context(struct mm_struct *m
+ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+ {
+ #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+- if (read_pda(mmu_state) == TLBSTATE_OK)
++ if (read_pda(mmu_state) == TLBSTATE_OK)
+ write_pda(mmu_state, TLBSTATE_LAZY);
+ #endif
+ }
+@@ -62,7 +62,7 @@ extern void mm_pin(struct mm_struct *mm)
+ extern void mm_unpin(struct mm_struct *mm);
+ void mm_pin_all(void);
+
+-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
++static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+ {
+ unsigned cpu = smp_processor_id();
+@@ -106,7 +106,7 @@ static inline void switch_mm(struct mm_s
+ if (read_pda(active_mm) != next)
+ BUG();
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+- /* We were in lazy tlb mode and leave_mm disabled
++ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ */
+@@ -118,10 +118,11 @@ static inline void switch_mm(struct mm_s
+ #endif
}
- /* PMD - Level 2 access */
--#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
--#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
-+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
-+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
+-#define deactivate_mm(tsk,mm) do { \
+- load_gs_index(0); \
+- asm volatile("movl %0,%%fs"::"r"(0)); \
+-} while(0)
++#define deactivate_mm(tsk, mm) \
++do { \
++ load_gs_index(0); \
++ asm volatile("movl %0,%%fs"::"r"(0)); \
++} while (0)
+
+ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
+ {
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:38:05.000000000 +0100
+@@ -20,8 +20,16 @@
+ #define _PAGE_BIT_IO 9
+ #define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
+
+-#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
+-#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK)
++#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
++#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
++
++/* Cast PAGE_MASK to a signed type so that it is sign-extended if
++ virtual addresses are 32-bits but physical addresses are larger
++ (ie, 32-bit PAE). */
++#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
++
++/* PTE_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
++#define PTE_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
+
+ #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
+ #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
+@@ -34,19 +42,14 @@
+ /* to align the pointer to the (next) page boundary */
+ #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
--#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
--#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
-- pmd_index(address))
-+#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
-+#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
-+ pmd_index(address))
- #define pmd_none(x) (!__pmd_val(x))
- #if CONFIG_XEN_COMPAT <= 0x030002
- /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
-@@ -235,43 +240,56 @@ static inline int pud_large(pud_t pte)
- #else
- #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
+-#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
+-#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
+-
+ #ifndef __ASSEMBLY__
+ #include <linux/types.h>
#endif
--#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
--#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-+#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
-+#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-
- #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
--#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
-+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
-+ _PAGE_FILE })
- #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
- /* PTE - Level 1 access. */
+ #ifdef CONFIG_X86_64
+ #include <asm/page_64.h>
+-#define max_pfn_mapped end_pfn_map
+ #else
+ #include <asm/page_32.h>
+-#define max_pfn_mapped max_low_pfn
+ #endif /* CONFIG_X86_64 */
- /* page, protection -> pte */
--#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
--
--#define pte_index(address) \
-- (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
-+
-+#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
- #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
-- pte_index(address))
-+ pte_index((address)))
+ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
+@@ -59,6 +62,9 @@
+ #ifndef __ASSEMBLY__
- /* x86-64 always has all page tables mapped. */
--#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
--#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
-+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
-+#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
- #define pte_unmap(pte) /* NOP */
--#define pte_unmap_nested(pte) /* NOP */
-+#define pte_unmap_nested(pte) /* NOP */
+ extern int page_is_ram(unsigned long pagenr);
++extern int devmem_is_allowed(unsigned long pagenr);
+
-+#define update_mmu_cache(vma, address, pte) do { } while (0)
++extern unsigned long max_pfn_mapped;
--#define update_mmu_cache(vma,address,pte) do { } while (0)
-+extern int direct_gbpages;
+ struct page;
- /* Encode and de-code a swap entry */
--#define __swp_type(x) (((x).val >> 1) & 0x3f)
--#define __swp_offset(x) ((x).val >> 8)
--#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
-+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
-+#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
-+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-+#else
-+#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
-+#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
-+#endif
-+
-+#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
-+ & ((1U << SWP_TYPE_BITS) - 1))
-+#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
-+#define __swp_entry(type, offset) ((swp_entry_t) { \
-+ ((type) << (_PAGE_BIT_PRESENT + 1)) \
-+ | ((offset) << SWP_OFFSET_SHIFT) })
- #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
- #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:38:05.000000000 +0100
+@@ -5,7 +5,7 @@
--extern int kern_addr_valid(unsigned long addr);
-+extern int kern_addr_valid(unsigned long addr);
- extern void cleanup_highmap(void);
+ #define THREAD_ORDER 1
+ #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+-#define CURRENT_MASK (~(THREAD_SIZE-1))
++#define CURRENT_MASK (~(THREAD_SIZE - 1))
--#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
-- direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
-+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
-+ direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
+ #define EXCEPTION_STACK_ORDER 0
+ #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+@@ -53,10 +53,10 @@
+ #define __VIRTUAL_MASK_SHIFT 48
- #define HAVE_ARCH_UNMAPPED_AREA
- #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-@@ -284,8 +302,10 @@ extern void cleanup_highmap(void);
+ /*
+- * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
++ * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
+ * arch/x86/kernel/head_64.S), and it is mapped here:
+ */
+-#define KERNEL_IMAGE_SIZE (128*1024*1024)
++#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+ #define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
- /* fs/proc/kcore.c */
- #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
--#define kc_offset_to_vaddr(o) \
-- (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
-+#define kc_offset_to_vaddr(o) \
-+ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
-+ ? ((o) | ~__VIRTUAL_MASK) \
-+ : (o))
+ #ifndef __ASSEMBLY__
+@@ -64,7 +64,6 @@ void clear_page(void *page);
+ void copy_page(void *to, void *from);
- #define __HAVE_ARCH_PTE_SAME
- #endif /* !__ASSEMBLY__ */
---- a/include/asm-x86/mach-xen/asm/pgtable.h
-+++ b/include/asm-x86/mach-xen/asm/pgtable.h
-@@ -1,17 +1,15 @@
- #ifndef _ASM_X86_PGTABLE_H
- #define _ASM_X86_PGTABLE_H
+ extern unsigned long end_pfn;
+-extern unsigned long end_pfn_map;
--#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
- #define FIRST_USER_ADDRESS 0
+ static inline unsigned long __phys_addr(unsigned long x)
+ {
+@@ -89,6 +88,9 @@ typedef union { pteval_t pte; unsigned i
--#define _PAGE_BIT_PRESENT 0
--#define _PAGE_BIT_RW 1
--#define _PAGE_BIT_USER 2
--#define _PAGE_BIT_PWT 3
--#define _PAGE_BIT_PCD 4
--#define _PAGE_BIT_ACCESSED 5
--#define _PAGE_BIT_DIRTY 6
--#define _PAGE_BIT_FILE 6
-+#define _PAGE_BIT_PRESENT 0 /* is present */
-+#define _PAGE_BIT_RW 1 /* writeable */
-+#define _PAGE_BIT_USER 2 /* userspace addressable */
-+#define _PAGE_BIT_PWT 3 /* page write through */
-+#define _PAGE_BIT_PCD 4 /* page cache disabled */
-+#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
-+#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
- #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
- #define _PAGE_BIT_PAT 7 /* on 4KB pages */
- #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
-@@ -22,6 +20,14 @@
- #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
- #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+ #define vmemmap ((struct page *)VMEMMAP_START)
-+/* If _PAGE_BIT_PRESENT is clear, we use these: */
-+
-+/* set: nonlinear file mapping, saved PTE; unset:swap */
-+#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
-+
-+/* if the user mapped it with PROT_NONE; pte_present gives true */
-+#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
++extern unsigned long init_memory_mapping(unsigned long start,
++ unsigned long end);
+
- /*
- * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
- * sign-extended value on 32-bit with all 1's in the upper word,
-@@ -48,10 +54,8 @@
- #define _PAGE_NX 0
- #endif
+ #endif /* !__ASSEMBLY__ */
--/* If _PAGE_PRESENT is clear, we use these: */
--#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
--#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
-- pte_present gives true */
-+#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
-+#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
+ #ifdef CONFIG_FLATMEM
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:38:05.000000000 +0100
+@@ -8,14 +8,13 @@
+ #include <asm/scatterlist.h>
+ #include <asm/io.h>
- #ifndef __ASSEMBLY__
- #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
-@@ -61,20 +65,42 @@ extern unsigned int __kernel_page_user;
- #endif
+-
+ #ifdef __KERNEL__
+
+ struct pci_sysdata {
+ int domain; /* PCI domain */
+ int node; /* NUMA node */
+ #ifdef CONFIG_X86_64
+- void* iommu; /* IOMMU private data */
++ void *iommu; /* IOMMU private data */
#endif
+ #ifdef CONFIG_XEN_PCIDEV_FRONTEND
+ struct pcifront_device *pdev;
+@@ -23,6 +22,8 @@ struct pci_sysdata {
+ };
--#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
--#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
-+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
-+ _PAGE_ACCESSED | _PAGE_DIRTY)
-+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
-+ _PAGE_DIRTY | __kernel_page_user)
-+
-+/* Set of bits not changed in pte_modify */
-+#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
-+ _PAGE_ACCESSED | _PAGE_DIRTY)
+ /* scan a bus after allocating a pci_sysdata for it */
++extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
++ int node);
+ extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
--#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
-+/*
-+ * PAT settings are part of the hypervisor interface, which sets the
-+ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
-+ */
-+#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
-+#define _PAGE_CACHE_WB (0)
-+#define _PAGE_CACHE_WT (_PAGE_PWT)
-+#define _PAGE_CACHE_WC (_PAGE_PAT)
-+#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT)
-+#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
-+#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
+ static inline int pci_domain_nr(struct pci_bus *bus)
+@@ -36,6 +37,7 @@ static inline int pci_proc_domain(struct
+ return pci_domain_nr(bus);
+ }
+
++extern void pci_iommu_alloc(void);
+
+ /* Can be used to override the logic in pci_scan_bus for skipping
+ already-configured bus numbers - to be used for buggy BIOSes
+@@ -57,7 +59,7 @@ extern unsigned long pci_mem_start;
+ #define PCIBIOS_MIN_CARDBUS_IO 0x4000
+
+ void pcibios_config_init(void);
+-struct pci_bus * pcibios_scan_root(int bus);
++struct pci_bus *pcibios_scan_root(int bus);
- #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
--#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
-+ _PAGE_ACCESSED | _PAGE_NX)
+ void pcibios_set_master(struct pci_dev *dev);
+ void pcibios_penalize_isa_irq(int irq, int active);
+@@ -67,7 +69,8 @@ int pcibios_set_irq_routing(struct pci_d
--#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
--#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
--#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
-+ _PAGE_USER | _PAGE_ACCESSED)
-+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
-+ _PAGE_ACCESSED | _PAGE_NX)
-+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
-+ _PAGE_ACCESSED)
- #define PAGE_COPY PAGE_COPY_NOEXEC
--#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
--#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
-+ _PAGE_ACCESSED | _PAGE_NX)
-+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
-+ _PAGE_ACCESSED)
+ #define HAVE_PCI_MMAP
+ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+- enum pci_mmap_state mmap_state, int write_combine);
++ enum pci_mmap_state mmap_state,
++ int write_combine);
- #ifdef CONFIG_X86_32
- #define _PAGE_KERNEL_EXEC \
-@@ -93,6 +119,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
- #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
- #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
- #define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
-+#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
- #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
- #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
- #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
-@@ -109,6 +136,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
- #define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
- #define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
- #define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
-+#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
- #define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
- #define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
- #define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
-@@ -142,7 +170,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
--extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
-+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
- #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
- extern spinlock_t pgd_lock;
-@@ -152,30 +180,111 @@ extern struct list_head pgd_list;
- * The following only work if pte_present() is true.
- * Undefined behaviour if not..
- */
--static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
--static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
--static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
--static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
--static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
--static inline int pte_global(pte_t pte) { return 0; }
--static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
--
--static inline int pmd_large(pmd_t pte) {
-- return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
-- (_PAGE_PSE|_PAGE_PRESENT);
--}
--
--static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
--static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
--static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
--static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
--static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
--static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
--static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
--static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
--static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
--static inline pte_t pte_mkglobal(pte_t pte) { return pte; }
--static inline pte_t pte_clrglobal(pte_t pte) { return pte; }
-+static inline int pte_dirty(pte_t pte)
-+{
-+ return __pte_val(pte) & _PAGE_DIRTY;
-+}
+ #ifdef CONFIG_PCI
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgalloc.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgalloc.h 2009-03-16 16:38:05.000000000 +0100
+@@ -1,5 +1,149 @@
+-#ifdef CONFIG_X86_32
+-# include "pgalloc_32.h"
+-#else
+-# include "pgalloc_64.h"
++#ifndef _ASM_X86_PGALLOC_H
++#define _ASM_X86_PGALLOC_H
+
-+static inline int pte_young(pte_t pte)
-+{
-+ return __pte_val(pte) & _PAGE_ACCESSED;
-+}
++#include <linux/threads.h>
++#include <linux/mm.h> /* for struct page */
++#include <linux/pagemap.h>
+
-+static inline int pte_write(pte_t pte)
-+{
-+ return __pte_val(pte) & _PAGE_RW;
-+}
++#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
+
-+static inline int pte_file(pte_t pte)
-+{
-+ return __pte_val(pte) & _PAGE_FILE;
-+}
++static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
++static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
++static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
++ unsigned long start, unsigned long count) {}
++static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
++static inline void paravirt_release_pte(unsigned long pfn) {}
++static inline void paravirt_release_pmd(unsigned long pfn) {}
++static inline void paravirt_release_pud(unsigned long pfn) {}
+
-+static inline int pte_huge(pte_t pte)
-+{
-+ return __pte_val(pte) & _PAGE_PSE;
-+}
++#ifdef CONFIG_X86_64
++void early_make_page_readonly(void *va, unsigned int feature);
++pmd_t *early_get_pmd(unsigned long va);
++#define make_lowmem_page_readonly make_page_readonly
++#define make_lowmem_page_writable make_page_writable
+ #endif
+
-+static inline int pte_global(pte_t pte)
-+{
-+ return 0;
-+}
++/*
++ * Allocate and free page tables.
++ */
++extern pgd_t *pgd_alloc(struct mm_struct *);
++extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
-+static inline int pte_exec(pte_t pte)
-+{
-+ return !(__pte_val(pte) & _PAGE_NX);
-+}
++extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
++extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
-+static inline int pte_special(pte_t pte)
-+{
-+ return 0;
-+}
++/* Should really implement gc for free page table pages. This could be
++ done with a reference count in struct page. */
+
-+static inline int pmd_large(pmd_t pte)
++static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
-+ return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
-+ (_PAGE_PSE | _PAGE_PRESENT);
++ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
++ make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
++ free_page((unsigned long)pte);
+}
+
-+static inline pte_t pte_mkclean(pte_t pte)
++extern void __pte_free(pgtable_t);
++static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
-+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
++ __pte_free(pte);
+}
+
-+static inline pte_t pte_mkold(pte_t pte)
-+{
-+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
-+}
++extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
-+static inline pte_t pte_wrprotect(pte_t pte)
++static inline void pmd_populate_kernel(struct mm_struct *mm,
++ pmd_t *pmd, pte_t *pte)
+{
-+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
++ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
++ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
-+static inline pte_t pte_mkexec(pte_t pte)
++static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
++ struct page *pte)
+{
-+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
-+}
++ unsigned long pfn = page_to_pfn(pte);
+
-+static inline pte_t pte_mkdirty(pte_t pte)
-+{
-+ return __pte_ma(__pte_val(pte) | _PAGE_DIRTY);
++ paravirt_alloc_pte(mm, pfn);
++ if (PagePinned(virt_to_page(mm->pgd))) {
++ if (!PageHighMem(pte))
++ BUG_ON(HYPERVISOR_update_va_mapping(
++ (unsigned long)__va(pfn << PAGE_SHIFT),
++ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
++#ifndef CONFIG_X86_64
++ else if (!TestSetPagePinned(pte))
++ kmap_flush_unused();
++#endif
++ set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
++ } else
++ *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
+}
+
-+static inline pte_t pte_mkyoung(pte_t pte)
-+{
-+ return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED);
-+}
++#define pmd_pgtable(pmd) pmd_page(pmd)
+
-+static inline pte_t pte_mkwrite(pte_t pte)
-+{
-+ return __pte_ma(__pte_val(pte) | _PAGE_RW);
-+}
++#if PAGETABLE_LEVELS > 2
++extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
++extern void __pmd_free(pgtable_t);
+
-+static inline pte_t pte_mkhuge(pte_t pte)
++static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
-+ return __pte_ma(__pte_val(pte) | _PAGE_PSE);
++ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
++ __pmd_free(virt_to_page(pmd));
+}
+
-+static inline pte_t pte_clrhuge(pte_t pte)
++extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
++
++#ifdef CONFIG_X86_PAE
++extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
++#else /* !CONFIG_X86_PAE */
++static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
-+ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
++ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
++ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
++ BUG_ON(HYPERVISOR_update_va_mapping(
++ (unsigned long)pmd,
++ pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
++ PAGE_KERNEL_RO), 0));
++ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
++ } else
++ *pud = __pud(_PAGE_TABLE | __pa(pmd));
+}
++#endif /* CONFIG_X86_PAE */
+
-+static inline pte_t pte_mkglobal(pte_t pte)
++#if PAGETABLE_LEVELS > 3
++#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
++
++/*
++ * We need to use the batch mode here, but pgd_pupulate() won't be
++ * be called frequently.
++ */
++static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
-+ return pte;
++ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
++ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
++ BUG_ON(HYPERVISOR_update_va_mapping(
++ (unsigned long)pud,
++ pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
++ PAGE_KERNEL_RO), 0));
++ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
++ set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
++ } else {
++ *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
++ *__user_pgd(pgd) = *(pgd);
++ }
+}
+
-+static inline pte_t pte_clrglobal(pte_t pte)
++static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
-+ return pte;
++ return (pud_t *)pmd_alloc_one(mm, addr);
+}
+
-+static inline pte_t pte_mkspecial(pte_t pte)
++static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
-+ return pte;
++ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
++ __pmd_free(virt_to_page(pud));
+}
++
++extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
++#endif /* PAGETABLE_LEVELS > 3 */
++#endif /* PAGETABLE_LEVELS > 2 */
++
++#endif /* _ASM_X86_PGALLOC_H */
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgalloc_32.h 2009-03-16 16:33:40.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,111 +0,0 @@
+-#ifndef _I386_PGALLOC_H
+-#define _I386_PGALLOC_H
+-
+-#include <linux/threads.h>
+-#include <linux/mm.h> /* for struct page */
+-#include <linux/pagemap.h>
+-#include <asm/tlb.h>
+-#include <asm-generic/tlb.h>
+-#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
+-
+-#define paravirt_alloc_pt(mm, pfn) do { } while (0)
+-#define paravirt_alloc_pd(mm, pfn) do { } while (0)
+-#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
+-#define paravirt_release_pt(pfn) do { } while (0)
+-#define paravirt_release_pd(pfn) do { } while (0)
+-
+-static inline void pmd_populate_kernel(struct mm_struct *mm,
+- pmd_t *pmd, pte_t *pte)
+-{
+- paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
+- set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+-}
+-
+-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+-{
+- unsigned long pfn = page_to_pfn(pte);
+-
+- paravirt_alloc_pt(mm, pfn);
+- if (PagePinned(virt_to_page(mm->pgd))) {
+- if (!PageHighMem(pte))
+- BUG_ON(HYPERVISOR_update_va_mapping(
+- (unsigned long)__va(pfn << PAGE_SHIFT),
+- pfn_pte(pfn, PAGE_KERNEL_RO), 0));
+- else if (!test_and_set_bit(PG_pinned, &pte->flags))
+- kmap_flush_unused();
+- set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+- } else
+- *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
+-}
+-#define pmd_pgtable(pmd) pmd_page(pmd)
+-
+-/*
+- * Allocate and free page tables.
+- */
+-extern void pgd_test_and_unpin(pgd_t *);
+-extern pgd_t *pgd_alloc(struct mm_struct *);
+-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+-
+-extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+-extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+-
+-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+-{
+- make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
+- free_page((unsigned long)pte);
+-}
+-
+-extern void __pte_free(pgtable_t);
+-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
+-{
+- __pte_free(pte);
+-}
+-
+-
+-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+-
+-#ifdef CONFIG_X86_PAE
+-/*
+- * In the PAE case we free the pmds as part of the pgd.
+- */
+-extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
+-
+-extern void __pmd_free(pgtable_t);
+-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+-{
+- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+- __pmd_free(virt_to_page(pmd));
+-}
+-
+-extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+-
+-static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+-{
+- struct page *page = virt_to_page(pmd);
+- unsigned long pfn = page_to_pfn(page);
+-
+- paravirt_alloc_pd(mm, pfn);
+-
+- /* Note: almost everything apart from _PAGE_PRESENT is
+- reserved at the pmd (PDPT) level. */
+- if (PagePinned(virt_to_page(mm->pgd))) {
+- BUG_ON(PageHighMem(page));
+- BUG_ON(HYPERVISOR_update_va_mapping(
+- (unsigned long)__va(pfn << PAGE_SHIFT),
+- pfn_pte(pfn, PAGE_KERNEL_RO), 0));
+- set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+- } else
+- *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
+-
+- /*
+- * According to Intel App note "TLBs, Paging-Structure Caches,
+- * and Their Invalidation", April 2007, document 317080-001,
+- * section 8.1: in PAE mode we explicitly have to flush the
+- * TLB via cr3 if the top-level pgd is changed...
+- */
+- if (mm == current->active_mm)
+- xen_tlb_flush();
+-}
+-#endif /* CONFIG_X86_PAE */
+-
+-#endif /* _I386_PGALLOC_H */
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgalloc_64.h 2009-03-16 16:33:40.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,179 +0,0 @@
+-#ifndef _X86_64_PGALLOC_H
+-#define _X86_64_PGALLOC_H
+-
+-#include <asm/pda.h>
+-#include <linux/threads.h>
+-#include <linux/mm.h>
+-#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
+-
+-pmd_t *early_get_pmd(unsigned long va);
+-void early_make_page_readonly(void *va, unsigned int feature);
+-
+-#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
+-
+-#define pmd_populate_kernel(mm, pmd, pte) \
+- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
+-
+-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+-{
+- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
+- BUG_ON(HYPERVISOR_update_va_mapping(
+- (unsigned long)pmd,
+- pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
+- PAGE_KERNEL_RO), 0));
+- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+- } else {
+- *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
+- }
+-}
+-
+-/*
+- * We need to use the batch mode here, but pgd_pupulate() won't be
+- * be called frequently.
+- */
+-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+-{
+- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
+- BUG_ON(HYPERVISOR_update_va_mapping(
+- (unsigned long)pud,
+- pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
+- PAGE_KERNEL_RO), 0));
+- set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+- set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
+- } else {
+- *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
+- *(__user_pgd(pgd)) = *(pgd);
+- }
+-}
+-
+-#define pmd_pgtable(pmd) pmd_page(pmd)
+-
+-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+-{
+- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
+- BUG_ON(HYPERVISOR_update_va_mapping(
+- (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
+- pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
+- set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
+- } else {
+- *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
+- }
+-}
+-
+-extern void __pmd_free(pgtable_t);
+-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+-{
+- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+- __pmd_free(virt_to_page(pmd));
+-}
+-
+-extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
+-
+-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+-{
+- return (pud_t *)pmd_alloc_one(mm, addr);
+-}
+-
+-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+-{
+- BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+- __pmd_free(virt_to_page(pud));
+-}
+-
+-static inline void pgd_list_add(pgd_t *pgd)
+-{
+- struct page *page = virt_to_page(pgd);
+- unsigned long flags;
+-
+- spin_lock_irqsave(&pgd_lock, flags);
+- list_add(&page->lru, &pgd_list);
+- spin_unlock_irqrestore(&pgd_lock, flags);
+-}
+-
+-static inline void pgd_list_del(pgd_t *pgd)
+-{
+- struct page *page = virt_to_page(pgd);
+- unsigned long flags;
+-
+- spin_lock_irqsave(&pgd_lock, flags);
+- list_del(&page->lru);
+- spin_unlock_irqrestore(&pgd_lock, flags);
+-}
+-
+-extern void pgd_test_and_unpin(pgd_t *);
+-
+-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+-{
+- /*
+- * We allocate two contiguous pages for kernel and user.
+- */
+- unsigned boundary;
+- pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
+- if (!pgd)
+- return NULL;
+- pgd_list_add(pgd);
+- pgd_test_and_unpin(pgd);
+- /*
+- * Copy kernel pointers in from init.
+- * Could keep a freelist or slab cache of those because the kernel
+- * part never changes.
+- */
+- boundary = pgd_index(__PAGE_OFFSET);
+- memset(pgd, 0, boundary * sizeof(pgd_t));
+- memcpy(pgd + boundary,
+- init_level4_pgt + boundary,
+- (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
+-
+- memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
+- /*
+- * Set level3_user_pgt for vsyscall area
+- */
+- __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
+- __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
+- return pgd;
+-}
+-
+-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+-{
+- pgd_test_and_unpin(pgd);
+- pgd_list_del(pgd);
+- free_pages((unsigned long)pgd, 1);
+-}
+-
+-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+-{
+- pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+- if (pte)
+- make_page_readonly(pte, XENFEAT_writable_page_tables);
+-
+- return pte;
+-}
+-
+-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
+-
+-/* Should really implement gc for free page table pages. This could be
+- done with a reference count in struct page. */
+-
+-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+-{
+- BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+- make_page_writable(pte, XENFEAT_writable_page_tables);
+- free_page((unsigned long)pte);
+-}
+-
+-extern void __pte_free(pgtable_t);
+-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
+-{
+- __pte_free(pte);
+-}
+-
+-#define __pte_free_tlb(tlb,pte) \
+-do { \
+- pgtable_page_dtor((pte)); \
+- tlb_remove_page((tlb), (pte)); \
+-} while (0)
+-
+-#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
+-#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
+-
+-#endif /* _X86_64_PGALLOC_H */
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:38:05.000000000 +0100
+@@ -1,17 +1,15 @@
+ #ifndef _ASM_X86_PGTABLE_H
+ #define _ASM_X86_PGTABLE_H
- extern pteval_t __supported_pte_mask;
-
-@@ -202,15 +311,33 @@ static inline pte_t pte_modify(pte_t pte
- pteval_t val = pte_val(pte);
-
- val &= _PAGE_CHG_MASK;
-- val |= pgprot_val(newprot) & __supported_pte_mask;
-+ val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
+-#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
+ #define FIRST_USER_ADDRESS 0
- return __pte(val);
- }
+-#define _PAGE_BIT_PRESENT 0
+-#define _PAGE_BIT_RW 1
+-#define _PAGE_BIT_USER 2
+-#define _PAGE_BIT_PWT 3
+-#define _PAGE_BIT_PCD 4
+-#define _PAGE_BIT_ACCESSED 5
+-#define _PAGE_BIT_DIRTY 6
+-#define _PAGE_BIT_FILE 6
++#define _PAGE_BIT_PRESENT 0 /* is present */
++#define _PAGE_BIT_RW 1 /* writeable */
++#define _PAGE_BIT_USER 2 /* userspace addressable */
++#define _PAGE_BIT_PWT 3 /* page write through */
++#define _PAGE_BIT_PCD 4 /* page cache disabled */
++#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
++#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
+ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
+ #define _PAGE_BIT_PAT 7 /* on 4KB pages */
+ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+@@ -22,6 +20,14 @@
+ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
--#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
-+/* mprotect needs to preserve PAT bits when updating vm_page_prot */
-+#define pgprot_modify pgprot_modify
-+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
-+{
-+ pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
-+ pgprotval_t addbits = pgprot_val(newprot);
-+ return __pgprot(preservebits | addbits);
-+}
++/* If _PAGE_BIT_PRESENT is clear, we use these: */
+
-+#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
-
- #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
-
-+#ifndef __ASSEMBLY__
-+#define __HAVE_PHYS_MEM_ACCESS_PROT
-+struct file;
-+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
-+ unsigned long size, pgprot_t vma_prot);
-+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
-+ unsigned long size, pgprot_t *vma_prot);
-+#endif
++/* set: nonlinear file mapping, saved PTE; unset:swap */
++#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
+
- #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
- #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
-
-@@ -246,6 +373,9 @@ static inline pte_t pte_modify(pte_t pte
- # include "pgtable_64.h"
++/* if the user mapped it with PROT_NONE; pte_present gives true */
++#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
++
+ /*
+ * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
+ * sign-extended value on 32-bit with all 1's in the upper word,
+@@ -48,10 +54,8 @@
+ #define _PAGE_NX 0
#endif
-+#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
-+#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
-+
- #ifndef __ASSEMBLY__
+-/* If _PAGE_PRESENT is clear, we use these: */
+-#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
+-#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
+- pte_present gives true */
++#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
++#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
- enum {
-@@ -312,46 +442,17 @@ static inline void xen_pte_clear(struct
- * bit at the same time.
- */
- #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
--#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
--({ \
-- int __changed = !pte_same(*(ptep), entry); \
-- if (__changed && (dirty)) { \
-- if ( likely((vma)->vm_mm == current->mm) ) { \
-- BUG_ON(HYPERVISOR_update_va_mapping(address, \
-- entry, \
-- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
-- UVMF_INVLPG|UVMF_MULTI)); \
-- } else { \
-- xen_l1_entry_update(ptep, entry); \
-- flush_tlb_page(vma, address); \
-- } \
-- } \
-- __changed; \
--})
-+extern int ptep_set_access_flags(struct vm_area_struct *vma,
-+ unsigned long address, pte_t *ptep,
-+ pte_t entry, int dirty);
+ #ifndef __ASSEMBLY__
+ #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
+@@ -61,20 +65,42 @@ extern unsigned int __kernel_page_user;
+ #endif
+ #endif
- #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
--#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
-- int __ret = 0; \
-- if (pte_young(*(ptep))) \
-- __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
-- &(ptep)->pte); \
-- if (__ret) \
-- pte_update((vma)->vm_mm, addr, ptep); \
-- __ret; \
--})
-+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
-+ unsigned long addr, pte_t *ptep);
+-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
++#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
++ _PAGE_ACCESSED | _PAGE_DIRTY)
++#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
++ _PAGE_DIRTY | __kernel_page_user)
++
++/* Set of bits not changed in pte_modify */
++#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
++ _PAGE_ACCESSED | _PAGE_DIRTY)
- #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
--#define ptep_clear_flush_young(vma, address, ptep) \
--({ \
-- pte_t __pte = *(ptep); \
-- int __young = pte_young(__pte); \
-- __pte = pte_mkold(__pte); \
-- if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
-- (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
-- else if (__young) \
-- (ptep)->pte_low = __pte.pte_low; \
-- __young; \
--})
-+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
-+ unsigned long address, pte_t *ptep);
+-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
++/*
++ * PAT settings are part of the hypervisor interface, which sets the
++ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
++ */
++#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
++#define _PAGE_CACHE_WB (0)
++#define _PAGE_CACHE_WT (_PAGE_PWT)
++#define _PAGE_CACHE_WC (_PAGE_PAT)
++#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT)
++#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
++#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
- #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
- #define ptep_clear_flush(vma, addr, ptep) \
-@@ -370,7 +471,8 @@ static inline void xen_pte_clear(struct
- })
+ #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+-#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
++#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
++ _PAGE_ACCESSED | _PAGE_NX)
- #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
--static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
-+ pte_t *ptep)
- {
- pte_t pte = *ptep;
- if (!pte_none(pte)
-@@ -398,13 +500,29 @@ static inline pte_t ptep_get_and_clear(s
- pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
+-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
+-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
++#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
++ _PAGE_USER | _PAGE_ACCESSED)
++#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
++ _PAGE_ACCESSED | _PAGE_NX)
++#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
++ _PAGE_ACCESSED)
+ #define PAGE_COPY PAGE_COPY_NOEXEC
+-#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
++#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
++ _PAGE_ACCESSED | _PAGE_NX)
++#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
++ _PAGE_ACCESSED)
- #define __HAVE_ARCH_PTEP_SET_WRPROTECT
--static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-+static inline void ptep_set_wrprotect(struct mm_struct *mm,
-+ unsigned long addr, pte_t *ptep)
- {
- pte_t pte = *ptep;
- if (pte_write(pte))
- set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
- }
+ #ifdef CONFIG_X86_32
+ #define _PAGE_KERNEL_EXEC \
+@@ -93,6 +119,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
+ #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
+ #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+ #define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
+ #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+ #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
+ #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
+@@ -109,6 +136,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
+ #define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
+ #define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
+ #define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
++#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
+ #define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
+ #define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
+ #define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
+@@ -142,7 +170,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+-extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
++extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+ #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
-+/*
-+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
-+ *
-+ * dst - pointer to pgd range anwhere on a pgd page
-+ * src - ""
-+ * count - the number of pgds to copy.
-+ *
-+ * dst and src can be on the same page, but the range must not overlap,
-+ * and must not cross a page boundary.
-+ */
-+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ extern spinlock_t pgd_lock;
+@@ -152,30 +180,111 @@ extern struct list_head pgd_list;
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+-static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
+-static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
+-static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
+-static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
+-static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
+-static inline int pte_global(pte_t pte) { return 0; }
+-static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
+-
+-static inline int pmd_large(pmd_t pte) {
+- return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
+- (_PAGE_PSE|_PAGE_PRESENT);
+-}
+-
+-static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
+-static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
+-static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
+-static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
+-static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
+-static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
+-static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
+-static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
+-static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
+-static inline pte_t pte_mkglobal(pte_t pte) { return pte; }
+-static inline pte_t pte_clrglobal(pte_t pte) { return pte; }
++static inline int pte_dirty(pte_t pte)
+{
-+ memcpy(dst, src, count * sizeof(pgd_t));
++ return __pte_val(pte) & _PAGE_DIRTY;
+}
+
- #define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
- xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
-
---- a/include/asm-x86/mach-xen/asm/processor.h
-+++ b/include/asm-x86/mach-xen/asm/processor.h
-@@ -3,10 +3,6 @@
-
- #include <asm/processor-flags.h>
-
--/* migration helpers, for KVM - will be removed in 2.6.25: */
--#include <asm/vm86.h>
--#define Xgt_desc_struct desc_ptr
--
- /* Forward declaration, a strange C thing */
- struct task_struct;
- struct mm_struct;
-@@ -24,6 +20,7 @@ struct mm_struct;
- #include <asm/msr.h>
- #include <asm/desc_defs.h>
- #include <asm/nops.h>
++static inline int pte_young(pte_t pte)
++{
++ return __pte_val(pte) & _PAGE_ACCESSED;
++}
+
- #include <linux/personality.h>
- #include <linux/cpumask.h>
- #include <linux/cache.h>
-@@ -38,16 +35,18 @@ struct mm_struct;
- static inline void *current_text_addr(void)
- {
- void *pc;
-- asm volatile("mov $1f,%0\n1:":"=r" (pc));
++static inline int pte_write(pte_t pte)
++{
++ return __pte_val(pte) & _PAGE_RW;
++}
+
-+ asm volatile("mov $1f, %0; 1:":"=r" (pc));
++static inline int pte_file(pte_t pte)
++{
++ return __pte_val(pte) & _PAGE_FILE;
++}
+
- return pc;
- }
-
- #ifdef CONFIG_X86_VSMP
--#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
--#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
-+# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
-+# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
- #else
--#define ARCH_MIN_TASKALIGN 16
--#define ARCH_MIN_MMSTRUCT_ALIGN 0
-+# define ARCH_MIN_TASKALIGN 16
-+# define ARCH_MIN_MMSTRUCT_ALIGN 0
- #endif
-
- /*
-@@ -57,68 +56,80 @@ static inline void *current_text_addr(vo
- */
-
- struct cpuinfo_x86 {
-- __u8 x86; /* CPU family */
-- __u8 x86_vendor; /* CPU vendor */
-- __u8 x86_model;
-- __u8 x86_mask;
-+ __u8 x86; /* CPU family */
-+ __u8 x86_vendor; /* CPU vendor */
-+ __u8 x86_model;
-+ __u8 x86_mask;
- #ifdef CONFIG_X86_32
-- char wp_works_ok; /* It doesn't on 386's */
-- char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
-- char hard_math;
-- char rfu;
-- char fdiv_bug;
-- char f00f_bug;
-- char coma_bug;
-- char pad0;
-+ char wp_works_ok; /* It doesn't on 386's */
++static inline int pte_huge(pte_t pte)
++{
++ return __pte_val(pte) & _PAGE_PSE;
++}
+
-+ /* Problems on some 486Dx4's and old 386's: */
-+ char hlt_works_ok;
-+ char hard_math;
-+ char rfu;
-+ char fdiv_bug;
-+ char f00f_bug;
-+ char coma_bug;
-+ char pad0;
- #else
-- /* number of 4K pages in DTLB/ITLB combined(in pages)*/
-- int x86_tlbsize;
-- __u8 x86_virt_bits, x86_phys_bits;
-- /* cpuid returned core id bits */
-- __u8 x86_coreid_bits;
-- /* Max extended CPUID function supported */
-- __u32 extended_cpuid_level;
--#endif
-- int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
-- __u32 x86_capability[NCAPINTS];
-- char x86_vendor_id[16];
-- char x86_model_id[64];
-- int x86_cache_size; /* in KB - valid for CPUS which support this
-- call */
-- int x86_cache_alignment; /* In bytes */
-- int x86_power;
-- unsigned long loops_per_jiffy;
-+ /* Number of 4K pages in DTLB/ITLB combined(in pages): */
-+ int x86_tlbsize;
-+ __u8 x86_virt_bits;
-+ __u8 x86_phys_bits;
-+ /* CPUID returned core id bits: */
-+ __u8 x86_coreid_bits;
-+ /* Max extended CPUID function supported: */
-+ __u32 extended_cpuid_level;
-+#endif
-+ /* Maximum supported CPUID level, -1=no CPUID: */
-+ int cpuid_level;
-+ __u32 x86_capability[NCAPINTS];
-+ char x86_vendor_id[16];
-+ char x86_model_id[64];
-+ /* in KB - valid for CPUS which support this call: */
-+ int x86_cache_size;
-+ int x86_cache_alignment; /* In bytes */
-+ int x86_power;
-+ unsigned long loops_per_jiffy;
- #ifdef CONFIG_SMP
-- cpumask_t llc_shared_map; /* cpus sharing the last level cache */
-+ /* cpus sharing the last level cache: */
-+ cpumask_t llc_shared_map;
- #endif
-- u16 x86_max_cores; /* cpuid returned max cores value */
-- u16 apicid;
-- u16 x86_clflush_size;
-+ /* cpuid returned max cores value: */
-+ u16 x86_max_cores;
-+ u16 apicid;
-+ u16 initial_apicid;
-+ u16 x86_clflush_size;
- #ifdef CONFIG_SMP
-- u16 booted_cores; /* number of cores as seen by OS */
-- u16 phys_proc_id; /* Physical processor id. */
-- u16 cpu_core_id; /* Core id */
-- u16 cpu_index; /* index into per_cpu list */
-+ /* number of cores as seen by the OS: */
-+ u16 booted_cores;
-+ /* Physical processor id: */
-+ u16 phys_proc_id;
-+ /* Core id: */
-+ u16 cpu_core_id;
-+ /* Index into per_cpu list: */
-+ u16 cpu_index;
- #endif
- } __attribute__((__aligned__(SMP_CACHE_BYTES)));
-
--#define X86_VENDOR_INTEL 0
--#define X86_VENDOR_CYRIX 1
--#define X86_VENDOR_AMD 2
--#define X86_VENDOR_UMC 3
--#define X86_VENDOR_NEXGEN 4
--#define X86_VENDOR_CENTAUR 5
--#define X86_VENDOR_TRANSMETA 7
--#define X86_VENDOR_NSC 8
--#define X86_VENDOR_NUM 9
--#define X86_VENDOR_UNKNOWN 0xff
-+#define X86_VENDOR_INTEL 0
-+#define X86_VENDOR_CYRIX 1
-+#define X86_VENDOR_AMD 2
-+#define X86_VENDOR_UMC 3
-+#define X86_VENDOR_CENTAUR 5
-+#define X86_VENDOR_TRANSMETA 7
-+#define X86_VENDOR_NSC 8
-+#define X86_VENDOR_NUM 9
++static inline int pte_global(pte_t pte)
++{
++ return 0;
++}
++
++static inline int pte_exec(pte_t pte)
++{
++ return !(__pte_val(pte) & _PAGE_NX);
++}
++
++static inline int pte_special(pte_t pte)
++{
++ return 0;
++}
++
++static inline int pmd_large(pmd_t pte)
++{
++ return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
++ (_PAGE_PSE | _PAGE_PRESENT);
++}
+
-+#define X86_VENDOR_UNKNOWN 0xff
-
- /*
- * capabilities of CPUs
- */
--extern struct cpuinfo_x86 boot_cpu_data;
--extern struct cpuinfo_x86 new_cpu_data;
--extern __u32 cleared_cpu_caps[NCAPINTS];
-+extern struct cpuinfo_x86 boot_cpu_data;
-+extern struct cpuinfo_x86 new_cpu_data;
++static inline pte_t pte_mkclean(pte_t pte)
++{
++ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
++}
+
-+extern __u32 cleared_cpu_caps[NCAPINTS];
-
- #ifdef CONFIG_SMP
- DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
-@@ -129,7 +140,18 @@ DECLARE_PER_CPU(struct cpuinfo_x86, cpu_
- #define current_cpu_data boot_cpu_data
- #endif
-
--void cpu_detect(struct cpuinfo_x86 *c);
-+static inline int hlt_works(int cpu)
++static inline pte_t pte_mkold(pte_t pte)
+{
-+#ifdef CONFIG_X86_32
-+ return cpu_data(cpu).hlt_works_ok;
-+#else
-+ return 1;
-+#endif
++ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
+}
+
-+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
++static inline pte_t pte_wrprotect(pte_t pte)
++{
++ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
++}
+
-+extern void cpu_detect(struct cpuinfo_x86 *c);
++static inline pte_t pte_mkexec(pte_t pte)
++{
++ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
++}
++
++static inline pte_t pte_mkdirty(pte_t pte)
++{
++ return __pte_ma(__pte_val(pte) | _PAGE_DIRTY);
++}
++
++static inline pte_t pte_mkyoung(pte_t pte)
++{
++ return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED);
++}
++
++static inline pte_t pte_mkwrite(pte_t pte)
++{
++ return __pte_ma(__pte_val(pte) | _PAGE_RW);
++}
++
++static inline pte_t pte_mkhuge(pte_t pte)
++{
++ return __pte_ma(__pte_val(pte) | _PAGE_PSE);
++}
++
++static inline pte_t pte_clrhuge(pte_t pte)
++{
++ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
++}
++
++static inline pte_t pte_mkglobal(pte_t pte)
++{
++ return pte;
++}
++
++static inline pte_t pte_clrglobal(pte_t pte)
++{
++ return pte;
++}
++
++static inline pte_t pte_mkspecial(pte_t pte)
++{
++ return pte;
++}
- extern void identify_cpu(struct cpuinfo_x86 *);
- extern void identify_boot_cpu(void);
-@@ -149,12 +171,12 @@ static inline void xen_cpuid(unsigned in
- unsigned int *ecx, unsigned int *edx)
- {
- /* ecx is often an input as well as an output. */
-- __asm__(XEN_CPUID
-- : "=a" (*eax),
-- "=b" (*ebx),
-- "=c" (*ecx),
-- "=d" (*edx)
-- : "0" (*eax), "2" (*ecx));
-+ asm(XEN_CPUID
-+ : "=a" (*eax),
-+ "=b" (*ebx),
-+ "=c" (*ecx),
-+ "=d" (*edx)
-+ : "0" (*eax), "2" (*ecx));
+ extern pteval_t __supported_pte_mask;
+
+@@ -202,15 +311,33 @@ static inline pte_t pte_modify(pte_t pte
+ pteval_t val = pte_val(pte);
+
+ val &= _PAGE_CHG_MASK;
+- val |= pgprot_val(newprot) & __supported_pte_mask;
++ val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
+
+ return __pte(val);
}
- static inline void load_cr3(pgd_t *pgdir)
-@@ -166,57 +188,70 @@ static inline void load_cr3(pgd_t *pgdir
- #ifdef CONFIG_X86_32
- /* This is the TSS defined by the hardware. */
- struct x86_hw_tss {
-- unsigned short back_link, __blh;
-- unsigned long sp0;
-- unsigned short ss0, __ss0h;
-- unsigned long sp1;
-- unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
-- unsigned long sp2;
-- unsigned short ss2, __ss2h;
-- unsigned long __cr3;
-- unsigned long ip;
-- unsigned long flags;
-- unsigned long ax, cx, dx, bx;
-- unsigned long sp, bp, si, di;
-- unsigned short es, __esh;
-- unsigned short cs, __csh;
-- unsigned short ss, __ssh;
-- unsigned short ds, __dsh;
-- unsigned short fs, __fsh;
-- unsigned short gs, __gsh;
-- unsigned short ldt, __ldth;
-- unsigned short trace, io_bitmap_base;
-+ unsigned short back_link, __blh;
-+ unsigned long sp0;
-+ unsigned short ss0, __ss0h;
-+ unsigned long sp1;
-+ /* ss1 caches MSR_IA32_SYSENTER_CS: */
-+ unsigned short ss1, __ss1h;
-+ unsigned long sp2;
-+ unsigned short ss2, __ss2h;
-+ unsigned long __cr3;
-+ unsigned long ip;
-+ unsigned long flags;
-+ unsigned long ax;
-+ unsigned long cx;
-+ unsigned long dx;
-+ unsigned long bx;
-+ unsigned long sp;
-+ unsigned long bp;
-+ unsigned long si;
-+ unsigned long di;
-+ unsigned short es, __esh;
-+ unsigned short cs, __csh;
-+ unsigned short ss, __ssh;
-+ unsigned short ds, __dsh;
-+ unsigned short fs, __fsh;
-+ unsigned short gs, __gsh;
-+ unsigned short ldt, __ldth;
-+ unsigned short trace;
-+ unsigned short io_bitmap_base;
+-#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
++/* mprotect needs to preserve PAT bits when updating vm_page_prot */
++#define pgprot_modify pgprot_modify
++static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
++{
++ pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
++ pgprotval_t addbits = pgprot_val(newprot);
++ return __pgprot(preservebits | addbits);
++}
+
- } __attribute__((packed));
- extern struct tss_struct doublefault_tss;
- #else
- struct x86_hw_tss {
-- u32 reserved1;
-- u64 sp0;
-- u64 sp1;
-- u64 sp2;
-- u64 reserved2;
-- u64 ist[7];
-- u32 reserved3;
-- u32 reserved4;
-- u16 reserved5;
-- u16 io_bitmap_base;
-+ u32 reserved1;
-+ u64 sp0;
-+ u64 sp1;
-+ u64 sp2;
-+ u64 reserved2;
-+ u64 ist[7];
-+ u32 reserved3;
-+ u32 reserved4;
-+ u16 reserved5;
-+ u16 io_bitmap_base;
++#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
+
+ #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
+
++#ifndef __ASSEMBLY__
++#define __HAVE_PHYS_MEM_ACCESS_PROT
++struct file;
++pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
++ unsigned long size, pgprot_t vma_prot);
++int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
++ unsigned long size, pgprot_t *vma_prot);
++#endif
+
- } __attribute__((packed)) ____cacheline_aligned;
+ #define set_pte(ptep, pte) xen_set_pte(ptep, pte)
+ #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
+
+@@ -246,6 +373,9 @@ static inline pte_t pte_modify(pte_t pte
+ # include "pgtable_64.h"
#endif
- #endif /* CONFIG_X86_NO_TSS */
- /*
-- * Size of io_bitmap.
-+ * IO-bitmap sizes:
- */
--#define IO_BITMAP_BITS 65536
--#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
--#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
--#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
--#define INVALID_IO_BITMAP_OFFSET 0x8000
--#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
-+#define IO_BITMAP_BITS 65536
-+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
-+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
-+#define INVALID_IO_BITMAP_OFFSET 0x8000
-+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
++#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
++#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
++
+ #ifndef __ASSEMBLY__
+
+ enum {
+@@ -312,46 +442,17 @@ static inline void xen_pte_clear(struct
+ * bit at the same time.
+ */
+ #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+-#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
+-({ \
+- int __changed = !pte_same(*(ptep), entry); \
+- if (__changed && (dirty)) { \
+- if ( likely((vma)->vm_mm == current->mm) ) { \
+- BUG_ON(HYPERVISOR_update_va_mapping(address, \
+- entry, \
+- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
+- UVMF_INVLPG|UVMF_MULTI)); \
+- } else { \
+- xen_l1_entry_update(ptep, entry); \
+- flush_tlb_page(vma, address); \
+- } \
+- } \
+- __changed; \
+-})
++extern int ptep_set_access_flags(struct vm_area_struct *vma,
++ unsigned long address, pte_t *ptep,
++ pte_t entry, int dirty);
+
+ #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+-#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
+- int __ret = 0; \
+- if (pte_young(*(ptep))) \
+- __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
+- &(ptep)->pte); \
+- if (__ret) \
+- pte_update((vma)->vm_mm, addr, ptep); \
+- __ret; \
+-})
++extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
++ unsigned long addr, pte_t *ptep);
+
+ #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+-#define ptep_clear_flush_young(vma, address, ptep) \
+-({ \
+- pte_t __pte = *(ptep); \
+- int __young = pte_young(__pte); \
+- __pte = pte_mkold(__pte); \
+- if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
+- (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
+- else if (__young) \
+- (ptep)->pte_low = __pte.pte_low; \
+- __young; \
+-})
++extern int ptep_clear_flush_young(struct vm_area_struct *vma,
++ unsigned long address, pte_t *ptep);
- #ifndef CONFIG_X86_NO_TSS
- struct tss_struct {
-- struct x86_hw_tss x86_tss;
-+ /*
-+ * The hardware state:
-+ */
-+ struct x86_hw_tss x86_tss;
+ #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
+ #define ptep_clear_flush(vma, addr, ptep) \
+@@ -370,7 +471,8 @@ static inline void xen_pte_clear(struct
+ })
- /*
- * The extra 1 is there because the CPU will access an
-@@ -224,136 +259,162 @@ struct tss_struct {
- * bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
-- unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
-+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
- /*
- * Cache the current maximum and the last task that used the bitmap:
- */
-- unsigned long io_bitmap_max;
-- struct thread_struct *io_bitmap_owner;
-+ unsigned long io_bitmap_max;
-+ struct thread_struct *io_bitmap_owner;
-+
- /*
-- * pads the TSS to be cacheline-aligned (size is 0x100)
-+ * Pad the TSS to be cacheline-aligned (size is 0x100):
- */
-- unsigned long __cacheline_filler[35];
-+ unsigned long __cacheline_filler[35];
- /*
-- * .. and then another 0x100 bytes for emergency kernel stack
-+ * .. and then another 0x100 bytes for the emergency kernel stack:
- */
-- unsigned long stack[64];
-+ unsigned long stack[64];
-+
- } __attribute__((packed));
+ #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
++static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
++ pte_t *ptep)
+ {
+ pte_t pte = *ptep;
+ if (!pte_none(pte)
+@@ -398,13 +500,29 @@ static inline pte_t ptep_get_and_clear(s
+ pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
- DECLARE_PER_CPU(struct tss_struct, init_tss);
+ #define __HAVE_ARCH_PTEP_SET_WRPROTECT
+-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
++static inline void ptep_set_wrprotect(struct mm_struct *mm,
++ unsigned long addr, pte_t *ptep)
+ {
+ pte_t pte = *ptep;
+ if (pte_write(pte))
+ set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
+ }
--/* Save the original ist values for checking stack pointers during debugging */
+/*
-+ * Save the original ist values for checking stack pointers during debugging
++ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
++ *
++ * dst - pointer to pgd range anwhere on a pgd page
++ * src - ""
++ * count - the number of pgds to copy.
++ *
++ * dst and src can be on the same page, but the range must not overlap,
++ * and must not cross a page boundary.
+ */
- struct orig_ist {
-- unsigned long ist[7];
-+ unsigned long ist[7];
- };
- #endif /* CONFIG_X86_NO_TSS */
++static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
++{
++ memcpy(dst, src, count * sizeof(pgd_t));
++}
++
+ #define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
+ xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
- #define MXCSR_DEFAULT 0x1f80
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:38:05.000000000 +0100
+@@ -8,25 +8,28 @@
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
- struct i387_fsave_struct {
-- u32 cwd;
-- u32 swd;
-- u32 twd;
-- u32 fip;
-- u32 fcs;
-- u32 foo;
-- u32 fos;
-- u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
-- u32 status; /* software status information */
-+ u32 cwd; /* FPU Control Word */
-+ u32 swd; /* FPU Status Word */
-+ u32 twd; /* FPU Tag Word */
-+ u32 fip; /* FPU IP Offset */
-+ u32 fcs; /* FPU IP Selector */
-+ u32 foo; /* FPU Operand Pointer Offset */
-+ u32 fos; /* FPU Operand Pointer Selector */
-+
-+ /* 8*10 bytes for each FP-reg = 80 bytes: */
-+ u32 st_space[20];
-+
-+ /* Software status information [not touched by FSAVE ]: */
-+ u32 status;
- };
+-#define pte_ERROR(e) \
+- printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \
+- &(e), __pte_val(e), pte_pfn(e))
+-#define pmd_ERROR(e) \
+- printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
+- &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+-#define pgd_ERROR(e) \
+- printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
+- &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+-
++#define pte_ERROR(e) \
++ printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \
++ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
++#define pmd_ERROR(e) \
++ printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
++ __FILE__, __LINE__, &(e), __pmd_val(e), \
++ (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
++#define pgd_ERROR(e) \
++ printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
++ __FILE__, __LINE__, &(e), __pgd_val(e), \
++ (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
- struct i387_fxsave_struct {
-- u16 cwd;
-- u16 swd;
-- u16 twd;
-- u16 fop;
-+ u16 cwd; /* Control Word */
-+ u16 swd; /* Status Word */
-+ u16 twd; /* Tag Word */
-+ u16 fop; /* Last Instruction Opcode */
- union {
- struct {
-- u64 rip;
-- u64 rdp;
-+ u64 rip; /* Instruction Pointer */
-+ u64 rdp; /* Data Pointer */
- };
- struct {
-- u32 fip;
-- u32 fcs;
-- u32 foo;
-- u32 fos;
-+ u32 fip; /* FPU IP Offset */
-+ u32 fcs; /* FPU IP Selector */
-+ u32 foo; /* FPU Operand Offset */
-+ u32 fos; /* FPU Operand Selector */
- };
- };
-- u32 mxcsr;
-- u32 mxcsr_mask;
-- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
-- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
-- u32 padding[24];
-+ u32 mxcsr; /* MXCSR Register State */
-+ u32 mxcsr_mask; /* MXCSR Mask */
+ static inline int pud_none(pud_t pud)
+ {
+ return __pud_val(pud) == 0;
+
-+ /* 8*16 bytes for each FP-reg = 128 bytes: */
-+ u32 st_space[32];
+ }
+ static inline int pud_bad(pud_t pud)
+ {
+ return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
+ }
+
-+ /* 16*16 bytes for each XMM-reg = 256 bytes: */
-+ u32 xmm_space[64];
+ static inline int pud_present(pud_t pud)
+ {
+ return __pud_val(pud) & _PAGE_PRESENT;
+@@ -48,12 +51,14 @@ static inline void xen_set_pte(pte_t *pt
+
+ static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+ {
+- set_64bit((unsigned long long *)(ptep),__pte_val(pte));
++ set_64bit((unsigned long long *)(ptep), __pte_val(pte));
+ }
+
-+ u32 padding[24];
+ static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
+ {
+ xen_l2_entry_update(pmdp, pmd);
+ }
+
- } __attribute__((aligned(16)));
+ static inline void xen_set_pud(pud_t *pudp, pud_t pud)
+ {
+ xen_l3_entry_update(pudp, pud);
+@@ -92,20 +97,19 @@ static inline void pud_clear(pud_t *pudp
+ * current pgd to avoid unnecessary TLB flushes.
+ */
+ pgd = read_cr3();
+- if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
++ if (__pa(pudp) >= pgd && __pa(pudp) <
++ (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
+ xen_tlb_flush();
+ }
+
+-#define pud_page(pud) \
+-((struct page *) __va(pud_val(pud) & PAGE_MASK))
++#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
+
+-#define pud_page_vaddr(pud) \
+-((unsigned long) __va(pud_val(pud) & PAGE_MASK))
++#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
+
+
+ /* Find an entry in the second-level page table.. */
+-#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
+- pmd_index(address))
++#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \
++ pmd_index(address))
+
+ #ifdef CONFIG_SMP
+ static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
+@@ -150,7 +154,8 @@ static inline int pte_none(pte_t pte)
+ * put the 32 bits of offset into the high part.
+ */
+ #define pte_to_pgoff(pte) ((pte).pte_high)
+-#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
++#define pgoff_to_pte(off) \
++ ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
+ #define PTE_FILE_MAX_BITS 32
- struct i387_soft_struct {
-- u32 cwd;
-- u32 swd;
-- u32 twd;
-- u32 fip;
-- u32 fcs;
-- u32 foo;
-- u32 fos;
-- u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
-- u8 ftop, changed, lookahead, no_update, rm, alimit;
-- struct info *info;
-- u32 entry_eip;
-+ u32 cwd;
-+ u32 swd;
-+ u32 twd;
-+ u32 fip;
-+ u32 fcs;
-+ u32 foo;
-+ u32 fos;
-+ /* 8*10 bytes for each FP-reg = 80 bytes: */
-+ u32 st_space[20];
-+ u8 ftop;
-+ u8 changed;
-+ u8 lookahead;
-+ u8 no_update;
-+ u8 rm;
-+ u8 alimit;
-+ struct info *info;
-+ u32 entry_eip;
- };
+ /* Encode and de-code a swap entry */
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:38:05.000000000 +0100
+@@ -38,16 +38,13 @@ void paging_init(void);
+ #ifdef CONFIG_X86_PAE
+ # include <asm/pgtable-3level-defs.h>
+ # define PMD_SIZE (1UL << PMD_SHIFT)
+-# define PMD_MASK (~(PMD_SIZE-1))
++# define PMD_MASK (~(PMD_SIZE - 1))
+ #else
+ # include <asm/pgtable-2level-defs.h>
+ #endif
--union i387_union {
-+union thread_xstate {
- struct i387_fsave_struct fsave;
- struct i387_fxsave_struct fxsave;
-- struct i387_soft_struct soft;
-+ struct i387_soft_struct soft;
- };
+ #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
+-#define PGDIR_MASK (~(PGDIR_SIZE-1))
+-
+-#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
+-#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
++#define PGDIR_MASK (~(PGDIR_SIZE - 1))
--#ifdef CONFIG_X86_32
--DECLARE_PER_CPU(u8, cpu_llc_id);
--#elif !defined(CONFIG_X86_NO_TSS)
-+#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS)
- DECLARE_PER_CPU(struct orig_ist, orig_ist);
+ /* Just any arbitrary offset to the start of the vmalloc VM area: the
+ * current 8MB value just means that there will be a 8MB "hole" after the
+@@ -56,21 +53,22 @@ void paging_init(void);
+ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
+ * area for the same reason. ;)
+ */
+-#define VMALLOC_OFFSET (8*1024*1024)
+-#define VMALLOC_START (((unsigned long) high_memory + \
+- 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
++#define VMALLOC_OFFSET (8 * 1024 * 1024)
++#define VMALLOC_START (((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) \
++ & ~(VMALLOC_OFFSET - 1))
+ #ifdef CONFIG_X86_PAE
+ #define LAST_PKMAP 512
+ #else
+ #define LAST_PKMAP 1024
#endif
- extern void print_cpu_info(struct cpuinfo_x86 *);
-+extern unsigned int xstate_size;
-+extern void free_thread_xstate(struct task_struct *);
-+extern struct kmem_cache *task_xstate_cachep;
- extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
- extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
- extern unsigned short num_cache_leaves;
+-#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
++#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
++ & PMD_MASK)
- struct thread_struct {
--/* cached TLS descriptors. */
-- struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
-- unsigned long sp0;
-- unsigned long sp;
-+ /* Cached TLS descriptors: */
-+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
-+ unsigned long sp0;
-+ unsigned long sp;
- #ifdef CONFIG_X86_32
-- unsigned long sysenter_cs;
-+ unsigned long sysenter_cs;
+ #ifdef CONFIG_HIGHMEM
+-# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
++# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
#else
-- unsigned long usersp; /* Copy from PDA */
-- unsigned short es, ds, fsindex, gsindex;
--#endif
-- unsigned long ip;
-- unsigned long fs;
-- unsigned long gs;
--/* Hardware debugging registers */
-- unsigned long debugreg0;
-- unsigned long debugreg1;
-- unsigned long debugreg2;
-- unsigned long debugreg3;
-- unsigned long debugreg6;
-- unsigned long debugreg7;
--/* fault info */
-- unsigned long cr2, trap_no, error_code;
--/* floating point info */
-- union i387_union i387 __attribute__((aligned(16)));;
-+ unsigned long usersp; /* Copy from PDA */
-+ unsigned short es;
-+ unsigned short ds;
-+ unsigned short fsindex;
-+ unsigned short gsindex;
-+#endif
-+ unsigned long ip;
-+ unsigned long fs;
-+ unsigned long gs;
-+ /* Hardware debugging registers: */
-+ unsigned long debugreg0;
-+ unsigned long debugreg1;
-+ unsigned long debugreg2;
-+ unsigned long debugreg3;
-+ unsigned long debugreg6;
-+ unsigned long debugreg7;
-+ /* Fault info: */
-+ unsigned long cr2;
-+ unsigned long trap_no;
-+ unsigned long error_code;
-+ /* floating point and extended processor state */
-+ union thread_xstate *xstate;
- #ifdef CONFIG_X86_32
--/* virtual 86 mode info */
-+ /* Virtual 86 mode info */
- struct vm86_struct __user *vm86_info;
- unsigned long screen_bitmap;
- unsigned long v86flags, v86mask, saved_sp0;
- unsigned int saved_fs, saved_gs;
+-# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
++# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
#endif
--/* IO permissions */
-- unsigned long *io_bitmap_ptr;
-- unsigned long iopl;
--/* max allowed port in the bitmap, in bytes: */
-- unsigned io_bitmap_max;
-+ /* IO permissions: */
-+ unsigned long *io_bitmap_ptr;
-+ unsigned long iopl;
-+ /* Max allowed port in the bitmap, in bytes: */
-+ unsigned io_bitmap_max;
- /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
- unsigned long debugctlmsr;
- /* Debug Store - if not 0 points to a DS Save Area configuration;
-@@ -384,12 +445,12 @@ static inline void xen_set_iopl_mask(uns
- }
- #ifndef CONFIG_X86_NO_TSS
--static inline void native_load_sp0(struct tss_struct *tss,
-- struct thread_struct *thread)
-+static inline void
-+native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
- {
- tss->x86_tss.sp0 = thread->sp0;
- #ifdef CONFIG_X86_32
-- /* Only happens when SEP is enabled, no need to test "SEP"arately */
-+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-@@ -403,8 +464,8 @@ static inline void native_load_sp0(struc
- } while (0)
+ /*
+@@ -91,10 +89,10 @@ extern unsigned long pg0[];
+ /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+ can temporarily clear it. */
+ #define pmd_present(x) (__pmd_val(x))
+-#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
++#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
+ #else
+ #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
+-#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
++#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+ #endif
+
+
+@@ -107,32 +105,18 @@ extern unsigned long pg0[];
#endif
--#define __cpuid xen_cpuid
--#define paravirt_enabled() 0
-+#define __cpuid xen_cpuid
-+#define paravirt_enabled() 0
+ /*
+- * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+- *
+- * dst - pointer to pgd range anwhere on a pgd page
+- * src - ""
+- * count - the number of pgds to copy.
+- *
+- * dst and src can be on the same page, but the range must not overlap,
+- * and must not cross a page boundary.
++ * Macro to mark a page protection value as "uncacheable".
++ * On processors which do not support it, this is a no-op.
+ */
+-static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+-{
+- memcpy(dst, src, count * sizeof(pgd_t));
+-}
+-
+-/*
+- * Macro to mark a page protection value as "uncacheable". On processors which do not support
+- * it, this is a no-op.
+- */
+-#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
+- ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
++#define pgprot_noncached(prot) \
++ ((boot_cpu_data.x86 > 3) \
++ ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
++ : (prot))
+
+ /*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+-
+ #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+
+ /*
+@@ -141,20 +125,20 @@ static inline void clone_pgd_range(pgd_t
+ * this macro returns the index of the entry in the pgd page which would
+ * control the given virtual address
+ */
+-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
+-#define pgd_index_k(addr) pgd_index(addr)
++#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
++#define pgd_index_k(addr) pgd_index((addr))
+
+ /*
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+-#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
++#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+
+ /*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+ */
+-#define pgd_offset_k(address) pgd_offset(&init_mm, address)
++#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+
+ static inline int pud_large(pud_t pud) { return 0; }
+
+@@ -164,8 +148,8 @@ static inline int pud_large(pud_t pud) {
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+-#define pmd_index(address) \
+- (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
++#define pmd_index(address) \
++ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
/*
- * These special macros can be used to get or set a debugging register
-@@ -424,11 +485,12 @@ static inline void native_load_sp0(struc
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+@@ -173,33 +157,36 @@ static inline int pud_large(pud_t pud) {
+ * this macro returns the index of the entry in the pte page which would
+ * control the given virtual address
*/
--extern unsigned long mmu_cr4_features;
-+extern unsigned long mmu_cr4_features;
+-#define pte_index(address) \
+- (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+-#define pte_offset_kernel(dir, address) \
+- ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
++#define pte_index(address) \
++ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
++#define pte_offset_kernel(dir, address) \
++ ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
- static inline void set_in_cr4(unsigned long mask)
- {
- unsigned cr4;
-+
- mmu_cr4_features |= mask;
- cr4 = read_cr4();
- cr4 |= mask;
-@@ -438,6 +500,7 @@ static inline void set_in_cr4(unsigned l
- static inline void clear_in_cr4(unsigned long mask)
- {
- unsigned cr4;
-+
- mmu_cr4_features &= ~mask;
- cr4 = read_cr4();
- cr4 &= ~mask;
-@@ -445,42 +508,42 @@ static inline void clear_in_cr4(unsigned
- }
+-#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
++#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
- struct microcode_header {
-- unsigned int hdrver;
-- unsigned int rev;
-- unsigned int date;
-- unsigned int sig;
-- unsigned int cksum;
-- unsigned int ldrver;
-- unsigned int pf;
-- unsigned int datasize;
-- unsigned int totalsize;
-- unsigned int reserved[3];
-+ unsigned int hdrver;
-+ unsigned int rev;
-+ unsigned int date;
-+ unsigned int sig;
-+ unsigned int cksum;
-+ unsigned int ldrver;
-+ unsigned int pf;
-+ unsigned int datasize;
-+ unsigned int totalsize;
-+ unsigned int reserved[3];
- };
+-#define pmd_page_vaddr(pmd) \
+- ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
++#define pmd_page_vaddr(pmd) \
++ ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
- struct microcode {
-- struct microcode_header hdr;
-- unsigned int bits[0];
-+ struct microcode_header hdr;
-+ unsigned int bits[0];
- };
+ #if defined(CONFIG_HIGHPTE)
+-#define pte_offset_map(dir, address) \
+- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
+-#define pte_offset_map_nested(dir, address) \
+- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
+-#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
+-#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
+-#else
+-#define pte_offset_map(dir, address) \
+- ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
+-#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
++#define pte_offset_map(dir, address) \
++ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
++ pte_index((address)))
++#define pte_offset_map_nested(dir, address) \
++ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
++ pte_index((address)))
++#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
++#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
++#else
++#define pte_offset_map(dir, address) \
++ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
++#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
+ #define pte_unmap(pte) do { } while (0)
+ #define pte_unmap_nested(pte) do { } while (0)
+ #endif
--typedef struct microcode microcode_t;
--typedef struct microcode_header microcode_header_t;
-+typedef struct microcode microcode_t;
-+typedef struct microcode_header microcode_header_t;
+ /* Clear a kernel PTE and flush it from the TLB */
+-#define kpte_clear_flush(ptep, vaddr) do { \
++#define kpte_clear_flush(ptep, vaddr) \
++do { \
+ if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
+ BUG(); \
+ } while (0)
+@@ -208,7 +195,7 @@ static inline int pud_large(pud_t pud) {
+ * The i386 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+-#define update_mmu_cache(vma,address,pte) do { } while (0)
++#define update_mmu_cache(vma, address, pte) do { } while (0)
- /* microcode format is extended from prescott processors */
- struct extended_signature {
-- unsigned int sig;
-- unsigned int pf;
-- unsigned int cksum;
-+ unsigned int sig;
-+ unsigned int pf;
-+ unsigned int cksum;
- };
+ void make_lowmem_page_readonly(void *va, unsigned int feature);
+ void make_lowmem_page_writable(void *va, unsigned int feature);
+@@ -225,7 +212,7 @@ void make_lowmem_page_writable(void *va,
+ #define kern_addr_valid(kaddr) (0)
+ #endif
- struct extended_sigtable {
-- unsigned int count;
-- unsigned int cksum;
-- unsigned int reserved[3];
-+ unsigned int count;
-+ unsigned int cksum;
-+ unsigned int reserved[3];
- struct extended_signature sigs[0];
- };
+-#define io_remap_pfn_range(vma,from,pfn,size,prot) \
+-direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
++#define io_remap_pfn_range(vma, from, pfn, size, prot) \
++ direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO)
- typedef struct {
-- unsigned long seg;
-+ unsigned long seg;
- } mm_segment_t;
+ #endif /* _I386_PGTABLE_H */
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:38:05.000000000 +0100
+@@ -31,7 +31,7 @@ extern void paging_init(void);
+ #endif /* !__ASSEMBLY__ */
-@@ -492,7 +555,7 @@ extern int kernel_thread(int (*fn)(void
- /* Free all resources held by a thread. */
- extern void release_thread(struct task_struct *);
+-#define SHARED_KERNEL_PMD 1
++#define SHARED_KERNEL_PMD 0
--/* Prepare to copy thread state - unlazy all lazy status */
-+/* Prepare to copy thread state - unlazy all lazy state */
- extern void prepare_to_copy(struct task_struct *tsk);
+ /*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+@@ -59,18 +59,20 @@ extern void paging_init(void);
- unsigned long get_wchan(struct task_struct *p);
-@@ -529,118 +592,138 @@ static inline unsigned int cpuid_eax(uns
- unsigned int eax, ebx, ecx, edx;
+ #ifndef __ASSEMBLY__
- cpuid(op, &eax, &ebx, &ecx, &edx);
-+
- return eax;
- }
-+
- static inline unsigned int cpuid_ebx(unsigned int op)
- {
- unsigned int eax, ebx, ecx, edx;
+-#define pte_ERROR(e) \
+- printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
+- &(e), __pte_val(e), pte_pfn(e))
+-#define pmd_ERROR(e) \
+- printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
+- &(e), __pmd_val(e), pmd_pfn(e))
+-#define pud_ERROR(e) \
+- printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
+- &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+-#define pgd_ERROR(e) \
+- printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
+- &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
++#define pte_ERROR(e) \
++ printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \
++ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
++#define pmd_ERROR(e) \
++ printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \
++ __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
++#define pud_ERROR(e) \
++ printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", \
++ __FILE__, __LINE__, &(e), __pud_val(e), \
++ (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
++#define pgd_ERROR(e) \
++ printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", \
++ __FILE__, __LINE__, &(e), __pgd_val(e), \
++ (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
- cpuid(op, &eax, &ebx, &ecx, &edx);
-+
- return ebx;
+ #define pgd_none(x) (!__pgd_val(x))
+ #define pud_none(x) (!__pud_val(x))
+@@ -125,7 +127,7 @@ static inline void xen_set_pgd(pgd_t *pg
+ xen_l4_entry_update(pgdp, pgd);
}
-+
- static inline unsigned int cpuid_ecx(unsigned int op)
- {
- unsigned int eax, ebx, ecx, edx;
- cpuid(op, &eax, &ebx, &ecx, &edx);
-+
- return ecx;
- }
-+
- static inline unsigned int cpuid_edx(unsigned int op)
+-static inline void xen_pgd_clear(pgd_t * pgd)
++static inline void xen_pgd_clear(pgd_t *pgd)
{
- unsigned int eax, ebx, ecx, edx;
+ xen_set_pgd(pgd, xen_make_pgd(0));
+ xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
+@@ -135,43 +137,43 @@ static inline void xen_pgd_clear(pgd_t *
+
+ #endif /* !__ASSEMBLY__ */
+
+-#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
+-#define PMD_MASK (~(PMD_SIZE-1))
+-#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT)
+-#define PUD_MASK (~(PUD_SIZE-1))
+-#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
+-#define PGDIR_MASK (~(PGDIR_SIZE-1))
++#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
++#define PMD_MASK (~(PMD_SIZE - 1))
++#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
++#define PUD_MASK (~(PUD_SIZE - 1))
++#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
++#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+
+-#define MAXMEM _AC(0x3fffffffffff, UL)
++#define MAXMEM _AC(0x00003fffffffffff, UL)
+ #define VMALLOC_START _AC(0xffffc20000000000, UL)
+ #define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
+ #define VMEMMAP_START _AC(0xffffe20000000000, UL)
+-#define MODULES_VADDR _AC(0xffffffff88000000, UL)
++#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
+ #define MODULES_END _AC(0xfffffffffff00000, UL)
+ #define MODULES_LEN (MODULES_END - MODULES_VADDR)
- cpuid(op, &eax, &ebx, &ecx, &edx);
-+
- return edx;
- }
+ #ifndef __ASSEMBLY__
- /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
- static inline void rep_nop(void)
+-static inline unsigned long pgd_bad(pgd_t pgd)
++static inline int pgd_bad(pgd_t pgd)
{
-- __asm__ __volatile__("rep;nop": : :"memory");
-+ asm volatile("rep; nop" ::: "memory");
+- return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
++ return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
}
--/* Stop speculative execution */
-+static inline void cpu_relax(void)
-+{
-+ rep_nop();
-+}
-+
-+/* Stop speculative execution: */
- static inline void sync_core(void)
+-static inline unsigned long pud_bad(pud_t pud)
++static inline int pud_bad(pud_t pud)
{
- int tmp;
-+
- asm volatile("cpuid" : "=a" (tmp) : "0" (1)
-- : "ebx", "ecx", "edx", "memory");
-+ : "ebx", "ecx", "edx", "memory");
+- return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
++ return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
}
--#define cpu_relax() rep_nop()
--
- static inline void __monitor(const void *eax, unsigned long ecx,
-- unsigned long edx)
-+ unsigned long edx)
+-static inline unsigned long pmd_bad(pmd_t pmd)
++static inline int pmd_bad(pmd_t pmd)
{
-- /* "monitor %eax,%ecx,%edx;" */
-- asm volatile(
-- ".byte 0x0f,0x01,0xc8;"
-- : :"a" (eax), "c" (ecx), "d"(edx));
-+ /* "monitor %eax, %ecx, %edx;" */
-+ asm volatile(".byte 0x0f, 0x01, 0xc8;"
-+ :: "a" (eax), "c" (ecx), "d"(edx));
+- return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
++ return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
}
- static inline void __mwait(unsigned long eax, unsigned long ecx)
- {
-- /* "mwait %eax,%ecx;" */
-- asm volatile(
-- ".byte 0x0f,0x01,0xc9;"
-- : :"a" (eax), "c" (ecx));
-+ /* "mwait %eax, %ecx;" */
-+ asm volatile(".byte 0x0f, 0x01, 0xc9;"
-+ :: "a" (eax), "c" (ecx));
- }
+ #define pte_none(x) (!(x).pte)
+ #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
- static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
++#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
+
+ #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
+ #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
+@@ -181,13 +183,13 @@ static inline unsigned long pmd_bad(pmd_
+ mfn_to_local_pfn(__pte_mfn(_pte)) : \
+ __pte_mfn(_pte))
+
+-#define pte_page(x) pfn_to_page(pte_pfn(x))
++#define pte_page(x) pfn_to_page(pte_pfn((x)))
+
+ /*
+ * Macro to mark a page protection value as "uncacheable".
+ */
+-#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
+-
++#define pgprot_noncached(prot) \
++ (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
+
+ /*
+ * Conversion functions: convert a page and protection to a page entry,
+@@ -197,36 +199,39 @@ static inline unsigned long pmd_bad(pmd_
+ /*
+ * Level 4 access.
+ */
+-#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
+-#define pgd_page(pgd) (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
+-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
+-#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
+-#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
++#define pgd_page_vaddr(pgd) \
++ ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
++#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
++#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
++#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
++#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
+ #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
+ static inline int pgd_large(pgd_t pgd) { return 0; }
+ #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+
+ /* PUD - Level3 access */
+ /* to find an entry in a page-table-directory. */
+-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
+-#define pud_page(pud) (pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
+-#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+-#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
++#define pud_page_vaddr(pud) \
++ ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
++#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
++#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
++#define pud_offset(pgd, address) \
++ ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
+ #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
+
+ static inline int pud_large(pud_t pte)
{
-- /* "mwait %eax,%ecx;" */
-- asm volatile(
-- "sti; .byte 0x0f,0x01,0xc9;"
-- : :"a" (eax), "c" (ecx));
-+ trace_hardirqs_on();
-+ /* "mwait %eax, %ecx;" */
-+ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
-+ :: "a" (eax), "c" (ecx));
+- return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
+- (_PAGE_PSE|_PAGE_PRESENT);
++ return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
++ (_PAGE_PSE | _PAGE_PRESENT);
}
- extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+ /* PMD - Level 2 access */
+-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
+-#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
++#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
++#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
--extern int force_mwait;
-+extern int force_mwait;
+-#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
+-#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
+- pmd_index(address))
++#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
++#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
++ pmd_index(address))
+ #define pmd_none(x) (!__pmd_val(x))
+ #if CONFIG_XEN_COMPAT <= 0x030002
+ /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+@@ -235,43 +240,56 @@ static inline int pud_large(pud_t pte)
+ #else
+ #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
+ #endif
+-#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
+-#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
++#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
++#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
- extern void select_idle_routine(const struct cpuinfo_x86 *c);
+ #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+-#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
++#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
++ _PAGE_FILE })
+ #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
--extern unsigned long boot_option_idle_override;
-+extern unsigned long boot_option_idle_override;
+ /* PTE - Level 1 access. */
- extern void enable_sep_cpu(void);
- extern int sysenter_setup(void);
+ /* page, protection -> pte */
+-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+-
+-#define pte_index(address) \
+- (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
++#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
++
++#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+ #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
+- pte_index(address))
++ pte_index((address)))
- /* Defined in head.S */
--extern struct desc_ptr early_gdt_descr;
-+extern struct desc_ptr early_gdt_descr;
+ /* x86-64 always has all page tables mapped. */
+-#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
+-#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
++#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
++#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
+ #define pte_unmap(pte) /* NOP */
+-#define pte_unmap_nested(pte) /* NOP */
++#define pte_unmap_nested(pte) /* NOP */
++
++#define update_mmu_cache(vma, address, pte) do { } while (0)
- extern void cpu_set_gdt(int);
- extern void switch_to_new_gdt(void);
- extern void cpu_init(void);
- extern void init_gdt(int cpu);
+-#define update_mmu_cache(vma,address,pte) do { } while (0)
++#define direct_gbpages 0
--/* from system description table in BIOS. Mostly for MCA use, but
-- * others may find it useful. */
--extern unsigned int machine_id;
--extern unsigned int machine_submodel_id;
--extern unsigned int BIOS_revision;
-+static inline void update_debugctlmsr(unsigned long debugctlmsr)
-+{
-+#ifndef CONFIG_X86_DEBUGCTLMSR
-+ if (boot_cpu_data.x86 < 6)
-+ return;
+ /* Encode and de-code a swap entry */
+-#define __swp_type(x) (((x).val >> 1) & 0x3f)
+-#define __swp_offset(x) ((x).val >> 8)
+-#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
++#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
++#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
++#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
++#else
++#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
++#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#endif
-+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
-+}
-
--/* Boot loader type from the setup header */
--extern int bootloader_type;
-+/*
-+ * from system description table in BIOS. Mostly for MCA use, but
-+ * others may find it useful:
-+ */
-+extern unsigned int machine_id;
-+extern unsigned int machine_submodel_id;
-+extern unsigned int BIOS_revision;
+
-+/* Boot loader type from the setup header: */
-+extern int bootloader_type;
++#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
++ & ((1U << SWP_TYPE_BITS) - 1))
++#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
++#define __swp_entry(type, offset) ((swp_entry_t) { \
++ ((type) << (_PAGE_BIT_PRESENT + 1)) \
++ | ((offset) << SWP_OFFSET_SHIFT) })
+ #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
+ #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
--extern char ignore_fpu_irq;
--#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
-+extern char ignore_fpu_irq;
+-extern int kern_addr_valid(unsigned long addr);
++extern int kern_addr_valid(unsigned long addr);
+ extern void cleanup_highmap(void);
- #define HAVE_ARCH_PICK_MMAP_LAYOUT 1
- #define ARCH_HAS_PREFETCHW
- #define ARCH_HAS_SPINLOCK_PREFETCH
+-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+- direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
++#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
++ direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
+
+ #define HAVE_ARCH_UNMAPPED_AREA
+ #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+@@ -284,8 +302,10 @@ extern void cleanup_highmap(void);
+
+ /* fs/proc/kcore.c */
+ #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
+-#define kc_offset_to_vaddr(o) \
+- (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
++#define kc_offset_to_vaddr(o) \
++ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
++ ? ((o) | ~__VIRTUAL_MASK) \
++ : (o))
- #ifdef CONFIG_X86_32
--#define BASE_PREFETCH ASM_NOP4
--#define ARCH_HAS_PREFETCH
-+# define BASE_PREFETCH ASM_NOP4
-+# define ARCH_HAS_PREFETCH
- #else
--#define BASE_PREFETCH "prefetcht0 (%1)"
-+# define BASE_PREFETCH "prefetcht0 (%1)"
- #endif
+ #define __HAVE_ARCH_PTE_SAME
+ #endif /* !__ASSEMBLY__ */
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:38:05.000000000 +0100
+@@ -3,10 +3,6 @@
--/* Prefetch instructions for Pentium III and AMD Athlon */
--/* It's not worth to care about 3dnow! prefetches for the K6
-- because they are microcoded there and very slow.
-- However we don't do prefetches for pre XP Athlons currently
-- That should be fixed. */
-+/*
-+ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
-+ *
-+ * It's not worth to care about 3dnow prefetches for the K6
-+ * because they are microcoded there and very slow.
-+ */
- static inline void prefetch(const void *x)
- {
- alternative_input(BASE_PREFETCH,
-@@ -649,8 +732,11 @@ static inline void prefetch(const void *
- "r" (x));
- }
+ #include <asm/processor-flags.h>
--/* 3dnow! prefetch to get an exclusive cache line. Useful for
-- spinlocks to avoid one state transition in the cache coherency protocol. */
-+/*
-+ * 3dnow prefetch to get an exclusive cache line.
-+ * Useful for spinlocks to avoid one state transition in the
-+ * cache coherency protocol:
-+ */
- static inline void prefetchw(const void *x)
+-/* migration helpers, for KVM - will be removed in 2.6.25: */
+-#include <asm/vm86.h>
+-#define Xgt_desc_struct desc_ptr
+-
+ /* Forward declaration, a strange C thing */
+ struct task_struct;
+ struct mm_struct;
+@@ -24,6 +20,7 @@ struct mm_struct;
+ #include <asm/msr.h>
+ #include <asm/desc_defs.h>
+ #include <asm/nops.h>
++
+ #include <linux/personality.h>
+ #include <linux/cpumask.h>
+ #include <linux/cache.h>
+@@ -38,16 +35,18 @@ struct mm_struct;
+ static inline void *current_text_addr(void)
{
- alternative_input(BASE_PREFETCH,
-@@ -659,21 +745,25 @@ static inline void prefetchw(const void
- "r" (x));
- }
-
--#define spin_lock_prefetch(x) prefetchw(x)
-+static inline void spin_lock_prefetch(const void *x)
-+{
-+ prefetchw(x);
-+}
+ void *pc;
+- asm volatile("mov $1f,%0\n1:":"=r" (pc));
+
- #ifdef CONFIG_X86_32
- /*
- * User space process size: 3GB (default).
- */
--#define TASK_SIZE (PAGE_OFFSET)
--#define STACK_TOP TASK_SIZE
--#define STACK_TOP_MAX STACK_TOP
--
--#define INIT_THREAD { \
-- .sp0 = sizeof(init_stack) + (long)&init_stack, \
-- .vm86_info = NULL, \
-- .sysenter_cs = __KERNEL_CS, \
-- .io_bitmap_ptr = NULL, \
-- .fs = __KERNEL_PERCPU, \
-+#define TASK_SIZE PAGE_OFFSET
-+#define STACK_TOP TASK_SIZE
-+#define STACK_TOP_MAX STACK_TOP
++ asm volatile("mov $1f, %0; 1:":"=r" (pc));
+
-+#define INIT_THREAD { \
-+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
-+ .vm86_info = NULL, \
-+ .sysenter_cs = __KERNEL_CS, \
-+ .io_bitmap_ptr = NULL, \
-+ .fs = __KERNEL_PERCPU, \
+ return pc;
}
+ #ifdef CONFIG_X86_VSMP
+-#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
+-#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
++# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
++# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
+ #else
+-#define ARCH_MIN_TASKALIGN 16
+-#define ARCH_MIN_MMSTRUCT_ALIGN 0
++# define ARCH_MIN_TASKALIGN 16
++# define ARCH_MIN_MMSTRUCT_ALIGN 0
+ #endif
+
/*
-@@ -682,28 +772,15 @@ static inline void prefetchw(const void
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
+@@ -57,68 +56,80 @@ static inline void *current_text_addr(vo
*/
--#define INIT_TSS { \
-- .x86_tss = { \
-+#define INIT_TSS { \
-+ .x86_tss = { \
- .sp0 = sizeof(init_stack) + (long)&init_stack, \
-- .ss0 = __KERNEL_DS, \
-- .ss1 = __KERNEL_CS, \
-- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
-- }, \
-- .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
--}
--
--#define start_thread(regs, new_eip, new_esp) do { \
-- __asm__("movl %0,%%gs": :"r" (0)); \
-- regs->fs = 0; \
-- set_fs(USER_DS); \
-- regs->ds = __USER_DS; \
-- regs->es = __USER_DS; \
-- regs->ss = __USER_DS; \
-- regs->cs = __USER_CS; \
-- regs->ip = new_eip; \
-- regs->sp = new_esp; \
--} while (0)
--
-+ .ss0 = __KERNEL_DS, \
-+ .ss1 = __KERNEL_CS, \
-+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
-+ }, \
-+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
-+}
-
- extern unsigned long thread_saved_pc(struct task_struct *tsk);
-@@ -731,24 +808,24 @@ extern unsigned long thread_saved_pc(str
- __regs__ - 1; \
- })
+ struct cpuinfo_x86 {
+- __u8 x86; /* CPU family */
+- __u8 x86_vendor; /* CPU vendor */
+- __u8 x86_model;
+- __u8 x86_mask;
++ __u8 x86; /* CPU family */
++ __u8 x86_vendor; /* CPU vendor */
++ __u8 x86_model;
++ __u8 x86_mask;
+ #ifdef CONFIG_X86_32
+- char wp_works_ok; /* It doesn't on 386's */
+- char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
+- char hard_math;
+- char rfu;
+- char fdiv_bug;
+- char f00f_bug;
+- char coma_bug;
+- char pad0;
++ char wp_works_ok; /* It doesn't on 386's */
++
++ /* Problems on some 486Dx4's and old 386's: */
++ char hlt_works_ok;
++ char hard_math;
++ char rfu;
++ char fdiv_bug;
++ char f00f_bug;
++ char coma_bug;
++ char pad0;
+ #else
+- /* number of 4K pages in DTLB/ITLB combined(in pages)*/
+- int x86_tlbsize;
+- __u8 x86_virt_bits, x86_phys_bits;
+- /* cpuid returned core id bits */
+- __u8 x86_coreid_bits;
+- /* Max extended CPUID function supported */
+- __u32 extended_cpuid_level;
+-#endif
+- int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
+- __u32 x86_capability[NCAPINTS];
+- char x86_vendor_id[16];
+- char x86_model_id[64];
+- int x86_cache_size; /* in KB - valid for CPUS which support this
+- call */
+- int x86_cache_alignment; /* In bytes */
+- int x86_power;
+- unsigned long loops_per_jiffy;
++ /* Number of 4K pages in DTLB/ITLB combined(in pages): */
++ int x86_tlbsize;
++ __u8 x86_virt_bits;
++ __u8 x86_phys_bits;
++ /* CPUID returned core id bits: */
++ __u8 x86_coreid_bits;
++ /* Max extended CPUID function supported: */
++ __u32 extended_cpuid_level;
++#endif
++ /* Maximum supported CPUID level, -1=no CPUID: */
++ int cpuid_level;
++ __u32 x86_capability[NCAPINTS];
++ char x86_vendor_id[16];
++ char x86_model_id[64];
++ /* in KB - valid for CPUS which support this call: */
++ int x86_cache_size;
++ int x86_cache_alignment; /* In bytes */
++ int x86_power;
++ unsigned long loops_per_jiffy;
+ #ifdef CONFIG_SMP
+- cpumask_t llc_shared_map; /* cpus sharing the last level cache */
++ /* cpus sharing the last level cache: */
++ cpumask_t llc_shared_map;
+ #endif
+- u16 x86_max_cores; /* cpuid returned max cores value */
+- u16 apicid;
+- u16 x86_clflush_size;
++ /* cpuid returned max cores value: */
++ u16 x86_max_cores;
++ u16 apicid;
++ u16 initial_apicid;
++ u16 x86_clflush_size;
+ #ifdef CONFIG_SMP
+- u16 booted_cores; /* number of cores as seen by OS */
+- u16 phys_proc_id; /* Physical processor id. */
+- u16 cpu_core_id; /* Core id */
+- u16 cpu_index; /* index into per_cpu list */
++ /* number of cores as seen by the OS: */
++ u16 booted_cores;
++ /* Physical processor id: */
++ u16 phys_proc_id;
++ /* Core id: */
++ u16 cpu_core_id;
++ /* Index into per_cpu list: */
++ u16 cpu_index;
+ #endif
+ } __attribute__((__aligned__(SMP_CACHE_BYTES)));
--#define KSTK_ESP(task) (task_pt_regs(task)->sp)
-+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+-#define X86_VENDOR_INTEL 0
+-#define X86_VENDOR_CYRIX 1
+-#define X86_VENDOR_AMD 2
+-#define X86_VENDOR_UMC 3
+-#define X86_VENDOR_NEXGEN 4
+-#define X86_VENDOR_CENTAUR 5
+-#define X86_VENDOR_TRANSMETA 7
+-#define X86_VENDOR_NSC 8
+-#define X86_VENDOR_NUM 9
+-#define X86_VENDOR_UNKNOWN 0xff
++#define X86_VENDOR_INTEL 0
++#define X86_VENDOR_CYRIX 1
++#define X86_VENDOR_AMD 2
++#define X86_VENDOR_UMC 3
++#define X86_VENDOR_CENTAUR 5
++#define X86_VENDOR_TRANSMETA 7
++#define X86_VENDOR_NSC 8
++#define X86_VENDOR_NUM 9
++
++#define X86_VENDOR_UNKNOWN 0xff
- #else
/*
- * User space process size. 47bits minus one guard page.
+ * capabilities of CPUs
*/
--#define TASK_SIZE64 (0x800000000000UL - 4096)
-+#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
+-extern struct cpuinfo_x86 boot_cpu_data;
+-extern struct cpuinfo_x86 new_cpu_data;
+-extern __u32 cleared_cpu_caps[NCAPINTS];
++extern struct cpuinfo_x86 boot_cpu_data;
++extern struct cpuinfo_x86 new_cpu_data;
++
++extern __u32 cleared_cpu_caps[NCAPINTS];
- /* This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
--#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
-- 0xc0000000 : 0xFFFFe000)
-+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
-+ 0xc0000000 : 0xFFFFe000)
+ #ifdef CONFIG_SMP
+ DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+@@ -129,7 +140,18 @@ DECLARE_PER_CPU(struct cpuinfo_x86, cpu_
+ #define current_cpu_data boot_cpu_data
+ #endif
--#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
-- IA32_PAGE_OFFSET : TASK_SIZE64)
--#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
-- IA32_PAGE_OFFSET : TASK_SIZE64)
-+#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
-+ IA32_PAGE_OFFSET : TASK_SIZE64)
-+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
-+ IA32_PAGE_OFFSET : TASK_SIZE64)
+-void cpu_detect(struct cpuinfo_x86 *c);
++static inline int hlt_works(int cpu)
++{
++#ifdef CONFIG_X86_32
++ return cpu_data(cpu).hlt_works_ok;
++#else
++ return 1;
++#endif
++}
++
++#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
++
++extern void cpu_detect(struct cpuinfo_x86 *c);
- #define STACK_TOP TASK_SIZE
- #define STACK_TOP_MAX TASK_SIZE64
-@@ -761,33 +838,32 @@ extern unsigned long thread_saved_pc(str
- .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+ extern void identify_cpu(struct cpuinfo_x86 *);
+ extern void identify_boot_cpu(void);
+@@ -149,12 +171,12 @@ static inline void xen_cpuid(unsigned in
+ unsigned int *ecx, unsigned int *edx)
+ {
+ /* ecx is often an input as well as an output. */
+- __asm__(XEN_CPUID
+- : "=a" (*eax),
+- "=b" (*ebx),
+- "=c" (*ecx),
+- "=d" (*edx)
+- : "0" (*eax), "2" (*ecx));
++ asm(XEN_CPUID
++ : "=a" (*eax),
++ "=b" (*ebx),
++ "=c" (*ecx),
++ "=d" (*edx)
++ : "0" (*eax), "2" (*ecx));
}
--#define start_thread(regs, new_rip, new_rsp) do { \
-- asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
-- load_gs_index(0); \
-- (regs)->ip = (new_rip); \
-- (regs)->sp = (new_rsp); \
-- write_pda(oldrsp, (new_rsp)); \
-- (regs)->cs = __USER_CS; \
-- (regs)->ss = __USER_DS; \
-- (regs)->flags = 0x200; \
-- set_fs(USER_DS); \
--} while (0)
--
+ static inline void load_cr3(pgd_t *pgdir)
+@@ -166,57 +188,70 @@ static inline void load_cr3(pgd_t *pgdir
+ #ifdef CONFIG_X86_32
+ /* This is the TSS defined by the hardware. */
+ struct x86_hw_tss {
+- unsigned short back_link, __blh;
+- unsigned long sp0;
+- unsigned short ss0, __ss0h;
+- unsigned long sp1;
+- unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
+- unsigned long sp2;
+- unsigned short ss2, __ss2h;
+- unsigned long __cr3;
+- unsigned long ip;
+- unsigned long flags;
+- unsigned long ax, cx, dx, bx;
+- unsigned long sp, bp, si, di;
+- unsigned short es, __esh;
+- unsigned short cs, __csh;
+- unsigned short ss, __ssh;
+- unsigned short ds, __dsh;
+- unsigned short fs, __fsh;
+- unsigned short gs, __gsh;
+- unsigned short ldt, __ldth;
+- unsigned short trace, io_bitmap_base;
++ unsigned short back_link, __blh;
++ unsigned long sp0;
++ unsigned short ss0, __ss0h;
++ unsigned long sp1;
++ /* ss1 caches MSR_IA32_SYSENTER_CS: */
++ unsigned short ss1, __ss1h;
++ unsigned long sp2;
++ unsigned short ss2, __ss2h;
++ unsigned long __cr3;
++ unsigned long ip;
++ unsigned long flags;
++ unsigned long ax;
++ unsigned long cx;
++ unsigned long dx;
++ unsigned long bx;
++ unsigned long sp;
++ unsigned long bp;
++ unsigned long si;
++ unsigned long di;
++ unsigned short es, __esh;
++ unsigned short cs, __csh;
++ unsigned short ss, __ssh;
++ unsigned short ds, __dsh;
++ unsigned short fs, __fsh;
++ unsigned short gs, __gsh;
++ unsigned short ldt, __ldth;
++ unsigned short trace;
++ unsigned short io_bitmap_base;
++
+ } __attribute__((packed));
+ extern struct tss_struct doublefault_tss;
+ #else
+ struct x86_hw_tss {
+- u32 reserved1;
+- u64 sp0;
+- u64 sp1;
+- u64 sp2;
+- u64 reserved2;
+- u64 ist[7];
+- u32 reserved3;
+- u32 reserved4;
+- u16 reserved5;
+- u16 io_bitmap_base;
++ u32 reserved1;
++ u64 sp0;
++ u64 sp1;
++ u64 sp2;
++ u64 reserved2;
++ u64 ist[7];
++ u32 reserved3;
++ u32 reserved4;
++ u16 reserved5;
++ u16 io_bitmap_base;
++
+ } __attribute__((packed)) ____cacheline_aligned;
+ #endif
+ #endif /* CONFIG_X86_NO_TSS */
+
/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
+- * Size of io_bitmap.
++ * IO-bitmap sizes:
*/
--#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
-+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
-
--#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
--#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
-+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
-+#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
- #endif /* CONFIG_X86_64 */
+-#define IO_BITMAP_BITS 65536
+-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+-#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+-#define INVALID_IO_BITMAP_OFFSET 0x8000
+-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
++#define IO_BITMAP_BITS 65536
++#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
++#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
++#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
++#define INVALID_IO_BITMAP_OFFSET 0x8000
++#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
--/* This decides where the kernel will search for a free chunk of vm
-+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
-+ unsigned long new_sp);
-+
-+/*
-+ * This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
- #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+ #ifndef CONFIG_X86_NO_TSS
+ struct tss_struct {
+- struct x86_hw_tss x86_tss;
++ /*
++ * The hardware state:
++ */
++ struct x86_hw_tss x86_tss;
--#define KSTK_EIP(task) (task_pt_regs(task)->ip)
-+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+ /*
+ * The extra 1 is there because the CPU will access an
+@@ -224,136 +259,162 @@ struct tss_struct {
+ * bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+- unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
++ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+ /*
+ * Cache the current maximum and the last task that used the bitmap:
+ */
+- unsigned long io_bitmap_max;
+- struct thread_struct *io_bitmap_owner;
++ unsigned long io_bitmap_max;
++ struct thread_struct *io_bitmap_owner;
+
-+/* Get/set a process' ability to use the timestamp counter instruction */
-+#define GET_TSC_CTL(adr) get_tsc_mode((adr))
-+#define SET_TSC_CTL(val) set_tsc_mode((val))
+ /*
+- * pads the TSS to be cacheline-aligned (size is 0x100)
++ * Pad the TSS to be cacheline-aligned (size is 0x100):
+ */
+- unsigned long __cacheline_filler[35];
++ unsigned long __cacheline_filler[35];
+ /*
+- * .. and then another 0x100 bytes for emergency kernel stack
++ * .. and then another 0x100 bytes for the emergency kernel stack:
+ */
+- unsigned long stack[64];
++ unsigned long stack[64];
+
-+extern int get_tsc_mode(unsigned long adr);
-+extern int set_tsc_mode(unsigned int val);
-
- #endif
---- a/include/asm-x86/mach-xen/asm/segment.h
-+++ b/include/asm-x86/mach-xen/asm/segment.h
-@@ -191,13 +191,14 @@
- #define SEGMENT_TI_MASK 0x4
+ } __attribute__((packed));
- #define IDT_ENTRIES 256
-+#define NUM_EXCEPTION_VECTORS 32
- #define GDT_SIZE (GDT_ENTRIES * 8)
- #define GDT_ENTRY_TLS_ENTRIES 3
- #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+ DECLARE_PER_CPU(struct tss_struct, init_tss);
- #ifdef __KERNEL__
- #ifndef __ASSEMBLY__
--extern const char early_idt_handlers[IDT_ENTRIES][10];
-+extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
- #endif
- #endif
+-/* Save the original ist values for checking stack pointers during debugging */
++/*
++ * Save the original ist values for checking stack pointers during debugging
++ */
+ struct orig_ist {
+- unsigned long ist[7];
++ unsigned long ist[7];
+ };
+ #endif /* CONFIG_X86_NO_TSS */
---- a/include/asm-x86/mach-xen/asm/smp_32.h
-+++ /dev/null
-@@ -1,178 +0,0 @@
--#ifndef __ASM_SMP_H
--#define __ASM_SMP_H
--
--#ifndef __ASSEMBLY__
--#include <linux/cpumask.h>
--#include <linux/init.h>
--
--/*
-- * We need the APIC definitions automatically as part of 'smp.h'
-- */
--#ifdef CONFIG_X86_LOCAL_APIC
--# include <asm/mpspec.h>
--# include <asm/apic.h>
--# ifdef CONFIG_X86_IO_APIC
--# include <asm/io_apic.h>
--# endif
--#endif
--
--#define cpu_callout_map cpu_possible_map
--#define cpu_callin_map cpu_possible_map
--
--extern int smp_num_siblings;
--extern unsigned int num_processors;
--
--extern void smp_alloc_memory(void);
--extern void lock_ipi_call_lock(void);
--extern void unlock_ipi_call_lock(void);
--
--extern void (*mtrr_hook) (void);
--extern void zap_low_mappings (void);
--
--DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
--DECLARE_PER_CPU(cpumask_t, cpu_core_map);
--DECLARE_PER_CPU(u8, cpu_llc_id);
--DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
--
--#ifdef CONFIG_HOTPLUG_CPU
--extern void cpu_exit_clear(void);
--extern void cpu_uninit(void);
--#endif
--
--#ifdef CONFIG_SMP
--
--#ifndef CONFIG_XEN
--
--/* Globals due to paravirt */
--extern void set_cpu_sibling_map(int cpu);
--
--struct smp_ops
--{
-- void (*smp_prepare_boot_cpu)(void);
-- void (*smp_prepare_cpus)(unsigned max_cpus);
-- int (*cpu_up)(unsigned cpu);
-- void (*smp_cpus_done)(unsigned max_cpus);
--
-- void (*smp_send_stop)(void);
-- void (*smp_send_reschedule)(int cpu);
-- int (*smp_call_function_mask)(cpumask_t mask,
-- void (*func)(void *info), void *info,
-- int wait);
--};
--
--extern struct smp_ops smp_ops;
--
--static inline void smp_prepare_boot_cpu(void)
--{
-- smp_ops.smp_prepare_boot_cpu();
--}
--static inline void smp_prepare_cpus(unsigned int max_cpus)
--{
-- smp_ops.smp_prepare_cpus(max_cpus);
--}
--static inline int __cpu_up(unsigned int cpu)
--{
-- return smp_ops.cpu_up(cpu);
--}
--static inline void smp_cpus_done(unsigned int max_cpus)
--{
-- smp_ops.smp_cpus_done(max_cpus);
--}
--
--static inline void smp_send_stop(void)
--{
-- smp_ops.smp_send_stop();
--}
--static inline void smp_send_reschedule(int cpu)
--{
-- smp_ops.smp_send_reschedule(cpu);
--}
--static inline int smp_call_function_mask(cpumask_t mask,
-- void (*func) (void *info), void *info,
-- int wait)
--{
-- return smp_ops.smp_call_function_mask(mask, func, info, wait);
--}
--
--void native_smp_prepare_boot_cpu(void);
--void native_smp_prepare_cpus(unsigned int max_cpus);
--int native_cpu_up(unsigned int cpunum);
--void native_smp_cpus_done(unsigned int max_cpus);
--
--#ifndef CONFIG_PARAVIRT
--#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
--#endif
--
--#else /* CONFIG_XEN */
--
--void xen_smp_send_stop(void);
--void xen_smp_send_reschedule(int cpu);
--int xen_smp_call_function_mask(cpumask_t mask,
-- void (*func) (void *info), void *info,
-- int wait);
--
--#define smp_send_stop xen_smp_send_stop
--#define smp_send_reschedule xen_smp_send_reschedule
--#define smp_call_function_mask xen_smp_call_function_mask
--
--extern void prefill_possible_map(void);
--
--#endif /* CONFIG_XEN */
--
--extern int __cpu_disable(void);
--extern void __cpu_die(unsigned int cpu);
--
--/*
-- * This function is needed by all SMP systems. It must _always_ be valid
-- * from the initial startup. We map APIC_BASE very early in page_setup(),
-- * so this is correct in the x86 case.
-- */
--DECLARE_PER_CPU(int, cpu_number);
--#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
--
--#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
--
--#define safe_smp_processor_id() smp_processor_id()
--
--/* We don't mark CPUs online until __cpu_up(), so we need another measure */
--static inline int num_booting_cpus(void)
--{
-- return cpus_weight(cpu_callout_map);
--}
--
--#else /* CONFIG_SMP */
--
--#define safe_smp_processor_id() 0
--#define cpu_physical_id(cpu) boot_cpu_physical_apicid
--
--#endif /* !CONFIG_SMP */
--
--#ifdef CONFIG_X86_LOCAL_APIC
--
--static __inline int logical_smp_processor_id(void)
--{
-- /* we don't want to mark this access volatile - bad code generation */
-- return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
--}
--
--# ifdef APIC_DEFINITION
--extern int hard_smp_processor_id(void);
--# else
--# include <mach_apicdef.h>
--static inline int hard_smp_processor_id(void)
--{
-- /* we don't want to mark this access volatile - bad code generation */
-- return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
--}
--# endif /* APIC_DEFINITION */
--
--#else /* CONFIG_X86_LOCAL_APIC */
--
--# ifndef CONFIG_SMP
--# define hard_smp_processor_id() 0
--# endif
--
--#endif /* CONFIG_X86_LOCAL_APIC */
--
--#endif /* !ASSEMBLY */
--#endif
---- a/include/asm-x86/mach-xen/asm/smp_64.h
-+++ /dev/null
-@@ -1,103 +0,0 @@
--#ifndef __ASM_SMP_H
--#define __ASM_SMP_H
--
--#include <linux/cpumask.h>
--#include <linux/init.h>
--
--#ifdef CONFIG_X86_LOCAL_APIC
--/*
-- * We need the APIC definitions automatically as part of 'smp.h'
-- */
--#include <asm/apic.h>
--#ifdef CONFIG_X86_IO_APIC
--#include <asm/io_apic.h>
--#endif
--#include <asm/mpspec.h>
--#endif
--#include <asm/pda.h>
--#include <asm/thread_info.h>
--
--extern cpumask_t cpu_initialized;
--
--extern int smp_num_siblings;
--extern unsigned int num_processors;
--
--extern void smp_alloc_memory(void);
--extern void lock_ipi_call_lock(void);
--extern void unlock_ipi_call_lock(void);
--
--extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
-- void *info, int wait);
--
--DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
--DECLARE_PER_CPU(cpumask_t, cpu_core_map);
--DECLARE_PER_CPU(u16, cpu_llc_id);
--DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
--DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
--
--#ifdef CONFIG_X86_LOCAL_APIC
--static inline int cpu_present_to_apicid(int mps_cpu)
--{
-- if (cpu_present(mps_cpu))
-- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
-- else
-- return BAD_APICID;
--}
+ #define MXCSR_DEFAULT 0x1f80
+
+ struct i387_fsave_struct {
+- u32 cwd;
+- u32 swd;
+- u32 twd;
+- u32 fip;
+- u32 fcs;
+- u32 foo;
+- u32 fos;
+- u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+- u32 status; /* software status information */
++ u32 cwd; /* FPU Control Word */
++ u32 swd; /* FPU Status Word */
++ u32 twd; /* FPU Tag Word */
++ u32 fip; /* FPU IP Offset */
++ u32 fcs; /* FPU IP Selector */
++ u32 foo; /* FPU Operand Pointer Offset */
++ u32 fos; /* FPU Operand Pointer Selector */
++
++ /* 8*10 bytes for each FP-reg = 80 bytes: */
++ u32 st_space[20];
++
++ /* Software status information [not touched by FSAVE ]: */
++ u32 status;
+ };
+
+ struct i387_fxsave_struct {
+- u16 cwd;
+- u16 swd;
+- u16 twd;
+- u16 fop;
++ u16 cwd; /* Control Word */
++ u16 swd; /* Status Word */
++ u16 twd; /* Tag Word */
++ u16 fop; /* Last Instruction Opcode */
+ union {
+ struct {
+- u64 rip;
+- u64 rdp;
++ u64 rip; /* Instruction Pointer */
++ u64 rdp; /* Data Pointer */
+ };
+ struct {
+- u32 fip;
+- u32 fcs;
+- u32 foo;
+- u32 fos;
++ u32 fip; /* FPU IP Offset */
++ u32 fcs; /* FPU IP Selector */
++ u32 foo; /* FPU Operand Offset */
++ u32 fos; /* FPU Operand Selector */
+ };
+ };
+- u32 mxcsr;
+- u32 mxcsr_mask;
+- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+- u32 padding[24];
++ u32 mxcsr; /* MXCSR Register State */
++ u32 mxcsr_mask; /* MXCSR Mask */
++
++ /* 8*16 bytes for each FP-reg = 128 bytes: */
++ u32 st_space[32];
++
++ /* 16*16 bytes for each XMM-reg = 256 bytes: */
++ u32 xmm_space[64];
++
++ u32 padding[24];
++
+ } __attribute__((aligned(16)));
+
+ struct i387_soft_struct {
+- u32 cwd;
+- u32 swd;
+- u32 twd;
+- u32 fip;
+- u32 fcs;
+- u32 foo;
+- u32 fos;
+- u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+- u8 ftop, changed, lookahead, no_update, rm, alimit;
+- struct info *info;
+- u32 entry_eip;
++ u32 cwd;
++ u32 swd;
++ u32 twd;
++ u32 fip;
++ u32 fcs;
++ u32 foo;
++ u32 fos;
++ /* 8*10 bytes for each FP-reg = 80 bytes: */
++ u32 st_space[20];
++ u8 ftop;
++ u8 changed;
++ u8 lookahead;
++ u8 no_update;
++ u8 rm;
++ u8 alimit;
++ struct info *info;
++ u32 entry_eip;
+ };
+
+-union i387_union {
++union thread_xstate {
+ struct i387_fsave_struct fsave;
+ struct i387_fxsave_struct fxsave;
+- struct i387_soft_struct soft;
++ struct i387_soft_struct soft;
+ };
+
+-#ifdef CONFIG_X86_32
+-DECLARE_PER_CPU(u8, cpu_llc_id);
+-#elif !defined(CONFIG_X86_NO_TSS)
++#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS)
+ DECLARE_PER_CPU(struct orig_ist, orig_ist);
+ #endif
+
+ extern void print_cpu_info(struct cpuinfo_x86 *);
++extern unsigned int xstate_size;
++extern void free_thread_xstate(struct task_struct *);
++extern struct kmem_cache *task_xstate_cachep;
+ extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+ extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+ extern unsigned short num_cache_leaves;
+
+ struct thread_struct {
+-/* cached TLS descriptors. */
+- struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+- unsigned long sp0;
+- unsigned long sp;
++ /* Cached TLS descriptors: */
++ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
++ unsigned long sp0;
++ unsigned long sp;
+ #ifdef CONFIG_X86_32
+- unsigned long sysenter_cs;
++ unsigned long sysenter_cs;
+ #else
+- unsigned long usersp; /* Copy from PDA */
+- unsigned short es, ds, fsindex, gsindex;
-#endif
+- unsigned long ip;
+- unsigned long fs;
+- unsigned long gs;
+-/* Hardware debugging registers */
+- unsigned long debugreg0;
+- unsigned long debugreg1;
+- unsigned long debugreg2;
+- unsigned long debugreg3;
+- unsigned long debugreg6;
+- unsigned long debugreg7;
+-/* fault info */
+- unsigned long cr2, trap_no, error_code;
+-/* floating point info */
+- union i387_union i387 __attribute__((aligned(16)));;
++ unsigned long usersp; /* Copy from PDA */
++ unsigned short es;
++ unsigned short ds;
++ unsigned short fsindex;
++ unsigned short gsindex;
++#endif
++ unsigned long ip;
++ unsigned long fs;
++ unsigned long gs;
++ /* Hardware debugging registers: */
++ unsigned long debugreg0;
++ unsigned long debugreg1;
++ unsigned long debugreg2;
++ unsigned long debugreg3;
++ unsigned long debugreg6;
++ unsigned long debugreg7;
++ /* Fault info: */
++ unsigned long cr2;
++ unsigned long trap_no;
++ unsigned long error_code;
++ /* floating point and extended processor state */
++ union thread_xstate *xstate;
+ #ifdef CONFIG_X86_32
+-/* virtual 86 mode info */
++ /* Virtual 86 mode info */
+ struct vm86_struct __user *vm86_info;
+ unsigned long screen_bitmap;
+ unsigned long v86flags, v86mask, saved_sp0;
+ unsigned int saved_fs, saved_gs;
+ #endif
+-/* IO permissions */
+- unsigned long *io_bitmap_ptr;
+- unsigned long iopl;
+-/* max allowed port in the bitmap, in bytes: */
+- unsigned io_bitmap_max;
++ /* IO permissions: */
++ unsigned long *io_bitmap_ptr;
++ unsigned long iopl;
++ /* Max allowed port in the bitmap, in bytes: */
++ unsigned io_bitmap_max;
+ /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
+ unsigned long debugctlmsr;
+ /* Debug Store - if not 0 points to a DS Save Area configuration;
+@@ -384,12 +445,12 @@ static inline void xen_set_iopl_mask(uns
+ }
+
+ #ifndef CONFIG_X86_NO_TSS
+-static inline void native_load_sp0(struct tss_struct *tss,
+- struct thread_struct *thread)
++static inline void
++native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+ {
+ tss->x86_tss.sp0 = thread->sp0;
+ #ifdef CONFIG_X86_32
+- /* Only happens when SEP is enabled, no need to test "SEP"arately */
++ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+ tss->x86_tss.ss1 = thread->sysenter_cs;
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+@@ -403,8 +464,8 @@ static inline void native_load_sp0(struc
+ } while (0)
+ #endif
+
+-#define __cpuid xen_cpuid
+-#define paravirt_enabled() 0
++#define __cpuid xen_cpuid
++#define paravirt_enabled() 0
+
+ /*
+ * These special macros can be used to get or set a debugging register
+@@ -424,11 +485,12 @@ static inline void native_load_sp0(struc
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+-extern unsigned long mmu_cr4_features;
++extern unsigned long mmu_cr4_features;
+
+ static inline void set_in_cr4(unsigned long mask)
+ {
+ unsigned cr4;
++
+ mmu_cr4_features |= mask;
+ cr4 = read_cr4();
+ cr4 |= mask;
+@@ -438,6 +500,7 @@ static inline void set_in_cr4(unsigned l
+ static inline void clear_in_cr4(unsigned long mask)
+ {
+ unsigned cr4;
++
+ mmu_cr4_features &= ~mask;
+ cr4 = read_cr4();
+ cr4 &= ~mask;
+@@ -445,42 +508,42 @@ static inline void clear_in_cr4(unsigned
+ }
+
+ struct microcode_header {
+- unsigned int hdrver;
+- unsigned int rev;
+- unsigned int date;
+- unsigned int sig;
+- unsigned int cksum;
+- unsigned int ldrver;
+- unsigned int pf;
+- unsigned int datasize;
+- unsigned int totalsize;
+- unsigned int reserved[3];
++ unsigned int hdrver;
++ unsigned int rev;
++ unsigned int date;
++ unsigned int sig;
++ unsigned int cksum;
++ unsigned int ldrver;
++ unsigned int pf;
++ unsigned int datasize;
++ unsigned int totalsize;
++ unsigned int reserved[3];
+ };
+
+ struct microcode {
+- struct microcode_header hdr;
+- unsigned int bits[0];
++ struct microcode_header hdr;
++ unsigned int bits[0];
+ };
+
+-typedef struct microcode microcode_t;
+-typedef struct microcode_header microcode_header_t;
++typedef struct microcode microcode_t;
++typedef struct microcode_header microcode_header_t;
+
+ /* microcode format is extended from prescott processors */
+ struct extended_signature {
+- unsigned int sig;
+- unsigned int pf;
+- unsigned int cksum;
++ unsigned int sig;
++ unsigned int pf;
++ unsigned int cksum;
+ };
+
+ struct extended_sigtable {
+- unsigned int count;
+- unsigned int cksum;
+- unsigned int reserved[3];
++ unsigned int count;
++ unsigned int cksum;
++ unsigned int reserved[3];
+ struct extended_signature sigs[0];
+ };
+
+ typedef struct {
+- unsigned long seg;
++ unsigned long seg;
+ } mm_segment_t;
+
+
+@@ -492,7 +555,7 @@ extern int kernel_thread(int (*fn)(void
+ /* Free all resources held by a thread. */
+ extern void release_thread(struct task_struct *);
+
+-/* Prepare to copy thread state - unlazy all lazy status */
++/* Prepare to copy thread state - unlazy all lazy state */
+ extern void prepare_to_copy(struct task_struct *tsk);
+
+ unsigned long get_wchan(struct task_struct *p);
+@@ -529,118 +592,138 @@ static inline unsigned int cpuid_eax(uns
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
++
+ return eax;
+ }
++
+ static inline unsigned int cpuid_ebx(unsigned int op)
+ {
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
++
+ return ebx;
+ }
++
+ static inline unsigned int cpuid_ecx(unsigned int op)
+ {
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
++
+ return ecx;
+ }
++
+ static inline unsigned int cpuid_edx(unsigned int op)
+ {
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
++
+ return edx;
+ }
+
+ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+ static inline void rep_nop(void)
+ {
+- __asm__ __volatile__("rep;nop": : :"memory");
++ asm volatile("rep; nop" ::: "memory");
+ }
+
+-/* Stop speculative execution */
++static inline void cpu_relax(void)
++{
++ rep_nop();
++}
++
++/* Stop speculative execution: */
+ static inline void sync_core(void)
+ {
+ int tmp;
++
+ asm volatile("cpuid" : "=a" (tmp) : "0" (1)
+- : "ebx", "ecx", "edx", "memory");
++ : "ebx", "ecx", "edx", "memory");
+ }
+
+-#define cpu_relax() rep_nop()
-
--#ifdef CONFIG_SMP
--
--#define SMP_TRAMPOLINE_BASE 0x6000
--
--extern int __cpu_disable(void);
--extern void __cpu_die(unsigned int cpu);
--extern void prefill_possible_map(void);
--extern unsigned __cpuinitdata disabled_cpus;
--
--#define raw_smp_processor_id() read_pda(cpunumber)
--#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
--
--#define stack_smp_processor_id() \
-- ({ \
-- struct thread_info *ti; \
-- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
-- ti->cpu; \
--})
--
--/*
-- * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
-- * scheduling and IPI sending and compresses data structures.
-- */
--static inline int num_booting_cpus(void)
--{
-- return cpus_weight(cpu_possible_map);
--}
--
--extern void smp_send_reschedule(int cpu);
--
--#else /* CONFIG_SMP */
--
--extern unsigned int boot_cpu_id;
--#define cpu_physical_id(cpu) boot_cpu_id
--#define stack_smp_processor_id() 0
--
--#endif /* !CONFIG_SMP */
--
--#define safe_smp_processor_id() smp_processor_id()
+ static inline void __monitor(const void *eax, unsigned long ecx,
+- unsigned long edx)
++ unsigned long edx)
+ {
+- /* "monitor %eax,%ecx,%edx;" */
+- asm volatile(
+- ".byte 0x0f,0x01,0xc8;"
+- : :"a" (eax), "c" (ecx), "d"(edx));
++ /* "monitor %eax, %ecx, %edx;" */
++ asm volatile(".byte 0x0f, 0x01, 0xc8;"
++ :: "a" (eax), "c" (ecx), "d"(edx));
+ }
+
+ static inline void __mwait(unsigned long eax, unsigned long ecx)
+ {
+- /* "mwait %eax,%ecx;" */
+- asm volatile(
+- ".byte 0x0f,0x01,0xc9;"
+- : :"a" (eax), "c" (ecx));
++ /* "mwait %eax, %ecx;" */
++ asm volatile(".byte 0x0f, 0x01, 0xc9;"
++ :: "a" (eax), "c" (ecx));
+ }
+
+ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+ {
+- /* "mwait %eax,%ecx;" */
+- asm volatile(
+- "sti; .byte 0x0f,0x01,0xc9;"
+- : :"a" (eax), "c" (ecx));
++ trace_hardirqs_on();
++ /* "mwait %eax, %ecx;" */
++ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
++ :: "a" (eax), "c" (ecx));
+ }
+
+ extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+
+-extern int force_mwait;
++extern int force_mwait;
+
+ extern void select_idle_routine(const struct cpuinfo_x86 *c);
+
+-extern unsigned long boot_option_idle_override;
++extern unsigned long boot_option_idle_override;
+
+ extern void enable_sep_cpu(void);
+ extern int sysenter_setup(void);
+
+ /* Defined in head.S */
+-extern struct desc_ptr early_gdt_descr;
++extern struct desc_ptr early_gdt_descr;
+
+ extern void cpu_set_gdt(int);
+ extern void switch_to_new_gdt(void);
+ extern void cpu_init(void);
+ extern void init_gdt(int cpu);
+
+-/* from system description table in BIOS. Mostly for MCA use, but
+- * others may find it useful. */
+-extern unsigned int machine_id;
+-extern unsigned int machine_submodel_id;
+-extern unsigned int BIOS_revision;
++static inline void update_debugctlmsr(unsigned long debugctlmsr)
++{
++#ifndef CONFIG_X86_DEBUGCTLMSR
++ if (boot_cpu_data.x86 < 6)
++ return;
++#endif
++ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
++}
+
+-/* Boot loader type from the setup header */
+-extern int bootloader_type;
++/*
++ * from system description table in BIOS. Mostly for MCA use, but
++ * others may find it useful:
++ */
++extern unsigned int machine_id;
++extern unsigned int machine_submodel_id;
++extern unsigned int BIOS_revision;
++
++/* Boot loader type from the setup header: */
++extern int bootloader_type;
+
+-extern char ignore_fpu_irq;
+-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
++extern char ignore_fpu_irq;
+
+ #define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+ #define ARCH_HAS_PREFETCHW
+ #define ARCH_HAS_SPINLOCK_PREFETCH
+
+ #ifdef CONFIG_X86_32
+-#define BASE_PREFETCH ASM_NOP4
+-#define ARCH_HAS_PREFETCH
++# define BASE_PREFETCH ASM_NOP4
++# define ARCH_HAS_PREFETCH
+ #else
+-#define BASE_PREFETCH "prefetcht0 (%1)"
++# define BASE_PREFETCH "prefetcht0 (%1)"
+ #endif
+
+-/* Prefetch instructions for Pentium III and AMD Athlon */
+-/* It's not worth to care about 3dnow! prefetches for the K6
+- because they are microcoded there and very slow.
+- However we don't do prefetches for pre XP Athlons currently
+- That should be fixed. */
++/*
++ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
++ *
++ * It's not worth to care about 3dnow prefetches for the K6
++ * because they are microcoded there and very slow.
++ */
+ static inline void prefetch(const void *x)
+ {
+ alternative_input(BASE_PREFETCH,
+@@ -649,8 +732,11 @@ static inline void prefetch(const void *
+ "r" (x));
+ }
+
+-/* 3dnow! prefetch to get an exclusive cache line. Useful for
+- spinlocks to avoid one state transition in the cache coherency protocol. */
++/*
++ * 3dnow prefetch to get an exclusive cache line.
++ * Useful for spinlocks to avoid one state transition in the
++ * cache coherency protocol:
++ */
+ static inline void prefetchw(const void *x)
+ {
+ alternative_input(BASE_PREFETCH,
+@@ -659,21 +745,25 @@ static inline void prefetchw(const void
+ "r" (x));
+ }
+
+-#define spin_lock_prefetch(x) prefetchw(x)
++static inline void spin_lock_prefetch(const void *x)
++{
++ prefetchw(x);
++}
++
+ #ifdef CONFIG_X86_32
+ /*
+ * User space process size: 3GB (default).
+ */
+-#define TASK_SIZE (PAGE_OFFSET)
+-#define STACK_TOP TASK_SIZE
+-#define STACK_TOP_MAX STACK_TOP
-
--#ifdef CONFIG_X86_LOCAL_APIC
--static __inline int logical_smp_processor_id(void)
--{
-- /* we don't want to mark this access volatile - bad code generation */
-- return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+-#define INIT_THREAD { \
+- .sp0 = sizeof(init_stack) + (long)&init_stack, \
+- .vm86_info = NULL, \
+- .sysenter_cs = __KERNEL_CS, \
+- .io_bitmap_ptr = NULL, \
+- .fs = __KERNEL_PERCPU, \
++#define TASK_SIZE PAGE_OFFSET
++#define STACK_TOP TASK_SIZE
++#define STACK_TOP_MAX STACK_TOP
++
++#define INIT_THREAD { \
++ .sp0 = sizeof(init_stack) + (long)&init_stack, \
++ .vm86_info = NULL, \
++ .sysenter_cs = __KERNEL_CS, \
++ .io_bitmap_ptr = NULL, \
++ .fs = __KERNEL_PERCPU, \
+ }
+
+ /*
+@@ -682,28 +772,15 @@ static inline void prefetchw(const void
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+-#define INIT_TSS { \
+- .x86_tss = { \
++#define INIT_TSS { \
++ .x86_tss = { \
+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
+- .ss0 = __KERNEL_DS, \
+- .ss1 = __KERNEL_CS, \
+- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
+- }, \
+- .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
-}
-
--static inline int hard_smp_processor_id(void)
--{
-- /* we don't want to mark this access volatile - bad code generation */
-- return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
--}
--#endif
+-#define start_thread(regs, new_eip, new_esp) do { \
+- __asm__("movl %0,%%gs": :"r" (0)); \
+- regs->fs = 0; \
+- set_fs(USER_DS); \
+- regs->ds = __USER_DS; \
+- regs->es = __USER_DS; \
+- regs->ss = __USER_DS; \
+- regs->cs = __USER_CS; \
+- regs->ip = new_eip; \
+- regs->sp = new_esp; \
+-} while (0)
-
--#endif
++ .ss0 = __KERNEL_DS, \
++ .ss1 = __KERNEL_CS, \
++ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
++ }, \
++ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
++}
+
+ extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
+@@ -731,24 +808,24 @@ extern unsigned long thread_saved_pc(str
+ __regs__ - 1; \
+ })
+
+-#define KSTK_ESP(task) (task_pt_regs(task)->sp)
++#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+
+ #else
+ /*
+ * User space process size. 47bits minus one guard page.
+ */
+-#define TASK_SIZE64 (0x800000000000UL - 4096)
++#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
+
+ /* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
+- 0xc0000000 : 0xFFFFe000)
++#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
++ 0xc0000000 : 0xFFFFe000)
+
+-#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
+- IA32_PAGE_OFFSET : TASK_SIZE64)
+-#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
+- IA32_PAGE_OFFSET : TASK_SIZE64)
++#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
++ IA32_PAGE_OFFSET : TASK_SIZE64)
++#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
++ IA32_PAGE_OFFSET : TASK_SIZE64)
+
+ #define STACK_TOP TASK_SIZE
+ #define STACK_TOP_MAX TASK_SIZE64
+@@ -761,33 +838,32 @@ extern unsigned long thread_saved_pc(str
+ .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+ }
+
+-#define start_thread(regs, new_rip, new_rsp) do { \
+- asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
+- load_gs_index(0); \
+- (regs)->ip = (new_rip); \
+- (regs)->sp = (new_rsp); \
+- write_pda(oldrsp, (new_rsp)); \
+- (regs)->cs = __USER_CS; \
+- (regs)->ss = __USER_DS; \
+- (regs)->flags = 0x200; \
+- set_fs(USER_DS); \
+-} while (0)
-
---- a/include/asm-x86/mach-xen/asm/smp.h
-+++ b/include/asm-x86/mach-xen/asm/smp.h
+ /*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
++#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
+
+-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+-#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
++#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
++#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
+ #endif /* CONFIG_X86_64 */
+
+-/* This decides where the kernel will search for a free chunk of vm
++extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
++ unsigned long new_sp);
++
++/*
++ * This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+ #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+
+-#define KSTK_EIP(task) (task_pt_regs(task)->ip)
++#define KSTK_EIP(task) (task_pt_regs(task)->ip)
++
++/* Get/set a process' ability to use the timestamp counter instruction */
++#define GET_TSC_CTL(adr) get_tsc_mode((adr))
++#define SET_TSC_CTL(val) set_tsc_mode((val))
++
++extern int get_tsc_mode(unsigned long adr);
++extern int set_tsc_mode(unsigned int val);
+
+ #endif
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:38:05.000000000 +0100
+@@ -191,13 +191,14 @@
+ #define SEGMENT_TI_MASK 0x4
+
+ #define IDT_ENTRIES 256
++#define NUM_EXCEPTION_VECTORS 32
+ #define GDT_SIZE (GDT_ENTRIES * 8)
+ #define GDT_ENTRY_TLS_ENTRIES 3
+ #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+
+ #ifdef __KERNEL__
+ #ifndef __ASSEMBLY__
+-extern const char early_idt_handlers[IDT_ENTRIES][10];
++extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
+ #endif
+ #endif
+
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/smp.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/smp.h 2009-03-16 16:38:05.000000000 +0100
@@ -1,5 +1,227 @@
-#ifdef CONFIG_X86_32
-# include "smp_32.h"
+extern void unlock_ipi_call_lock(void);
+#endif /* __ASSEMBLY__ */
#endif
---- a/include/asm-x86/mach-xen/asm/spinlock.h
-+++ b/include/asm-x86/mach-xen/asm/spinlock.h
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/smp_32.h 2009-03-16 16:33:40.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,178 +0,0 @@
+-#ifndef __ASM_SMP_H
+-#define __ASM_SMP_H
+-
+-#ifndef __ASSEMBLY__
+-#include <linux/cpumask.h>
+-#include <linux/init.h>
+-
+-/*
+- * We need the APIC definitions automatically as part of 'smp.h'
+- */
+-#ifdef CONFIG_X86_LOCAL_APIC
+-# include <asm/mpspec.h>
+-# include <asm/apic.h>
+-# ifdef CONFIG_X86_IO_APIC
+-# include <asm/io_apic.h>
+-# endif
+-#endif
+-
+-#define cpu_callout_map cpu_possible_map
+-#define cpu_callin_map cpu_possible_map
+-
+-extern int smp_num_siblings;
+-extern unsigned int num_processors;
+-
+-extern void smp_alloc_memory(void);
+-extern void lock_ipi_call_lock(void);
+-extern void unlock_ipi_call_lock(void);
+-
+-extern void (*mtrr_hook) (void);
+-extern void zap_low_mappings (void);
+-
+-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+-DECLARE_PER_CPU(u8, cpu_llc_id);
+-DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
+-
+-#ifdef CONFIG_HOTPLUG_CPU
+-extern void cpu_exit_clear(void);
+-extern void cpu_uninit(void);
+-#endif
+-
+-#ifdef CONFIG_SMP
+-
+-#ifndef CONFIG_XEN
+-
+-/* Globals due to paravirt */
+-extern void set_cpu_sibling_map(int cpu);
+-
+-struct smp_ops
+-{
+- void (*smp_prepare_boot_cpu)(void);
+- void (*smp_prepare_cpus)(unsigned max_cpus);
+- int (*cpu_up)(unsigned cpu);
+- void (*smp_cpus_done)(unsigned max_cpus);
+-
+- void (*smp_send_stop)(void);
+- void (*smp_send_reschedule)(int cpu);
+- int (*smp_call_function_mask)(cpumask_t mask,
+- void (*func)(void *info), void *info,
+- int wait);
+-};
+-
+-extern struct smp_ops smp_ops;
+-
+-static inline void smp_prepare_boot_cpu(void)
+-{
+- smp_ops.smp_prepare_boot_cpu();
+-}
+-static inline void smp_prepare_cpus(unsigned int max_cpus)
+-{
+- smp_ops.smp_prepare_cpus(max_cpus);
+-}
+-static inline int __cpu_up(unsigned int cpu)
+-{
+- return smp_ops.cpu_up(cpu);
+-}
+-static inline void smp_cpus_done(unsigned int max_cpus)
+-{
+- smp_ops.smp_cpus_done(max_cpus);
+-}
+-
+-static inline void smp_send_stop(void)
+-{
+- smp_ops.smp_send_stop();
+-}
+-static inline void smp_send_reschedule(int cpu)
+-{
+- smp_ops.smp_send_reschedule(cpu);
+-}
+-static inline int smp_call_function_mask(cpumask_t mask,
+- void (*func) (void *info), void *info,
+- int wait)
+-{
+- return smp_ops.smp_call_function_mask(mask, func, info, wait);
+-}
+-
+-void native_smp_prepare_boot_cpu(void);
+-void native_smp_prepare_cpus(unsigned int max_cpus);
+-int native_cpu_up(unsigned int cpunum);
+-void native_smp_cpus_done(unsigned int max_cpus);
+-
+-#ifndef CONFIG_PARAVIRT
+-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
+-#endif
+-
+-#else /* CONFIG_XEN */
+-
+-void xen_smp_send_stop(void);
+-void xen_smp_send_reschedule(int cpu);
+-int xen_smp_call_function_mask(cpumask_t mask,
+- void (*func) (void *info), void *info,
+- int wait);
+-
+-#define smp_send_stop xen_smp_send_stop
+-#define smp_send_reschedule xen_smp_send_reschedule
+-#define smp_call_function_mask xen_smp_call_function_mask
+-
+-extern void prefill_possible_map(void);
+-
+-#endif /* CONFIG_XEN */
+-
+-extern int __cpu_disable(void);
+-extern void __cpu_die(unsigned int cpu);
+-
+-/*
+- * This function is needed by all SMP systems. It must _always_ be valid
+- * from the initial startup. We map APIC_BASE very early in page_setup(),
+- * so this is correct in the x86 case.
+- */
+-DECLARE_PER_CPU(int, cpu_number);
+-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+-
+-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+-
+-#define safe_smp_processor_id() smp_processor_id()
+-
+-/* We don't mark CPUs online until __cpu_up(), so we need another measure */
+-static inline int num_booting_cpus(void)
+-{
+- return cpus_weight(cpu_callout_map);
+-}
+-
+-#else /* CONFIG_SMP */
+-
+-#define safe_smp_processor_id() 0
+-#define cpu_physical_id(cpu) boot_cpu_physical_apicid
+-
+-#endif /* !CONFIG_SMP */
+-
+-#ifdef CONFIG_X86_LOCAL_APIC
+-
+-static __inline int logical_smp_processor_id(void)
+-{
+- /* we don't want to mark this access volatile - bad code generation */
+- return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+-}
+-
+-# ifdef APIC_DEFINITION
+-extern int hard_smp_processor_id(void);
+-# else
+-# include <mach_apicdef.h>
+-static inline int hard_smp_processor_id(void)
+-{
+- /* we don't want to mark this access volatile - bad code generation */
+- return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
+-}
+-# endif /* APIC_DEFINITION */
+-
+-#else /* CONFIG_X86_LOCAL_APIC */
+-
+-# ifndef CONFIG_SMP
+-# define hard_smp_processor_id() 0
+-# endif
+-
+-#endif /* CONFIG_X86_LOCAL_APIC */
+-
+-#endif /* !ASSEMBLY */
+-#endif
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/smp_64.h 2009-03-16 16:33:40.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,103 +0,0 @@
+-#ifndef __ASM_SMP_H
+-#define __ASM_SMP_H
+-
+-#include <linux/cpumask.h>
+-#include <linux/init.h>
+-
+-#ifdef CONFIG_X86_LOCAL_APIC
+-/*
+- * We need the APIC definitions automatically as part of 'smp.h'
+- */
+-#include <asm/apic.h>
+-#ifdef CONFIG_X86_IO_APIC
+-#include <asm/io_apic.h>
+-#endif
+-#include <asm/mpspec.h>
+-#endif
+-#include <asm/pda.h>
+-#include <asm/thread_info.h>
+-
+-extern cpumask_t cpu_initialized;
+-
+-extern int smp_num_siblings;
+-extern unsigned int num_processors;
+-
+-extern void smp_alloc_memory(void);
+-extern void lock_ipi_call_lock(void);
+-extern void unlock_ipi_call_lock(void);
+-
+-extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+- void *info, int wait);
+-
+-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+-DECLARE_PER_CPU(u16, cpu_llc_id);
+-DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
+-DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
+-
+-#ifdef CONFIG_X86_LOCAL_APIC
+-static inline int cpu_present_to_apicid(int mps_cpu)
+-{
+- if (cpu_present(mps_cpu))
+- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+- else
+- return BAD_APICID;
+-}
+-#endif
+-
+-#ifdef CONFIG_SMP
+-
+-#define SMP_TRAMPOLINE_BASE 0x6000
+-
+-extern int __cpu_disable(void);
+-extern void __cpu_die(unsigned int cpu);
+-extern void prefill_possible_map(void);
+-extern unsigned __cpuinitdata disabled_cpus;
+-
+-#define raw_smp_processor_id() read_pda(cpunumber)
+-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+-
+-#define stack_smp_processor_id() \
+- ({ \
+- struct thread_info *ti; \
+- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
+- ti->cpu; \
+-})
+-
+-/*
+- * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
+- * scheduling and IPI sending and compresses data structures.
+- */
+-static inline int num_booting_cpus(void)
+-{
+- return cpus_weight(cpu_possible_map);
+-}
+-
+-extern void smp_send_reschedule(int cpu);
+-
+-#else /* CONFIG_SMP */
+-
+-extern unsigned int boot_cpu_id;
+-#define cpu_physical_id(cpu) boot_cpu_id
+-#define stack_smp_processor_id() 0
+-
+-#endif /* !CONFIG_SMP */
+-
+-#define safe_smp_processor_id() smp_processor_id()
+-
+-#ifdef CONFIG_X86_LOCAL_APIC
+-static __inline int logical_smp_processor_id(void)
+-{
+- /* we don't want to mark this access volatile - bad code generation */
+- return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+-}
+-
+-static inline int hard_smp_processor_id(void)
+-{
+- /* we don't want to mark this access volatile - bad code generation */
+- return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
+-}
+-#endif
+-
+-#endif
+-
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:38:05.000000000 +0100
@@ -88,7 +88,7 @@ extern void xen_spin_kick(raw_spinlock_t
: "memory", "cc")
{
unsigned int token, count;
bool free;
---- a/include/asm-x86/mach-xen/asm/swiotlb_32.h
-+++ /dev/null
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/swiotlb.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/swiotlb.h 2009-03-16 16:38:05.000000000 +0100
+@@ -1,5 +1,8 @@
+-#ifdef CONFIG_X86_32
+-# include "swiotlb_32.h"
+-#else
+-# include "../../swiotlb.h"
+-#endif
++#ifndef _ASM_SWIOTLB_H
++
++#include "../../swiotlb.h"
++
++dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
++ int dir);
++
++#endif /* _ASM_SWIOTLB_H */
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/swiotlb_32.h 2009-05-14 10:56:29.000000000 +0200
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,43 +0,0 @@
-#ifndef _ASM_SWIOTLB_H
-#define _ASM_SWIOTLB_H 1
-#endif
-
-#endif
---- a/include/asm-x86/mach-xen/asm/swiotlb.h
-+++ b/include/asm-x86/mach-xen/asm/swiotlb.h
-@@ -1,5 +1,8 @@
--#ifdef CONFIG_X86_32
--# include "swiotlb_32.h"
--#else
--# include "../../swiotlb.h"
--#endif
-+#ifndef _ASM_SWIOTLB_H
-+
-+#include "../../swiotlb.h"
-+
-+dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
-+ int dir);
-+
-+#endif /* _ASM_SWIOTLB_H */
---- a/include/asm-x86/mach-xen/asm/system.h
-+++ b/include/asm-x86/mach-xen/asm/system.h
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:38:05.000000000 +0100
@@ -28,22 +28,44 @@ struct task_struct *__switch_to(struct t
* Saving eflags is important. It switches not only IOPL between tasks,
* it also protects other tasks from NT leaking through sysenter etc.
#else
#define smp_mb() barrier()
#define smp_rmb() barrier()
---- a/include/asm-x86/mach-xen/asm/tlbflush.h
-+++ b/include/asm-x86/mach-xen/asm/tlbflush.h
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/tlbflush.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/tlbflush.h 2009-03-16 16:38:05.000000000 +0100
@@ -86,8 +86,7 @@ static inline void flush_tlb_range(struc
#define TLBSTATE_LAZY 2
struct mm_struct *active_mm;
int state;
char __cacheline_padding[L1_CACHE_BYTES-8];
---- a/include/asm-x86/mach-xen/asm/vga.h
-+++ b/include/asm-x86/mach-xen/asm/vga.h
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/vga.h 2009-05-14 10:56:29.000000000 +0200
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/vga.h 2009-03-16 16:38:05.000000000 +0100
@@ -12,9 +12,9 @@
* access the videoram directly without any black magic.
*/
+#define vga_writeb(x, y) (*(y) = (x))
#endif
---- a/include/asm-x86/mach-xen/asm/xor_64.h
-+++ b/include/asm-x86/mach-xen/asm/xor_64.h
+--- sle11-2009-05-14.orig/include/asm-x86/mach-xen/asm/xor_64.h 2009-05-14 10:56:29.000000000 +0200
++++ sle11-2009-05-14/include/asm-x86/mach-xen/asm/xor_64.h 2009-03-16 16:38:05.000000000 +0100
@@ -1,20 +1,23 @@
/*
- * x86-64 changes / gcc fixes from Andi Kleen.
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
---- a/include/asm-x86/scatterlist.h
-+++ b/include/asm-x86/scatterlist.h
+--- sle11-2009-05-14.orig/include/asm-x86/scatterlist.h 2009-05-14 10:56:29.000000000 +0200
++++ sle11-2009-05-14/include/asm-x86/scatterlist.h 2009-03-16 16:38:05.000000000 +0100
@@ -24,7 +24,7 @@ struct scatterlist {
* returns.
*/
# define sg_dma_len(sg) ((sg)->length)
#else
# define sg_dma_len(sg) ((sg)->dma_length)
---- a/include/linux/page-flags.h
-+++ b/include/linux/page-flags.h
-@@ -276,18 +276,25 @@ static inline void SetPageUptodate(struc
+--- sle11-2009-05-14.orig/include/linux/page-flags.h 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/include/linux/page-flags.h 2009-03-16 16:38:05.000000000 +0100
+@@ -278,18 +278,25 @@ static inline void SetPageUptodate(struc
CLEARPAGEFLAG(Uptodate, uptodate)
extern void cancel_dirty_page(struct page *page, unsigned int account_size);
---- a/include/xen/balloon.h
-+++ b/include/xen/balloon.h
+--- sle11-2009-05-14.orig/include/xen/balloon.h 2008-11-25 12:35:56.000000000 +0100
++++ sle11-2009-05-14/include/xen/balloon.h 2009-03-16 16:38:05.000000000 +0100
@@ -31,9 +31,12 @@
* IN THE SOFTWARE.
*/
-#endif /* __ASM_BALLOON_H__ */
+#endif /* __XEN_BALLOON_H__ */
---- a/include/xen/interface/grant_table.h
-+++ b/include/xen/interface/grant_table.h
+--- sle11-2009-05-14.orig/include/xen/interface/grant_table.h 2008-11-25 12:22:34.000000000 +0100
++++ sle11-2009-05-14/include/xen/interface/grant_table.h 2009-03-16 16:38:05.000000000 +0100
@@ -193,6 +193,7 @@ struct gnttab_map_grant_ref {
grant_handle_t handle;
uint64_t dev_bus_addr;
typedef struct gnttab_query_size gnttab_query_size_t;
DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t);
---- a/include/xen/interface/io/fbif.h
-+++ b/include/xen/interface/io/fbif.h
+--- sle11-2009-05-14.orig/include/xen/interface/io/fbif.h 2008-11-25 12:35:56.000000000 +0100
++++ sle11-2009-05-14/include/xen/interface/io/fbif.h 2009-03-16 16:38:05.000000000 +0100
@@ -150,7 +150,12 @@ struct xenfb_page
* framebuffer with a max resolution of 12,800x10,240. Should
* be enough for a while with room leftover for expansion.
};
/*
---- a/include/xen/interface/memory.h
-+++ b/include/xen/interface/memory.h
+--- sle11-2009-05-14.orig/include/xen/interface/memory.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-05-14/include/xen/interface/memory.h 2009-03-16 16:38:05.000000000 +0100
@@ -62,7 +62,7 @@ struct xen_memory_reservation {
* OUT: GMFN bases of extents that were allocated
* (NB. This command also updates the mach_to_phys translation table)
};
DEFINE_XEN_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
---- a/include/xen/interface/vcpu.h
-+++ b/include/xen/interface/vcpu.h
+--- sle11-2009-05-14.orig/include/xen/interface/vcpu.h 2008-11-25 12:35:56.000000000 +0100
++++ sle11-2009-05-14/include/xen/interface/vcpu.h 2009-03-16 16:38:05.000000000 +0100
@@ -85,6 +85,7 @@ struct vcpu_runstate_info {
*/
uint64_t time[4];
typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t;
DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t);
---- a/lib/swiotlb-xen.c
-+++ b/lib/swiotlb-xen.c
+--- sle11-2009-05-14.orig/lib/swiotlb-xen.c 2009-03-16 16:33:40.000000000 +0100
++++ sle11-2009-05-14/lib/swiotlb-xen.c 2009-03-16 16:38:05.000000000 +0100
@@ -20,6 +20,7 @@
#include <linux/ctype.h>
#include <linux/init.h>