--- /dev/null
+From: kernel.org
+Subject: 2.6.25
+Patch-mainline: 2.6.25
+
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py
+
+--- sle11-2009-06-29.orig/arch/x86/Kconfig 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/Kconfig 2009-03-16 16:33:40.000000000 +0100
+@@ -27,7 +27,7 @@ config X86
+ select HAVE_KRETPROBES
+ select HAVE_DYNAMIC_FTRACE
+ select HAVE_FTRACE
+- select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
++ select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
+ select HAVE_ARCH_KGDB if !X86_VOYAGER
+ select HAVE_ARCH_TRACEHOOK
+ select HAVE_GENERIC_DMA_COHERENT if X86_32
+@@ -211,14 +211,12 @@ config X86_TRAMPOLINE
+ default y
+
+ config X86_NO_TSS
+- bool
++ def_bool y
+ depends on XEN
+- default y
+
+ config X86_NO_IDT
+- bool
++ def_bool y
+ depends on XEN
+- default y
+
+ config KTIME_SCALAR
+ def_bool X86_32
+@@ -728,9 +726,8 @@ config X86_VISWS_APIC
+ depends on X86_32 && X86_VISWS
+
+ config X86_XEN_GENAPIC
+- bool
++ def_bool y
+ depends on X86_64_XEN
+- default y
+
+ config X86_MCE
+ bool "Machine Check Exception"
+@@ -1117,7 +1114,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
+
+ config ARCH_SPARSEMEM_DEFAULT
+ def_bool y
+- depends on X86_64
++ depends on X86_64 && !X86_64_XEN
+
+ config ARCH_SPARSEMEM_ENABLE
+ def_bool y
+@@ -1747,10 +1744,10 @@ config PCI_MMCONFIG
+ depends on X86_64 && PCI && ACPI
+
+ config XEN_PCIDEV_FRONTEND
+- bool "Xen PCI Frontend" if X86_64
++ def_bool y
++ prompt "Xen PCI Frontend" if X86_64
+ depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
+ select HOTPLUG
+- default y
+ help
+ The PCI device frontend driver allows the kernel to import arbitrary
+ PCI devices from a PCI backend to support PCI driver domains.
+@@ -1758,7 +1755,6 @@ config XEN_PCIDEV_FRONTEND
+ config XEN_PCIDEV_FE_DEBUG
+ bool "Xen PCI Frontend Debugging"
+ depends on XEN_PCIDEV_FRONTEND
+- default n
+ help
+ Enables some debug statements within the PCI Frontend.
+
+--- sle11-2009-06-29.orig/arch/x86/Kconfig.debug 2009-02-02 09:40:56.000000000 +0100
++++ sle11-2009-06-29/arch/x86/Kconfig.debug 2009-03-16 16:33:40.000000000 +0100
+@@ -279,6 +279,7 @@ config DEBUG_BOOT_PARAMS
+ bool "Debug boot parameters"
+ depends on DEBUG_KERNEL
+ depends on DEBUG_FS
++ depends on !XEN
+ help
+ This option will cause struct boot_params to be exported via debugfs.
+
+--- sle11-2009-06-29.orig/arch/x86/ia32/ia32entry-xen.S 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:33:40.000000000 +0100
+@@ -12,7 +12,6 @@
+ #include <asm/ia32_unistd.h>
+ #include <asm/thread_info.h>
+ #include <asm/segment.h>
+-#include <asm/vsyscall32.h>
+ #include <asm/irqflags.h>
+ #include <linux/linkage.h>
+
+@@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target)
+ CFI_RESTORE rcx
+ movl %ebp,%ebp /* zero extension */
+ movl %eax,%eax
++ movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
+ movl $__USER32_DS,40(%rsp)
+ movq %rbp,32(%rsp)
+ movl $__USER32_CS,16(%rsp)
+- movl $VSYSCALL32_SYSEXIT,8(%rsp)
++ movq %r10,8(%rsp)
+ movq %rax,(%rsp)
+ cld
+ SAVE_ARGS 0,0,1
+@@ -582,8 +582,8 @@ ia32_sys_call_table:
+ .quad compat_sys_futex /* 240 */
+ .quad compat_sys_sched_setaffinity
+ .quad compat_sys_sched_getaffinity
+- .quad sys32_set_thread_area
+- .quad sys32_get_thread_area
++ .quad sys_set_thread_area
++ .quad sys_get_thread_area
+ .quad compat_sys_io_setup /* 245 */
+ .quad sys_io_destroy
+ .quad compat_sys_io_getevents
+@@ -661,7 +661,9 @@ ia32_sys_call_table:
+ .quad sys_epoll_pwait
+ .quad compat_sys_utimensat /* 320 */
+ .quad compat_sys_signalfd
+- .quad compat_sys_timerfd
++ .quad sys_timerfd_create
+ .quad sys_eventfd
+ .quad sys32_fallocate
++ .quad compat_sys_timerfd_settime /* 325 */
++ .quad compat_sys_timerfd_gettime
+ ia32_syscall_end:
+--- sle11-2009-06-29.orig/arch/x86/kernel/Makefile 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/Makefile 2009-03-16 16:33:40.000000000 +0100
+@@ -120,11 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
+
+ obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
+
++ obj-$(CONFIG_XEN) += nmi_64.o
+ time_64-$(CONFIG_XEN) += time_32.o
+ pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
+ endif
+
+ disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
+ smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
+-disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o
+-%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) :=
+--- sle11-2009-06-29.orig/arch/x86/kernel/acpi/boot.c 2008-12-01 11:11:08.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/acpi/boot.c 2009-03-16 16:33:40.000000000 +0100
+@@ -133,6 +133,9 @@ char *__init __acpi_map_table(unsigned l
+ #ifndef CONFIG_XEN
+ if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
+ return __va(phys);
++#else
++ if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
++ return isa_bus_to_virt(phys);
+ #endif
+
+ offset = phys & (PAGE_SIZE - 1);
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-06-29/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -0,0 +1,95 @@
++/*
++ * sleep.c - x86-specific ACPI sleep support.
++ *
++ * Copyright (C) 2001-2003 Patrick Mochel
++ * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
++ */
++
++#include <linux/acpi.h>
++#include <linux/bootmem.h>
++#include <linux/dmi.h>
++#include <linux/cpumask.h>
++
++#include <asm/smp.h>
++
++#ifndef CONFIG_ACPI_PV_SLEEP
++/* address in low memory of the wakeup routine. */
++unsigned long acpi_wakeup_address = 0;
++unsigned long acpi_realmode_flags;
++extern char wakeup_start, wakeup_end;
++
++extern unsigned long acpi_copy_wakeup_routine(unsigned long);
++#endif
++
++/**
++ * acpi_save_state_mem - save kernel state
++ *
++ * Create an identity mapped page table and copy the wakeup routine to
++ * low memory.
++ */
++int acpi_save_state_mem(void)
++{
++#ifndef CONFIG_ACPI_PV_SLEEP
++ if (!acpi_wakeup_address) {
++ printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
++ return -ENOMEM;
++ }
++ memcpy((void *)acpi_wakeup_address, &wakeup_start,
++ &wakeup_end - &wakeup_start);
++ acpi_copy_wakeup_routine(acpi_wakeup_address);
++#endif
++
++ return 0;
++}
++
++/*
++ * acpi_restore_state - undo effects of acpi_save_state_mem
++ */
++void acpi_restore_state_mem(void)
++{
++}
++
++
++/**
++ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
++ *
++ * We allocate a page from the first 1MB of memory for the wakeup
++ * routine for when we come back from a sleep state. The
++ * runtime allocator allows specification of <16MB pages, but not
++ * <1MB pages.
++ */
++void __init acpi_reserve_bootmem(void)
++{
++#ifndef CONFIG_ACPI_PV_SLEEP
++ if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
++ printk(KERN_ERR
++ "ACPI: Wakeup code way too big, S3 disabled.\n");
++ return;
++ }
++
++ acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
++ if (!acpi_wakeup_address)
++ printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
++#endif
++}
++
++
++#ifndef CONFIG_ACPI_PV_SLEEP
++static int __init acpi_sleep_setup(char *str)
++{
++ while ((str != NULL) && (*str != '\0')) {
++ if (strncmp(str, "s3_bios", 7) == 0)
++ acpi_realmode_flags |= 1;
++ if (strncmp(str, "s3_mode", 7) == 0)
++ acpi_realmode_flags |= 2;
++ if (strncmp(str, "s3_beep", 7) == 0)
++ acpi_realmode_flags |= 4;
++ str = strchr(str, ',');
++ if (str != NULL)
++ str += strspn(str, ", \t");
++ }
++ return 1;
++}
++
++__setup("acpi_sleep=", acpi_sleep_setup);
++#endif /* CONFIG_ACPI_PV_SLEEP */
+--- sle11-2009-06-29.orig/arch/x86/kernel/acpi/sleep_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,117 +0,0 @@
+-/*
+- * sleep.c - x86-specific ACPI sleep support.
+- *
+- * Copyright (C) 2001-2003 Patrick Mochel
+- * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+- */
+-
+-#include <linux/acpi.h>
+-#include <linux/bootmem.h>
+-#include <linux/dmi.h>
+-#include <linux/cpumask.h>
+-
+-#include <asm/smp.h>
+-
+-#ifndef CONFIG_ACPI_PV_SLEEP
+-/* address in low memory of the wakeup routine. */
+-unsigned long acpi_wakeup_address = 0;
+-unsigned long acpi_realmode_flags;
+-extern char wakeup_start, wakeup_end;
+-
+-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
+-#endif
+-
+-/**
+- * acpi_save_state_mem - save kernel state
+- *
+- * Create an identity mapped page table and copy the wakeup routine to
+- * low memory.
+- */
+-int acpi_save_state_mem(void)
+-{
+-#ifndef CONFIG_ACPI_PV_SLEEP
+- if (!acpi_wakeup_address)
+- return 1;
+- memcpy((void *)acpi_wakeup_address, &wakeup_start,
+- &wakeup_end - &wakeup_start);
+- acpi_copy_wakeup_routine(acpi_wakeup_address);
+-#endif
+- return 0;
+-}
+-
+-/*
+- * acpi_restore_state - undo effects of acpi_save_state_mem
+- */
+-void acpi_restore_state_mem(void)
+-{
+-}
+-
+-/**
+- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+- *
+- * We allocate a page from the first 1MB of memory for the wakeup
+- * routine for when we come back from a sleep state. The
+- * runtime allocator allows specification of <16MB pages, but not
+- * <1MB pages.
+- */
+-void __init acpi_reserve_bootmem(void)
+-{
+-#ifndef CONFIG_ACPI_PV_SLEEP
+- if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
+- printk(KERN_ERR
+- "ACPI: Wakeup code way too big, S3 disabled.\n");
+- return;
+- }
+-
+- acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
+- if (!acpi_wakeup_address)
+- printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
+-#endif
+-}
+-
+-#ifndef CONFIG_ACPI_PV_SLEEP
+-static int __init acpi_sleep_setup(char *str)
+-{
+- while ((str != NULL) && (*str != '\0')) {
+- if (strncmp(str, "s3_bios", 7) == 0)
+- acpi_realmode_flags |= 1;
+- if (strncmp(str, "s3_mode", 7) == 0)
+- acpi_realmode_flags |= 2;
+- if (strncmp(str, "s3_beep", 7) == 0)
+- acpi_realmode_flags |= 4;
+- str = strchr(str, ',');
+- if (str != NULL)
+- str += strspn(str, ", \t");
+- }
+- return 1;
+-}
+-
+-__setup("acpi_sleep=", acpi_sleep_setup);
+-
+-/* Ouch, we want to delete this. We already have better version in userspace, in
+- s2ram from suspend.sf.net project */
+-static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
+-{
+- acpi_realmode_flags |= 2;
+- return 0;
+-}
+-
+-static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
+- { /* Reset video mode after returning from ACPI S3 sleep */
+- .callback = reset_videomode_after_s3,
+- .ident = "Toshiba Satellite 4030cdt",
+- .matches = {
+- DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
+- },
+- },
+- {}
+-};
+-
+-static int __init acpisleep_dmi_init(void)
+-{
+- dmi_check_system(acpisleep_dmi_table);
+- return 0;
+-}
+-
+-core_initcall(acpisleep_dmi_init);
+-#endif /* CONFIG_ACPI_PV_SLEEP */
+--- sle11-2009-06-29.orig/arch/x86/kernel/acpi/sleep_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,125 +0,0 @@
+-/*
+- * acpi.c - Architecture-Specific Low-Level ACPI Support
+- *
+- * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+- * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
+- * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
+- * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
+- * Copyright (C) 2003 Pavel Machek, SuSE Labs
+- *
+- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License as published by
+- * the Free Software Foundation; either version 2 of the License, or
+- * (at your option) any later version.
+- *
+- * This program is distributed in the hope that it will be useful,
+- * but WITHOUT ANY WARRANTY; without even the implied warranty of
+- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+- * GNU General Public License for more details.
+- *
+- * You should have received a copy of the GNU General Public License
+- * along with this program; if not, write to the Free Software
+- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+- *
+- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+- */
+-
+-#include <linux/kernel.h>
+-#include <linux/init.h>
+-#include <linux/types.h>
+-#include <linux/stddef.h>
+-#include <linux/slab.h>
+-#include <linux/pci.h>
+-#include <linux/bootmem.h>
+-#include <linux/acpi.h>
+-#include <linux/cpumask.h>
+-
+-#include <asm/mpspec.h>
+-#include <asm/io.h>
+-#include <asm/apic.h>
+-#include <asm/apicdef.h>
+-#include <asm/page.h>
+-#include <asm/pgtable.h>
+-#include <asm/pgalloc.h>
+-#include <asm/io_apic.h>
+-#include <asm/proto.h>
+-#include <asm/tlbflush.h>
+-
+-/* --------------------------------------------------------------------------
+- Low-Level Sleep Support
+- -------------------------------------------------------------------------- */
+-
+-#ifndef CONFIG_ACPI_PV_SLEEP
+-/* address in low memory of the wakeup routine. */
+-unsigned long acpi_wakeup_address = 0;
+-unsigned long acpi_realmode_flags;
+-extern char wakeup_start, wakeup_end;
+-
+-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+-#endif
+-
+-/**
+- * acpi_save_state_mem - save kernel state
+- *
+- * Create an identity mapped page table and copy the wakeup routine to
+- * low memory.
+- */
+-int acpi_save_state_mem(void)
+-{
+-#ifndef CONFIG_ACPI_PV_SLEEP
+- memcpy((void *)acpi_wakeup_address, &wakeup_start,
+- &wakeup_end - &wakeup_start);
+- acpi_copy_wakeup_routine(acpi_wakeup_address);
+-#endif
+- return 0;
+-}
+-
+-/*
+- * acpi_restore_state
+- */
+-void acpi_restore_state_mem(void)
+-{
+-}
+-
+-/**
+- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+- *
+- * We allocate a page in low memory for the wakeup
+- * routine for when we come back from a sleep state. The
+- * runtime allocator allows specification of <16M pages, but not
+- * <1M pages.
+- */
+-void __init acpi_reserve_bootmem(void)
+-{
+-#ifndef CONFIG_ACPI_PV_SLEEP
+- acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
+- if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
+- printk(KERN_CRIT
+- "ACPI: Wakeup code way too big, will crash on attempt"
+- " to suspend\n");
+-#endif
+-}
+-
+-#ifndef CONFIG_ACPI_PV_SLEEP
+-static int __init acpi_sleep_setup(char *str)
+-{
+- while ((str != NULL) && (*str != '\0')) {
+- if (strncmp(str, "s3_bios", 7) == 0)
+- acpi_realmode_flags |= 1;
+- if (strncmp(str, "s3_mode", 7) == 0)
+- acpi_realmode_flags |= 2;
+- if (strncmp(str, "s3_beep", 7) == 0)
+- acpi_realmode_flags |= 4;
+- str = strchr(str, ',');
+- if (str != NULL)
+- str += strspn(str, ", \t");
+- }
+-
+- return 1;
+-}
+-
+-__setup("acpi_sleep=", acpi_sleep_setup);
+-#endif /* CONFIG_ACPI_PV_SLEEP */
+-
+--- sle11-2009-06-29.orig/arch/x86/kernel/apic_32-xen.c 2008-12-15 11:27:22.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+-int __init APIC_init_uniprocessor (void)
++int __init APIC_init_uniprocessor(void)
+ {
+ #ifdef CONFIG_X86_IO_APIC
+ if (smp_found_config)
+--- sle11-2009-06-29.orig/arch/x86/kernel/apic_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -34,34 +34,17 @@
+ #include <asm/hpet.h>
+ #include <asm/idle.h>
+
+-int apic_verbosity;
++int disable_apic;
+
+ /*
+- * 'what should we do if we get a hw irq event on an illegal vector'.
+- * each architecture has to answer this themselves.
++ * Debug level, exported for io_apic.c
+ */
+-void ack_bad_irq(unsigned int irq)
+-{
+- printk("unexpected IRQ trap at irq %02x\n", irq);
+- /*
+- * Currently unexpected vectors happen only on SMP and APIC.
+- * We _must_ ack these because every local APIC has only N
+- * irq slots per priority level, and a 'hanging, unacked' IRQ
+- * holds up an irq slot - in excessive cases (when multiple
+- * unexpected vectors occur) that might lock up the APIC
+- * completely.
+- * But don't ack when the APIC is disabled. -AK
+- */
+- if (!disable_apic)
+- ack_APIC_irq();
+-}
+-
+-int setup_profiling_timer(unsigned int multiplier)
+-{
+- return -EINVAL;
+-}
++int apic_verbosity;
+
+-void smp_local_timer_interrupt(void)
++/*
++ * The guts of the apic timer interrupt
++ */
++static void local_apic_timer_interrupt(void)
+ {
+ #ifndef CONFIG_XEN
+ int cpu = smp_processor_id();
+@@ -121,11 +104,34 @@ void smp_apic_timer_interrupt(struct pt_
+ */
+ exit_idle();
+ irq_enter();
+- smp_local_timer_interrupt();
++ local_apic_timer_interrupt();
+ irq_exit();
+ set_irq_regs(old_regs);
+ }
+
++int setup_profiling_timer(unsigned int multiplier)
++{
++ return -EINVAL;
++}
++
++/*
++ * This initializes the IO-APIC and APIC hardware if this is
++ * a UP kernel.
++ */
++int __init APIC_init_uniprocessor(void)
++{
++#ifdef CONFIG_X86_IO_APIC
++ if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
++ setup_IO_APIC();
++#endif
++
++ return 1;
++}
++
++/*
++ * Local APIC interrupts
++ */
++
+ /*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+@@ -150,7 +156,6 @@ asmlinkage void smp_spurious_interrupt(v
+ /*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+-
+ asmlinkage void smp_error_interrupt(void)
+ {
+ unsigned int v, v1;
+@@ -178,19 +183,3 @@ asmlinkage void smp_error_interrupt(void
+ smp_processor_id(), v , v1);
+ irq_exit();
+ }
+-
+-int disable_apic;
+-
+-/*
+- * This initializes the IO-APIC and APIC hardware if this is
+- * a UP kernel.
+- */
+-int __init APIC_init_uniprocessor (void)
+-{
+-#ifdef CONFIG_X86_IO_APIC
+- if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+- setup_IO_APIC();
+-#endif
+-
+- return 1;
+-}
+--- sle11-2009-06-29.orig/arch/x86/kernel/asm-offsets_32.c 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/asm-offsets_32.c 2009-03-16 16:33:40.000000000 +0100
+@@ -23,8 +23,10 @@
+ #include <xen/interface/xen.h>
+ #endif
+
++#ifdef CONFIG_LGUEST_GUEST
+ #include <linux/lguest.h>
+ #include "../../../drivers/lguest/lg.h"
++#endif
+
+ /* workaround for a warning with -Wmissing-prototypes */
+ void foo(void);
+--- sle11-2009-06-29.orig/arch/x86/kernel/cpu/common-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -27,45 +27,50 @@
+ #include "cpu.h"
+
+ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+- [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
+- [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
+- [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
+- [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
++ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
++ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
++ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
++ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+ #ifndef CONFIG_XEN
+ /*
+ * Segments used for calling PnP BIOS have byte granularity.
+ * They code segments and data segments have fixed 64k limits,
+ * the transfer segment sizes are set at run time.
+ */
+- [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+- [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
+- [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
+- [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
+- [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
++ /* 32-bit code */
++ [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
++ /* 16-bit code */
++ [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
++ /* 16-bit data */
++ [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
++ /* 16-bit data */
++ [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
++ /* 16-bit data */
++ [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+ /*
+ * The APM segments have byte granularity and their bases
+ * are set at run time. All have 64k limits.
+ */
+- [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
++ /* 32-bit code */
++ [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+ /* 16-bit code */
+- [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
+- [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
++ [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
++ /* data */
++ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+
+- [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
++ [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+ #endif
+- [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
++ [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
+ } };
+ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
++__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
++
+ static int cachesize_override __cpuinitdata = -1;
+-static int disable_x86_fxsr __cpuinitdata;
+ static int disable_x86_serial_nr __cpuinitdata = 1;
+-static int disable_x86_sep __cpuinitdata;
+
+ struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
+
+-extern int disable_pse;
+-
+ static void __cpuinit default_init(struct cpuinfo_x86 * c)
+ {
+ /* Not much we can do here... */
+@@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str
+
+ static int __init x86_fxsr_setup(char * s)
+ {
+- /* Tell all the other CPUs to not use it... */
+- disable_x86_fxsr = 1;
+-
+- /*
+- * ... and clear the bits early in the boot_cpu_data
+- * so that the bootup process doesn't try to do this
+- * either.
+- */
+- clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
+- clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
++ setup_clear_cpu_cap(X86_FEATURE_FXSR);
++ setup_clear_cpu_cap(X86_FEATURE_XMM);
+ return 1;
+ }
+ __setup("nofxsr", x86_fxsr_setup);
+@@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup);
+
+ static int __init x86_sep_setup(char * s)
+ {
+- disable_x86_sep = 1;
++ setup_clear_cpu_cap(X86_FEATURE_SEP);
+ return 1;
+ }
+ __setup("nosep", x86_sep_setup);
+@@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void)
+ void __init cpu_detect(struct cpuinfo_x86 *c)
+ {
+ /* Get vendor name */
+- cpuid(0x00000000, &c->cpuid_level,
+- (int *)&c->x86_vendor_id[0],
+- (int *)&c->x86_vendor_id[8],
+- (int *)&c->x86_vendor_id[4]);
++ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
++ (unsigned int *)&c->x86_vendor_id[0],
++ (unsigned int *)&c->x86_vendor_id[8],
++ (unsigned int *)&c->x86_vendor_id[4]);
+
+ c->x86 = 4;
+ if (c->cpuid_level >= 0x00000001) {
+@@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8
+ if (c->x86 >= 0x6)
+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
+ c->x86_mask = tfms & 15;
+- if (cap0 & (1<<19))
++ if (cap0 & (1<<19)) {
+ c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
++ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
++ }
++ }
++}
++static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
++{
++ u32 tfms, xlvl;
++ unsigned int ebx;
++
++ memset(&c->x86_capability, 0, sizeof c->x86_capability);
++ if (have_cpuid_p()) {
++ /* Intel-defined flags: level 0x00000001 */
++ if (c->cpuid_level >= 0x00000001) {
++ u32 capability, excap;
++ cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
++ c->x86_capability[0] = capability;
++ c->x86_capability[4] = excap;
++ }
++
++ /* AMD-defined flags: level 0x80000001 */
++ xlvl = cpuid_eax(0x80000000);
++ if ((xlvl & 0xffff0000) == 0x80000000) {
++ if (xlvl >= 0x80000001) {
++ c->x86_capability[1] = cpuid_edx(0x80000001);
++ c->x86_capability[6] = cpuid_ecx(0x80000001);
++ }
++ }
++
+ }
++
+ }
+
+ /* Do minimum CPU detection early.
+@@ -300,6 +326,7 @@ static void __init early_cpu_detect(void
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ c->x86_cache_alignment = 32;
++ c->x86_clflush_size = 32;
+
+ if (!have_cpuid_p())
+ return;
+@@ -307,19 +334,30 @@ static void __init early_cpu_detect(void
+ cpu_detect(c);
+
+ get_cpu_vendor(c, 1);
++
++ switch (c->x86_vendor) {
++ case X86_VENDOR_AMD:
++ early_init_amd(c);
++ break;
++ case X86_VENDOR_INTEL:
++ early_init_intel(c);
++ break;
++ }
++
++ early_get_cap(c);
+ }
+
+ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
+ {
+ u32 tfms, xlvl;
+- int ebx;
++ unsigned int ebx;
+
+ if (have_cpuid_p()) {
+ /* Get vendor name */
+- cpuid(0x00000000, &c->cpuid_level,
+- (int *)&c->x86_vendor_id[0],
+- (int *)&c->x86_vendor_id[8],
+- (int *)&c->x86_vendor_id[4]);
++ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
++ (unsigned int *)&c->x86_vendor_id[0],
++ (unsigned int *)&c->x86_vendor_id[8],
++ (unsigned int *)&c->x86_vendor_id[4]);
+
+ get_cpu_vendor(c, 0);
+ /* Initialize the standard set of capabilities */
+@@ -364,8 +402,6 @@ static void __cpuinit generic_identify(s
+ init_scattered_cpuid_features(c);
+ }
+
+- early_intel_workaround(c);
+-
+ #ifdef CONFIG_X86_HT
+ c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
+ #endif
+@@ -399,7 +435,7 @@ __setup("serialnumber", x86_serial_nr_se
+ /*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
++void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+ {
+ int i;
+
+@@ -425,20 +461,9 @@ static void __cpuinit identify_cpu(struc
+
+ generic_identify(c);
+
+- printk(KERN_DEBUG "CPU: After generic identify, caps:");
+- for (i = 0; i < NCAPINTS; i++)
+- printk(" %08lx", c->x86_capability[i]);
+- printk("\n");
+-
+- if (this_cpu->c_identify) {
++ if (this_cpu->c_identify)
+ this_cpu->c_identify(c);
+
+- printk(KERN_DEBUG "CPU: After vendor identify, caps:");
+- for (i = 0; i < NCAPINTS; i++)
+- printk(" %08lx", c->x86_capability[i]);
+- printk("\n");
+- }
+-
+ /*
+ * Vendor-specific initialization. In this section we
+ * canonicalize the feature flags, meaning if there are
+@@ -460,23 +485,6 @@ static void __cpuinit identify_cpu(struc
+ * we do "generic changes."
+ */
+
+- /* TSC disabled? */
+- if ( tsc_disable )
+- clear_bit(X86_FEATURE_TSC, c->x86_capability);
+-
+- /* FXSR disabled? */
+- if (disable_x86_fxsr) {
+- clear_bit(X86_FEATURE_FXSR, c->x86_capability);
+- clear_bit(X86_FEATURE_XMM, c->x86_capability);
+- }
+-
+- /* SEP disabled? */
+- if (disable_x86_sep)
+- clear_bit(X86_FEATURE_SEP, c->x86_capability);
+-
+- if (disable_pse)
+- clear_bit(X86_FEATURE_PSE, c->x86_capability);
+-
+ /* If the model name is still unset, do table lookup. */
+ if ( !c->x86_model_id[0] ) {
+ char *p;
+@@ -489,13 +497,6 @@ static void __cpuinit identify_cpu(struc
+ c->x86, c->x86_model);
+ }
+
+- /* Now the feature flags better reflect actual CPU features! */
+-
+- printk(KERN_DEBUG "CPU: After all inits, caps:");
+- for (i = 0; i < NCAPINTS; i++)
+- printk(" %08lx", c->x86_capability[i]);
+- printk("\n");
+-
+ /*
+ * On SMP, boot_cpu_data holds the common feature set between
+ * all CPUs; so make sure that we indicate which features are
+@@ -508,8 +509,14 @@ static void __cpuinit identify_cpu(struc
+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+ }
+
++ /* Clear all flags overriden by options */
++ for (i = 0; i < NCAPINTS; i++)
++ c->x86_capability[i] &= ~cleared_cpu_caps[i];
++
+ /* Init Machine Check Exception if available. */
+ mcheck_init(c);
++
++ select_idle_routine(c);
+ }
+
+ void __init identify_boot_cpu(void)
+@@ -517,7 +524,6 @@ void __init identify_boot_cpu(void)
+ identify_cpu(&boot_cpu_data);
+ sysenter_setup();
+ enable_sep_cpu();
+- mtrr_bp_init();
+ }
+
+ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+@@ -574,6 +580,13 @@ void __cpuinit detect_ht(struct cpuinfo_
+ }
+ #endif
+
++static __init int setup_noclflush(char *arg)
++{
++ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
++ return 1;
++}
++__setup("noclflush", setup_noclflush);
++
+ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+ {
+ char *vendor = NULL;
+@@ -597,6 +610,17 @@ void __cpuinit print_cpu_info(struct cpu
+ printk("\n");
+ }
+
++static __init int setup_disablecpuid(char *arg)
++{
++ int bit;
++ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
++ setup_clear_cpu_cap(bit);
++ else
++ return 0;
++ return 1;
++}
++__setup("clearcpuid=", setup_disablecpuid);
++
+ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+
+ /* This is hacky. :)
+@@ -606,16 +630,6 @@ cpumask_t cpu_initialized __cpuinitdata
+ * They will insert themselves into the cpu_devs structure.
+ * Then, when cpu_init() is called, we can just iterate over that array.
+ */
+-
+-extern int intel_cpu_init(void);
+-extern int cyrix_init_cpu(void);
+-extern int nsc_init_cpu(void);
+-extern int amd_init_cpu(void);
+-extern int centaur_init_cpu(void);
+-extern int transmeta_init_cpu(void);
+-extern int nexgen_init_cpu(void);
+-extern int umc_init_cpu(void);
+-
+ void __init early_cpu_init(void)
+ {
+ intel_cpu_init();
+@@ -627,21 +641,13 @@ void __init early_cpu_init(void)
+ nexgen_init_cpu();
+ umc_init_cpu();
+ early_cpu_detect();
+-
+-#ifdef CONFIG_DEBUG_PAGEALLOC
+- /* pse is not compatible with on-the-fly unmapping,
+- * disable it even if the cpus claim to support it.
+- */
+- clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+- disable_pse = 1;
+-#endif
+ }
+
+ /* Make sure %fs is initialized properly in idle threads */
+-struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
++struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
+ {
+ memset(regs, 0, sizeof(struct pt_regs));
+- regs->xfs = __KERNEL_PERCPU;
++ regs->fs = __KERNEL_PERCPU;
+ return regs;
+ }
+
+@@ -649,7 +655,7 @@ struct pt_regs * __devinit idle_regs(str
+ * it's on the real one. */
+ void switch_to_new_gdt(void)
+ {
+- struct Xgt_desc_struct gdt_descr;
++ struct desc_ptr gdt_descr;
+ unsigned long va, frames[16];
+ int f;
+
+@@ -692,12 +698,6 @@ void __cpuinit cpu_init(void)
+
+ if (cpu_has_vme || cpu_has_de)
+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+- if (tsc_disable && cpu_has_tsc) {
+- printk(KERN_NOTICE "Disabling TSC...\n");
+- /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
+- clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+- set_in_cr4(X86_CR4_TSD);
+- }
+
+ switch_to_new_gdt();
+
+@@ -710,7 +710,7 @@ void __cpuinit cpu_init(void)
+ BUG();
+ enter_lazy_tlb(&init_mm, curr);
+
+- load_esp0(t, thread);
++ load_sp0(t, thread);
+
+ load_LDT(&init_mm.context);
+
+--- sle11-2009-06-29.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {
+
+ struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
+ unsigned int num_var_ranges;
+-unsigned int *usage_table;
++unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+
+ static void __init set_num_var_ranges(void)
+ {
+@@ -52,17 +52,12 @@ static void __init init_table(void)
+ int i, max;
+
+ max = num_var_ranges;
+- if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
+- == NULL) {
+- printk(KERN_ERR "mtrr: could not allocate\n");
+- return;
+- }
+ for (i = 0; i < max; i++)
+- usage_table[i] = 0;
++ mtrr_usage_table[i] = 0;
+ }
+
+ int mtrr_add_page(unsigned long base, unsigned long size,
+- unsigned int type, char increment)
++ unsigned int type, bool increment)
+ {
+ int error;
+ struct xen_platform_op op;
+@@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un
+ }
+
+ if (increment)
+- ++usage_table[op.u.add_memtype.reg];
++ ++mtrr_usage_table[op.u.add_memtype.reg];
+
+ mutex_unlock(&mtrr_mutex);
+
+@@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base
+
+ int
+ mtrr_add(unsigned long base, unsigned long size, unsigned int type,
+- char increment)
++ bool increment)
+ {
+ if (mtrr_check(base, size))
+ return -EINVAL;
+@@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long
+ goto out;
+ }
+ }
+- if (usage_table[reg] < 1) {
++ if (mtrr_usage_table[reg] < 1) {
+ printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
+ goto out;
+ }
+- if (--usage_table[reg] < 1) {
++ if (--mtrr_usage_table[reg] < 1) {
+ op.cmd = XENPF_del_memtype;
+ op.u.del_memtype.handle = 0;
+ op.u.del_memtype.reg = reg;
+--- sle11-2009-06-29.orig/arch/x86/kernel/e820_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -7,7 +7,6 @@
+ #include <linux/kexec.h>
+ #include <linux/module.h>
+ #include <linux/mm.h>
+-#include <linux/efi.h>
+ #include <linux/pfn.h>
+ #include <linux/uaccess.h>
+ #include <linux/suspend.h>
+@@ -18,11 +17,6 @@
+ #include <asm/setup.h>
+ #include <xen/interface/memory.h>
+
+-#ifdef CONFIG_EFI
+-int efi_enabled = 0;
+-EXPORT_SYMBOL(efi_enabled);
+-#endif
+-
+ struct e820map e820;
+ struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+@@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000
+ EXPORT_SYMBOL(pci_mem_start);
+ #endif
+ extern int user_defined_memmap;
+-struct resource data_resource = {
+- .name = "Kernel data",
+- .start = 0,
+- .end = 0,
+- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+-};
+-
+-struct resource code_resource = {
+- .name = "Kernel code",
+- .start = 0,
+- .end = 0,
+- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+-};
+-
+-struct resource bss_resource = {
+- .name = "Kernel bss",
+- .start = 0,
+- .end = 0,
+- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+-};
+
+ static struct resource system_rom_resource = {
+ .name = "System ROM",
+@@ -112,60 +86,6 @@ static struct resource video_rom_resourc
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+ };
+
+-static struct resource video_ram_resource = {
+- .name = "Video RAM area",
+- .start = 0xa0000,
+- .end = 0xbffff,
+- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+-};
+-
+-static struct resource standard_io_resources[] = { {
+- .name = "dma1",
+- .start = 0x0000,
+- .end = 0x001f,
+- .flags = IORESOURCE_BUSY | IORESOURCE_IO
+-}, {
+- .name = "pic1",
+- .start = 0x0020,
+- .end = 0x0021,
+- .flags = IORESOURCE_BUSY | IORESOURCE_IO
+-}, {
+- .name = "timer0",
+- .start = 0x0040,
+- .end = 0x0043,
+- .flags = IORESOURCE_BUSY | IORESOURCE_IO
+-}, {
+- .name = "timer1",
+- .start = 0x0050,
+- .end = 0x0053,
+- .flags = IORESOURCE_BUSY | IORESOURCE_IO
+-}, {
+- .name = "keyboard",
+- .start = 0x0060,
+- .end = 0x006f,
+- .flags = IORESOURCE_BUSY | IORESOURCE_IO
+-}, {
+- .name = "dma page reg",
+- .start = 0x0080,
+- .end = 0x008f,
+- .flags = IORESOURCE_BUSY | IORESOURCE_IO
+-}, {
+- .name = "pic2",
+- .start = 0x00a0,
+- .end = 0x00a1,
+- .flags = IORESOURCE_BUSY | IORESOURCE_IO
+-}, {
+- .name = "dma2",
+- .start = 0x00c0,
+- .end = 0x00df,
+- .flags = IORESOURCE_BUSY | IORESOURCE_IO
+-}, {
+- .name = "fpu",
+- .start = 0x00f0,
+- .end = 0x00ff,
+- .flags = IORESOURCE_BUSY | IORESOURCE_IO
+-} };
+-
+ #define ROMSIGNATURE 0xaa55
+
+ static int __init romsignature(const unsigned char *rom)
+@@ -272,10 +192,9 @@ static struct e820map machine_e820;
+ * Request address space for all standard RAM and ROM resources
+ * and also for regions reported as reserved by the e820.
+ */
+-static void __init
+-legacy_init_iomem_resources(struct resource *code_resource,
+- struct resource *data_resource,
+- struct resource *bss_resource)
++void __init init_iomem_resources(struct resource *code_resource,
++ struct resource *data_resource,
++ struct resource *bss_resource)
+ {
+ int i;
+
+@@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou
+
+ #undef e820
+
+-/*
+- * Request address space for all standard resources
+- *
+- * This is called just before pcibios_init(), which is also a
+- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
+- */
+-static int __init request_standard_resources(void)
+-{
+- int i;
+-
+- /* Nothing to do if not running in dom0. */
+- if (!is_initial_xendomain())
+- return 0;
+-
+- printk("Setting up standard PCI resources\n");
+- if (efi_enabled)
+- efi_initialize_iomem_resources(&code_resource,
+- &data_resource, &bss_resource);
+- else
+- legacy_init_iomem_resources(&code_resource,
+- &data_resource, &bss_resource);
+-
+- /* EFI systems may still have VGA */
+- request_resource(&iomem_resource, &video_ram_resource);
+-
+- /* request I/O space for devices used on all i[345]86 PCs */
+- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+- request_resource(&ioport_resource, &standard_io_resources[i]);
+- return 0;
+-}
+-
+-subsys_initcall(request_standard_resources);
+-
+ #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
+ /**
+ * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
+@@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l
+ {
+ int x;
+
+- if (!efi_enabled) {
+- x = e820.nr_map;
+-
+- if (x == E820MAX) {
+- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+- return;
+- }
++ x = e820.nr_map;
+
+- e820.map[x].addr = start;
+- e820.map[x].size = size;
+- e820.map[x].type = type;
+- e820.nr_map++;
++ if (x == E820MAX) {
++ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
++ return;
+ }
++
++ e820.map[x].addr = start;
++ e820.map[x].size = size;
++ e820.map[x].type = type;
++ e820.nr_map++;
+ } /* add_memory_region */
+
+ /*
+@@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr
+ }
+
+ /*
+- * Callback for efi_memory_walk.
+- */
+-static int __init
+-efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
+-{
+- unsigned long *max_pfn = arg, pfn;
+-
+- if (start < end) {
+- pfn = PFN_UP(end -1);
+- if (pfn > *max_pfn)
+- *max_pfn = pfn;
+- }
+- return 0;
+-}
+-
+-static int __init
+-efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
+-{
+- memory_present(0, PFN_UP(start), PFN_DOWN(end));
+- return 0;
+-}
+-
+-/*
+ * Find the highest page frame number we have available
+ */
+ void __init find_max_pfn(void)
+@@ -672,11 +533,6 @@ void __init find_max_pfn(void)
+ int i;
+
+ max_pfn = 0;
+- if (efi_enabled) {
+- efi_memmap_walk(efi_find_max_pfn, &max_pfn);
+- efi_memmap_walk(efi_memory_present_wrapper, NULL);
+- return;
+- }
+
+ for (i = 0; i < e820.nr_map; i++) {
+ unsigned long start, end;
+@@ -694,34 +550,12 @@ void __init find_max_pfn(void)
+ }
+
+ /*
+- * Free all available memory for boot time allocation. Used
+- * as a callback function by efi_memory_walk()
+- */
+-
+-static int __init
+-free_available_memory(unsigned long start, unsigned long end, void *arg)
+-{
+- /* check max_low_pfn */
+- if (start >= (max_low_pfn << PAGE_SHIFT))
+- return 0;
+- if (end >= (max_low_pfn << PAGE_SHIFT))
+- end = max_low_pfn << PAGE_SHIFT;
+- if (start < end)
+- free_bootmem(start, end - start);
+-
+- return 0;
+-}
+-/*
+ * Register fully available low RAM pages with the bootmem allocator.
+ */
+ void __init register_bootmem_low_pages(unsigned long max_low_pfn)
+ {
+ int i;
+
+- if (efi_enabled) {
+- efi_memmap_walk(free_available_memory, NULL);
+- return;
+- }
+ for (i = 0; i < e820.nr_map; i++) {
+ unsigned long curr_pfn, last_pfn, size;
+ /*
+@@ -855,56 +689,12 @@ void __init print_memory_map(char *who)
+ }
+ }
+
+-static __init __always_inline void efi_limit_regions(unsigned long long size)
+-{
+- unsigned long long current_addr = 0;
+- efi_memory_desc_t *md, *next_md;
+- void *p, *p1;
+- int i, j;
+-
+- j = 0;
+- p1 = memmap.map;
+- for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+- md = p;
+- next_md = p1;
+- current_addr = md->phys_addr +
+- PFN_PHYS(md->num_pages);
+- if (is_available_memory(md)) {
+- if (md->phys_addr >= size) continue;
+- memcpy(next_md, md, memmap.desc_size);
+- if (current_addr >= size) {
+- next_md->num_pages -=
+- PFN_UP(current_addr-size);
+- }
+- p1 += memmap.desc_size;
+- next_md = p1;
+- j++;
+- } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
+- EFI_MEMORY_RUNTIME) {
+- /* In order to make runtime services
+- * available we have to include runtime
+- * memory regions in memory map */
+- memcpy(next_md, md, memmap.desc_size);
+- p1 += memmap.desc_size;
+- next_md = p1;
+- j++;
+- }
+- }
+- memmap.nr_map = j;
+- memmap.map_end = memmap.map +
+- (memmap.nr_map * memmap.desc_size);
+-}
+-
+ void __init limit_regions(unsigned long long size)
+ {
+ unsigned long long current_addr = 0;
+ int i;
+
+ print_memory_map("limit_regions start");
+- if (efi_enabled) {
+- efi_limit_regions(size);
+- return;
+- }
+ for (i = 0; i < e820.nr_map; i++) {
+ current_addr = e820.map[i].addr + e820.map[i].size;
+ if (current_addr < size)
+@@ -1056,3 +846,44 @@ static int __init parse_memmap(char *arg
+ return 0;
+ }
+ early_param("memmap", parse_memmap);
++
++#ifndef CONFIG_XEN
++void __init update_memory_range(u64 start, u64 size, unsigned old_type,
++ unsigned new_type)
++{
++ int i;
++
++ BUG_ON(old_type == new_type);
++
++ for (i = 0; i < e820.nr_map; i++) {
++ struct e820entry *ei = &e820.map[i];
++ u64 final_start, final_end;
++ if (ei->type != old_type)
++ continue;
++ /* totally covered? */
++ if (ei->addr >= start && ei->size <= size) {
++ ei->type = new_type;
++ continue;
++ }
++ /* partially covered */
++ final_start = max(start, ei->addr);
++ final_end = min(start + size, ei->addr + ei->size);
++ if (final_start >= final_end)
++ continue;
++ add_memory_region(final_start, final_end - final_start,
++ new_type);
++ }
++}
++
++void __init update_e820(void)
++{
++ u8 nr_map;
++
++ nr_map = e820.nr_map;
++ if (sanitize_e820_map(e820.map, &nr_map))
++ return;
++ e820.nr_map = nr_map;
++ printk(KERN_INFO "modified physical RAM map:\n");
++ print_memory_map("modified");
++}
++#endif
+--- sle11-2009-06-29.orig/arch/x86/kernel/e820_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -1,4 +1,4 @@
+-/*
++/*
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ *
+@@ -26,6 +26,7 @@
+ #include <asm/proto.h>
+ #include <asm/setup.h>
+ #include <asm/sections.h>
++#include <asm/kdebug.h>
+ #include <xen/interface/memory.h>
+
+ struct e820map e820 __initdata;
+@@ -33,96 +34,103 @@ struct e820map e820 __initdata;
+ struct e820map machine_e820;
+ #endif
+
+-/*
++/*
+ * PFN of last memory page.
+ */
+-unsigned long end_pfn;
+-EXPORT_SYMBOL(end_pfn);
++unsigned long end_pfn;
+
+-/*
++/*
+ * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
+ * The direct mapping extends to end_pfn_map, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+- */
+-unsigned long end_pfn_map;
++ */
++unsigned long end_pfn_map;
+
+-/*
++/*
+ * Last pfn which the user wants to use.
+ */
+ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
+
+-extern struct resource code_resource, data_resource, bss_resource;
+-
+-/* Check for some hardcoded bad areas that early boot is not allowed to touch */
+-static inline int bad_addr(unsigned long *addrp, unsigned long size)
+-{
+- unsigned long addr = *addrp, last = addr + size;
++/*
++ * Early reserved memory areas.
++ */
++#define MAX_EARLY_RES 20
+
++struct early_res {
++ unsigned long start, end;
++ char name[16];
++};
++static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+ #ifndef CONFIG_XEN
+- /* various gunk below that needed for SMP startup */
+- if (addr < 0x8000) {
+- *addrp = PAGE_ALIGN(0x8000);
+- return 1;
+- }
+-
+- /* direct mapping tables of the kernel */
+- if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
+- *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
+- return 1;
+- }
+-
+- /* initrd */
+-#ifdef CONFIG_BLK_DEV_INITRD
+- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
+- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
+- unsigned long ramdisk_end = ramdisk_image+ramdisk_size;
+-
+- if (last >= ramdisk_image && addr < ramdisk_end) {
+- *addrp = PAGE_ALIGN(ramdisk_end);
+- return 1;
+- }
+- }
++ { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
++#ifdef CONFIG_SMP
++ { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
+ #endif
+- /* kernel code */
+- if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
+- *addrp = PAGE_ALIGN(__pa_symbol(&_end));
+- return 1;
+- }
++#endif
++ {}
++};
+
+- if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
+- *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
+- return 1;
++void __init reserve_early(unsigned long start, unsigned long end, char *name)
++{
++ int i;
++ struct early_res *r;
++ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
++ r = &early_res[i];
++ if (end > r->start && start < r->end)
++ panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
++ start, end - 1, name?name:"", r->start, r->end - 1, r->name);
+ }
++ if (i >= MAX_EARLY_RES)
++ panic("Too many early reservations");
++ r = &early_res[i];
++ r->start = start;
++ r->end = end;
++ if (name)
++ strncpy(r->name, name, sizeof(r->name) - 1);
++}
+
+-#ifdef CONFIG_NUMA
+- /* NUMA memory to node map */
+- if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+- *addrp = nodemap_addr + nodemap_size;
+- return 1;
++void __init early_res_to_bootmem(void)
++{
++ int i;
++ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
++ struct early_res *r = &early_res[i];
++ printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
++ r->start, r->end - 1, r->name);
++ reserve_bootmem_generic(r->start, r->end - r->start);
+ }
+-#endif
+- /* XXX ramdisk image here? */
+-#else
+- if (last < (table_end<<PAGE_SHIFT)) {
+- *addrp = table_end << PAGE_SHIFT;
+- return 1;
++}
++
++/* Check for already reserved areas */
++static inline int bad_addr(unsigned long *addrp, unsigned long size)
++{
++ int i;
++ unsigned long addr = *addrp, last;
++ int changed = 0;
++again:
++ last = addr + size;
++ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
++ struct early_res *r = &early_res[i];
++ if (last >= r->start && addr < r->end) {
++ *addrp = addr = r->end;
++ changed = 1;
++ goto again;
++ }
+ }
+-#endif
+- return 0;
+-}
++ return changed;
++}
+
+ /*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+-int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
+-{
++int
++e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
++{
+ int i;
+
+ #ifndef CONFIG_XEN
+- for (i = 0; i < e820.nr_map; i++) {
+- struct e820entry *ei = &e820.map[i];
++ for (i = 0; i < e820.nr_map; i++) {
++ struct e820entry *ei = &e820.map[i];
+ #else
+ if (!is_initial_xendomain())
+ return 0;
+@@ -130,12 +138,12 @@ int e820_any_mapped(unsigned long start,
+ const struct e820entry *ei = &machine_e820.map[i];
+ #endif
+
+- if (type && ei->type != type)
++ if (type && ei->type != type)
+ continue;
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+- continue;
+- return 1;
+- }
++ continue;
++ return 1;
++ }
+ return 0;
+ }
+ EXPORT_SYMBOL_GPL(e820_any_mapped);
+@@ -146,7 +154,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+-int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
++int __init e820_all_mapped(unsigned long start, unsigned long end,
++ unsigned type)
+ {
+ int i;
+
+@@ -171,65 +180,77 @@ int __init e820_all_mapped(unsigned long
+ */
+ if (ei->addr <= start)
+ start = ei->addr + ei->size;
+- /* if start is now at or beyond end, we're done, full coverage */
++ /*
++ * if start is now at or beyond end, we're done, full
++ * coverage
++ */
+ if (start >= end)
+- return 1; /* we're done */
++ return 1;
+ }
+ return 0;
+ }
+
+-/*
+- * Find a free area in a specific range.
+- */
+-unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
+-{
+- int i;
+- for (i = 0; i < e820.nr_map; i++) {
+- struct e820entry *ei = &e820.map[i];
+- unsigned long addr = ei->addr, last;
+- if (ei->type != E820_RAM)
+- continue;
+- if (addr < start)
++/*
++ * Find a free area with specified alignment in a specific range.
++ */
++unsigned long __init find_e820_area(unsigned long start, unsigned long end,
++ unsigned size, unsigned long align)
++{
++ int i;
++ unsigned long mask = ~(align - 1);
++
++ for (i = 0; i < e820.nr_map; i++) {
++ struct e820entry *ei = &e820.map[i];
++ unsigned long addr = ei->addr, last;
++
++ if (ei->type != E820_RAM)
++ continue;
++ if (addr < start)
+ addr = start;
+- if (addr > ei->addr + ei->size)
+- continue;
++ if (addr > ei->addr + ei->size)
++ continue;
+ while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
+ ;
+- last = PAGE_ALIGN(addr) + size;
++ addr = (addr + align - 1) & mask;
++ last = addr + size;
+ if (last > ei->addr + ei->size)
+ continue;
+- if (last > end)
++ if (last > end)
+ continue;
+- return addr;
+- }
+- return -1UL;
+-}
++ return addr;
++ }
++ return -1UL;
++}
+
+ /*
+ * Find the highest page frame number we have available
+ */
+ unsigned long __init e820_end_of_ram(void)
+ {
+- unsigned long end_pfn = 0;
++ unsigned long end_pfn;
++
+ end_pfn = find_max_pfn_with_active_regions();
+-
+- if (end_pfn > end_pfn_map)
++
++ if (end_pfn > end_pfn_map)
+ end_pfn_map = end_pfn;
+ if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
+ end_pfn_map = MAXMEM>>PAGE_SHIFT;
+ if (end_pfn > end_user_pfn)
+ end_pfn = end_user_pfn;
+- if (end_pfn > end_pfn_map)
+- end_pfn = end_pfn_map;
++ if (end_pfn > end_pfn_map)
++ end_pfn = end_pfn_map;
+
+- printk("end_pfn_map = %lu\n", end_pfn_map);
+- return end_pfn;
++ printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
++ return end_pfn;
+ }
+
+ /*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+-void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
++void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
++ struct resource *code_resource,
++ struct resource *data_resource,
++ struct resource *bss_resource)
+ {
+ int i;
+ for (i = 0; i < nr_map; i++) {
+@@ -247,14 +268,14 @@ void __init e820_reserve_resources(struc
+ request_resource(&iomem_resource, res);
+ if (e820[i].type == E820_RAM) {
+ /*
+- * We don't know which RAM region contains kernel data,
+- * so we try it repeatedly and let the resource manager
+- * test it.
++ * We don't know which RAM region contains kernel data,
++ * so we try it repeatedly and let the resource manager
++ * test it.
+ */
+ #ifndef CONFIG_XEN
+- request_resource(res, &code_resource);
+- request_resource(res, &data_resource);
+- request_resource(res, &bss_resource);
++ request_resource(res, code_resource);
++ request_resource(res, data_resource);
++ request_resource(res, bss_resource);
+ #endif
+ #ifdef CONFIG_KEXEC
+ if (crashk_res.start != crashk_res.end)
+@@ -357,9 +378,9 @@ e820_register_active_regions(int nid, un
+ add_active_range(nid, ei_startpfn, ei_endpfn);
+ }
+
+-/*
++/*
+ * Add a memory region to the kernel e820 map.
+- */
++ */
+ void __init add_memory_region(unsigned long start, unsigned long size, int type)
+ {
+ int x = e820.nr_map;
+@@ -384,9 +405,7 @@ unsigned long __init e820_hole_size(unsi
+ {
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long end_pfn = end >> PAGE_SHIFT;
+- unsigned long ei_startpfn;
+- unsigned long ei_endpfn;
+- unsigned long ram = 0;
++ unsigned long ei_startpfn, ei_endpfn, ram = 0;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+@@ -398,28 +417,31 @@ unsigned long __init e820_hole_size(unsi
+ return end - start - (ram << PAGE_SHIFT);
+ }
+
+-void __init e820_print_map(char *who)
++static void __init e820_print_map(char *who)
+ {
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+- (unsigned long long) e820.map[i].addr,
+- (unsigned long long) (e820.map[i].addr + e820.map[i].size));
++ (unsigned long long) e820.map[i].addr,
++ (unsigned long long)
++ (e820.map[i].addr + e820.map[i].size));
+ switch (e820.map[i].type) {
+- case E820_RAM: printk("(usable)\n");
+- break;
++ case E820_RAM:
++ printk(KERN_CONT "(usable)\n");
++ break;
+ case E820_RESERVED:
+- printk("(reserved)\n");
+- break;
++ printk(KERN_CONT "(reserved)\n");
++ break;
+ case E820_ACPI:
+- printk("(ACPI data)\n");
+- break;
++ printk(KERN_CONT "(ACPI data)\n");
++ break;
+ case E820_NVS:
+- printk("(ACPI NVS)\n");
+- break;
+- default: printk("type %u\n", e820.map[i].type);
+- break;
++ printk(KERN_CONT "(ACPI NVS)\n");
++ break;
++ default:
++ printk(KERN_CONT "type %u\n", e820.map[i].type);
++ break;
+ }
+ }
+ }
+@@ -427,11 +449,11 @@ void __init e820_print_map(char *who)
+ /*
+ * Sanitize the BIOS e820 map.
+ *
+- * Some e820 responses include overlapping entries. The following
++ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+-static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
++static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
+ {
+ struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+@@ -451,7 +473,8 @@ static int __init sanitize_e820_map(stru
+ int i;
+
+ /*
+- Visually we're performing the following (1,2,3,4 = memory types)...
++ Visually we're performing the following
++ (1,2,3,4 = memory types)...
+
+ Sample memory map (w/overlaps):
+ ____22__________________
+@@ -493,22 +516,23 @@ static int __init sanitize_e820_map(stru
+ old_nr = *pnr_map;
+
+ /* bail out if we find any unreasonable addresses in bios map */
+- for (i=0; i<old_nr; i++)
++ for (i = 0; i < old_nr; i++)
+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+ return -1;
+
+ /* create pointers for initial change-point information (for sorting) */
+- for (i=0; i < 2*old_nr; i++)
++ for (i = 0; i < 2 * old_nr; i++)
+ change_point[i] = &change_point_list[i];
+
+ /* record all known change-points (starting and ending addresses),
+ omitting those that are for empty memory regions */
+ chgidx = 0;
+- for (i=0; i < old_nr; i++) {
++ for (i = 0; i < old_nr; i++) {
+ if (biosmap[i].size != 0) {
+ change_point[chgidx]->addr = biosmap[i].addr;
+ change_point[chgidx++]->pbios = &biosmap[i];
+- change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
++ change_point[chgidx]->addr = biosmap[i].addr +
++ biosmap[i].size;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ }
+ }
+@@ -518,75 +542,106 @@ static int __init sanitize_e820_map(stru
+ still_changing = 1;
+ while (still_changing) {
+ still_changing = 0;
+- for (i=1; i < chg_nr; i++) {
+- /* if <current_addr> > <last_addr>, swap */
+- /* or, if current=<start_addr> & last=<end_addr>, swap */
+- if ((change_point[i]->addr < change_point[i-1]->addr) ||
+- ((change_point[i]->addr == change_point[i-1]->addr) &&
+- (change_point[i]->addr == change_point[i]->pbios->addr) &&
+- (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+- )
+- {
++ for (i = 1; i < chg_nr; i++) {
++ unsigned long long curaddr, lastaddr;
++ unsigned long long curpbaddr, lastpbaddr;
++
++ curaddr = change_point[i]->addr;
++ lastaddr = change_point[i - 1]->addr;
++ curpbaddr = change_point[i]->pbios->addr;
++ lastpbaddr = change_point[i - 1]->pbios->addr;
++
++ /*
++ * swap entries, when:
++ *
++ * curaddr > lastaddr or
++ * curaddr == lastaddr and curaddr == curpbaddr and
++ * lastaddr != lastpbaddr
++ */
++ if (curaddr < lastaddr ||
++ (curaddr == lastaddr && curaddr == curpbaddr &&
++ lastaddr != lastpbaddr)) {
+ change_tmp = change_point[i];
+ change_point[i] = change_point[i-1];
+ change_point[i-1] = change_tmp;
+- still_changing=1;
++ still_changing = 1;
+ }
+ }
+ }
+
+ /* create a new bios memory map, removing overlaps */
+- overlap_entries=0; /* number of entries in the overlap table */
+- new_bios_entry=0; /* index for creating new bios map entries */
++ overlap_entries = 0; /* number of entries in the overlap table */
++ new_bios_entry = 0; /* index for creating new bios map entries */
+ last_type = 0; /* start with undefined memory type */
+ last_addr = 0; /* start with 0 as last starting address */
++
+ /* loop through change-points, determining affect on the new bios map */
+- for (chgidx=0; chgidx < chg_nr; chgidx++)
+- {
++ for (chgidx = 0; chgidx < chg_nr; chgidx++) {
+ /* keep track of all overlapping bios entries */
+- if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+- {
+- /* add map entry to overlap list (> 1 entry implies an overlap) */
+- overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+- }
+- else
+- {
+- /* remove entry from list (order independent, so swap with last) */
+- for (i=0; i<overlap_entries; i++)
+- {
+- if (overlap_list[i] == change_point[chgidx]->pbios)
+- overlap_list[i] = overlap_list[overlap_entries-1];
++ if (change_point[chgidx]->addr ==
++ change_point[chgidx]->pbios->addr) {
++ /*
++ * add map entry to overlap list (> 1 entry
++ * implies an overlap)
++ */
++ overlap_list[overlap_entries++] =
++ change_point[chgidx]->pbios;
++ } else {
++ /*
++ * remove entry from list (order independent,
++ * so swap with last)
++ */
++ for (i = 0; i < overlap_entries; i++) {
++ if (overlap_list[i] ==
++ change_point[chgidx]->pbios)
++ overlap_list[i] =
++ overlap_list[overlap_entries-1];
+ }
+ overlap_entries--;
+ }
+- /* if there are overlapping entries, decide which "type" to use */
+- /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
++ /*
++ * if there are overlapping entries, decide which
++ * "type" to use (larger value takes precedence --
++ * 1=usable, 2,3,4,4+=unusable)
++ */
+ current_type = 0;
+- for (i=0; i<overlap_entries; i++)
++ for (i = 0; i < overlap_entries; i++)
+ if (overlap_list[i]->type > current_type)
+ current_type = overlap_list[i]->type;
+- /* continue building up new bios map based on this information */
++ /*
++ * continue building up new bios map based on this
++ * information
++ */
+ if (current_type != last_type) {
+ if (last_type != 0) {
+ new_bios[new_bios_entry].size =
+ change_point[chgidx]->addr - last_addr;
+- /* move forward only if the new size was non-zero */
++ /*
++ * move forward only if the new size
++ * was non-zero
++ */
+ if (new_bios[new_bios_entry].size != 0)
++ /*
++ * no more space left for new
++ * bios entries ?
++ */
+ if (++new_bios_entry >= E820MAX)
+- break; /* no more space left for new bios entries */
++ break;
+ }
+ if (current_type != 0) {
+- new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
++ new_bios[new_bios_entry].addr =
++ change_point[chgidx]->addr;
+ new_bios[new_bios_entry].type = current_type;
+- last_addr=change_point[chgidx]->addr;
++ last_addr = change_point[chgidx]->addr;
+ }
+ last_type = current_type;
+ }
+ }
+- new_nr = new_bios_entry; /* retain count for new bios entries */
++ /* retain count for new bios entries */
++ new_nr = new_bios_entry;
+
+ /* copy new bios mapping into original location */
+- memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
++ memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
+ *pnr_map = new_nr;
+
+ return 0;
+@@ -601,7 +656,7 @@ static int __init sanitize_e820_map(stru
+ * will have given us a memory map that we can use to properly
+ * set up memory. If we aren't, we'll fake a memory map.
+ */
+-static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
++static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
+ {
+ #ifndef CONFIG_XEN
+ /* Only one memory region (or negative)? Ignore it */
+@@ -622,7 +677,7 @@ static int __init copy_e820_map(struct e
+ return -1;
+
+ add_memory_region(start, size, type);
+- } while (biosmap++,--nr_map);
++ } while (biosmap++, --nr_map);
+
+ #ifdef CONFIG_XEN
+ if (is_initial_xendomain()) {
+@@ -641,15 +696,17 @@ static int __init copy_e820_map(struct e
+ return 0;
+ }
+
+-void early_panic(char *msg)
++static void early_panic(char *msg)
+ {
+ early_printk(msg);
+ panic(msg);
+ }
+
+-#ifndef CONFIG_XEN
+-void __init setup_memory_region(void)
++/* We're not void only for x86 32-bit compat */
++char * __init machine_specific_memory_setup(void)
+ {
++#ifndef CONFIG_XEN
++ char *who = "BIOS-e820";
+ /*
+ * Try to copy the BIOS-supplied E820-map.
+ *
+@@ -659,14 +716,8 @@ void __init setup_memory_region(void)
+ sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
+ if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
+ early_panic("Cannot find a valid memory map");
+- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+- e820_print_map("BIOS-e820");
+-}
+-
+ #else /* CONFIG_XEN */
+-
+-void __init setup_memory_region(void)
+-{
++ char *who = "Xen";
+ int rc;
+ struct xen_memory_map memmap;
+ /*
+@@ -694,11 +745,13 @@ void __init setup_memory_region(void)
+
+ if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
+ early_panic("Cannot find a valid memory map");
+-
++#endif
+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+- e820_print_map("Xen");
++ e820_print_map(who);
++
++ /* In case someone cares... */
++ return who;
+ }
+-#endif
+
+ static int __init parse_memopt(char *p)
+ {
+@@ -709,7 +762,7 @@ static int __init parse_memopt(char *p)
+ if (!p)
+ return -EINVAL;
+ end_user_pfn = memparse(p, &p);
+- end_user_pfn >>= PAGE_SHIFT;
++ end_user_pfn >>= PAGE_SHIFT;
+
+ end = end_user_pfn<<PAGE_SHIFT;
+ i = e820.nr_map-1;
+@@ -727,7 +780,7 @@ static int __init parse_memopt(char *p)
+ }
+
+ return 0;
+-}
++}
+ early_param("mem", parse_memopt);
+
+ static int userdef __initdata;
+@@ -739,9 +792,9 @@ static int __init parse_memmap_opt(char
+
+ if (!strcmp(p, "exactmap")) {
+ #ifdef CONFIG_CRASH_DUMP
+- /* If we are doing a crash dump, we
+- * still need to know the real mem
+- * size before original memory map is
++ /*
++ * If we are doing a crash dump, we still need to know
++ * the real mem size before original memory map is
+ * reset.
+ */
+ e820_register_active_regions(0, 0, -1UL);
+@@ -758,6 +811,8 @@ static int __init parse_memmap_opt(char
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
++
++ userdef = 1;
+ if (*p == '@') {
+ start_at = memparse(p+1, &p);
+ add_memory_region(start_at, mem_size, E820_RAM);
+@@ -777,11 +832,58 @@ early_param("memmap", parse_memmap_opt);
+ void __init finish_e820_parsing(void)
+ {
+ if (userdef) {
++ char nr = e820.nr_map;
++
++ if (sanitize_e820_map(e820.map, &nr) < 0)
++ early_panic("Invalid user supplied memory map");
++ e820.nr_map = nr;
++
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ e820_print_map("user");
+ }
+ }
+
++#ifndef CONFIG_XEN
++void __init update_memory_range(u64 start, u64 size, unsigned old_type,
++ unsigned new_type)
++{
++ int i;
++
++ BUG_ON(old_type == new_type);
++
++ for (i = 0; i < e820.nr_map; i++) {
++ struct e820entry *ei = &e820.map[i];
++ u64 final_start, final_end;
++ if (ei->type != old_type)
++ continue;
++ /* totally covered? */
++ if (ei->addr >= start && ei->size <= size) {
++ ei->type = new_type;
++ continue;
++ }
++ /* partially covered */
++ final_start = max(start, ei->addr);
++ final_end = min(start + size, ei->addr + ei->size);
++ if (final_start >= final_end)
++ continue;
++ add_memory_region(final_start, final_end - final_start,
++ new_type);
++ }
++}
++
++void __init update_e820(void)
++{
++ u8 nr_map;
++
++ nr_map = e820.nr_map;
++ if (sanitize_e820_map(e820.map, &nr_map))
++ return;
++ e820.nr_map = nr_map;
++ printk(KERN_INFO "modified physical RAM map:\n");
++ e820_print_map("modified");
++}
++#endif
++
+ unsigned long pci_mem_start = 0xaeedbabe;
+ EXPORT_SYMBOL(pci_mem_start);
+
+@@ -825,8 +927,10 @@ __init void e820_setup_gap(struct e820en
+
+ if (!found) {
+ gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
+- printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
+- KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
++ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
++ "address range\n"
++ KERN_ERR "PCI: Unassigned devices with 32bit resource "
++ "registers may break!\n");
+ }
+
+ /*
+@@ -839,8 +943,9 @@ __init void e820_setup_gap(struct e820en
+ /* Fun with two's complement */
+ pci_mem_start = (gapstart + round) & -round;
+
+- printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+- pci_mem_start, gapstart, gapsize);
++ printk(KERN_INFO
++ "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
++ pci_mem_start, gapstart, gapsize);
+ }
+
+ int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
+--- sle11-2009-06-29.orig/arch/x86/kernel/early_printk-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -222,7 +222,7 @@ static struct console simnow_console = {
+ };
+
+ /* Direct interface for emergencies */
+-struct console *early_console = &early_vga_console;
++static struct console *early_console = &early_vga_console;
+ static int early_console_initialized = 0;
+
+ void early_printk(const char *fmt, ...)
+--- sle11-2009-06-29.orig/arch/x86/kernel/entry_32-xen.S 2009-05-14 11:18:18.000000000 +0200
++++ sle11-2009-06-29/arch/x86/kernel/entry_32-xen.S 2009-05-14 11:18:32.000000000 +0200
+@@ -59,7 +59,7 @@
+ * for paravirtualization. The following will never clobber any registers:
+ * INTERRUPT_RETURN (aka. "iret")
+ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
++ * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+@@ -282,16 +282,21 @@ END(resume_kernel)
+ #endif
+ CFI_ENDPROC
+
++ .macro test_tif ti_reg # system call tracing in operation / emulation
++ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
++ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
++ .endm
++
+ /* SYSENTER_RETURN points to after the "sysenter" instruction in
+ the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
+
+ # sysenter call handler stub
+-ENTRY(sysenter_entry)
++ENTRY(ia32_sysenter_target)
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 0
+ CFI_REGISTER esp, ebp
+- movl SYSENTER_stack_esp0(%esp),%esp
++ movl SYSENTER_stack_sp0(%esp),%esp
+ sysenter_past_esp:
+ /*
+ * No need to follow this irqs on/off section: the syscall
+@@ -334,9 +339,7 @@ sysenter_past_esp:
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ GET_THREAD_INFO(%ebp)
+-
+- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+- testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
++ test_tif %ebp
+ jnz syscall_trace_entry
+ cmpl $(nr_syscalls), %eax
+ jae syscall_badsys
+@@ -354,7 +357,7 @@ sysenter_past_esp:
+ xorl %ebp,%ebp
+ TRACE_IRQS_ON
+ 1: mov PT_FS(%esp), %fs
+- ENABLE_INTERRUPTS_SYSEXIT
++ ENABLE_INTERRUPTS_SYSCALL_RET
+ CFI_ENDPROC
+ .pushsection .fixup,"ax"
+ 2: movl $0,PT_FS(%esp)
+@@ -363,10 +366,10 @@ sysenter_past_esp:
+ .align 4
+ .long 1b,2b
+ .popsection
+-ENDPROC(sysenter_entry)
++ENDPROC(ia32_sysenter_target)
+
+ # pv sysenter call handler stub
+-ENTRY(sysenter_entry_pv)
++ENTRY(ia32pv_sysenter_target)
+ RING0_INT_FRAME
+ movl $__USER_DS,16(%esp)
+ movl %ebp,12(%esp)
+@@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv)
+ .previous
+ /* fall through */
+ CFI_ENDPROC
+-ENDPROC(sysenter_entry_pv)
++ENDPROC(ia32pv_sysenter_target)
+
+ # system call handler stub
+ ENTRY(system_call)
+@@ -398,9 +401,7 @@ ENTRY(system_call)
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ GET_THREAD_INFO(%ebp)
+- # system call tracing in operation / emulation
+- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+- testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
++ test_tif %ebp
+ jnz syscall_trace_entry
+ cmpl $(nr_syscalls), %eax
+ jae syscall_badsys
+@@ -452,7 +453,8 @@ restore_nocheck_notrace:
+ RESTORE_REGS
+ addl $4, %esp # skip orig_eax/error_code
+ CFI_ADJUST_CFA_OFFSET -4
+-1: INTERRUPT_RETURN
++irq_return:
++ INTERRUPT_RETURN
+ .section .fixup,"ax"
+ iret_exc:
+ pushl $0 # no error code
+@@ -461,7 +463,7 @@ iret_exc:
+ .previous
+ .section __ex_table,"a"
+ .align 4
+- .long 1b,iret_exc
++ .long irq_return,iret_exc
+ .previous
+
+ CFI_RESTORE_STATE
+@@ -657,7 +659,7 @@ END(syscall_badsys)
+ * Build the entry stubs and pointer table with
+ * some assembler magic.
+ */
+-.data
++.section .rodata,"a"
+ ENTRY(interrupt)
+ .text
+
+@@ -963,7 +965,7 @@ END(device_not_available)
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+- * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
++ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+@@ -975,7 +977,7 @@ END(device_not_available)
+ cmpw $__KERNEL_CS,4(%esp); \
+ jne ok; \
+ label: \
+- movl SYSENTER_stack_esp0+offset(%esp),%esp; \
++ movl SYSENTER_stack_sp0+offset(%esp),%esp; \
+ CFI_DEF_CFA esp, 0; \
+ CFI_UNDEFINED eip; \
+ pushfl; \
+@@ -990,7 +992,7 @@ label: \
+ KPROBE_ENTRY(debug)
+ RING0_INT_FRAME
+ #ifndef CONFIG_XEN
+- cmpl $sysenter_entry,(%esp)
++ cmpl $ia32_sysenter_target,(%esp)
+ jne debug_stack_correct
+ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+ debug_stack_correct:
+@@ -1023,7 +1025,7 @@ KPROBE_ENTRY(nmi)
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ je nmi_espfix_stack
+- cmpl $sysenter_entry,(%esp)
++ cmpl $ia32_sysenter_target,(%esp)
+ je nmi_stack_fixup
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+@@ -1036,7 +1038,7 @@ KPROBE_ENTRY(nmi)
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ jae nmi_stack_correct
+- cmpl $sysenter_entry,12(%esp)
++ cmpl $ia32_sysenter_target,12(%esp)
+ je nmi_debug_stack_check
+ nmi_stack_correct:
+ /* We have a RING0_INT_FRAME here */
+@@ -1089,12 +1091,8 @@ nmi_espfix_stack:
+ RESTORE_REGS
+ lss 12+4(%esp), %esp # back to espfix stack
+ CFI_ADJUST_CFA_OFFSET -24
+-1: INTERRUPT_RETURN
++ jmp irq_return
+ CFI_ENDPROC
+-.section __ex_table,"a"
+- .align 4
+- .long 1b,iret_exc
+-.previous
+ #else
+ KPROBE_ENTRY(nmi)
+ RING0_INT_FRAME
+@@ -1112,17 +1110,17 @@ KPROBE_END(nmi)
+
+ #ifdef CONFIG_PARAVIRT
+ ENTRY(native_iret)
+-1: iret
++ iret
+ .section __ex_table,"a"
+ .align 4
+- .long 1b,iret_exc
++ .long native_iret, iret_exc
+ .previous
+ END(native_iret)
+
+-ENTRY(native_irq_enable_sysexit)
++ENTRY(native_irq_enable_syscall_ret)
+ sti
+ sysexit
+-END(native_irq_enable_sysexit)
++END(native_irq_enable_syscall_ret)
+ #endif
+
+ KPROBE_ENTRY(int3)
+@@ -1271,7 +1269,144 @@ ENTRY(kernel_thread_helper)
+ CFI_ENDPROC
+ ENDPROC(kernel_thread_helper)
+
++#include <asm/alternative-asm.h>
++
++ # pv syscall call handler stub
++ENTRY(ia32pv_cstar_target)
++ RING0_INT_FRAME
++ movl $__USER_DS,16(%esp)
++ movl %ebp,%ecx
++ movl $__USER_CS,4(%esp)
++ movl 12(%esp),%ebp
++ pushl %eax # save orig_eax
++ CFI_ADJUST_CFA_OFFSET 4
++/*
++ * Load the potential sixth argument from user stack.
++ * Careful about security.
++ */
++ cmpl $__PAGE_OFFSET-4,%ebp
++ CFI_REMEMBER_STATE
++ ja cstar_fault
++1: movl (%ebp),%ebp
++.section __ex_table,"a"
++ .align 4
++ .long 1b,cstar_fault
++.previous
++ SAVE_ALL
++ GET_THREAD_INFO(%ebp)
++ test_tif %ebp
++ jnz cstar_trace_entry
++ cmpl $nr_syscalls,%eax
++ jae cstar_badsys
++.Lcstar_call:
++ btl %eax,cstar_special
++ jc .Lcstar_special
++ call *cstar_call_table(,%eax,4)
++ movl %eax,PT_EAX(%esp) # store the return value
++.Lcstar_exit:
++ movl PT_ECX(%esp),%ecx
++ movl %ecx,PT_EBP(%esp) # put user EBP back in place
++ jmp syscall_exit
++.Lcstar_special:
++ movl PT_ECX(%esp),%ecx
++ movl %ecx,PT_EBP(%esp) # put user EBP back in place
++ jmp syscall_call
++cstar_set_tif:
++ movl $cstar_clear_tif,(%esp) # replace return address
++ LOCK_PREFIX
++ orl $_TIF_CSTAR,TI_flags(%ebp)
++ jmp *sys_call_table(,%eax,4)
++cstar_clear_tif:
++ movl %eax,PT_EAX(%esp) # store the return value
++ LOCK_PREFIX
++ andl $~_TIF_CSTAR,TI_flags(%ebp)
++ jmp .Lcstar_exit
++cstar_trace_entry:
++ movl $-ENOSYS,PT_EAX(%esp)
++ cmpl $nr_syscalls,%eax
++ jae 1f
++ btl %eax,cstar_special
++ jc .Lcstar_trace_special
++1: movl %esp,%eax
++ xorl %edx,%edx
++ LOCK_PREFIX
++ orl $_TIF_CSTAR,TI_flags(%ebp)
++ call do_syscall_trace
++ LOCK_PREFIX
++ andl $~_TIF_CSTAR,TI_flags(%ebp)
++ testl %eax,%eax
++ jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
++ # so must skip actual syscall
++ movl PT_ORIG_EAX(%esp),%eax
++ cmpl $nr_syscalls,%eax
++ jb .Lcstar_call
++ jmp .Lcstar_exit
++.Lcstar_trace_special:
++ movl PT_ECX(%esp),%ecx
++ movl %esp,%eax
++ xorl %edx,%edx
++ movl %ecx,PT_EBP(%esp) # put user EBP back in place
++ call do_syscall_trace
++ testl %eax,%eax
++ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
++ # so must skip actual syscall
++ movl PT_ORIG_EAX(%esp),%eax
++ cmpl $nr_syscalls,%eax
++ jb syscall_call
++ jmp syscall_exit
++cstar_badsys:
++ movl $-ENOSYS,PT_EAX(%esp)
++.Lcstar_resume:
++ movl PT_ECX(%esp),%ecx
++ movl %ecx,PT_EBP(%esp) # put user EBP back in place
++ jmp resume_userspace
++ CFI_RESTORE_STATE
++cstar_fault:
++ movl $-EFAULT,%eax
++ SAVE_ALL
++ GET_THREAD_INFO(%ebp)
++ jmp .Lcstar_resume
++ CFI_ENDPROC
++ENDPROC(ia32pv_cstar_target)
++
++ENTRY(cstar_ret_from_fork)
++ CFI_STARTPROC
++ movl PT_ECX(%esp),%ecx
++ GET_THREAD_INFO(%ebp)
++ movl %ecx,PT_EBP(%esp) # put user EBP back in place
++ LOCK_PREFIX
++ andl $~_TIF_CSTAR,TI_flags(%ebp)
++ jmp ret_from_fork
++ CFI_ENDPROC
++END(ret_from_fork)
++
+ .section .rodata,"a"
+ #include "syscall_table_32.S"
+
+ syscall_table_size=(.-sys_call_table)
++
++#include <asm/unistd.h>
++cstar_special:
++nr=0
++mask=0
++.rept nr_syscalls+31
++ .irp n, __NR_sigreturn, __NR_rt_sigreturn
++ .if nr == \n
++ mask = mask | (1 << (\n & 31))
++ .endif
++ .endr
++ nr = nr + 1
++ .if (nr & 31) == 0
++ .long mask
++ mask = 0
++ .endif
++.endr
++#define sys_call_table cstar_call_table
++#define sys_fork cstar_set_tif
++#define sys_clone cstar_set_tif
++#define sys_vfork cstar_set_tif
++#include "syscall_table_32.S"
++#undef sys_call_table
++#undef sys_fork
++#undef sys_clone
++#undef sys_vfork
+--- sle11-2009-06-29.orig/arch/x86/kernel/entry_64-xen.S 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:33:40.000000000 +0100
+@@ -54,17 +54,22 @@
+ #include <asm/page.h>
+ #include <asm/irqflags.h>
+ #include <asm/errno.h>
+-#include <xen/interface/arch-x86_64.h>
++#include <xen/interface/xen.h>
+ #include <xen/interface/features.h>
+
+-#include "xen_entry_64.S"
+-
+ .code64
+
+ #ifndef CONFIG_PREEMPT
+ #define retint_kernel retint_restore_args
+ #endif
+
++#ifdef CONFIG_PARAVIRT
++ENTRY(native_irq_enable_syscall_ret)
++ movq %gs:pda_oldrsp,%rsp
++ swapgs
++ sysretq
++#endif /* CONFIG_PARAVIRT */
++
+
+ .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+ #ifdef CONFIG_TRACE_IRQFLAGS
+@@ -277,7 +282,7 @@ ret_from_sys_call:
+ sysret_check:
+ LOCKDEP_SYS_EXIT
+ GET_THREAD_INFO(%rcx)
+- XEN_BLOCK_EVENTS(%rsi)
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ movl threadinfo_flags(%rcx),%edx
+ andl %edi,%edx
+@@ -287,7 +292,7 @@ sysret_check:
+ * sysretq will re-enable interrupts:
+ */
+ TRACE_IRQS_ON
+- XEN_UNBLOCK_EVENTS(%rsi)
++ ENABLE_INTERRUPTS(CLBR_NONE)
+ RESTORE_ARGS 0,8,0
+ HYPERVISOR_IRET VGCF_IN_SYSCALL
+
+@@ -298,7 +303,7 @@ sysret_careful:
+ bt $TIF_NEED_RESCHED,%edx
+ jnc sysret_signal
+ TRACE_IRQS_ON
+- XEN_UNBLOCK_EVENTS(%rsi)
++ ENABLE_INTERRUPTS(CLBR_NONE)
+ pushq %rdi
+ CFI_ADJUST_CFA_OFFSET 8
+ call schedule
+@@ -309,9 +314,8 @@ sysret_careful:
+ /* Handle a signal */
+ sysret_signal:
+ TRACE_IRQS_ON
+-/* sti */
+- XEN_UNBLOCK_EVENTS(%rsi)
+- testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
++ ENABLE_INTERRUPTS(CLBR_NONE)
++ testl $_TIF_DO_NOTIFY_MASK,%edx
+ jz 1f
+
+ /* Really a signal */
+@@ -323,7 +327,7 @@ sysret_signal:
+ 1: movl $_TIF_NEED_RESCHED,%edi
+ /* Use IRET because user could have changed frame. This
+ works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
+- XEN_BLOCK_EVENTS(%rsi)
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp int_with_check
+
+@@ -355,7 +359,7 @@ tracesys:
+ */
+ .globl int_ret_from_sys_call
+ int_ret_from_sys_call:
+- XEN_BLOCK_EVENTS(%rsi)
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testb $3,CS-ARGOFFSET(%rsp)
+ jnz 1f
+@@ -381,22 +385,20 @@ int_careful:
+ bt $TIF_NEED_RESCHED,%edx
+ jnc int_very_careful
+ TRACE_IRQS_ON
+-/* sti */
+- XEN_UNBLOCK_EVENTS(%rsi)
++ ENABLE_INTERRUPTS(CLBR_NONE)
+ pushq %rdi
+ CFI_ADJUST_CFA_OFFSET 8
+ call schedule
+ popq %rdi
+ CFI_ADJUST_CFA_OFFSET -8
+- XEN_BLOCK_EVENTS(%rsi)
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp int_with_check
+
+ /* handle signals and tracing -- both require a full stack frame */
+ int_very_careful:
+ TRACE_IRQS_ON
+-/* sti */
+- XEN_UNBLOCK_EVENTS(%rsi)
++ ENABLE_INTERRUPTS(CLBR_NONE)
+ SAVE_REST
+ /* Check for syscall exit trace */
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+@@ -411,7 +413,7 @@ int_very_careful:
+ jmp int_restore_rest
+
+ int_signal:
+- testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
++ testl $_TIF_DO_NOTIFY_MASK,%edx
+ jz 1f
+ movq %rsp,%rdi # &ptregs -> arg1
+ xorl %esi,%esi # oldset -> arg2
+@@ -419,7 +421,7 @@ int_signal:
+ 1: movl $_TIF_NEED_RESCHED,%edi
+ int_restore_rest:
+ RESTORE_REST
+- XEN_BLOCK_EVENTS(%rsi)
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp int_with_check
+ CFI_ENDPROC
+@@ -474,6 +476,7 @@ ENTRY(stub_execve)
+ CFI_REGISTER rip, r11
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
++ movq %rsp, %rcx
+ call sys_execve
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+@@ -526,11 +529,10 @@ retint_check:
+ retint_restore_args: /* return to kernel space */
+ movl EFLAGS-REST_SKIP(%rsp), %eax
+ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
+- XEN_GET_VCPU_INFO(%rsi)
++ GET_VCPU_INFO
+ andb evtchn_upcall_mask(%rsi),%al
+ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
+ jnz restore_all_enable_events # != 0 => enable event delivery
+- XEN_PUT_VCPU_INFO(%rsi)
+
+ RESTORE_ARGS 0,8,0
+ HYPERVISOR_IRET 0
+@@ -541,31 +543,29 @@ retint_careful:
+ bt $TIF_NEED_RESCHED,%edx
+ jnc retint_signal
+ TRACE_IRQS_ON
+- XEN_UNBLOCK_EVENTS(%rsi)
+-/* sti */
++ ENABLE_INTERRUPTS(CLBR_NONE)
+ pushq %rdi
+ CFI_ADJUST_CFA_OFFSET 8
+ call schedule
+ popq %rdi
+ CFI_ADJUST_CFA_OFFSET -8
+ GET_THREAD_INFO(%rcx)
+- XEN_BLOCK_EVENTS(%rsi)
+-/* cli */
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp retint_check
+
+ retint_signal:
+- testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
++ testl $_TIF_DO_NOTIFY_MASK,%edx
+ jz retint_restore_args
+ TRACE_IRQS_ON
+- XEN_UNBLOCK_EVENTS(%rsi)
++ ENABLE_INTERRUPTS(CLBR_NONE)
+ SAVE_REST
+ movq $-1,ORIG_RAX(%rsp)
+ xorl %esi,%esi # oldset
+ movq %rsp,%rdi # &pt_regs
+ call do_notify_resume
+ RESTORE_REST
+- XEN_BLOCK_EVENTS(%rsi)
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ movl $_TIF_NEED_RESCHED,%edi
+ GET_THREAD_INFO(%rcx)
+@@ -702,7 +702,7 @@ END(spurious_interrupt)
+ rdmsr
+ testl %edx,%edx
+ js 1f
+- swapgs
++ SWAPGS
+ xorl %ebx,%ebx
+ 1:
+ #endif
+@@ -719,8 +719,7 @@ END(spurious_interrupt)
+ .if \ist
+ addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ .endif
+-/* cli */
+- XEN_BLOCK_EVENTS(%rsi)
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ .if \irqtrace
+ TRACE_IRQS_OFF
+ .endif
+@@ -749,10 +748,10 @@ paranoid_swapgs\trace:
+ .if \trace
+ TRACE_IRQS_IRETQ 0
+ .endif
+- swapgs
++ SWAPGS_UNSAFE_STACK
+ paranoid_restore\trace:
+ RESTORE_ALL 8
+- iretq
++ jmp irq_return
+ paranoid_userspace\trace:
+ GET_THREAD_INFO(%rcx)
+ movl threadinfo_flags(%rcx),%ebx
+@@ -767,11 +766,11 @@ paranoid_userspace\trace:
+ .if \trace
+ TRACE_IRQS_ON
+ .endif
+- sti
++ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+- cli
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ .if \trace
+ TRACE_IRQS_OFF
+ .endif
+@@ -780,9 +779,9 @@ paranoid_schedule\trace:
+ .if \trace
+ TRACE_IRQS_ON
+ .endif
+- sti
++ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+- cli
++ DISABLE_INTERRUPTS(CLBR_ANY)
+ .if \trace
+ TRACE_IRQS_OFF
+ .endif
+@@ -846,8 +845,7 @@ error_call_handler:
+ call *%rax
+ error_exit:
+ RESTORE_REST
+-/* cli */
+- XEN_BLOCK_EVENTS(%rsi)
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ testb $3,CS-ARGOFFSET(%rsp)
+@@ -875,7 +873,7 @@ error_kernelspace:
+ iret run with kernel gs again, so don't set the user space flag.
+ B stepping K8s sometimes report an truncated RIP for IRET
+ exceptions returning to compat mode. Check for these here too. */
+- leaq iret_label(%rip),%rbp
++ leaq irq_return(%rip),%rbp
+ cmpq %rbp,RIP(%rsp)
+ je error_swapgs
+ movl %ebp,%ebp /* zero extend */
+@@ -930,19 +928,17 @@ END(do_hypervisor_callback)
+ restore_all_enable_events:
+ CFI_DEFAULT_STACK adj=1
+ TRACE_IRQS_ON
+- XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up...
++ __ENABLE_INTERRUPTS
+
+ scrit: /**** START OF CRITICAL REGION ****/
+- XEN_TEST_PENDING(%rsi)
++ __TEST_PENDING
+ CFI_REMEMBER_STATE
+ jnz 14f # process more events if necessary...
+- XEN_PUT_VCPU_INFO(%rsi)
+ RESTORE_ARGS 0,8,0
+ HYPERVISOR_IRET 0
+
+ CFI_RESTORE_STATE
+-14: XEN_LOCKED_BLOCK_EVENTS(%rsi)
+- XEN_PUT_VCPU_INFO(%rsi)
++14: __DISABLE_INTERRUPTS
+ SAVE_REST
+ movq %rsp,%rdi # set the argument again
+ jmp 11b
+@@ -1086,15 +1082,16 @@ ENDPROC(child_rip)
+ * rdi: name, rsi: argv, rdx: envp
+ *
+ * We want to fallback into:
+- * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
++ * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
+ *
+ * do_sys_execve asm fallback arguments:
+- * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
++ * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
+ */
+ ENTRY(kernel_execve)
+ CFI_STARTPROC
+ FAKE_STACK_FRAME $0
+ SAVE_ALL
++ movq %rsp,%rcx
+ call sys_execve
+ movq %rax, RAX(%rsp)
+ RESTORE_REST
+@@ -1144,7 +1141,7 @@ do_nmi_callback:
+ call do_nmi
+ orl $NMI_MASK,EFLAGS(%rsp)
+ RESTORE_REST
+- XEN_BLOCK_EVENTS(%rsi)
++ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ jmp retint_restore_args
+--- sle11-2009-06-29.orig/arch/x86/kernel/fixup.c 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/arch/x86/kernel/fixup.c 2009-03-16 16:33:40.000000000 +0100
+@@ -36,7 +36,7 @@
+
+ #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
+
+-fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
++void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
+ {
+ static unsigned long printed = 0;
+ char info[100];
+--- sle11-2009-06-29.orig/arch/x86/kernel/genapic_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -24,20 +24,13 @@
+ #include <acpi/acpi_bus.h>
+ #endif
+
+-/*
+- * which logical CPU number maps to which CPU (physical APIC ID)
+- *
+- * The following static array is used during kernel startup
+- * and the x86_cpu_to_apicid_ptr contains the address of the
+- * array during this time. Is it zeroed when the per_cpu
+- * data area is removed.
+- */
++/* which logical CPU number maps to which CPU (physical APIC ID) */
+ #ifndef CONFIG_XEN
+-u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
++u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
+ = { [0 ... NR_CPUS-1] = BAD_APICID };
+-void *x86_cpu_to_apicid_ptr;
++void *x86_cpu_to_apicid_early_ptr;
+ #endif
+-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
++DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
+ EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+
+ #ifndef CONFIG_XEN
+--- sle11-2009-06-29.orig/arch/x86/kernel/head64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/head64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -16,6 +16,7 @@
+ #include <linux/kernel.h>
+ #include <linux/string.h>
+ #include <linux/percpu.h>
++#include <linux/start_kernel.h>
+ #include <linux/module.h>
+
+ #include <asm/processor.h>
+@@ -26,6 +27,8 @@
+ #include <asm/pgtable.h>
+ #include <asm/tlbflush.h>
+ #include <asm/sections.h>
++#include <asm/kdebug.h>
++#include <asm/e820.h>
+
+ unsigned long start_pfn;
+
+@@ -34,7 +37,7 @@ static void __init zap_identity_mappings
+ {
+ pgd_t *pgd = pgd_offset_k(0UL);
+ pgd_clear(pgd);
+- __flush_tlb();
++ __flush_tlb_all();
+ }
+
+ /* Don't add a printk in there. printk relies on the PDA which is not initialized
+@@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
+ unsigned int machine_to_phys_order;
+ EXPORT_SYMBOL(machine_to_phys_order);
+
++#define EBDA_ADDR_POINTER 0x40E
++
++static __init void reserve_ebda(void)
++{
++#ifndef CONFIG_XEN
++ unsigned ebda_addr, ebda_size;
++
++ /*
++ * there is a real-mode segmented pointer pointing to the
++ * 4K EBDA area at 0x40E
++ */
++ ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
++ ebda_addr <<= 4;
++
++ if (!ebda_addr)
++ return;
++
++ ebda_size = *(unsigned short *)__va(ebda_addr);
++
++ /* Round EBDA up to pages */
++ if (ebda_size == 0)
++ ebda_size = 1;
++ ebda_size <<= 10;
++ ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
++ if (ebda_size > 64*1024)
++ ebda_size = 64*1024;
++
++ reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
++#endif
++}
++
+ void __init x86_64_start_kernel(char * real_mode_data)
+ {
+ struct xen_machphys_mapping mapping;
+@@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r
+ /* Make NULL pointers segfault */
+ zap_identity_mappings();
+
+- for (i = 0; i < IDT_ENTRIES; i++)
++ /* Cleanup the over mapped high alias */
++ cleanup_highmap();
++
++ for (i = 0; i < IDT_ENTRIES; i++) {
++#ifdef CONFIG_EARLY_PRINTK
++ set_intr_gate(i, &early_idt_handlers[i]);
++#else
+ set_intr_gate(i, early_idt_handler);
++#endif
++ }
+ load_idt((const struct desc_ptr *)&idt_descr);
+ #endif
+
+@@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r
+
+ pda_init(0);
+ copy_bootdata(__va(real_mode_data));
+-#ifdef CONFIG_SMP
+- cpu_set(0, cpu_online_map);
+-#endif
++
++ reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
++
++ reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
++ start_pfn << PAGE_SHIFT, "Xen provided");
++
++ reserve_ebda();
++
++ /*
++ * At this point everything still needed from the boot loader
++ * or BIOS or kernel text should be early reserved or marked not
++ * RAM in e820. All other memory is free game.
++ */
++
+ start_kernel();
+ }
+--- sle11-2009-06-29.orig/arch/x86/kernel/head_32-xen.S 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/head_32-xen.S 2009-03-16 16:33:40.000000000 +0100
+@@ -3,6 +3,7 @@
+ .text
+ #include <linux/elfnote.h>
+ #include <linux/threads.h>
++#include <linux/init.h>
+ #include <linux/linkage.h>
+ #include <asm/segment.h>
+ #include <asm/page.h>
+@@ -88,7 +89,7 @@ ENTRY(_stext)
+ */
+ .section ".bss.page_aligned","wa"
+ .align PAGE_SIZE_asm
+-ENTRY(swapper_pg_pmd)
++ENTRY(swapper_pg_fixmap)
+ .fill 1024,4,0
+ ENTRY(empty_zero_page)
+ .fill 4096,1,0
+--- sle11-2009-06-29.orig/arch/x86/kernel/init_task-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/init_task-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -19,7 +19,7 @@ static struct sighand_struct init_sighan
+ #endif
+ struct mm_struct init_mm = INIT_MM(init_mm);
+ #undef swapper_pg_dir
+-EXPORT_SYMBOL(init_mm);
++EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
+
+ /*
+ * Initial thread structure.
+--- sle11-2009-06-29.orig/arch/x86/kernel/io_apic_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -35,6 +35,7 @@
+ #include <linux/htirq.h>
+ #include <linux/freezer.h>
+ #include <linux/kthread.h>
++#include <linux/jiffies.h> /* time_after() */
+
+ #include <asm/io.h>
+ #include <asm/smp.h>
+@@ -48,8 +49,6 @@
+ #include <mach_apic.h>
+ #include <mach_apicdef.h>
+
+-#include "io_ports.h"
+-
+ #ifdef CONFIG_XEN
+ #include <xen/interface/xen.h>
+ #include <xen/interface/physdev.h>
+@@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi
+ # include <asm/processor.h> /* kernel_thread() */
+ # include <linux/kernel_stat.h> /* kstat */
+ # include <linux/slab.h> /* kmalloc() */
+-# include <linux/timer.h> /* time_after() */
++# include <linux/timer.h>
+
+ #define IRQBALANCE_CHECK_ARCH -999
+ #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
+@@ -777,7 +776,7 @@ late_initcall(balanced_irq_init);
+ #endif
+
+ #ifndef CONFIG_SMP
+-void fastcall send_IPI_self(int vector)
++void send_IPI_self(int vector)
+ {
+ #ifndef CONFIG_XEN
+ unsigned int cfg;
+@@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void)
+ * might have cached one ExtINT interrupt. Finally, at
+ * least one tick may be lost due to delays.
+ */
+- if (jiffies - t1 > 4)
++ if (time_after(jiffies, t1 + 4))
+ return 1;
+
+ return 0;
+@@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read
+ .eoi = ack_apic,
+ };
+
+-static void setup_nmi (void)
++static void __init setup_nmi(void)
+ {
+ /*
+ * Dirty trick to enable the NMI watchdog ...
+@@ -2155,7 +2154,7 @@ static void setup_nmi (void)
+ */
+ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
+
+- on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
++ enable_NMI_through_LVT0();
+
+ apic_printk(APIC_VERBOSE, " done.\n");
+ }
+@@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi
+ }
+
+ static struct sysdev_class ioapic_sysdev_class = {
+- set_kset_name("ioapic"),
++ .name = "ioapic",
+ .suspend = ioapic_suspend,
+ .resume = ioapic_resume,
+ };
+--- sle11-2009-06-29.orig/arch/x86/kernel/io_apic_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -32,9 +32,11 @@
+ #include <linux/msi.h>
+ #include <linux/htirq.h>
+ #include <linux/dmar.h>
++#include <linux/jiffies.h>
+ #ifdef CONFIG_ACPI
+ #include <acpi/acpi_bus.h>
+ #endif
++#include <linux/bootmem.h>
+
+ #include <asm/idle.h>
+ #include <asm/io.h>
+@@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo
+ v = apic_read(APIC_LVR);
+ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+- maxlvt = get_maxlvt();
++ maxlvt = lapic_get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+@@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void)
+ }
+ #endif /* !CONFIG_XEN */
+
+-static void __init enable_IO_APIC(void)
++void __init enable_IO_APIC(void)
+ {
+ union IO_APIC_reg_01 reg_01;
+ #ifndef CONFIG_XEN
+@@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void)
+ */
+
+ /* jiffies wrap? */
+- if (jiffies - t1 > 4)
++ if (time_after(jiffies, t1 + 4))
+ return 1;
+ return 0;
+ }
+@@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i
+ if (likely(!cfg->move_in_progress))
+ return;
+
+- vector = ~get_irq_regs()->orig_rax;
++ vector = ~get_irq_regs()->orig_ax;
+ me = smp_processor_id();
+ if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
+ cpumask_t cleanup_mask;
+@@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int
+ int do_unmask_irq = 0;
+
+ irq_complete_move(irq);
+-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
++#ifdef CONFIG_GENERIC_PENDING_IRQ
+ /* If we are moving the irq we need to mask it */
+ if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+ do_unmask_irq = 1;
+@@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir
+ .end = end_lapic_irq,
+ };
+
+-static void setup_nmi (void)
++static void __init setup_nmi(void)
+ {
+ /*
+ * Dirty trick to enable the NMI watchdog ...
+@@ -1583,7 +1585,7 @@ static void setup_nmi (void)
+ */
+ printk(KERN_INFO "activating NMI Watchdog ...");
+
+- enable_NMI_through_LVT0(NULL);
++ enable_NMI_through_LVT0();
+
+ printk(" done.\n");
+ }
+@@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v
+ *
+ * FIXME: really need to revamp this for modern platforms only.
+ */
+-static inline void check_timer(void)
++static inline void __init check_timer(void)
+ {
+ struct irq_cfg *cfg = irq_cfg + 0;
+ int apic1, pin1, apic2, pin2;
+@@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi
+ }
+
+ static struct sysdev_class ioapic_sysdev_class = {
+- set_kset_name("ioapic"),
++ .name = "ioapic",
+ .suspend = ioapic_suspend,
+ .resume = ioapic_resume,
+ };
+@@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void)
+ }
+ }
+ #endif
+-#endif /* !CONFIG_XEN */
+
++#define IOAPIC_RESOURCE_NAME_SIZE 11
++
++static struct resource *ioapic_resources;
++
++static struct resource * __init ioapic_setup_resources(void)
++{
++ unsigned long n;
++ struct resource *res;
++ char *mem;
++ int i;
++
++ if (nr_ioapics <= 0)
++ return NULL;
++
++ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
++ n *= nr_ioapics;
++
++ mem = alloc_bootmem(n);
++ res = (void *)mem;
++
++ if (mem != NULL) {
++ memset(mem, 0, n);
++ mem += sizeof(struct resource) * nr_ioapics;
++
++ for (i = 0; i < nr_ioapics; i++) {
++ res[i].name = mem;
++ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
++ sprintf(mem, "IOAPIC %u", i);
++ mem += IOAPIC_RESOURCE_NAME_SIZE;
++ }
++ }
++
++ ioapic_resources = res;
++
++ return res;
++}
++
++void __init ioapic_init_mappings(void)
++{
++ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
++ struct resource *ioapic_res;
++ int i;
++
++ ioapic_res = ioapic_setup_resources();
++ for (i = 0; i < nr_ioapics; i++) {
++ if (smp_found_config) {
++ ioapic_phys = mp_ioapics[i].mpc_apicaddr;
++ } else {
++ ioapic_phys = (unsigned long)
++ alloc_bootmem_pages(PAGE_SIZE);
++ ioapic_phys = __pa(ioapic_phys);
++ }
++ set_fixmap_nocache(idx, ioapic_phys);
++ apic_printk(APIC_VERBOSE,
++ "mapped IOAPIC to %016lx (%016lx)\n",
++ __fix_to_virt(idx), ioapic_phys);
++ idx++;
++
++ if (ioapic_res != NULL) {
++ ioapic_res->start = ioapic_phys;
++ ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
++ ioapic_res++;
++ }
++ }
++}
++
++static int __init ioapic_insert_resources(void)
++{
++ int i;
++ struct resource *r = ioapic_resources;
++
++ if (!r) {
++ printk(KERN_ERR
++ "IO APIC resources could be not be allocated.\n");
++ return -1;
++ }
++
++ for (i = 0; i < nr_ioapics; i++) {
++ insert_resource(&iomem_resource, r);
++ r++;
++ }
++
++ return 0;
++}
++
++/* Insert the IO APIC resources after PCI initialization has occured to handle
++ * IO APICS that are mapped in on a BAR in PCI space. */
++late_initcall(ioapic_insert_resources);
++#endif /* !CONFIG_XEN */
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-06-29/arch/x86/kernel/ioport-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -0,0 +1,112 @@
++/*
++ * This contains the io-permission bitmap code - written by obz, with changes
++ * by Linus. 32/64 bits code unification by Miguel Botón.
++ */
++
++#include <linux/sched.h>
++#include <linux/kernel.h>
++#include <linux/capability.h>
++#include <linux/errno.h>
++#include <linux/types.h>
++#include <linux/ioport.h>
++#include <linux/smp.h>
++#include <linux/stddef.h>
++#include <linux/slab.h>
++#include <linux/thread_info.h>
++#include <linux/syscalls.h>
++#include <xen/interface/physdev.h>
++
++/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
++static void set_bitmap(unsigned long *bitmap, unsigned int base,
++ unsigned int extent, int new_value)
++{
++ unsigned int i;
++
++ for (i = base; i < base + extent; i++) {
++ if (new_value)
++ __set_bit(i, bitmap);
++ else
++ __clear_bit(i, bitmap);
++ }
++}
++
++/*
++ * this changes the io permissions bitmap in the current task.
++ */
++asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
++{
++ struct thread_struct * t = ¤t->thread;
++ struct physdev_set_iobitmap set_iobitmap;
++
++ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
++ return -EINVAL;
++ if (turn_on && !capable(CAP_SYS_RAWIO))
++ return -EPERM;
++
++ /*
++ * If it's the first ioperm() call in this thread's lifetime, set the
++ * IO bitmap up. ioperm() is much less timing critical than clone(),
++ * this is why we delay this operation until now:
++ */
++ if (!t->io_bitmap_ptr) {
++ unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
++
++ if (!bitmap)
++ return -ENOMEM;
++
++ memset(bitmap, 0xff, IO_BITMAP_BYTES);
++ t->io_bitmap_ptr = bitmap;
++ set_thread_flag(TIF_IO_BITMAP);
++
++ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
++ set_iobitmap.nr_ports = IO_BITMAP_BITS;
++ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
++ &set_iobitmap));
++ }
++
++ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
++
++ return 0;
++}
++
++/*
++ * sys_iopl has to be used when you want to access the IO ports
++ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
++ * you'd need 8kB of bitmaps/process, which is a bit excessive.
++ */
++static int do_iopl(unsigned int level, struct thread_struct *t)
++{
++ unsigned int old = t->iopl >> 12;
++
++ if (level > 3)
++ return -EINVAL;
++ /* Trying to gain more privileges? */
++ if (level > old) {
++ if (!capable(CAP_SYS_RAWIO))
++ return -EPERM;
++ }
++
++ return 0;
++}
++
++#ifdef CONFIG_X86_32
++asmlinkage long sys_iopl(unsigned long regsp)
++{
++ struct pt_regs *regs = (struct pt_regs *)®sp;
++ unsigned int level = regs->bx;
++#else
++asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
++{
++#endif
++ struct thread_struct *t = ¤t->thread;
++ int rc;
++
++ rc = do_iopl(level, t);
++ if (rc < 0)
++ goto out;
++
++ t->iopl = level << 12;
++ set_iopl_mask(t->iopl);
++out:
++ return rc;
++}
+--- sle11-2009-06-29.orig/arch/x86/kernel/ioport_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,121 +0,0 @@
+-/*
+- * This contains the io-permission bitmap code - written by obz, with changes
+- * by Linus.
+- */
+-
+-#include <linux/sched.h>
+-#include <linux/kernel.h>
+-#include <linux/capability.h>
+-#include <linux/errno.h>
+-#include <linux/types.h>
+-#include <linux/ioport.h>
+-#include <linux/smp.h>
+-#include <linux/stddef.h>
+-#include <linux/slab.h>
+-#include <linux/thread_info.h>
+-#include <linux/syscalls.h>
+-#include <xen/interface/physdev.h>
+-
+-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+-{
+- unsigned long mask;
+- unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
+- unsigned int low_index = base & (BITS_PER_LONG-1);
+- int length = low_index + extent;
+-
+- if (low_index != 0) {
+- mask = (~0UL << low_index);
+- if (length < BITS_PER_LONG)
+- mask &= ~(~0UL << length);
+- if (new_value)
+- *bitmap_base++ |= mask;
+- else
+- *bitmap_base++ &= ~mask;
+- length -= BITS_PER_LONG;
+- }
+-
+- mask = (new_value ? ~0UL : 0UL);
+- while (length >= BITS_PER_LONG) {
+- *bitmap_base++ = mask;
+- length -= BITS_PER_LONG;
+- }
+-
+- if (length > 0) {
+- mask = ~(~0UL << length);
+- if (new_value)
+- *bitmap_base++ |= mask;
+- else
+- *bitmap_base++ &= ~mask;
+- }
+-}
+-
+-
+-/*
+- * this changes the io permissions bitmap in the current task.
+- */
+-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+-{
+- struct thread_struct * t = ¤t->thread;
+- unsigned long *bitmap;
+- struct physdev_set_iobitmap set_iobitmap;
+-
+- if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+- return -EINVAL;
+- if (turn_on && !capable(CAP_SYS_RAWIO))
+- return -EPERM;
+-
+- /*
+- * If it's the first ioperm() call in this thread's lifetime, set the
+- * IO bitmap up. ioperm() is much less timing critical than clone(),
+- * this is why we delay this operation until now:
+- */
+- if (!t->io_bitmap_ptr) {
+- bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+- if (!bitmap)
+- return -ENOMEM;
+-
+- memset(bitmap, 0xff, IO_BITMAP_BYTES);
+- t->io_bitmap_ptr = bitmap;
+- set_thread_flag(TIF_IO_BITMAP);
+-
+- set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
+- set_iobitmap.nr_ports = IO_BITMAP_BITS;
+- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+- &set_iobitmap));
+- }
+-
+- set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+-
+- return 0;
+-}
+-
+-/*
+- * sys_iopl has to be used when you want to access the IO ports
+- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+- * you'd need 8kB of bitmaps/process, which is a bit excessive.
+- *
+- * Here we just change the eflags value on the stack: we allow
+- * only the super-user to do it. This depends on the stack-layout
+- * on system-call entry - see also fork() and the signal handling
+- * code.
+- */
+-
+-asmlinkage long sys_iopl(unsigned long unused)
+-{
+- volatile struct pt_regs * regs = (struct pt_regs *) &unused;
+- unsigned int level = regs->ebx;
+- struct thread_struct *t = ¤t->thread;
+- unsigned int old = (t->iopl >> 12) & 3;
+-
+- if (level > 3)
+- return -EINVAL;
+- /* Trying to gain more privileges? */
+- if (level > old) {
+- if (!capable(CAP_SYS_RAWIO))
+- return -EPERM;
+- }
+- t->iopl = level << 12;
+- set_iopl_mask(t->iopl);
+- return 0;
+-}
+--- sle11-2009-06-29.orig/arch/x86/kernel/ioport_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,99 +0,0 @@
+-/*
+- * This contains the io-permission bitmap code - written by obz, with changes
+- * by Linus.
+- */
+-
+-#include <linux/sched.h>
+-#include <linux/kernel.h>
+-#include <linux/capability.h>
+-#include <linux/errno.h>
+-#include <linux/types.h>
+-#include <linux/ioport.h>
+-#include <linux/mm.h>
+-#include <linux/smp.h>
+-#include <linux/stddef.h>
+-#include <linux/slab.h>
+-#include <linux/thread_info.h>
+-#include <linux/syscalls.h>
+-#include <xen/interface/physdev.h>
+-
+-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+-{
+- int i;
+-
+- if (new_value)
+- for (i = base; i < base + extent; i++)
+- __set_bit(i, bitmap);
+- else
+- for (i = base; i < base + extent; i++)
+- clear_bit(i, bitmap);
+-}
+-
+-/*
+- * this changes the io permissions bitmap in the current task.
+- */
+-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+-{
+- struct thread_struct * t = ¤t->thread;
+- unsigned long *bitmap;
+- struct physdev_set_iobitmap set_iobitmap;
+-
+- if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+- return -EINVAL;
+- if (turn_on && !capable(CAP_SYS_RAWIO))
+- return -EPERM;
+-
+- /*
+- * If it's the first ioperm() call in this thread's lifetime, set the
+- * IO bitmap up. ioperm() is much less timing critical than clone(),
+- * this is why we delay this operation until now:
+- */
+- if (!t->io_bitmap_ptr) {
+- bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+- if (!bitmap)
+- return -ENOMEM;
+-
+- memset(bitmap, 0xff, IO_BITMAP_BYTES);
+- t->io_bitmap_ptr = bitmap;
+- set_thread_flag(TIF_IO_BITMAP);
+-
+- set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
+- set_iobitmap.nr_ports = IO_BITMAP_BITS;
+- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+- &set_iobitmap));
+- }
+-
+- set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+-
+- return 0;
+-}
+-
+-/*
+- * sys_iopl has to be used when you want to access the IO ports
+- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+- * you'd need 8kB of bitmaps/process, which is a bit excessive.
+- *
+- */
+-
+-asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
+-{
+- unsigned int old_iopl = current->thread.iopl;
+- struct physdev_set_iopl set_iopl;
+-
+- if (new_iopl > 3)
+- return -EINVAL;
+-
+- /* Need "raw I/O" privileges for direct port access. */
+- if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
+- return -EPERM;
+-
+- /* Change our version of the privilege levels. */
+- current->thread.iopl = new_iopl;
+-
+- /* Force the change at ring 0. */
+- set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
+- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+-
+- return 0;
+-}
+--- sle11-2009-06-29.orig/arch/x86/kernel/irq_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPU
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+-fastcall unsigned int do_IRQ(struct pt_regs *regs)
++unsigned int do_IRQ(struct pt_regs *regs)
+ {
+ struct pt_regs *old_regs;
+ /* high bit used in ret_from_ code */
+- int irq = ~regs->orig_eax;
++ int irq = ~regs->orig_ax;
+ struct irq_desc *desc = irq_desc + irq;
+ #ifdef CONFIG_4KSTACKS
+ union irq_ctx *curctx, *irqctx;
+@@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_r
+ #ifdef CONFIG_DEBUG_STACKOVERFLOW
+ /* Debugging check for stack overflow: is there less than 1KB free? */
+ {
+- long esp;
++ long sp;
+
+ __asm__ __volatile__("andl %%esp,%0" :
+- "=r" (esp) : "0" (THREAD_SIZE - 1));
+- if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
++ "=r" (sp) : "0" (THREAD_SIZE - 1));
++ if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
+ printk("do_IRQ: stack overflow: %ld\n",
+- esp - sizeof(struct thread_info));
++ sp - sizeof(struct thread_info));
+ dump_stack();
+ }
+ }
+@@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_r
+ * current stack (which is the irq stack already after all)
+ */
+ if (curctx != irqctx) {
+- int arg1, arg2, ebx;
++ int arg1, arg2, bx;
+
+ /* build the stack frame on the IRQ stack */
+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+@@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_r
+ (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+
+ asm volatile(
+- " xchgl %%ebx,%%esp \n"
+- " call *%%edi \n"
+- " movl %%ebx,%%esp \n"
+- : "=a" (arg1), "=d" (arg2), "=b" (ebx)
++ " xchgl %%ebx,%%esp \n"
++ " call *%%edi \n"
++ " movl %%ebx,%%esp \n"
++ : "=a" (arg1), "=d" (arg2), "=b" (bx)
+ : "0" (irq), "1" (desc), "2" (isp),
+ "D" (desc->handle_irq)
+ : "memory", "cc"
+--- sle11-2009-06-29.orig/arch/x86/kernel/irq_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/irq_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -20,6 +20,28 @@
+
+ atomic_t irq_err_count;
+
++/*
++ * 'what should we do if we get a hw irq event on an illegal vector'.
++ * each architecture has to answer this themselves.
++ */
++void ack_bad_irq(unsigned int irq)
++{
++ printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq);
++#ifdef CONFIG_X86_LOCAL_APIC
++ /*
++ * Currently unexpected vectors happen only on SMP and APIC.
++ * We _must_ ack these because every local APIC has only N
++ * irq slots per priority level, and a 'hanging, unacked' IRQ
++ * holds up an irq slot - in excessive cases (when multiple
++ * unexpected vectors occur) that might lock up the APIC
++ * completely.
++ * But don't ack when the APIC is disabled. -AK
++ */
++ if (!disable_apic)
++ ack_APIC_irq();
++#endif
++}
++
+ #ifdef CONFIG_DEBUG_STACKOVERFLOW
+ /*
+ * Probabilistic stack overflow check:
+@@ -33,11 +55,11 @@ static inline void stack_overflow_check(
+ u64 curbase = (u64)task_stack_page(current);
+ static unsigned long warned = -60*HZ;
+
+- if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
+- regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
++ if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
++ regs->sp < curbase + sizeof(struct thread_info) + 128 &&
+ time_after(jiffies, warned + 60*HZ)) {
+- printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
+- current->comm, curbase, regs->rsp);
++ printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
++ current->comm, curbase, regs->sp);
+ show_stack(NULL,NULL);
+ warned = jiffies;
+ }
+@@ -150,7 +172,7 @@ asmlinkage unsigned int do_IRQ(struct pt
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /* high bit used in ret_from_ code */
+- unsigned irq = ~regs->orig_rax;
++ unsigned irq = ~regs->orig_ax;
+
+ /*exit_idle();*/
+ /*irq_enter();*/
+@@ -251,14 +273,3 @@ asmlinkage void do_softirq(void)
+ }
+ local_irq_restore(flags);
+ }
+-
+-#ifndef CONFIG_X86_LOCAL_APIC
+-/*
+- * 'what should we do if we get a hw irq event on an illegal vector'.
+- * each architecture has to answer this themselves.
+- */
+-void ack_bad_irq(unsigned int irq)
+-{
+- printk("unexpected IRQ trap at irq %02x\n", irq);
+-}
+-#endif
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-06-29/arch/x86/kernel/ldt-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -0,0 +1,272 @@
++/*
++ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
++ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
++ * Copyright (C) 2002 Andi Kleen
++ *
++ * This handles calls from both 32bit and 64bit mode.
++ */
++
++#include <linux/errno.h>
++#include <linux/sched.h>
++#include <linux/string.h>
++#include <linux/mm.h>
++#include <linux/smp.h>
++#include <linux/vmalloc.h>
++
++#include <asm/uaccess.h>
++#include <asm/system.h>
++#include <asm/ldt.h>
++#include <asm/desc.h>
++#include <asm/mmu_context.h>
++
++#ifdef CONFIG_SMP
++static void flush_ldt(void *null)
++{
++ if (current->active_mm)
++ load_LDT(¤t->active_mm->context);
++}
++#endif
++
++static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
++{
++ void *oldldt, *newldt;
++ int oldsize;
++
++ if (mincount <= pc->size)
++ return 0;
++ oldsize = pc->size;
++ mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
++ (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
++ if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
++ newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
++ else
++ newldt = (void *)__get_free_page(GFP_KERNEL);
++
++ if (!newldt)
++ return -ENOMEM;
++
++ if (oldsize)
++ memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
++ oldldt = pc->ldt;
++ memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
++ (mincount - oldsize) * LDT_ENTRY_SIZE);
++
++#ifdef CONFIG_X86_64
++ /* CHECKME: Do we really need this ? */
++ wmb();
++#endif
++ pc->ldt = newldt;
++ wmb();
++ pc->size = mincount;
++ wmb();
++
++ if (reload) {
++#ifdef CONFIG_SMP
++ cpumask_t mask;
++
++ preempt_disable();
++#endif
++ make_pages_readonly(newldt,
++ (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
++ XENFEAT_writable_descriptor_tables);
++ load_LDT(pc);
++#ifdef CONFIG_SMP
++ mask = cpumask_of_cpu(smp_processor_id());
++ if (!cpus_equal(current->mm->cpu_vm_mask, mask))
++ smp_call_function(flush_ldt, NULL, 1, 1);
++ preempt_enable();
++#endif
++ }
++ if (oldsize) {
++ make_pages_writable(oldldt,
++ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
++ XENFEAT_writable_descriptor_tables);
++ if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
++ vfree(oldldt);
++ else
++ put_page(virt_to_page(oldldt));
++ }
++ return 0;
++}
++
++static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
++{
++ int err = alloc_ldt(new, old->size, 0);
++
++ if (err < 0)
++ return err;
++ memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
++ make_pages_readonly(new->ldt,
++ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
++ XENFEAT_writable_descriptor_tables);
++ return 0;
++}
++
++/*
++ * we do not have to muck with descriptors here, that is
++ * done in switch_mm() as needed.
++ */
++int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
++{
++ struct mm_struct *old_mm;
++ int retval = 0;
++
++ memset(&mm->context, 0, sizeof(mm->context));
++ mutex_init(&mm->context.lock);
++ old_mm = current->mm;
++ if (old_mm)
++ mm->context.vdso = old_mm->context.vdso;
++ if (old_mm && old_mm->context.size > 0) {
++ mutex_lock(&old_mm->context.lock);
++ retval = copy_ldt(&mm->context, &old_mm->context);
++ mutex_unlock(&old_mm->context.lock);
++ }
++ return retval;
++}
++
++/*
++ * No need to lock the MM as we are the last user
++ *
++ * 64bit: Don't touch the LDT register - we're already in the next thread.
++ */
++void destroy_context(struct mm_struct *mm)
++{
++ if (mm->context.size) {
++ /* CHECKME: Can this ever happen ? */
++ if (mm == current->active_mm)
++ clear_LDT();
++ make_pages_writable(mm->context.ldt,
++ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
++ XENFEAT_writable_descriptor_tables);
++ if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
++ vfree(mm->context.ldt);
++ else
++ put_page(virt_to_page(mm->context.ldt));
++ mm->context.size = 0;
++ }
++}
++
++static int read_ldt(void __user *ptr, unsigned long bytecount)
++{
++ int err;
++ unsigned long size;
++ struct mm_struct *mm = current->mm;
++
++ if (!mm->context.size)
++ return 0;
++ if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
++ bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
++
++ mutex_lock(&mm->context.lock);
++ size = mm->context.size * LDT_ENTRY_SIZE;
++ if (size > bytecount)
++ size = bytecount;
++
++ err = 0;
++ if (copy_to_user(ptr, mm->context.ldt, size))
++ err = -EFAULT;
++ mutex_unlock(&mm->context.lock);
++ if (err < 0)
++ goto error_return;
++ if (size != bytecount) {
++ /* zero-fill the rest */
++ if (clear_user(ptr + size, bytecount - size) != 0) {
++ err = -EFAULT;
++ goto error_return;
++ }
++ }
++ return bytecount;
++error_return:
++ return err;
++}
++
++static int read_default_ldt(void __user *ptr, unsigned long bytecount)
++{
++ /* CHECKME: Can we use _one_ random number ? */
++#ifdef CONFIG_X86_32
++ unsigned long size = 5 * sizeof(struct desc_struct);
++#else
++ unsigned long size = 128;
++#endif
++ if (bytecount > size)
++ bytecount = size;
++ if (clear_user(ptr, bytecount))
++ return -EFAULT;
++ return bytecount;
++}
++
++static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
++{
++ struct mm_struct *mm = current->mm;
++ struct desc_struct ldt;
++ int error;
++ struct user_desc ldt_info;
++
++ error = -EINVAL;
++ if (bytecount != sizeof(ldt_info))
++ goto out;
++ error = -EFAULT;
++ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
++ goto out;
++
++ error = -EINVAL;
++ if (ldt_info.entry_number >= LDT_ENTRIES)
++ goto out;
++ if (ldt_info.contents == 3) {
++ if (oldmode)
++ goto out;
++ if (ldt_info.seg_not_present == 0)
++ goto out;
++ }
++
++ mutex_lock(&mm->context.lock);
++ if (ldt_info.entry_number >= mm->context.size) {
++ error = alloc_ldt(¤t->mm->context,
++ ldt_info.entry_number + 1, 1);
++ if (error < 0)
++ goto out_unlock;
++ }
++
++ /* Allow LDTs to be cleared by the user. */
++ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
++ if (oldmode || LDT_empty(&ldt_info)) {
++ memset(&ldt, 0, sizeof(ldt));
++ goto install;
++ }
++ }
++
++ fill_ldt(&ldt, &ldt_info);
++ if (oldmode)
++ ldt.avl = 0;
++
++ /* Install the new entry ... */
++install:
++ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
++
++out_unlock:
++ mutex_unlock(&mm->context.lock);
++out:
++ return error;
++}
++
++asmlinkage int sys_modify_ldt(int func, void __user *ptr,
++ unsigned long bytecount)
++{
++ int ret = -ENOSYS;
++
++ switch (func) {
++ case 0:
++ ret = read_ldt(ptr, bytecount);
++ break;
++ case 1:
++ ret = write_ldt(ptr, bytecount, 1);
++ break;
++ case 2:
++ ret = read_default_ldt(ptr, bytecount);
++ break;
++ case 0x11:
++ ret = write_ldt(ptr, bytecount, 0);
++ break;
++ }
++ return ret;
++}
+--- sle11-2009-06-29.orig/arch/x86/kernel/ldt_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,265 +0,0 @@
+-/*
+- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+- */
+-
+-#include <linux/errno.h>
+-#include <linux/sched.h>
+-#include <linux/string.h>
+-#include <linux/mm.h>
+-#include <linux/smp.h>
+-#include <linux/vmalloc.h>
+-#include <linux/slab.h>
+-
+-#include <asm/uaccess.h>
+-#include <asm/system.h>
+-#include <asm/ldt.h>
+-#include <asm/desc.h>
+-#include <asm/mmu_context.h>
+-
+-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+-static void flush_ldt(void *null)
+-{
+- if (current->active_mm)
+- load_LDT(¤t->active_mm->context);
+-}
+-#endif
+-
+-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+-{
+- void *oldldt;
+- void *newldt;
+- int oldsize;
+-
+- if (mincount <= pc->size)
+- return 0;
+- oldsize = pc->size;
+- mincount = (mincount+511)&(~511);
+- if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+- newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+- else
+- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+-
+- if (!newldt)
+- return -ENOMEM;
+-
+- if (oldsize)
+- memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+- oldldt = pc->ldt;
+- memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+- pc->ldt = newldt;
+- wmb();
+- pc->size = mincount;
+- wmb();
+-
+- if (reload) {
+-#ifdef CONFIG_SMP
+- cpumask_t mask;
+- preempt_disable();
+-#endif
+- make_pages_readonly(
+- pc->ldt,
+- (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+- XENFEAT_writable_descriptor_tables);
+- load_LDT(pc);
+-#ifdef CONFIG_SMP
+- mask = cpumask_of_cpu(smp_processor_id());
+- if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+- smp_call_function(flush_ldt, NULL, 1, 1);
+- preempt_enable();
+-#endif
+- }
+- if (oldsize) {
+- make_pages_writable(
+- oldldt,
+- (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
+- XENFEAT_writable_descriptor_tables);
+- if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+- vfree(oldldt);
+- else
+- kfree(oldldt);
+- }
+- return 0;
+-}
+-
+-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+-{
+- int err = alloc_ldt(new, old->size, 0);
+- if (err < 0)
+- return err;
+- memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+- make_pages_readonly(
+- new->ldt,
+- (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+- XENFEAT_writable_descriptor_tables);
+- return 0;
+-}
+-
+-/*
+- * we do not have to muck with descriptors here, that is
+- * done in switch_mm() as needed.
+- */
+-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+-{
+- struct mm_struct * old_mm;
+- int retval = 0;
+-
+- mutex_init(&mm->context.lock);
+- mm->context.size = 0;
+- mm->context.has_foreign_mappings = 0;
+- old_mm = current->mm;
+- if (old_mm && old_mm->context.size > 0) {
+- mutex_lock(&old_mm->context.lock);
+- retval = copy_ldt(&mm->context, &old_mm->context);
+- mutex_unlock(&old_mm->context.lock);
+- }
+- return retval;
+-}
+-
+-/*
+- * No need to lock the MM as we are the last user
+- */
+-void destroy_context(struct mm_struct *mm)
+-{
+- if (mm->context.size) {
+- if (mm == current->active_mm)
+- clear_LDT();
+- make_pages_writable(
+- mm->context.ldt,
+- (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+- XENFEAT_writable_descriptor_tables);
+- if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+- vfree(mm->context.ldt);
+- else
+- kfree(mm->context.ldt);
+- mm->context.size = 0;
+- }
+-}
+-
+-static int read_ldt(void __user * ptr, unsigned long bytecount)
+-{
+- int err;
+- unsigned long size;
+- struct mm_struct * mm = current->mm;
+-
+- if (!mm->context.size)
+- return 0;
+- if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+- bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+-
+- mutex_lock(&mm->context.lock);
+- size = mm->context.size*LDT_ENTRY_SIZE;
+- if (size > bytecount)
+- size = bytecount;
+-
+- err = 0;
+- if (copy_to_user(ptr, mm->context.ldt, size))
+- err = -EFAULT;
+- mutex_unlock(&mm->context.lock);
+- if (err < 0)
+- goto error_return;
+- if (size != bytecount) {
+- /* zero-fill the rest */
+- if (clear_user(ptr+size, bytecount-size) != 0) {
+- err = -EFAULT;
+- goto error_return;
+- }
+- }
+- return bytecount;
+-error_return:
+- return err;
+-}
+-
+-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+-{
+- int err;
+- unsigned long size;
+-
+- err = 0;
+- size = 5*sizeof(struct desc_struct);
+- if (size > bytecount)
+- size = bytecount;
+-
+- err = size;
+- if (clear_user(ptr, size))
+- err = -EFAULT;
+-
+- return err;
+-}
+-
+-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+-{
+- struct mm_struct * mm = current->mm;
+- __u32 entry_1, entry_2;
+- int error;
+- struct user_desc ldt_info;
+-
+- error = -EINVAL;
+- if (bytecount != sizeof(ldt_info))
+- goto out;
+- error = -EFAULT;
+- if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+- goto out;
+-
+- error = -EINVAL;
+- if (ldt_info.entry_number >= LDT_ENTRIES)
+- goto out;
+- if (ldt_info.contents == 3) {
+- if (oldmode)
+- goto out;
+- if (ldt_info.seg_not_present == 0)
+- goto out;
+- }
+-
+- mutex_lock(&mm->context.lock);
+- if (ldt_info.entry_number >= mm->context.size) {
+- error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1);
+- if (error < 0)
+- goto out_unlock;
+- }
+-
+- /* Allow LDTs to be cleared by the user. */
+- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+- if (oldmode || LDT_empty(&ldt_info)) {
+- entry_1 = 0;
+- entry_2 = 0;
+- goto install;
+- }
+- }
+-
+- entry_1 = LDT_entry_a(&ldt_info);
+- entry_2 = LDT_entry_b(&ldt_info);
+- if (oldmode)
+- entry_2 &= ~(1 << 20);
+-
+- /* Install the new entry ... */
+-install:
+- error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
+- entry_1, entry_2);
+-
+-out_unlock:
+- mutex_unlock(&mm->context.lock);
+-out:
+- return error;
+-}
+-
+-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+-{
+- int ret = -ENOSYS;
+-
+- switch (func) {
+- case 0:
+- ret = read_ldt(ptr, bytecount);
+- break;
+- case 1:
+- ret = write_ldt(ptr, bytecount, 1);
+- break;
+- case 2:
+- ret = read_default_ldt(ptr, bytecount);
+- break;
+- case 0x11:
+- ret = write_ldt(ptr, bytecount, 0);
+- break;
+- }
+- return ret;
+-}
+--- sle11-2009-06-29.orig/arch/x86/kernel/ldt_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,271 +0,0 @@
+-/*
+- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+- * Copyright (C) 2002 Andi Kleen
+- *
+- * This handles calls from both 32bit and 64bit mode.
+- */
+-
+-#include <linux/errno.h>
+-#include <linux/sched.h>
+-#include <linux/string.h>
+-#include <linux/mm.h>
+-#include <linux/smp.h>
+-#include <linux/vmalloc.h>
+-#include <linux/slab.h>
+-
+-#include <asm/uaccess.h>
+-#include <asm/system.h>
+-#include <asm/ldt.h>
+-#include <asm/desc.h>
+-#include <asm/proto.h>
+-#include <asm/pgalloc.h>
+-
+-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+-static void flush_ldt(void *null)
+-{
+- if (current->active_mm)
+- load_LDT(¤t->active_mm->context);
+-}
+-#endif
+-
+-static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
+-{
+- void *oldldt;
+- void *newldt;
+- unsigned oldsize;
+-
+- if (mincount <= (unsigned)pc->size)
+- return 0;
+- oldsize = pc->size;
+- mincount = (mincount+511)&(~511);
+- if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+- newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+- else
+- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+-
+- if (!newldt)
+- return -ENOMEM;
+-
+- if (oldsize)
+- memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+- oldldt = pc->ldt;
+- memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+- wmb();
+- pc->ldt = newldt;
+- wmb();
+- pc->size = mincount;
+- wmb();
+- if (reload) {
+-#ifdef CONFIG_SMP
+- cpumask_t mask;
+-
+- preempt_disable();
+-#endif
+- make_pages_readonly(
+- pc->ldt,
+- (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+- XENFEAT_writable_descriptor_tables);
+- load_LDT(pc);
+-#ifdef CONFIG_SMP
+- mask = cpumask_of_cpu(smp_processor_id());
+- if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+- smp_call_function(flush_ldt, NULL, 1, 1);
+- preempt_enable();
+-#endif
+- }
+- if (oldsize) {
+- make_pages_writable(
+- oldldt,
+- (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
+- XENFEAT_writable_descriptor_tables);
+- if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+- vfree(oldldt);
+- else
+- kfree(oldldt);
+- }
+- return 0;
+-}
+-
+-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+-{
+- int err = alloc_ldt(new, old->size, 0);
+- if (err < 0)
+- return err;
+- memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+- make_pages_readonly(
+- new->ldt,
+- (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+- XENFEAT_writable_descriptor_tables);
+- return 0;
+-}
+-
+-/*
+- * we do not have to muck with descriptors here, that is
+- * done in switch_mm() as needed.
+- */
+-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+-{
+- struct mm_struct * old_mm;
+- int retval = 0;
+-
+- memset(&mm->context, 0, sizeof(mm->context));
+- mutex_init(&mm->context.lock);
+- old_mm = current->mm;
+- if (old_mm)
+- mm->context.vdso = old_mm->context.vdso;
+- if (old_mm && old_mm->context.size > 0) {
+- mutex_lock(&old_mm->context.lock);
+- retval = copy_ldt(&mm->context, &old_mm->context);
+- mutex_unlock(&old_mm->context.lock);
+- }
+- return retval;
+-}
+-
+-/*
+- *
+- * Don't touch the LDT register - we're already in the next thread.
+- */
+-void destroy_context(struct mm_struct *mm)
+-{
+- if (mm->context.size) {
+- if (mm == current->active_mm)
+- clear_LDT();
+- make_pages_writable(
+- mm->context.ldt,
+- (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+- XENFEAT_writable_descriptor_tables);
+- if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+- vfree(mm->context.ldt);
+- else
+- kfree(mm->context.ldt);
+- mm->context.size = 0;
+- }
+-}
+-
+-static int read_ldt(void __user * ptr, unsigned long bytecount)
+-{
+- int err;
+- unsigned long size;
+- struct mm_struct * mm = current->mm;
+-
+- if (!mm->context.size)
+- return 0;
+- if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+- bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+-
+- mutex_lock(&mm->context.lock);
+- size = mm->context.size*LDT_ENTRY_SIZE;
+- if (size > bytecount)
+- size = bytecount;
+-
+- err = 0;
+- if (copy_to_user(ptr, mm->context.ldt, size))
+- err = -EFAULT;
+- mutex_unlock(&mm->context.lock);
+- if (err < 0)
+- goto error_return;
+- if (size != bytecount) {
+- /* zero-fill the rest */
+- if (clear_user(ptr+size, bytecount-size) != 0) {
+- err = -EFAULT;
+- goto error_return;
+- }
+- }
+- return bytecount;
+-error_return:
+- return err;
+-}
+-
+-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+-{
+- /* Arbitrary number */
+- /* x86-64 default LDT is all zeros */
+- if (bytecount > 128)
+- bytecount = 128;
+- if (clear_user(ptr, bytecount))
+- return -EFAULT;
+- return bytecount;
+-}
+-
+-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+-{
+- struct task_struct *me = current;
+- struct mm_struct * mm = me->mm;
+- __u32 entry_1, entry_2, *lp;
+- unsigned long mach_lp;
+- int error;
+- struct user_desc ldt_info;
+-
+- error = -EINVAL;
+-
+- if (bytecount != sizeof(ldt_info))
+- goto out;
+- error = -EFAULT;
+- if (copy_from_user(&ldt_info, ptr, bytecount))
+- goto out;
+-
+- error = -EINVAL;
+- if (ldt_info.entry_number >= LDT_ENTRIES)
+- goto out;
+- if (ldt_info.contents == 3) {
+- if (oldmode)
+- goto out;
+- if (ldt_info.seg_not_present == 0)
+- goto out;
+- }
+-
+- mutex_lock(&mm->context.lock);
+- if (ldt_info.entry_number >= (unsigned)mm->context.size) {
+- error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1);
+- if (error < 0)
+- goto out_unlock;
+- }
+-
+- lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
+- mach_lp = arbitrary_virt_to_machine(lp);
+-
+- /* Allow LDTs to be cleared by the user. */
+- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+- if (oldmode || LDT_empty(&ldt_info)) {
+- entry_1 = 0;
+- entry_2 = 0;
+- goto install;
+- }
+- }
+-
+- entry_1 = LDT_entry_a(&ldt_info);
+- entry_2 = LDT_entry_b(&ldt_info);
+- if (oldmode)
+- entry_2 &= ~(1 << 20);
+-
+- /* Install the new entry ... */
+-install:
+- error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
+-
+-out_unlock:
+- mutex_unlock(&mm->context.lock);
+-out:
+- return error;
+-}
+-
+-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+-{
+- int ret = -ENOSYS;
+-
+- switch (func) {
+- case 0:
+- ret = read_ldt(ptr, bytecount);
+- break;
+- case 1:
+- ret = write_ldt(ptr, bytecount, 1);
+- break;
+- case 2:
+- ret = read_default_ldt(ptr, bytecount);
+- break;
+- case 0x11:
+- ret = write_ldt(ptr, bytecount, 0);
+- break;
+- }
+- return ret;
+-}
+--- sle11-2009-06-29.orig/arch/x86/kernel/machine_kexec_64.c 2008-11-25 12:35:54.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/machine_kexec_64.c 2009-03-16 16:33:40.000000000 +0100
+@@ -300,7 +300,9 @@ void machine_kexec(struct kimage *image)
+
+ void arch_crash_save_vmcoreinfo(void)
+ {
++#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
+ VMCOREINFO_SYMBOL(phys_base);
++#endif
+ VMCOREINFO_SYMBOL(init_level4_pgt);
+
+ #ifdef CONFIG_NUMA
+--- sle11-2009-06-29.orig/arch/x86/kernel/microcode-xen.c 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/microcode-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -167,7 +167,7 @@ static int request_microcode(void)
+ }
+
+ op.cmd = XENPF_microcode_update;
+- set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
++ set_xen_guest_handle(op.u.microcode.data, firmware->data);
+ op.u.microcode.length = firmware->size;
+ error = HYPERVISOR_platform_op(&op);
+
+--- sle11-2009-06-29.orig/arch/x86/kernel/mpparse_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/mpparse_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
+ /* Processor that is doing the boot up */
+ unsigned int boot_cpu_physical_apicid = -1U;
+ /* Internal processor count */
+-unsigned int __cpuinitdata num_processors;
++unsigned int num_processors;
+
+ /* Bitmask of physically existing CPUs */
+ physid_mask_t phys_cpu_present_map;
+@@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc
+ if (!(m->mpc_flags & MPC_APIC_USABLE))
+ return;
+
+- printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
++ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+ if (nr_ioapics >= MAX_IO_APICS) {
+ printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
+@@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp
+
+ mps_oem_check(mpc, oem, str);
+
+- printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
++ printk("APIC at: 0x%X\n", mpc->mpc_lapic);
+
+- /*
++ /*
+ * Save the local APIC address (it might be non-default) -- but only
+ * if we're not using ACPI.
+ */
+@@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig
+ unsigned long *bp = isa_bus_to_virt(base);
+ struct intel_mp_floating *mpf;
+
+- Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
++ printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
+ if (sizeof(*mpf) != 16)
+ printk("Error: MPF size\n");
+
+@@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig
+
+ smp_found_config = 1;
+ #ifndef CONFIG_XEN
+- printk(KERN_INFO "found SMP MP-table at %08lx\n",
+- virt_to_phys(mpf));
+- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
++ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
++ mpf, virt_to_phys(mpf));
++ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
++ BOOTMEM_DEFAULT);
+ if (mpf->mpf_physptr) {
+ /*
+ * We cannot access to MPC table to compute
+@@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig
+ unsigned long end = max_low_pfn * PAGE_SIZE;
+ if (mpf->mpf_physptr + size > end)
+ size = end - mpf->mpf_physptr;
+- reserve_bootmem(mpf->mpf_physptr, size);
++ reserve_bootmem(mpf->mpf_physptr, size,
++ BOOTMEM_DEFAULT);
+ }
+ #else
+- printk(KERN_INFO "found SMP MP-table at %08lx\n",
+- ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
++ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
++ mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
+ #endif
+
+ mpf_found = mpf;
+@@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3
+ */
+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+ mp_ioapic_routing[idx].gsi_base = gsi_base;
+- mp_ioapic_routing[idx].gsi_end = gsi_base +
++ mp_ioapic_routing[idx].gsi_end = gsi_base +
+ io_apic_get_redir_entries(idx);
+
+- printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
+- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
+- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+- mp_ioapic_routing[idx].gsi_base,
+- mp_ioapic_routing[idx].gsi_end);
++ printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
++ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
++ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
++ mp_ioapic_routing[idx].gsi_base,
++ mp_ioapic_routing[idx].gsi_end);
+ }
+
+ void __init
+@@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs (
+ }
+
+ #define MAX_GSI_NUM 4096
++#define IRQ_COMPRESSION_START 64
+
+ int mp_register_gsi(u32 gsi, int triggering, int polarity)
+ {
+ int ioapic = -1;
+ int ioapic_pin = 0;
+ int idx, bit = 0;
+- static int pci_irq = 16;
++ static int pci_irq = IRQ_COMPRESSION_START;
+ /*
+- * Mapping between Global System Interrups, which
++ * Mapping between Global System Interrupts, which
+ * represent all possible interrupts, and IRQs
+ * assigned to actual devices.
+ */
+@@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger
+ if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+- return gsi_to_irq[gsi];
++ return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+ }
+
+ mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+
+- if (triggering == ACPI_LEVEL_SENSITIVE) {
++ /*
++ * For GSI >= 64, use IRQ compression
++ */
++ if ((gsi >= IRQ_COMPRESSION_START)
++ && (triggering == ACPI_LEVEL_SENSITIVE)) {
+ /*
+ * For PCI devices assign IRQs in order, avoiding gaps
+ * due to unused I/O APIC pins.
+--- sle11-2009-06-29.orig/arch/x86/kernel/mpparse_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/mpparse_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U;
+ EXPORT_SYMBOL(boot_cpu_id);
+
+ /* Internal processor count */
+-unsigned int num_processors __cpuinitdata = 0;
++unsigned int num_processors;
+
+ unsigned disabled_cpus __cpuinitdata;
+
+ /* Bitmask of physically existing CPUs */
+ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
+
+-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
++#ifndef CONFIG_XEN
++u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
++ = { [0 ... NR_CPUS-1] = BAD_APICID };
++void *x86_bios_cpu_apicid_early_ptr;
++#endif
++DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
++EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+
+
+ /*
+@@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info(
+ physid_set(m->mpc_apicid, phys_cpu_present_map);
+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+ /*
+- * bios_cpu_apicid is required to have processors listed
++ * x86_bios_cpu_apicid is required to have processors listed
+ * in same order as logical cpu numbers. Hence the first
+ * entry is BSP, and so on.
+ */
+ cpu = 0;
+ }
+- bios_cpu_apicid[cpu] = m->mpc_apicid;
+- /*
+- * We get called early in the the start_kernel initialization
+- * process when the per_cpu data area is not yet setup, so we
+- * use a static array that is removed after the per_cpu data
+- * area is created.
+- */
+- if (x86_cpu_to_apicid_ptr) {
+- u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
+- x86_cpu_to_apicid[cpu] = m->mpc_apicid;
++ /* are we being called early in kernel startup? */
++ if (x86_cpu_to_apicid_early_ptr) {
++ u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
++ u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
++
++ cpu_to_apicid[cpu] = m->mpc_apicid;
++ bios_cpu_apicid[cpu] = m->mpc_apicid;
+ } else {
+ per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
++ per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
+ }
+
+ cpu_set(cpu, cpu_possible_map);
+--- sle11-2009-06-29.orig/arch/x86/kernel/pci-dma-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -434,3 +434,23 @@ dma_sync_single_for_device(struct device
+ swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
+ }
+ EXPORT_SYMBOL(dma_sync_single_for_device);
++
++void
++dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
++ enum dma_data_direction direction)
++{
++ if (swiotlb)
++ swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
++ flush_write_buffers();
++}
++EXPORT_SYMBOL(dma_sync_sg_for_cpu);
++
++void
++dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
++ enum dma_data_direction direction)
++{
++ if (swiotlb)
++ swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
++ flush_write_buffers();
++}
++EXPORT_SYMBOL(dma_sync_sg_for_device);
+--- sle11-2009-06-29.orig/arch/x86/kernel/process_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/process_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -23,7 +23,6 @@
+ #include <linux/slab.h>
+ #include <linux/vmalloc.h>
+ #include <linux/user.h>
+-#include <linux/a.out.h>
+ #include <linux/interrupt.h>
+ #include <linux/utsname.h>
+ #include <linux/delay.h>
+@@ -59,8 +58,10 @@
+
+ #include <asm/tlbflush.h>
+ #include <asm/cpu.h>
++#include <asm/kdebug.h>
+
+ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
++asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
+
+ static int hlt_counter;
+
+@@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
+ */
+ unsigned long thread_saved_pc(struct task_struct *tsk)
+ {
+- return ((unsigned long *)tsk->thread.esp)[3];
++ return ((unsigned long *)tsk->thread.sp)[3];
+ }
+
+ /*
+@@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas
+ */
+ void (*pm_idle)(void);
+ EXPORT_SYMBOL(pm_idle);
+-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+
+ void disable_hlt(void)
+ {
+@@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt);
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+-static void poll_idle (void)
++static void poll_idle(void)
+ {
+ cpu_relax();
+ }
+@@ -122,10 +122,19 @@ static void xen_idle(void)
+ smp_mb();
+
+ local_irq_disable();
+- if (!need_resched())
++ if (!need_resched()) {
++ ktime_t t0, t1;
++ u64 t0n, t1n;
++
++ t0 = ktime_get();
++ t0n = ktime_to_ns(t0);
+ safe_halt(); /* enables interrupts racelessly */
+- else
+- local_irq_enable();
++ local_irq_disable();
++ t1 = ktime_get();
++ t1n = ktime_to_ns(t1);
++ sched_clock_idle_wakeup_event(t1n - t0n);
++ }
++ local_irq_enable();
+ current_thread_info()->status |= TS_POLLING;
+ }
+ #ifdef CONFIG_APM_MODULE
+@@ -168,13 +177,13 @@ void cpu_idle(void)
+ while (!need_resched()) {
+ void (*idle)(void);
+
+- if (__get_cpu_var(cpu_idle_state))
+- __get_cpu_var(cpu_idle_state) = 0;
+-
+ check_pgt_cache();
+ rmb();
+ idle = xen_idle; /* no alternatives */
+
++ if (rcu_pending(cpu))
++ rcu_check_callbacks(cpu, 0);
++
+ if (cpu_is_offline(cpu))
+ play_dead();
+
+@@ -192,40 +201,19 @@ static void do_nothing(void *unused)
+ {
+ }
+
++/*
++ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
++ * pm_idle and update to new pm_idle value. Required while changing pm_idle
++ * handler on SMP systems.
++ *
++ * Caller must have changed pm_idle to the new value before the call. Old
++ * pm_idle value will not be used by any CPU after the return of this function.
++ */
+ void cpu_idle_wait(void)
+ {
+- unsigned int cpu, this_cpu = get_cpu();
+- cpumask_t map, tmp = current->cpus_allowed;
+-
+- set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+- put_cpu();
+-
+- cpus_clear(map);
+- for_each_online_cpu(cpu) {
+- per_cpu(cpu_idle_state, cpu) = 1;
+- cpu_set(cpu, map);
+- }
+-
+- __get_cpu_var(cpu_idle_state) = 0;
+-
+- wmb();
+- do {
+- ssleep(1);
+- for_each_online_cpu(cpu) {
+- if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+- cpu_clear(cpu, map);
+- }
+- cpus_and(map, map, cpu_online_map);
+- /*
+- * We waited 1 sec, if a CPU still did not call idle
+- * it may be because it is in idle and not waking up
+- * because it has nothing to do.
+- * Give all the remaining CPUS a kick.
+- */
+- smp_call_function_mask(map, do_nothing, 0, 0);
+- } while (!cpus_empty(map));
+-
+- set_cpus_allowed(current, tmp);
++ smp_mb();
++ /* kick all the CPUs so that they exit out of pm_idle */
++ smp_call_function(do_nothing, NULL, 0, 1);
+ }
+ EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+@@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re
+ {
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+ unsigned long d0, d1, d2, d3, d6, d7;
+- unsigned long esp;
++ unsigned long sp;
+ unsigned short ss, gs;
+
+ if (user_mode_vm(regs)) {
+- esp = regs->esp;
+- ss = regs->xss & 0xffff;
++ sp = regs->sp;
++ ss = regs->ss & 0xffff;
+ savesegment(gs, gs);
+ } else {
+- esp = (unsigned long) (®s->esp);
++ sp = (unsigned long) (®s->sp);
+ savesegment(ss, ss);
+ savesegment(gs, gs);
+ }
+@@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re
+ init_utsname()->version);
+
+ printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
+- 0xffff & regs->xcs, regs->eip, regs->eflags,
++ 0xffff & regs->cs, regs->ip, regs->flags,
+ smp_processor_id());
+- print_symbol("EIP is at %s\n", regs->eip);
++ print_symbol("EIP is at %s\n", regs->ip);
+
+ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+- regs->eax, regs->ebx, regs->ecx, regs->edx);
++ regs->ax, regs->bx, regs->cx, regs->dx);
+ printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+- regs->esi, regs->edi, regs->ebp, esp);
++ regs->si, regs->di, regs->bp, sp);
+ printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
+- regs->xds & 0xffff, regs->xes & 0xffff,
+- regs->xfs & 0xffff, gs, ss);
++ regs->ds & 0xffff, regs->es & 0xffff,
++ regs->fs & 0xffff, gs, ss);
+
+ if (!all)
+ return;
+@@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re
+ void show_regs(struct pt_regs *regs)
+ {
+ __show_registers(regs, 1);
+- show_trace(NULL, regs, ®s->esp);
++ show_trace(NULL, regs, ®s->sp, regs->bp);
+ }
+
+ /*
+- * This gets run with %ebx containing the
+- * function to call, and %edx containing
++ * This gets run with %bx containing the
++ * function to call, and %dx containing
+ * the "args".
+ */
+ extern void kernel_thread_helper(void);
+@@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi
+
+ memset(®s, 0, sizeof(regs));
+
+- regs.ebx = (unsigned long) fn;
+- regs.edx = (unsigned long) arg;
++ regs.bx = (unsigned long) fn;
++ regs.dx = (unsigned long) arg;
+
+- regs.xds = __USER_DS;
+- regs.xes = __USER_DS;
+- regs.xfs = __KERNEL_PERCPU;
+- regs.orig_eax = -1;
+- regs.eip = (unsigned long) kernel_thread_helper;
+- regs.xcs = __KERNEL_CS | get_kernel_rpl();
+- regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
++ regs.ds = __USER_DS;
++ regs.es = __USER_DS;
++ regs.fs = __KERNEL_PERCPU;
++ regs.orig_ax = -1;
++ regs.ip = (unsigned long) kernel_thread_helper;
++ regs.cs = __KERNEL_CS | get_kernel_rpl();
++ regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+
+ /* Ok, create the new process.. */
+ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL);
+@@ -368,7 +356,12 @@ void flush_thread(void)
+ {
+ struct task_struct *tsk = current;
+
+- memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
++ tsk->thread.debugreg0 = 0;
++ tsk->thread.debugreg1 = 0;
++ tsk->thread.debugreg2 = 0;
++ tsk->thread.debugreg3 = 0;
++ tsk->thread.debugreg6 = 0;
++ tsk->thread.debugreg7 = 0;
+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ clear_tsk_thread_flag(tsk, TIF_DEBUG);
+ /*
+@@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct
+ unlazy_fpu(tsk);
+ }
+
+-int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
++int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+ unsigned long unused,
+ struct task_struct * p, struct pt_regs * regs)
+ {
+@@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl
+
+ childregs = task_pt_regs(p);
+ *childregs = *regs;
+- childregs->eax = 0;
+- childregs->esp = esp;
++ childregs->ax = 0;
++ childregs->sp = sp;
+
+- p->thread.esp = (unsigned long) childregs;
+- p->thread.esp0 = (unsigned long) (childregs+1);
++ p->thread.sp = (unsigned long) childregs;
++ p->thread.sp0 = (unsigned long) (childregs+1);
+
+- p->thread.eip = (unsigned long) ret_from_fork;
++ p->thread.ip = (unsigned long) ret_from_fork;
+
+- savesegment(gs,p->thread.gs);
++ savesegment(gs, p->thread.gs);
+
+ tsk = current;
++ if (test_tsk_thread_flag(tsk, TIF_CSTAR))
++ p->thread.ip = (unsigned long) cstar_ret_from_fork;
+ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
+ p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES, GFP_KERNEL);
+@@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
+ }
+
++ err = 0;
++
+ /*
+ * Set a new TLS for the child thread?
+ */
+- if (clone_flags & CLONE_SETTLS) {
+- struct desc_struct *desc;
+- struct user_desc info;
+- int idx;
+-
+- err = -EFAULT;
+- if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
+- goto out;
+- err = -EINVAL;
+- if (LDT_empty(&info))
+- goto out;
+-
+- idx = info.entry_number;
+- if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+- goto out;
+-
+- desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+- desc->a = LDT_entry_a(&info);
+- desc->b = LDT_entry_b(&info);
+- }
++ if (clone_flags & CLONE_SETTLS)
++ err = do_set_thread_area(p, -1,
++ (struct user_desc __user *)childregs->si, 0);
+
+ p->thread.iopl = current->thread.iopl;
+
+- err = 0;
+- out:
+ if (err && p->thread.io_bitmap_ptr) {
+ kfree(p->thread.io_bitmap_ptr);
+ p->thread.io_bitmap_max = 0;
+@@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl
+ return err;
+ }
+
+-/*
+- * fill in the user structure for a core dump..
+- */
+-void dump_thread(struct pt_regs * regs, struct user * dump)
+-{
+- int i;
+-
+-/* changed the size calculations - should hopefully work better. lbt */
+- dump->magic = CMAGIC;
+- dump->start_code = 0;
+- dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
+- dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
+- dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
+- dump->u_dsize -= dump->u_tsize;
+- dump->u_ssize = 0;
+- for (i = 0; i < 8; i++)
+- dump->u_debugreg[i] = current->thread.debugreg[i];
+-
+- if (dump->start_stack < TASK_SIZE)
+- dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
+-
+- dump->regs.ebx = regs->ebx;
+- dump->regs.ecx = regs->ecx;
+- dump->regs.edx = regs->edx;
+- dump->regs.esi = regs->esi;
+- dump->regs.edi = regs->edi;
+- dump->regs.ebp = regs->ebp;
+- dump->regs.eax = regs->eax;
+- dump->regs.ds = regs->xds;
+- dump->regs.es = regs->xes;
+- dump->regs.fs = regs->xfs;
+- savesegment(gs,dump->regs.gs);
+- dump->regs.orig_eax = regs->orig_eax;
+- dump->regs.eip = regs->eip;
+- dump->regs.cs = regs->xcs;
+- dump->regs.eflags = regs->eflags;
+- dump->regs.esp = regs->esp;
+- dump->regs.ss = regs->xss;
+-
+- dump->u_fpvalid = dump_fpu (regs, &dump->i387);
+-}
+-EXPORT_SYMBOL(dump_thread);
+-
+-/*
+- * Capture the user space registers if the task is not running (in user space)
+- */
+-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+-{
+- struct pt_regs ptregs = *task_pt_regs(tsk);
+- ptregs.xcs &= 0xffff;
+- ptregs.xds &= 0xffff;
+- ptregs.xes &= 0xffff;
+- ptregs.xss &= 0xffff;
+-
+- elf_core_copy_regs(regs, &ptregs);
+-
+- return 1;
+-}
+-
+ #ifdef CONFIG_SECCOMP
+-void hard_disable_TSC(void)
++static void hard_disable_TSC(void)
+ {
+ write_cr4(read_cr4() | X86_CR4_TSD);
+ }
+@@ -534,7 +453,7 @@ void disable_TSC(void)
+ hard_disable_TSC();
+ preempt_enable();
+ }
+-void hard_enable_TSC(void)
++static void hard_enable_TSC(void)
+ {
+ write_cr4(read_cr4() & ~X86_CR4_TSD);
+ }
+@@ -543,18 +462,32 @@ void hard_enable_TSC(void)
+ static noinline void
+ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
+ {
+- struct thread_struct *next;
++ struct thread_struct *prev, *next;
++ unsigned long debugctl;
+
++ prev = &prev_p->thread;
+ next = &next_p->thread;
+
++ debugctl = prev->debugctlmsr;
++ if (next->ds_area_msr != prev->ds_area_msr) {
++ /* we clear debugctl to make sure DS
++ * is not in use when we change it */
++ debugctl = 0;
++ wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
++ wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
++ }
++
++ if (next->debugctlmsr != debugctl)
++ wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
++
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+- set_debugreg(next->debugreg[0], 0);
+- set_debugreg(next->debugreg[1], 1);
+- set_debugreg(next->debugreg[2], 2);
+- set_debugreg(next->debugreg[3], 3);
++ set_debugreg(next->debugreg0, 0);
++ set_debugreg(next->debugreg1, 1);
++ set_debugreg(next->debugreg2, 2);
++ set_debugreg(next->debugreg3, 3);
+ /* no 4 and 5 */
+- set_debugreg(next->debugreg[6], 6);
+- set_debugreg(next->debugreg[7], 7);
++ set_debugreg(next->debugreg6, 6);
++ set_debugreg(next->debugreg7, 7);
+ }
+
+ #ifdef CONFIG_SECCOMP
+@@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre
+ hard_enable_TSC();
+ }
+ #endif
++
++#ifdef X86_BTS
++ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
++ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
++
++ if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
++ ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
++#endif
+ }
+
+ /*
+@@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre
+ * More important, however, is the fact that this allows us much
+ * more flexibility.
+ *
+- * The return value (in %eax) will be the "prev" task after
++ * The return value (in %ax) will be the "prev" task after
+ * the task-switch, and shows up in ret_from_fork in entry.S,
+ * for example.
+ */
+-struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
++struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ {
+ struct thread_struct *prev = &prev_p->thread,
+ *next = &next_p->thread;
+@@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t
+ #endif
+
+ /*
+- * Reload esp0.
+- * This is load_esp0(tss, next) with a multicall.
++ * Reload sp0.
++ * This is load_sp0(tss, next) with a multicall.
+ */
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = __KERNEL_DS;
+- mcl->args[1] = next->esp0;
++ mcl->args[1] = next->sp0;
+ mcl++;
+
+ /*
+@@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t
+
+ asmlinkage int sys_fork(struct pt_regs regs)
+ {
+- return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
++ return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL);
+ }
+
+ asmlinkage int sys_clone(struct pt_regs regs)
+@@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs
+ unsigned long newsp;
+ int __user *parent_tidptr, *child_tidptr;
+
+- clone_flags = regs.ebx;
+- newsp = regs.ecx;
+- parent_tidptr = (int __user *)regs.edx;
+- child_tidptr = (int __user *)regs.edi;
++ clone_flags = regs.bx;
++ newsp = regs.cx;
++ parent_tidptr = (int __user *)regs.dx;
++ child_tidptr = (int __user *)regs.di;
+ if (!newsp)
+- newsp = regs.esp;
++ newsp = regs.sp;
+ return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr);
+ }
+
+@@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs
+ */
+ asmlinkage int sys_vfork(struct pt_regs regs)
+ {
+- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
++ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL);
+ }
+
+ /*
+@@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs
+ int error;
+ char * filename;
+
+- filename = getname((char __user *) regs.ebx);
++ filename = getname((char __user *) regs.bx);
+ error = PTR_ERR(filename);
+ if (IS_ERR(filename))
+ goto out;
+ error = do_execve(filename,
+- (char __user * __user *) regs.ecx,
+- (char __user * __user *) regs.edx,
++ (char __user * __user *) regs.cx,
++ (char __user * __user *) regs.dx,
+ ®s);
+ if (error == 0) {
+- task_lock(current);
+- current->ptrace &= ~PT_DTRACE;
+- task_unlock(current);
+ /* Make sure we don't return using sysenter.. */
+ set_thread_flag(TIF_IRET);
+ }
+@@ -800,145 +738,37 @@ out:
+
+ unsigned long get_wchan(struct task_struct *p)
+ {
+- unsigned long ebp, esp, eip;
++ unsigned long bp, sp, ip;
+ unsigned long stack_page;
+ int count = 0;
+ if (!p || p == current || p->state == TASK_RUNNING)
+ return 0;
+ stack_page = (unsigned long)task_stack_page(p);
+- esp = p->thread.esp;
+- if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
++ sp = p->thread.sp;
++ if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
+ return 0;
+- /* include/asm-i386/system.h:switch_to() pushes ebp last. */
+- ebp = *(unsigned long *) esp;
++ /* include/asm-i386/system.h:switch_to() pushes bp last. */
++ bp = *(unsigned long *) sp;
+ do {
+- if (ebp < stack_page || ebp > top_ebp+stack_page)
++ if (bp < stack_page || bp > top_ebp+stack_page)
+ return 0;
+- eip = *(unsigned long *) (ebp+4);
+- if (!in_sched_functions(eip))
+- return eip;
+- ebp = *(unsigned long *) ebp;
++ ip = *(unsigned long *) (bp+4);
++ if (!in_sched_functions(ip))
++ return ip;
++ bp = *(unsigned long *) bp;
+ } while (count++ < 16);
+ return 0;
+ }
+
+-/*
+- * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+- */
+-static int get_free_idx(void)
+-{
+- struct thread_struct *t = ¤t->thread;
+- int idx;
+-
+- for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+- if (desc_empty(t->tls_array + idx))
+- return idx + GDT_ENTRY_TLS_MIN;
+- return -ESRCH;
+-}
+-
+-/*
+- * Set a given TLS descriptor:
+- */
+-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+-{
+- struct thread_struct *t = ¤t->thread;
+- struct user_desc info;
+- struct desc_struct *desc;
+- int cpu, idx;
+-
+- if (copy_from_user(&info, u_info, sizeof(info)))
+- return -EFAULT;
+- idx = info.entry_number;
+-
+- /*
+- * index -1 means the kernel should try to find and
+- * allocate an empty descriptor:
+- */
+- if (idx == -1) {
+- idx = get_free_idx();
+- if (idx < 0)
+- return idx;
+- if (put_user(idx, &u_info->entry_number))
+- return -EFAULT;
+- }
+-
+- if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+- return -EINVAL;
+-
+- desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
+-
+- /*
+- * We must not get preempted while modifying the TLS.
+- */
+- cpu = get_cpu();
+-
+- if (LDT_empty(&info)) {
+- desc->a = 0;
+- desc->b = 0;
+- } else {
+- desc->a = LDT_entry_a(&info);
+- desc->b = LDT_entry_b(&info);
+- }
+- load_TLS(t, cpu);
+-
+- put_cpu();
+-
+- return 0;
+-}
+-
+-/*
+- * Get the current Thread-Local Storage area:
+- */
+-
+-#define GET_BASE(desc) ( \
+- (((desc)->a >> 16) & 0x0000ffff) | \
+- (((desc)->b << 16) & 0x00ff0000) | \
+- ( (desc)->b & 0xff000000) )
+-
+-#define GET_LIMIT(desc) ( \
+- ((desc)->a & 0x0ffff) | \
+- ((desc)->b & 0xf0000) )
+-
+-#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
+-#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
+-#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
+-#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
+-#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
+-#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
+-
+-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+-{
+- struct user_desc info;
+- struct desc_struct *desc;
+- int idx;
+-
+- if (get_user(idx, &u_info->entry_number))
+- return -EFAULT;
+- if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+- return -EINVAL;
+-
+- memset(&info, 0, sizeof(info));
+-
+- desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+-
+- info.entry_number = idx;
+- info.base_addr = GET_BASE(desc);
+- info.limit = GET_LIMIT(desc);
+- info.seg_32bit = GET_32BIT(desc);
+- info.contents = GET_CONTENTS(desc);
+- info.read_exec_only = !GET_WRITABLE(desc);
+- info.limit_in_pages = GET_LIMIT_PAGES(desc);
+- info.seg_not_present = !GET_PRESENT(desc);
+- info.useable = GET_USEABLE(desc);
+-
+- if (copy_to_user(u_info, &info, sizeof(info)))
+- return -EFAULT;
+- return 0;
+-}
+-
+ unsigned long arch_align_stack(unsigned long sp)
+ {
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+ }
++
++unsigned long arch_randomize_brk(struct mm_struct *mm)
++{
++ unsigned long range_end = mm->brk + 0x02000000;
++ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
++}
+--- sle11-2009-06-29.orig/arch/x86/kernel/process_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/process_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -3,7 +3,7 @@
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+- *
++ *
+ * X86-64 port
+ * Andi Kleen.
+ *
+@@ -22,19 +22,18 @@
+ #include <linux/cpu.h>
+ #include <linux/errno.h>
+ #include <linux/sched.h>
++#include <linux/fs.h>
+ #include <linux/kernel.h>
+ #include <linux/mm.h>
+-#include <linux/fs.h>
+ #include <linux/elfcore.h>
+ #include <linux/smp.h>
+ #include <linux/slab.h>
+ #include <linux/user.h>
+-#include <linux/module.h>
+-#include <linux/a.out.h>
+ #include <linux/interrupt.h>
++#include <linux/utsname.h>
+ #include <linux/delay.h>
++#include <linux/module.h>
+ #include <linux/ptrace.h>
+-#include <linux/utsname.h>
+ #include <linux/random.h>
+ #include <linux/notifier.h>
+ #include <linux/kprobes.h>
+@@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override)
+ */
+ void (*pm_idle)(void);
+ EXPORT_SYMBOL(pm_idle);
+-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+
+ static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+@@ -81,13 +79,6 @@ void idle_notifier_register(struct notif
+ {
+ atomic_notifier_chain_register(&idle_notifier, n);
+ }
+-EXPORT_SYMBOL_GPL(idle_notifier_register);
+-
+-void idle_notifier_unregister(struct notifier_block *n)
+-{
+- atomic_notifier_chain_unregister(&idle_notifier, n);
+-}
+-EXPORT_SYMBOL(idle_notifier_unregister);
+
+ void enter_idle(void)
+ {
+@@ -116,7 +107,7 @@ void exit_idle(void)
+ * to poll the ->need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+-static void poll_idle (void)
++static void poll_idle(void)
+ {
+ local_irq_enable();
+ cpu_relax();
+@@ -131,10 +122,19 @@ static void xen_idle(void)
+ */
+ smp_mb();
+ local_irq_disable();
+- if (!need_resched())
+- safe_halt();
+- else
+- local_irq_enable();
++ if (!need_resched()) {
++ ktime_t t0, t1;
++ u64 t0n, t1n;
++
++ t0 = ktime_get();
++ t0n = ktime_to_ns(t0);
++ safe_halt(); /* enables interrupts racelessly */
++ local_irq_disable();
++ t1 = ktime_get();
++ t1n = ktime_to_ns(t1);
++ sched_clock_idle_wakeup_event(t1n - t0n);
++ }
++ local_irq_enable();
+ current_thread_info()->status |= TS_POLLING;
+ }
+
+@@ -161,19 +161,15 @@ static inline void play_dead(void)
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+-void cpu_idle (void)
++void cpu_idle(void)
+ {
+ current_thread_info()->status |= TS_POLLING;
+ /* endless idle loop with no priority at all */
+ while (1) {
++ tick_nohz_stop_sched_tick();
+ while (!need_resched()) {
+ void (*idle)(void);
+
+- if (__get_cpu_var(cpu_idle_state))
+- __get_cpu_var(cpu_idle_state) = 0;
+-
+- tick_nohz_stop_sched_tick();
+-
+ rmb();
+ idle = xen_idle; /* no alternatives */
+ if (cpu_is_offline(smp_processor_id()))
+@@ -203,49 +199,27 @@ static void do_nothing(void *unused)
+ {
+ }
+
++/*
++ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
++ * pm_idle and update to new pm_idle value. Required while changing pm_idle
++ * handler on SMP systems.
++ *
++ * Caller must have changed pm_idle to the new value before the call. Old
++ * pm_idle value will not be used by any CPU after the return of this function.
++ */
+ void cpu_idle_wait(void)
+ {
+- unsigned int cpu, this_cpu = get_cpu();
+- cpumask_t map, tmp = current->cpus_allowed;
+-
+- set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+- put_cpu();
+-
+- cpus_clear(map);
+- for_each_online_cpu(cpu) {
+- per_cpu(cpu_idle_state, cpu) = 1;
+- cpu_set(cpu, map);
+- }
+-
+- __get_cpu_var(cpu_idle_state) = 0;
+-
+- wmb();
+- do {
+- ssleep(1);
+- for_each_online_cpu(cpu) {
+- if (cpu_isset(cpu, map) &&
+- !per_cpu(cpu_idle_state, cpu))
+- cpu_clear(cpu, map);
+- }
+- cpus_and(map, map, cpu_online_map);
+- /*
+- * We waited 1 sec, if a CPU still did not call idle
+- * it may be because it is in idle and not waking up
+- * because it has nothing to do.
+- * Give all the remaining CPUS a kick.
+- */
+- smp_call_function_mask(map, do_nothing, 0, 0);
+- } while (!cpus_empty(map));
+-
+- set_cpus_allowed(current, tmp);
++ smp_mb();
++ /* kick all the CPUs so that they exit out of pm_idle */
++ smp_call_function(do_nothing, NULL, 0, 1);
+ }
+ EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
++void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+ {
+ }
+
+-static int __init idle_setup (char *str)
++static int __init idle_setup(char *str)
+ {
+ if (!strcmp(str, "poll")) {
+ printk("using polling idle threads.\n");
+@@ -260,13 +234,13 @@ static int __init idle_setup (char *str)
+ }
+ early_param("idle", idle_setup);
+
+-/* Prints also some state that isn't saved in the pt_regs */
++/* Prints also some state that isn't saved in the pt_regs */
+ void __show_regs(struct pt_regs * regs)
+ {
+ unsigned long fs, gs, shadowgs;
+ unsigned long d0, d1, d2, d3, d6, d7;
+- unsigned int fsindex,gsindex;
+- unsigned int ds,cs,es;
++ unsigned int fsindex, gsindex;
++ unsigned int ds, cs, es;
+
+ printk("\n");
+ print_modules();
+@@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs)
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+- printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
+- printk_address(regs->rip);
+- printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
+- regs->eflags);
++ printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
++ printk_address(regs->ip, 1);
++ printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
++ regs->flags);
+ printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+- regs->rax, regs->rbx, regs->rcx);
++ regs->ax, regs->bx, regs->cx);
+ printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+- regs->rdx, regs->rsi, regs->rdi);
++ regs->dx, regs->si, regs->di);
+ printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+- regs->rbp, regs->r8, regs->r9);
++ regs->bp, regs->r8, regs->r9);
+ printk("R10: %016lx R11: %016lx R12: %016lx\n",
+ regs->r10, regs->r11, regs->r12);
+ printk("R13: %016lx R14: %016lx R15: %016lx\n",
+@@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs)
+ {
+ printk("CPU %d:", smp_processor_id());
+ __show_regs(regs);
+- show_trace(NULL, regs, (void *)(regs + 1));
++ show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
+ }
+
+ /*
+@@ -329,7 +303,7 @@ void exit_thread(void)
+ struct task_struct *me = current;
+ struct thread_struct *t = &me->thread;
+
+- if (me->thread.io_bitmap_ptr) {
++ if (me->thread.io_bitmap_ptr) {
+ #ifndef CONFIG_X86_NO_TSS
+ struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+ #endif
+@@ -382,7 +356,7 @@ void flush_thread(void)
+ tsk->thread.debugreg3 = 0;
+ tsk->thread.debugreg6 = 0;
+ tsk->thread.debugreg7 = 0;
+- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
++ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ /*
+ * Forget coprocessor state..
+ */
+@@ -405,26 +379,21 @@ void release_thread(struct task_struct *
+
+ static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
+ {
+- struct user_desc ud = {
++ struct user_desc ud = {
+ .base_addr = addr,
+ .limit = 0xfffff,
+ .seg_32bit = 1,
+ .limit_in_pages = 1,
+ .useable = 1,
+ };
+- struct n_desc_struct *desc = (void *)t->thread.tls_array;
++ struct desc_struct *desc = t->thread.tls_array;
+ desc += tls;
+- desc->a = LDT_entry_a(&ud);
+- desc->b = LDT_entry_b(&ud);
++ fill_ldt(desc, &ud);
+ }
+
+ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
+ {
+- struct desc_struct *desc = (void *)t->thread.tls_array;
+- desc += tls;
+- return desc->base0 |
+- (((u32)desc->base1) << 16) |
+- (((u32)desc->base2) << 24);
++ return get_desc_base(&t->thread.tls_array[tls]);
+ }
+
+ /*
+@@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct
+ unlazy_fpu(tsk);
+ }
+
+-int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
++int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+ unsigned long unused,
+ struct task_struct * p, struct pt_regs * regs)
+ {
+@@ -448,14 +417,14 @@ int copy_thread(int nr, unsigned long cl
+ (THREAD_SIZE + task_stack_page(p))) - 1;
+ *childregs = *regs;
+
+- childregs->rax = 0;
+- childregs->rsp = rsp;
+- if (rsp == ~0UL)
+- childregs->rsp = (unsigned long)childregs;
+-
+- p->thread.rsp = (unsigned long) childregs;
+- p->thread.rsp0 = (unsigned long) (childregs+1);
+- p->thread.userrsp = me->thread.userrsp;
++ childregs->ax = 0;
++ childregs->sp = sp;
++ if (sp == ~0UL)
++ childregs->sp = (unsigned long)childregs;
++
++ p->thread.sp = (unsigned long) childregs;
++ p->thread.sp0 = (unsigned long) (childregs+1);
++ p->thread.usersp = me->thread.usersp;
+
+ set_tsk_thread_flag(p, TIF_FORK);
+
+@@ -476,7 +445,7 @@ int copy_thread(int nr, unsigned long cl
+ memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES);
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
+- }
++ }
+
+ /*
+ * Set a new TLS for the child thread?
+@@ -484,7 +453,8 @@ int copy_thread(int nr, unsigned long cl
+ if (clone_flags & CLONE_SETTLS) {
+ #ifdef CONFIG_IA32_EMULATION
+ if (test_thread_flag(TIF_IA32))
+- err = ia32_child_tls(p, childregs);
++ err = do_set_thread_area(p, -1,
++ (struct user_desc __user *)childregs->si, 0);
+ else
+ #endif
+ err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+@@ -502,26 +472,32 @@ out:
+ return err;
+ }
+
+-static inline void __save_init_fpu( struct task_struct *tsk )
+-{
+- asm volatile( "rex64 ; fxsave %0 ; fnclex"
+- : "=m" (tsk->thread.i387.fxsave));
+- tsk->thread_info->status &= ~TS_USEDFPU;
+-}
+-
+ /*
+ * This special macro can be used to load a debugging register
+ */
+-#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
++#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
+
+ static inline void __switch_to_xtra(struct task_struct *prev_p,
+- struct task_struct *next_p)
++ struct task_struct *next_p)
+ {
+ struct thread_struct *prev, *next;
++ unsigned long debugctl;
+
+ prev = &prev_p->thread,
+ next = &next_p->thread;
+
++ debugctl = prev->debugctlmsr;
++ if (next->ds_area_msr != prev->ds_area_msr) {
++ /* we clear debugctl to make sure DS
++ * is not in use when we change it */
++ debugctl = 0;
++ wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
++ wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
++ }
++
++ if (next->debugctlmsr != debugctl)
++ wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
++
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+ loaddebug(next, 0);
+ loaddebug(next, 1);
+@@ -531,12 +507,20 @@ static inline void __switch_to_xtra(stru
+ loaddebug(next, 6);
+ loaddebug(next, 7);
+ }
++
++#ifdef X86_BTS
++ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
++ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
++
++ if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
++ ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
++#endif
+ }
+
+ /*
+ * switch_to(x,y) should switch tasks from x to y.
+ *
+- * This could still be optimized:
++ * This could still be optimized:
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ *
+@@ -547,7 +531,7 @@ __switch_to(struct task_struct *prev_p,
+ {
+ struct thread_struct *prev = &prev_p->thread,
+ *next = &next_p->thread;
+- int cpu = smp_processor_id();
++ int cpu = smp_processor_id();
+ #ifndef CONFIG_X86_NO_TSS
+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ #endif
+@@ -581,11 +565,12 @@ __switch_to(struct task_struct *prev_p,
+ prev_p->fpu_counter = 0;
+
+ /*
+- * Reload esp0, LDT and the page table pointer:
++ * Reload sp0.
++ * This is load_sp0(tss, next) with a multicall.
+ */
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = __KERNEL_DS;
+- mcl->args[1] = next->rsp0;
++ mcl->args[1] = next->sp0;
+ mcl++;
+
+ /*
+@@ -593,11 +578,12 @@ __switch_to(struct task_struct *prev_p,
+ * This is load_TLS(next, cpu) with multicalls.
+ */
+ #define C(i) do { \
+- if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
++ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
++ next->tls_array[i].b != prev->tls_array[i].b)) { \
+ mcl->op = __HYPERVISOR_update_descriptor; \
+ mcl->args[0] = virt_to_machine( \
+- &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
+- mcl->args[1] = next->tls_array[i]; \
++ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
++ mcl->args[1] = *(u64 *)&next->tls_array[i]; \
+ mcl++; \
+ } \
+ } while (0)
+@@ -605,7 +591,7 @@ __switch_to(struct task_struct *prev_p,
+ #undef C
+
+ if (unlikely(prev->iopl != next->iopl)) {
+- iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
++ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
+ #if CONFIG_XEN_COMPAT > 0x030002
+ mcl->op = __HYPERVISOR_physdev_op;
+ mcl->args[0] = PHYSDEVOP_set_iopl;
+@@ -669,8 +655,8 @@ __switch_to(struct task_struct *prev_p,
+ /*
+ * Switch the PDA context.
+ */
+- prev->userrsp = read_pda(oldrsp);
+- write_pda(oldrsp, next->userrsp);
++ prev->usersp = read_pda(oldrsp);
++ write_pda(oldrsp, next->usersp);
+ write_pda(pcurrent, next_p);
+ write_pda(kernelstack,
+ (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+@@ -687,7 +673,8 @@ __switch_to(struct task_struct *prev_p,
+ /*
+ * Now maybe reload the debug registers
+ */
+- if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
++ if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
++ task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
+ __switch_to_xtra(prev_p, next_p);
+
+ /* If the task has used fpu the last 5 timeslices, just do a full
+@@ -702,23 +689,18 @@ __switch_to(struct task_struct *prev_p,
+ /*
+ * sys_execve() executes a new program.
+ */
+-asmlinkage
++asmlinkage
+ long sys_execve(char __user *name, char __user * __user *argv,
+- char __user * __user *envp, struct pt_regs regs)
++ char __user * __user *envp, struct pt_regs *regs)
+ {
+ long error;
+ char * filename;
+
+ filename = getname(name);
+ error = PTR_ERR(filename);
+- if (IS_ERR(filename))
++ if (IS_ERR(filename))
+ return error;
+- error = do_execve(filename, argv, envp, ®s);
+- if (error == 0) {
+- task_lock(current);
+- current->ptrace &= ~PT_DTRACE;
+- task_unlock(current);
+- }
++ error = do_execve(filename, argv, envp, regs);
+ putname(filename);
+ return error;
+ }
+@@ -728,18 +710,18 @@ void set_personality_64bit(void)
+ /* inherit personality from parent */
+
+ /* Make sure to be in 64bit mode */
+- clear_thread_flag(TIF_IA32);
++ clear_thread_flag(TIF_IA32);
+
+ /* TBD: overwrites user setup. Should have two bits.
+ But 64bit processes have always behaved this way,
+ so it's not too bad. The main problem is just that
+- 32bit childs are affected again. */
++ 32bit childs are affected again. */
+ current->personality &= ~READ_IMPLIES_EXEC;
+ }
+
+ asmlinkage long sys_fork(struct pt_regs *regs)
+ {
+- return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
++ return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+ }
+
+ asmlinkage long
+@@ -747,7 +729,7 @@ sys_clone(unsigned long clone_flags, uns
+ void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+ {
+ if (!newsp)
+- newsp = regs->rsp;
++ newsp = regs->sp;
+ return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+ }
+
+@@ -763,29 +745,29 @@ sys_clone(unsigned long clone_flags, uns
+ */
+ asmlinkage long sys_vfork(struct pt_regs *regs)
+ {
+- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
++ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
+ NULL, NULL);
+ }
+
+ unsigned long get_wchan(struct task_struct *p)
+ {
+ unsigned long stack;
+- u64 fp,rip;
++ u64 fp,ip;
+ int count = 0;
+
+ if (!p || p == current || p->state==TASK_RUNNING)
+ return 0;
+ stack = (unsigned long)task_stack_page(p);
+- if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
++ if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
+ return 0;
+- fp = *(u64 *)(p->thread.rsp);
++ fp = *(u64 *)(p->thread.sp);
+ do {
+ if (fp < (unsigned long)stack ||
+ fp > (unsigned long)stack+THREAD_SIZE)
+ return 0;
+- rip = *(u64 *)(fp+8);
+- if (!in_sched_functions(rip))
+- return rip;
++ ip = *(u64 *)(fp+8);
++ if (!in_sched_functions(ip))
++ return ip;
+ fp = *(u64 *)fp;
+ } while (count++ < 16);
+ return 0;
+@@ -827,19 +809,19 @@ long do_arch_prctl(struct task_struct *t
+ /* Not strictly needed for fs, but do it for symmetry
+ with gs */
+ if (addr >= TASK_SIZE_OF(task))
+- return -EPERM;
++ return -EPERM;
+ cpu = get_cpu();
+- /* handle small bases via the GDT because that's faster to
++ /* handle small bases via the GDT because that's faster to
+ switch. */
+- if (addr <= 0xffffffff) {
++ if (addr <= 0xffffffff) {
+ set_32bit_tls(task, FS_TLS, addr);
+- if (doit) {
+- load_TLS(&task->thread, cpu);
++ if (doit) {
++ load_TLS(&task->thread, cpu);
+ asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
+ }
+ task->thread.fsindex = FS_TLS_SEL;
+ task->thread.fs = 0;
+- } else {
++ } else {
+ task->thread.fsindex = 0;
+ task->thread.fs = addr;
+ if (doit) {
+@@ -852,24 +834,24 @@ long do_arch_prctl(struct task_struct *t
+ }
+ put_cpu();
+ break;
+- case ARCH_GET_FS: {
+- unsigned long base;
++ case ARCH_GET_FS: {
++ unsigned long base;
+ if (task->thread.fsindex == FS_TLS_SEL)
+ base = read_32bit_tls(task, FS_TLS);
+ else if (doit)
+ rdmsrl(MSR_FS_BASE, base);
+ else
+ base = task->thread.fs;
+- ret = put_user(base, (unsigned long __user *)addr);
+- break;
++ ret = put_user(base, (unsigned long __user *)addr);
++ break;
+ }
+- case ARCH_GET_GS: {
++ case ARCH_GET_GS: {
+ unsigned long base;
+ unsigned gsindex;
+ if (task->thread.gsindex == GS_TLS_SEL)
+ base = read_32bit_tls(task, GS_TLS);
+ else if (doit) {
+- asm("movl %%gs,%0" : "=r" (gsindex));
++ asm("movl %%gs,%0" : "=r" (gsindex));
+ if (gsindex)
+ rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else
+@@ -877,40 +859,21 @@ long do_arch_prctl(struct task_struct *t
+ }
+ else
+ base = task->thread.gs;
+- ret = put_user(base, (unsigned long __user *)addr);
++ ret = put_user(base, (unsigned long __user *)addr);
+ break;
+ }
+
+ default:
+ ret = -EINVAL;
+ break;
+- }
++ }
+
+- return ret;
+-}
++ return ret;
++}
+
+ long sys_arch_prctl(int code, unsigned long addr)
+ {
+ return do_arch_prctl(current, code, addr);
+-}
+-
+-/*
+- * Capture the user space registers if the task is not running (in user space)
+- */
+-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+-{
+- struct pt_regs *pp, ptregs;
+-
+- pp = task_pt_regs(tsk);
+-
+- ptregs = *pp;
+- ptregs.cs &= 0xffff;
+- ptregs.ss &= 0xffff;
+-
+- elf_core_copy_regs(regs, &ptregs);
+-
+- boot_option_idle_override = 1;
+- return 1;
+ }
+
+ unsigned long arch_align_stack(unsigned long sp)
+@@ -919,3 +882,9 @@ unsigned long arch_align_stack(unsigned
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+ }
++
++unsigned long arch_randomize_brk(struct mm_struct *mm)
++{
++ unsigned long range_end = mm->brk + 0x02000000;
++ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
++}
+--- sle11-2009-06-29.orig/arch/x86/kernel/quirks-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/quirks-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -9,7 +9,7 @@
+ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+ {
+ u8 config, rev;
+- u32 word;
++ u16 word;
+
+ /* BIOS may enable hardware IRQ balancing for
+ * E7520/E7320/E7525(revision ID 0x9 and below)
+@@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal
+ pci_read_config_byte(dev, 0xf4, &config);
+ pci_write_config_byte(dev, 0xf4, config|0x2);
+
+- /* read xTPR register */
+- raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
++ /*
++ * read xTPR register. We may not have a pci_dev for device 8
++ * because it might be hidden until the above write.
++ */
++ pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);
+
+ if (!(word & (1 << 13))) {
+ struct xen_platform_op op;
+
+- printk(KERN_INFO "Intel E7520/7320/7525 detected. "
+- "Disabling irq balancing and affinity\n");
++ dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
++ "disabling irq balancing and affinity\n");
+ op.cmd = XENPF_platform_quirk;
+ op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
+ WARN_ON(HYPERVISOR_platform_op(&op));
+@@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct
+ pci_read_config_dword(dev, 0xF0, &rcba);
+ rcba &= 0xFFFFC000;
+ if (rcba == 0) {
+- printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n");
++ dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
++ "cannot force enable HPET\n");
+ return;
+ }
+
+ /* use bits 31:14, 16 kB aligned */
+ rcba_base = ioremap_nocache(rcba, 0x4000);
+ if (rcba_base == NULL) {
+- printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n");
++ dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
++ "cannot force enable HPET\n");
+ return;
+ }
+
+@@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct
+ /* HPET is enabled in HPTC. Just not reported by BIOS */
+ val = val & 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
+- force_hpet_address);
++ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
++ "0x%lx\n", force_hpet_address);
+ iounmap(rcba_base);
+ return;
+ }
+@@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct
+ if (err) {
+ force_hpet_address = 0;
+ iounmap(rcba_base);
+- printk(KERN_DEBUG "Failed to force enable HPET\n");
++ dev_printk(KERN_DEBUG, &dev->dev,
++ "Failed to force enable HPET\n");
+ } else {
+ force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
+- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
+- force_hpet_address);
++ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
++ "0x%lx\n", force_hpet_address);
+ }
+ }
+
+@@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
+ ich_force_enable_hpet);
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
+ ich_force_enable_hpet);
++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
++ ich_force_enable_hpet);
+
+
+ static struct pci_dev *cached_dev;
+@@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st
+ if (val & 0x4) {
+ val &= 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+- printk(KERN_DEBUG "HPET at base address 0x%lx\n",
+- force_hpet_address);
++ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
++ force_hpet_address);
+ return;
+ }
+
+@@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st
+ /* HPET is enabled in HPTC. Just not reported by BIOS */
+ val &= 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
+- force_hpet_address);
++ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
++ "0x%lx\n", force_hpet_address);
+ cached_dev = dev;
+ force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
+ return;
+ }
+
+- printk(KERN_DEBUG "Failed to force enable HPET\n");
++ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
+ }
+
+ /*
+@@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str
+ */
+ if (val & 0x80) {
+ force_hpet_address = (val & ~0x3ff);
+- printk(KERN_DEBUG "HPET at base address 0x%lx\n",
+- force_hpet_address);
++ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
++ force_hpet_address);
+ return;
+ }
+
+@@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str
+ pci_read_config_dword(dev, 0x68, &val);
+ if (val & 0x80) {
+ force_hpet_address = (val & ~0x3ff);
+- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
+- force_hpet_address);
++ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
++ "0x%lx\n", force_hpet_address);
+ cached_dev = dev;
+ force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
+ return;
+ }
+
+- printk(KERN_DEBUG "Failed to force enable HPET\n");
++ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
+ }
+
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
+@@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str
+ pci_read_config_dword(dev, 0x44, &val);
+ force_hpet_address = val & 0xfffffffe;
+ force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
+- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
++ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
+ force_hpet_address);
+ cached_dev = dev;
+ return;
+@@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
+ nvidia_force_enable_hpet);
+
+ /* LPC bridges */
++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
++ nvidia_force_enable_hpet);
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
+ nvidia_force_enable_hpet);
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
+@@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
+ void force_hpet_resume(void)
+ {
+ switch (force_hpet_resume_type) {
+- case ICH_FORCE_HPET_RESUME:
+- return ich_force_hpet_resume();
+-
+- case OLD_ICH_FORCE_HPET_RESUME:
+- return old_ich_force_hpet_resume();
+-
+- case VT8237_FORCE_HPET_RESUME:
+- return vt8237_force_hpet_resume();
+-
+- case NVIDIA_FORCE_HPET_RESUME:
+- return nvidia_force_hpet_resume();
+-
+- default:
++ case ICH_FORCE_HPET_RESUME:
++ ich_force_hpet_resume();
++ return;
++ case OLD_ICH_FORCE_HPET_RESUME:
++ old_ich_force_hpet_resume();
++ return;
++ case VT8237_FORCE_HPET_RESUME:
++ vt8237_force_hpet_resume();
++ return;
++ case NVIDIA_FORCE_HPET_RESUME:
++ nvidia_force_hpet_resume();
++ return;
++ default:
+ break;
+ }
+ }
+--- sle11-2009-06-29.orig/arch/x86/kernel/rtc.c 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/arch/x86/kernel/rtc.c 2009-03-16 16:33:40.000000000 +0100
+@@ -181,6 +181,10 @@ unsigned long read_persistent_clock(void
+ {
+ unsigned long retval, flags;
+
++#ifdef CONFIG_XEN
++ if (!is_initial_xendomain())
++ return xen_read_persistent_clock();
++#endif
+ spin_lock_irqsave(&rtc_lock, flags);
+ retval = get_wallclock();
+ spin_unlock_irqrestore(&rtc_lock, flags);
+@@ -190,6 +194,10 @@ unsigned long read_persistent_clock(void
+
+ int update_persistent_clock(struct timespec now)
+ {
++#ifdef CONFIG_XEN
++ if (xen_update_persistent_clock() < 0 || xen_independent_wallclock())
++ return 0;
++#endif
+ return set_rtc_mmss(now.tv_sec);
+ }
+
+--- sle11-2009-06-29.orig/arch/x86/kernel/setup64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/setup64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -31,7 +31,11 @@
+ #include <asm/hypervisor.h>
+ #endif
+
++#ifndef CONFIG_DEBUG_BOOT_PARAMS
+ struct boot_params __initdata boot_params;
++#else
++struct boot_params boot_params;
++#endif
+
+ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+
+@@ -47,6 +51,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __attr
+
+ unsigned long __supported_pte_mask __read_mostly = ~0UL;
+ EXPORT_SYMBOL(__supported_pte_mask);
++
+ static int do_not_nx __cpuinitdata = 0;
+
+ /* noexec=on|off
+@@ -90,6 +95,45 @@ static int __init nonx32_setup(char *str
+ __setup("noexec32=", nonx32_setup);
+
+ /*
++ * Copy data used in early init routines from the initial arrays to the
++ * per cpu data areas. These arrays then become expendable and the
++ * *_early_ptr's are zeroed indicating that the static arrays are gone.
++ */
++static void __init setup_per_cpu_maps(void)
++{
++#ifndef CONFIG_XEN
++ int cpu;
++
++ for_each_possible_cpu(cpu) {
++#ifdef CONFIG_SMP
++ if (per_cpu_offset(cpu)) {
++#endif
++ per_cpu(x86_cpu_to_apicid, cpu) =
++ x86_cpu_to_apicid_init[cpu];
++ per_cpu(x86_bios_cpu_apicid, cpu) =
++ x86_bios_cpu_apicid_init[cpu];
++#ifdef CONFIG_NUMA
++ per_cpu(x86_cpu_to_node_map, cpu) =
++ x86_cpu_to_node_map_init[cpu];
++#endif
++#ifdef CONFIG_SMP
++ }
++ else
++ printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
++ cpu);
++#endif
++ }
++
++ /* indicate the early static arrays will soon be gone */
++ x86_cpu_to_apicid_early_ptr = NULL;
++ x86_bios_cpu_apicid_early_ptr = NULL;
++#ifdef CONFIG_NUMA
++ x86_cpu_to_node_map_early_ptr = NULL;
++#endif
++#endif
++}
++
++/*
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
+@@ -109,19 +153,24 @@ void __init setup_per_cpu_areas(void)
+ printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
+ for_each_cpu_mask (i, cpu_possible_map) {
+ char *ptr;
++#ifndef CONFIG_NEED_MULTIPLE_NODES
++ ptr = alloc_bootmem_pages(size);
++#else
++ int node = early_cpu_to_node(i);
+
+- if (!NODE_DATA(cpu_to_node(i))) {
+- printk("cpu with no node %d, num_online_nodes %d\n",
+- i, num_online_nodes());
++ if (!node_online(node) || !NODE_DATA(node))
+ ptr = alloc_bootmem_pages(size);
+- } else {
+- ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
+- }
++ else
++ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
++#endif
+ if (!ptr)
+ panic("Cannot allocate cpu data for CPU %d\n", i);
+ cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+ }
++
++ /* setup percpu data maps early */
++ setup_per_cpu_maps();
+ }
+
+ #ifdef CONFIG_XEN
+@@ -224,7 +273,8 @@ void syscall_init(void)
+ wrmsrl(MSR_CSTAR, ignore_sysret);
+
+ /* Flags to clear on syscall */
+- wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
++ wrmsrl(MSR_SYSCALL_MASK,
++ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+ #endif
+ #ifdef CONFIG_IA32_EMULATION
+ syscall32_cpu_init ();
+@@ -303,7 +353,7 @@ void __cpuinit cpu_init (void)
+ */
+ #ifndef CONFIG_XEN
+ if (cpu)
+- memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
++ memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
+ #endif
+
+ cpu_gdt_descr[cpu].size = GDT_SIZE;
+@@ -334,10 +384,10 @@ void __cpuinit cpu_init (void)
+ v, cpu);
+ }
+ estacks += PAGE_SIZE << order[v];
+- orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
++ orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
+ }
+
+- t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
++ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ /*
+ * <= is required because the CPU will access up to
+ * 8 bits beyond the end of the IO permission bitmap.
+--- sle11-2009-06-29.orig/arch/x86/kernel/setup_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -47,9 +47,12 @@
+ #include <linux/crash_dump.h>
+ #include <linux/dmi.h>
+ #include <linux/pfn.h>
++#include <linux/pci.h>
++#include <linux/init_ohci1394_dma.h>
+
+ #include <video/edid.h>
+
++#include <asm/mtrr.h>
+ #include <asm/apic.h>
+ #include <asm/e820.h>
+ #include <asm/mpspec.h>
+@@ -79,14 +82,83 @@ static struct notifier_block xen_panic_b
+ xen_panic_event, NULL, 0 /* try to go last */
+ };
+
+-int disable_pse __cpuinitdata = 0;
+-
+ /*
+ * Machine setup..
+ */
+-extern struct resource code_resource;
+-extern struct resource data_resource;
+-extern struct resource bss_resource;
++static struct resource data_resource = {
++ .name = "Kernel data",
++ .start = 0,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++static struct resource code_resource = {
++ .name = "Kernel code",
++ .start = 0,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++static struct resource bss_resource = {
++ .name = "Kernel bss",
++ .start = 0,
++ .end = 0,
++ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++static struct resource video_ram_resource = {
++ .name = "Video RAM area",
++ .start = 0xa0000,
++ .end = 0xbffff,
++ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++static struct resource standard_io_resources[] = { {
++ .name = "dma1",
++ .start = 0x0000,
++ .end = 0x001f,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "pic1",
++ .start = 0x0020,
++ .end = 0x0021,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "timer0",
++ .start = 0x0040,
++ .end = 0x0043,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "timer1",
++ .start = 0x0050,
++ .end = 0x0053,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "keyboard",
++ .start = 0x0060,
++ .end = 0x006f,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "dma page reg",
++ .start = 0x0080,
++ .end = 0x008f,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "pic2",
++ .start = 0x00a0,
++ .end = 0x00a1,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "dma2",
++ .start = 0x00c0,
++ .end = 0x00df,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++}, {
++ .name = "fpu",
++ .start = 0x00f0,
++ .end = 0x00ff,
++ .flags = IORESOURCE_BUSY | IORESOURCE_IO
++} };
+
+ /* cpu data as detected by the assembly code in head.S */
+ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+@@ -94,13 +166,16 @@ struct cpuinfo_x86 new_cpu_data __cpuini
+ struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+ EXPORT_SYMBOL(boot_cpu_data);
+
++#ifndef CONFIG_X86_PAE
+ unsigned long mmu_cr4_features;
++#else
++unsigned long mmu_cr4_features = X86_CR4_PAE;
++#endif
+
+ /* for MCA, but anyone else can use it if they want */
+ unsigned int machine_id;
+ unsigned int machine_submodel_id;
+ unsigned int BIOS_revision;
+-unsigned int mca_pentium_flag;
+
+ /* Boot loader ID as an integer, for the benefit of proc_dointvec */
+ int bootloader_type;
+@@ -131,13 +206,17 @@ extern int root_mountflags;
+
+ unsigned long saved_videomode;
+
+-#define RAMDISK_IMAGE_START_MASK 0x07FF
++#define RAMDISK_IMAGE_START_MASK 0x07FF
+ #define RAMDISK_PROMPT_FLAG 0x8000
+-#define RAMDISK_LOAD_FLAG 0x4000
++#define RAMDISK_LOAD_FLAG 0x4000
+
+ static char __initdata command_line[COMMAND_LINE_SIZE];
+
++#ifndef CONFIG_DEBUG_BOOT_PARAMS
+ struct boot_params __initdata boot_params;
++#else
++struct boot_params boot_params;
++#endif
+
+ /*
+ * Point at the empty zero page to start with. We map the real shared_info
+@@ -198,8 +277,7 @@ static int __init parse_mem(char *arg)
+ return -EINVAL;
+
+ if (strcmp(arg, "nopentium") == 0) {
+- clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+- disable_pse = 1;
++ setup_clear_cpu_cap(X86_FEATURE_PSE);
+ } else {
+ /* If the user specifies memory size, we
+ * limit the BIOS-provided memory map to
+@@ -208,7 +286,7 @@ static int __init parse_mem(char *arg)
+ * trim the existing memory map.
+ */
+ unsigned long long mem_size;
+-
++
+ mem_size = memparse(arg, &arg);
+ limit_regions(mem_size);
+ user_defined_memmap = 1;
+@@ -350,7 +428,7 @@ static void __init reserve_ebda_region(v
+ unsigned int addr;
+ addr = get_bios_ebda();
+ if (addr)
+- reserve_bootmem(addr, PAGE_SIZE);
++ reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
+ }
+ #endif
+
+@@ -365,8 +443,6 @@ static unsigned long __init setup_memory
+ min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
+ xen_start_info->nr_pt_frames;
+
+- find_max_pfn();
+-
+ max_low_pfn = find_max_low_pfn();
+
+ #ifdef CONFIG_HIGHMEM
+@@ -447,7 +523,8 @@ static void __init reserve_crashkernel(v
+ (unsigned long)(total_mem >> 20));
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+- reserve_bootmem(crash_base, crash_size);
++ reserve_bootmem(crash_base, crash_size,
++ BOOTMEM_DEFAULT);
+ } else
+ printk(KERN_INFO "crashkernel reservation failed - "
+ "you have to specify a base address\n");
+@@ -461,6 +538,99 @@ static inline void __init reserve_crashk
+ {}
+ #endif
+
++#ifdef CONFIG_BLK_DEV_INITRD
++
++static bool do_relocate_initrd = false;
++
++static void __init reserve_initrd(void)
++{
++ unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
++ unsigned long ramdisk_size = xen_start_info->mod_len;
++ unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
++ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
++ unsigned long ramdisk_here;
++
++ initrd_start = 0;
++
++ if (!xen_start_info->mod_start || !ramdisk_size)
++ return; /* No initrd provided by bootloader */
++
++ if (ramdisk_end < ramdisk_image) {
++ printk(KERN_ERR "initrd wraps around end of memory, "
++ "disabling initrd\n");
++ return;
++ }
++ if (ramdisk_size >= end_of_lowmem/2) {
++ printk(KERN_ERR "initrd too large to handle, "
++ "disabling initrd\n");
++ return;
++ }
++ if (ramdisk_end <= end_of_lowmem) {
++ /* All in lowmem, easy case */
++ reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
++ initrd_start = ramdisk_image + PAGE_OFFSET;
++ initrd_end = initrd_start+ramdisk_size;
++ return;
++ }
++
++ /* We need to move the initrd down into lowmem */
++ ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
++
++ /* Note: this includes all the lowmem currently occupied by
++ the initrd, we rely on that fact to keep the data intact. */
++ reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
++ initrd_start = ramdisk_here + PAGE_OFFSET;
++ initrd_end = initrd_start + ramdisk_size;
++
++ do_relocate_initrd = true;
++}
++
++#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
++
++static void __init relocate_initrd(void)
++{
++ unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
++ unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
++ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
++ unsigned long ramdisk_here;
++ unsigned long slop, clen, mapaddr;
++ char *p, *q;
++
++ if (!do_relocate_initrd)
++ return;
++
++ ramdisk_here = initrd_start - PAGE_OFFSET;
++
++ q = (char *)initrd_start;
++
++ /* Copy any lowmem portion of the initrd */
++ if (ramdisk_image < end_of_lowmem) {
++ clen = end_of_lowmem - ramdisk_image;
++ p = (char *)__va(ramdisk_image);
++ memcpy(q, p, clen);
++ q += clen;
++ ramdisk_image += clen;
++ ramdisk_size -= clen;
++ }
++
++ /* Copy the highmem portion of the initrd */
++ while (ramdisk_size) {
++ slop = ramdisk_image & ~PAGE_MASK;
++ clen = ramdisk_size;
++ if (clen > MAX_MAP_CHUNK-slop)
++ clen = MAX_MAP_CHUNK-slop;
++ mapaddr = ramdisk_image & PAGE_MASK;
++ p = early_ioremap(mapaddr, clen+slop);
++ memcpy(q, p+slop, clen);
++ early_iounmap(p, clen+slop);
++ q += clen;
++ ramdisk_image += clen;
++ ramdisk_size -= clen;
++ }
++}
++
++#endif /* CONFIG_BLK_DEV_INITRD */
++
+ void __init setup_bootmem_allocator(void)
+ {
+ unsigned long bootmap_size;
+@@ -478,14 +648,15 @@ void __init setup_bootmem_allocator(void
+ * bootmem allocator with an invalid RAM area.
+ */
+ reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
+- bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
++ bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
++ BOOTMEM_DEFAULT);
+
+ #ifndef CONFIG_XEN
+ /*
+ * reserve physical page 0 - it's a special BIOS page on many boxes,
+ * enabling clean reboots, SMP operation, laptop functions.
+ */
+- reserve_bootmem(0, PAGE_SIZE);
++ reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
+
+ /* reserve EBDA region, it's a 4K region */
+ reserve_ebda_region();
+@@ -495,7 +666,7 @@ void __init setup_bootmem_allocator(void
+ unless you have no PS/2 mouse plugged in. */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86 == 6)
+- reserve_bootmem(0xa0000 - 4096, 4096);
++ reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
+
+ #ifdef CONFIG_SMP
+ /*
+@@ -503,7 +674,7 @@ void __init setup_bootmem_allocator(void
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+- reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
++ reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
+ #endif
+ #ifdef CONFIG_ACPI_SLEEP
+ /*
+@@ -511,29 +682,12 @@ void __init setup_bootmem_allocator(void
+ */
+ acpi_reserve_bootmem();
+ #endif
+- numa_kva_reserve();
+ #endif /* !CONFIG_XEN */
+
+ #ifdef CONFIG_BLK_DEV_INITRD
+- if (xen_start_info->mod_start) {
+- unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
+- unsigned long ramdisk_size = xen_start_info->mod_len;
+- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
+- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+-
+- if (ramdisk_end <= end_of_lowmem) {
+- /*reserve_bootmem(ramdisk_image, ramdisk_size);*/
+- initrd_start = ramdisk_image + PAGE_OFFSET;
+- initrd_end = initrd_start+ramdisk_size;
+- initrd_below_start_ok = 1;
+- } else {
+- printk(KERN_ERR "initrd extends beyond end of memory "
+- "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+- ramdisk_end, end_of_lowmem);
+- initrd_start = 0;
+- }
+- }
++ reserve_initrd();
+ #endif
++ numa_kva_reserve();
+ reserve_crashkernel();
+ }
+
+@@ -600,20 +754,14 @@ void __init setup_arch(char **cmdline_p)
+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+ pre_setup_arch_hook();
+ early_cpu_init();
++ early_ioremap_init();
+ #ifdef CONFIG_SMP
+ prefill_possible_map();
+ #endif
+
+- /*
+- * FIXME: This isn't an official loader_type right
+- * now but does currently work with elilo.
+- * If we were configured as an EFI kernel, check to make
+- * sure that we were loaded correctly from elilo and that
+- * the system table is valid. If not, then initialize normally.
+- */
+ #ifdef CONFIG_EFI
+- if ((boot_params.hdr.type_of_loader == 0x50) &&
+- boot_params.efi_info.efi_systab)
++ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
++ "EL32", 4))
+ efi_enabled = 1;
+ #endif
+
+@@ -653,12 +801,9 @@ void __init setup_arch(char **cmdline_p)
+ #endif
+
+ ARCH_SETUP
+- if (efi_enabled)
+- efi_init();
+- else {
+- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+- print_memory_map(memory_setup());
+- }
++
++ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
++ print_memory_map(memory_setup());
+
+ copy_edd();
+
+@@ -691,6 +836,17 @@ void __init setup_arch(char **cmdline_p)
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
++ if (efi_enabled)
++ efi_init();
++
++ /* update e820 for memory not covered by WB MTRRs */
++ find_max_pfn();
++ mtrr_bp_init();
++#ifndef CONFIG_XEN
++ if (mtrr_trim_uncached_memory(max_pfn))
++ find_max_pfn();
++#endif
++
+ max_low_pfn = setup_memory();
+
+ #ifdef CONFIG_VMI
+@@ -715,6 +871,16 @@ void __init setup_arch(char **cmdline_p)
+ smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
+ #endif
+ paging_init();
++
++ /*
++ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
++ */
++
++#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
++ if (init_ohci1394_dma_early)
++ init_ohci1394_dma_on_all_controllers();
++#endif
++
+ remapped_pgdat_init();
+ sparse_init();
+ zone_sizes_init();
+@@ -800,16 +966,20 @@ void __init setup_arch(char **cmdline_p)
+ * NOTE: at this point the bootmem allocator is fully available.
+ */
+
++#ifdef CONFIG_BLK_DEV_INITRD
++ relocate_initrd();
++#endif
++
+ paravirt_post_allocator_init();
+
+ if (is_initial_xendomain())
+ dmi_scan_machine();
+
++ io_delay_init();
++
+ #ifdef CONFIG_X86_GENERICARCH
+ generic_apic_probe();
+-#endif
+- if (efi_enabled)
+- efi_map_memmap();
++#endif
+
+ set_iopl.iopl = 1;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+@@ -827,7 +997,7 @@ void __init setup_arch(char **cmdline_p)
+ acpi_boot_table_init();
+ #endif
+
+-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
++#ifndef CONFIG_XEN
+ early_quirks();
+ #endif
+
+@@ -873,3 +1043,30 @@ xen_panic_event(struct notifier_block *t
+ /* we're never actually going to get here... */
+ return NOTIFY_DONE;
+ }
++
++/*
++ * Request address space for all standard resources
++ *
++ * This is called just before pcibios_init(), which is also a
++ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
++ */
++static int __init request_standard_resources(void)
++{
++ int i;
++
++ /* Nothing to do if not running in dom0. */
++ if (!is_initial_xendomain())
++ return 0;
++
++ printk(KERN_INFO "Setting up standard PCI resources\n");
++ init_iomem_resources(&code_resource, &data_resource, &bss_resource);
++
++ request_resource(&iomem_resource, &video_ram_resource);
++
++ /* request I/O space for devices used on all i[345]86 PCs */
++ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
++ request_resource(&ioport_resource, &standard_io_resources[i]);
++ return 0;
++}
++
++subsys_initcall(request_standard_resources);
+--- sle11-2009-06-29.orig/arch/x86/kernel/setup_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -15,7 +15,6 @@
+ #include <linux/ptrace.h>
+ #include <linux/slab.h>
+ #include <linux/user.h>
+-#include <linux/a.out.h>
+ #include <linux/screen_info.h>
+ #include <linux/ioport.h>
+ #include <linux/delay.h>
+@@ -30,6 +29,7 @@
+ #include <linux/crash_dump.h>
+ #include <linux/root_dev.h>
+ #include <linux/pci.h>
++#include <linux/efi.h>
+ #include <linux/acpi.h>
+ #include <linux/kallsyms.h>
+ #include <linux/edd.h>
+@@ -39,10 +39,13 @@
+ #include <linux/dmi.h>
+ #include <linux/dma-mapping.h>
+ #include <linux/ctype.h>
++#include <linux/uaccess.h>
++#include <linux/init_ohci1394_dma.h>
+
+ #include <asm/mtrr.h>
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
++#include <asm/vsyscall.h>
+ #include <asm/io.h>
+ #include <asm/smp.h>
+ #include <asm/msr.h>
+@@ -50,6 +53,7 @@
+ #include <video/edid.h>
+ #include <asm/e820.h>
+ #include <asm/dma.h>
++#include <asm/gart.h>
+ #include <asm/mpspec.h>
+ #include <asm/mmu_context.h>
+ #include <asm/proto.h>
+@@ -59,6 +63,9 @@
+ #include <asm/sections.h>
+ #include <asm/dmi.h>
+ #include <asm/cacheflush.h>
++#include <asm/mce.h>
++#include <asm/ds.h>
++#include <asm/topology.h>
+ #ifdef CONFIG_XEN
+ #include <linux/percpu.h>
+ #include <xen/interface/physdev.h>
+@@ -108,6 +115,8 @@ EXPORT_SYMBOL(xen_start_info);
+ struct cpuinfo_x86 boot_cpu_data __read_mostly;
+ EXPORT_SYMBOL(boot_cpu_data);
+
++__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
++
+ unsigned long mmu_cr4_features;
+
+ /* Boot loader ID as an integer, for the benefit of proc_dointvec */
+@@ -117,7 +126,7 @@ unsigned long saved_video_mode;
+
+ int force_mwait __cpuinitdata;
+
+-/*
++/*
+ * Early DMI memory
+ */
+ int dmi_alloc_index;
+@@ -163,25 +172,27 @@ struct resource standard_io_resources[]
+
+ #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
+
+-struct resource data_resource = {
++static struct resource data_resource = {
+ .name = "Kernel data",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_RAM,
+ };
+-struct resource code_resource = {
++static struct resource code_resource = {
+ .name = "Kernel code",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_RAM,
+ };
+-struct resource bss_resource = {
++static struct resource bss_resource = {
+ .name = "Kernel bss",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_RAM,
+ };
+
++static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
++
+ #ifdef CONFIG_PROC_VMCORE
+ /* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+@@ -205,9 +216,10 @@ contig_initmem_init(unsigned long start_
+ unsigned long bootmap_size, bootmap;
+
+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
++ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
++ PAGE_SIZE);
+ if (bootmap == -1L)
+- panic("Cannot find bootmem map of size %ld\n",bootmap_size);
++ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+ e820_register_active_regions(0, start_pfn, end_pfn);
+ #ifdef CONFIG_XEN
+@@ -215,8 +227,8 @@ contig_initmem_init(unsigned long start_
+ #else
+ free_bootmem_with_active_regions(0, end_pfn);
+ #endif
+- reserve_bootmem(bootmap, bootmap_size);
+-}
++ reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
++}
+ #endif
+
+ #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+@@ -249,27 +261,35 @@ static inline void copy_edd(void)
+ #ifndef CONFIG_XEN
+ static void __init reserve_crashkernel(void)
+ {
+- unsigned long long free_mem;
++ unsigned long long total_mem;
+ unsigned long long crash_size, crash_base;
+ int ret;
+
+- free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
++ total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
+
+- ret = parse_crashkernel(boot_command_line, free_mem,
++ ret = parse_crashkernel(boot_command_line, total_mem,
+ &crash_size, &crash_base);
+ if (ret == 0 && crash_size) {
+- if (crash_base > 0) {
+- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+- "for crashkernel (System RAM: %ldMB)\n",
+- (unsigned long)(crash_size >> 20),
+- (unsigned long)(crash_base >> 20),
+- (unsigned long)(free_mem >> 20));
+- crashk_res.start = crash_base;
+- crashk_res.end = crash_base + crash_size - 1;
+- reserve_bootmem(crash_base, crash_size);
+- } else
++ if (crash_base <= 0) {
+ printk(KERN_INFO "crashkernel reservation failed - "
+ "you have to specify a base address\n");
++ return;
++ }
++
++ if (reserve_bootmem(crash_base, crash_size,
++ BOOTMEM_EXCLUSIVE) < 0) {
++ printk(KERN_INFO "crashkernel reservation failed - "
++ "memory is in use\n");
++ return;
++ }
++
++ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
++ "for crashkernel (System RAM: %ldMB)\n",
++ (unsigned long)(crash_size >> 20),
++ (unsigned long)(crash_base >> 20),
++ (unsigned long)(total_mem >> 20));
++ crashk_res.start = crash_base;
++ crashk_res.end = crash_base + crash_size - 1;
+ }
+ }
+ #else
+@@ -280,37 +300,21 @@ static inline void __init reserve_crashk
+ {}
+ #endif
+
+-#ifndef CONFIG_XEN
+-#define EBDA_ADDR_POINTER 0x40E
+-
+-unsigned __initdata ebda_addr;
+-unsigned __initdata ebda_size;
+-
+-static void discover_ebda(void)
++/* Overridden in paravirt.c if CONFIG_PARAVIRT */
++void __attribute__((weak)) __init memory_setup(void)
+ {
+- /*
+- * there is a real-mode segmented pointer pointing to the
+- * 4K EBDA area at 0x40E
+- */
+- ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
+- ebda_addr <<= 4;
+-
+- ebda_size = *(unsigned short *)__va(ebda_addr);
+-
+- /* Round EBDA up to pages */
+- if (ebda_size == 0)
+- ebda_size = 1;
+- ebda_size <<= 10;
+- ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
+- if (ebda_size > 64*1024)
+- ebda_size = 64*1024;
++ machine_specific_memory_setup();
+ }
+-#else
+-#define discover_ebda() ((void)0)
+-#endif
+
++/*
++ * setup_arch - architecture-specific boot-time initializations
++ *
++ * Note: On x86_64, fixmaps are ready for use even before this is called.
++ */
+ void __init setup_arch(char **cmdline_p)
+ {
++ unsigned i;
++
+ #ifdef CONFIG_XEN
+ extern struct e820map machine_e820;
+
+@@ -319,6 +323,11 @@ void __init setup_arch(char **cmdline_p)
+ /* Register a call for panic conditions. */
+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+
++ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
++ VMASST_TYPE_writable_pagetables));
++
++ early_ioremap_init();
++
+ ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
+ screen_info = boot_params.screen_info;
+
+@@ -335,11 +344,6 @@ void __init setup_arch(char **cmdline_p)
+ screen_info.orig_video_isVGA = 0;
+
+ copy_edid();
+-
+- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+- VMASST_TYPE_writable_pagetables));
+-
+- ARCH_SETUP
+ #else
+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
+
+@@ -355,7 +359,15 @@ void __init setup_arch(char **cmdline_p)
+ rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+ rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+ #endif
+- setup_memory_region();
++#ifdef CONFIG_EFI
++ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
++ "EL64", 4))
++ efi_enabled = 1;
++#endif
++
++ ARCH_SETUP
++
++ memory_setup();
+ copy_edd();
+
+ if (!boot_params.hdr.root_flags)
+@@ -379,28 +391,51 @@ void __init setup_arch(char **cmdline_p)
+
+ parse_early_param();
+
++#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
++ if (init_ohci1394_dma_early)
++ init_ohci1394_dma_on_all_controllers();
++#endif
++
+ finish_e820_parsing();
+
++ early_gart_iommu_check();
++
+ e820_register_active_regions(0, 0, -1UL);
+ /*
+ * partially used pages are not usable - thus
+ * we are rounding upwards:
+ */
+ end_pfn = e820_end_of_ram();
++ /* update e820 for memory not covered by WB MTRRs */
++ mtrr_bp_init();
++#ifndef CONFIG_XEN
++ if (mtrr_trim_uncached_memory(end_pfn)) {
++ e820_register_active_regions(0, 0, -1UL);
++ end_pfn = e820_end_of_ram();
++ }
++#endif
++
+ num_physpages = end_pfn;
++ max_mapnr = end_pfn;
+
+ check_efer();
+
+- discover_ebda();
+-
+ init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
++ if (efi_enabled)
++ efi_init();
+
+ if (is_initial_xendomain())
+ dmi_scan_machine();
+
++ io_delay_init();
++
+ #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+- /* setup to use the static apicid table during kernel startup */
+- x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
++ /* setup to use the early static init tables during kernel startup */
++ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
++ x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
++#ifdef CONFIG_NUMA
++ x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
++#endif
+ #endif
+
+ /* How many end-of-memory variables you have, grandma! */
+@@ -419,54 +454,25 @@ void __init setup_arch(char **cmdline_p)
+ #endif
+
+ #ifdef CONFIG_NUMA
+- numa_initmem_init(0, end_pfn);
++ numa_initmem_init(0, end_pfn);
+ #else
+ contig_initmem_init(0, end_pfn);
+ #endif
+
+-#ifdef CONFIG_XEN
+- /*
+- * Reserve kernel, physmap, start info, initial page tables, and
+- * direct mapping.
+- */
+- reserve_bootmem_generic(__pa_symbol(&_text),
+- (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
+-#else
+- /* Reserve direct mapping */
+- reserve_bootmem_generic(table_start << PAGE_SHIFT,
+- (table_end - table_start) << PAGE_SHIFT);
+-
+- /* reserve kernel */
+- reserve_bootmem_generic(__pa_symbol(&_text),
+- __pa_symbol(&_end) - __pa_symbol(&_text));
++ early_res_to_bootmem();
+
++#ifndef CONFIG_XEN
++#ifdef CONFIG_ACPI_SLEEP
+ /*
+- * reserve physical page 0 - it's a special BIOS page on many boxes,
+- * enabling clean reboots, SMP operation, laptop functions.
++ * Reserve low memory region for sleep support.
+ */
+- reserve_bootmem_generic(0, PAGE_SIZE);
+-
+- /* reserve ebda region */
+- if (ebda_addr)
+- reserve_bootmem_generic(ebda_addr, ebda_size);
+-#ifdef CONFIG_NUMA
+- /* reserve nodemap region */
+- if (nodemap_addr)
+- reserve_bootmem_generic(nodemap_addr, nodemap_size);
++ acpi_reserve_bootmem();
+ #endif
+
+-#ifdef CONFIG_SMP
+- /* Reserve SMP trampoline */
+- reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
+-#endif
++ if (efi_enabled)
++ efi_reserve_bootmem();
+ #endif
+
+-#ifdef CONFIG_ACPI_SLEEP
+- /*
+- * Reserve low memory region for sleep support.
+- */
+- acpi_reserve_bootmem();
+-#endif
+ #ifdef CONFIG_BLK_DEV_INITRD
+ #ifdef CONFIG_XEN
+ if (xen_start_info->mod_start) {
+@@ -490,6 +496,8 @@ void __init setup_arch(char **cmdline_p)
+ initrd_below_start_ok = 1;
+ #endif
+ } else {
++ /* Assumes everything on node 0 */
++ free_bootmem(ramdisk_image, ramdisk_size);
+ printk(KERN_ERR "initrd extends beyond end of memory "
+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+ ramdisk_end, end_of_mem);
+@@ -499,10 +507,11 @@ void __init setup_arch(char **cmdline_p)
+ #endif
+ reserve_crashkernel();
+ paging_init();
++ map_vsyscall();
+ #ifdef CONFIG_X86_LOCAL_APIC
+ /*
+- * Find and reserve possible boot-time SMP configuration:
+- */
++ * Find and reserve possible boot-time SMP configuration:
++ */
+ find_smp_config();
+ #endif
+ #ifdef CONFIG_XEN
+@@ -590,16 +599,10 @@ void __init setup_arch(char **cmdline_p)
+ #endif
+ #endif
+
+-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
++#ifndef CONFIG_XEN
+ early_quirks();
+ #endif
+
+- /*
+- * set this early, so we dont allocate cpu0
+- * if MADT list doesnt list BSP first
+- * mpparse.c/MP_processor_info() allocates logical cpu numbers.
+- */
+- cpu_set(0, cpu_present_map);
+ #ifdef CONFIG_ACPI
+ /*
+ * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
+@@ -623,6 +626,7 @@ void __init setup_arch(char **cmdline_p)
+ get_smp_config();
+ #ifndef CONFIG_XEN
+ init_apic_mappings();
++ ioapic_init_mappings();
+ #endif
+ #endif
+ #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
+@@ -634,18 +638,17 @@ void __init setup_arch(char **cmdline_p)
+ */
+ #ifdef CONFIG_XEN
+ if (is_initial_xendomain())
+- e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
++ e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
++ &code_resource, &data_resource, &bss_resource);
+ #else
+- e820_reserve_resources(e820.map, e820.nr_map);
++ e820_reserve_resources(e820.map, e820.nr_map,
++ &code_resource, &data_resource, &bss_resource);
+ e820_mark_nosave_regions();
+ #endif
+
+- {
+- unsigned i;
+ /* request I/O space for devices used on all i[345]86 PCs */
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+ request_resource(&ioport_resource, &standard_io_resources[i]);
+- }
+
+ #ifdef CONFIG_XEN
+ if (is_initial_xendomain())
+@@ -679,7 +682,8 @@ void __init setup_arch(char **cmdline_p)
+
+ #ifdef CONFIG_VT
+ #if defined(CONFIG_VGA_CONSOLE)
+- conswitchp = &vga_con;
++ if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
++ conswitchp = &vga_con;
+ #elif defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
+ #endif
+@@ -723,9 +727,10 @@ static void __cpuinit display_cacheinfo(
+
+ if (n >= 0x80000005) {
+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+- printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
+- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+- c->x86_cache_size=(ecx>>24)+(edx>>24);
++ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
++ "D cache %dK (%d bytes/line)\n",
++ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
++ c->x86_cache_size = (ecx>>24) + (edx>>24);
+ /* On K8 L1 TLB is inclusive, so don't count it */
+ c->x86_tlbsize = 0;
+ }
+@@ -739,27 +744,25 @@ static void __cpuinit display_cacheinfo(
+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+ c->x86_cache_size, ecx & 0xFF);
+ }
+-
+- if (n >= 0x80000007)
+- cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
+ if (n >= 0x80000008) {
+- cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
++ cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ }
+ }
+
+ #ifdef CONFIG_NUMA
+-static int nearby_node(int apicid)
++static int __cpuinit nearby_node(int apicid)
+ {
+- int i;
++ int i, node;
++
+ for (i = apicid - 1; i >= 0; i--) {
+- int node = apicid_to_node[i];
++ node = apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+- int node = apicid_to_node[i];
++ node = apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+@@ -771,7 +774,7 @@ static int nearby_node(int apicid)
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+-static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
++static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+ {
+ #ifdef CONFIG_SMP
+ unsigned bits;
+@@ -780,7 +783,54 @@ static void __init amd_detect_cmp(struct
+ int node = 0;
+ unsigned apicid = hard_smp_processor_id();
+ #endif
+- unsigned ecx = cpuid_ecx(0x80000008);
++ bits = c->x86_coreid_bits;
++
++ /* Low order bits define the core id (index of core in socket) */
++ c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
++ /* Convert the APIC ID into the socket ID */
++ c->phys_proc_id = phys_pkg_id(bits);
++
++#ifdef CONFIG_NUMA
++ node = c->phys_proc_id;
++ if (apicid_to_node[apicid] != NUMA_NO_NODE)
++ node = apicid_to_node[apicid];
++ if (!node_online(node)) {
++ /* Two possibilities here:
++ - The CPU is missing memory and no node was created.
++ In that case try picking one from a nearby CPU
++ - The APIC IDs differ from the HyperTransport node IDs
++ which the K8 northbridge parsing fills in.
++ Assume they are all increased by a constant offset,
++ but in the same order as the HT nodeids.
++ If that doesn't result in a usable node fall back to the
++ path for the previous case. */
++
++ int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
++
++ if (ht_nodeid >= 0 &&
++ apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
++ node = apicid_to_node[ht_nodeid];
++ /* Pick a nearby node */
++ if (!node_online(node))
++ node = nearby_node(apicid);
++ }
++ numa_set_node(cpu, node);
++
++ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
++#endif
++#endif
++}
++
++static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
++{
++#ifdef CONFIG_SMP
++ unsigned bits, ecx;
++
++ /* Multi core CPU? */
++ if (c->extended_cpuid_level < 0x80000008)
++ return;
++
++ ecx = cpuid_ecx(0x80000008);
+
+ c->x86_max_cores = (ecx & 0xff) + 1;
+
+@@ -793,37 +843,8 @@ static void __init amd_detect_cmp(struct
+ bits++;
+ }
+
+- /* Low order bits define the core id (index of core in socket) */
+- c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
+- /* Convert the APIC ID into the socket ID */
+- c->phys_proc_id = phys_pkg_id(bits);
+-
+-#ifdef CONFIG_NUMA
+- node = c->phys_proc_id;
+- if (apicid_to_node[apicid] != NUMA_NO_NODE)
+- node = apicid_to_node[apicid];
+- if (!node_online(node)) {
+- /* Two possibilities here:
+- - The CPU is missing memory and no node was created.
+- In that case try picking one from a nearby CPU
+- - The APIC IDs differ from the HyperTransport node IDs
+- which the K8 northbridge parsing fills in.
+- Assume they are all increased by a constant offset,
+- but in the same order as the HT nodeids.
+- If that doesn't result in a usable node fall back to the
+- path for the previous case. */
+- int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
+- if (ht_nodeid >= 0 &&
+- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+- node = apicid_to_node[ht_nodeid];
+- /* Pick a nearby node */
+- if (!node_online(node))
+- node = nearby_node(apicid);
+- }
+- numa_set_node(cpu, node);
++ c->x86_coreid_bits = bits;
+
+- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+-#endif
+ #endif
+ }
+
+@@ -840,8 +861,8 @@ static void __init amd_detect_cmp(struct
+ /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
+ static __cpuinit int amd_apic_timer_broken(void)
+ {
+- u32 lo, hi;
+- u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
++ u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
++
+ switch (eax & CPUID_XFAM) {
+ case CPUID_XFAM_K8:
+ if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
+@@ -860,6 +881,15 @@ static __cpuinit int amd_apic_timer_brok
+ }
+ #endif
+
++static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
++{
++ early_init_amd_mc(c);
++
++ /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
++ if (c->x86_power & (1<<8))
++ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
++}
++
+ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+ {
+ unsigned level;
+@@ -870,7 +900,7 @@ static void __cpuinit init_amd(struct cp
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+- *
++ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+@@ -883,35 +913,32 @@ static void __cpuinit init_amd(struct cp
+
+ /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+- clear_bit(0*32+31, &c->x86_capability);
+-
++ clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
++
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+- if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
+- set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
++ if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
++ level >= 0x0f58))
++ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ if (c->x86 == 0x10 || c->x86 == 0x11)
+- set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
++ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* Enable workaround for FXSAVE leak */
+ if (c->x86 >= 6)
+- set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
++ set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+
+ level = get_model_name(c);
+ if (!level) {
+- switch (c->x86) {
++ switch (c->x86) {
+ case 15:
+ /* Should distinguish Models here, but this is only
+ a fallback anyways. */
+ strcpy(c->x86_model_id, "Hammer");
+- break;
+- }
+- }
++ break;
++ }
++ }
+ display_cacheinfo(c);
+
+- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+- if (c->x86_power & (1<<8))
+- set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+-
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level >= 0x80000008)
+ amd_detect_cmp(c);
+@@ -923,14 +950,10 @@ static void __cpuinit init_amd(struct cp
+ num_cache_leaves = 3;
+
+ if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
+- set_bit(X86_FEATURE_K8, &c->x86_capability);
+-
+- /* RDTSC can be speculated around */
+- clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
++ set_cpu_cap(c, X86_FEATURE_K8);
+
+- /* Family 10 doesn't support C states in MWAIT so don't use it */
+- if (c->x86 == 0x10 && !force_mwait)
+- clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
++ /* MFENCE stops RDTSC speculation */
++ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+
+ #ifndef CONFIG_XEN
+ if (amd_apic_timer_broken())
+@@ -938,28 +961,29 @@ static void __cpuinit init_amd(struct cp
+ #endif
+ }
+
+-static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
++void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+ {
+ #ifdef CONFIG_SMP
+- u32 eax, ebx, ecx, edx;
+- int index_msb, core_bits;
++ u32 eax, ebx, ecx, edx;
++ int index_msb, core_bits;
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+
+ if (!cpu_has(c, X86_FEATURE_HT))
+ return;
+- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
++ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ goto out;
+
+ smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+ if (smp_num_siblings == 1) {
+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
+- } else if (smp_num_siblings > 1 ) {
++ } else if (smp_num_siblings > 1) {
+
+ if (smp_num_siblings > NR_CPUS) {
+- printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
++ printk(KERN_WARNING "CPU: Unsupported number of "
++ "siblings %d", smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
+@@ -969,7 +993,7 @@ static void __cpuinit detect_ht(struct c
+
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+- index_msb = get_count_order(smp_num_siblings) ;
++ index_msb = get_count_order(smp_num_siblings);
+
+ core_bits = get_count_order(c->x86_max_cores);
+
+@@ -978,8 +1002,10 @@ static void __cpuinit detect_ht(struct c
+ }
+ out:
+ if ((c->x86_max_cores * smp_num_siblings) > 1) {
+- printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
+- printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
++ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
++ c->phys_proc_id);
++ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
++ c->cpu_core_id);
+ }
+
+ #endif
+@@ -1003,7 +1029,7 @@ static int __cpuinit intel_num_cpu_cores
+ return 1;
+ }
+
+-static void srat_detect_node(void)
++static void __cpuinit srat_detect_node(void)
+ {
+ #ifdef CONFIG_NUMA
+ unsigned node;
+@@ -1013,7 +1039,7 @@ static void srat_detect_node(void)
+ /* Don't do the funky fallback heuristics the AMD version employs
+ for now. */
+ node = apicid_to_node[apicid];
+- if (node == NUMA_NO_NODE)
++ if (node == NUMA_NO_NODE || !node_online(node))
+ node = first_node(node_online_map);
+ numa_set_node(cpu, node);
+
+@@ -1021,28 +1047,39 @@ static void srat_detect_node(void)
+ #endif
+ }
+
++static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
++{
++ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
++ (c->x86 == 0x6 && c->x86_model >= 0x0e))
++ set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
++}
++
+ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+ {
+ /* Cache sizes */
+ unsigned n;
+
+ init_intel_cacheinfo(c);
+- if (c->cpuid_level > 9 ) {
++ if (c->cpuid_level > 9) {
+ unsigned eax = cpuid_eax(10);
+ /* Check for version and the number of counters */
+ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+- set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
++ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ if (cpu_has_ds) {
+ unsigned int l1, l2;
+ rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+ if (!(l1 & (1<<11)))
+- set_bit(X86_FEATURE_BTS, c->x86_capability);
++ set_cpu_cap(c, X86_FEATURE_BTS);
+ if (!(l1 & (1<<12)))
+- set_bit(X86_FEATURE_PEBS, c->x86_capability);
++ set_cpu_cap(c, X86_FEATURE_PEBS);
+ }
+
++
++ if (cpu_has_bts)
++ ds_init_intel(c);
++
+ n = c->extended_cpuid_level;
+ if (n >= 0x80000008) {
+ unsigned eax = cpuid_eax(0x80000008);
+@@ -1059,14 +1096,11 @@ static void __cpuinit init_intel(struct
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
+- set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
++ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ if (c->x86 == 6)
+- set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+- if (c->x86 == 15)
+- set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+- else
+- clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+- c->x86_max_cores = intel_num_cpu_cores(c);
++ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
++ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
++ c->x86_max_cores = intel_num_cpu_cores(c);
+
+ srat_detect_node();
+ }
+@@ -1083,18 +1117,12 @@ static void __cpuinit get_cpu_vendor(str
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ }
+
+-struct cpu_model_info {
+- int vendor;
+- int family;
+- char *model_names[16];
+-};
+-
+ /* Do some early cpuid on the boot CPU to get some parameter that are
+ needed before check_bugs. Everything advanced is in identify_cpu
+ below. */
+-void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
++static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+ {
+- u32 tfms;
++ u32 tfms, xlvl;
+
+ c->loops_per_jiffy = loops_per_jiffy;
+ c->x86_cache_size = -1;
+@@ -1105,6 +1133,7 @@ void __cpuinit early_identify_cpu(struct
+ c->x86_clflush_size = 64;
+ c->x86_cache_alignment = c->x86_clflush_size;
+ c->x86_max_cores = 1;
++ c->x86_coreid_bits = 0;
+ c->extended_cpuid_level = 0;
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+@@ -1113,7 +1142,7 @@ void __cpuinit early_identify_cpu(struct
+ (unsigned int *)&c->x86_vendor_id[0],
+ (unsigned int *)&c->x86_vendor_id[8],
+ (unsigned int *)&c->x86_vendor_id[4]);
+-
++
+ get_cpu_vendor(c);
+
+ /* Initialize the standard set of capabilities */
+@@ -1131,7 +1160,7 @@ void __cpuinit early_identify_cpu(struct
+ c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
+- if (c->x86_capability[0] & (1<<19))
++ if (c->x86_capability[0] & (1<<19))
+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+ } else {
+ /* Have CPUID level 0 only - unheard of */
+@@ -1141,18 +1170,6 @@ void __cpuinit early_identify_cpu(struct
+ #ifdef CONFIG_SMP
+ c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
+ #endif
+-}
+-
+-/*
+- * This does the hard work of actually picking apart the CPU stuff...
+- */
+-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+-{
+- int i;
+- u32 xlvl;
+-
+- early_identify_cpu(c);
+-
+ /* AMD-defined flags: level 0x80000001 */
+ xlvl = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = xlvl;
+@@ -1173,6 +1190,30 @@ void __cpuinit identify_cpu(struct cpuin
+ c->x86_capability[2] = cpuid_edx(0x80860001);
+ }
+
++ c->extended_cpuid_level = cpuid_eax(0x80000000);
++ if (c->extended_cpuid_level >= 0x80000007)
++ c->x86_power = cpuid_edx(0x80000007);
++
++ switch (c->x86_vendor) {
++ case X86_VENDOR_AMD:
++ early_init_amd(c);
++ break;
++ case X86_VENDOR_INTEL:
++ early_init_intel(c);
++ break;
++ }
++
++}
++
++/*
++ * This does the hard work of actually picking apart the CPU stuff...
++ */
++void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
++{
++ int i;
++
++ early_identify_cpu(c);
++
+ init_scattered_cpuid_features(c);
+
+ c->apicid = phys_pkg_id(0);
+@@ -1202,8 +1243,7 @@ void __cpuinit identify_cpu(struct cpuin
+ break;
+ }
+
+- select_idle_routine(c);
+- detect_ht(c);
++ detect_ht(c);
+
+ /*
+ * On SMP, boot_cpu_data holds the common feature set between
+@@ -1213,31 +1253,55 @@ void __cpuinit identify_cpu(struct cpuin
+ */
+ if (c != &boot_cpu_data) {
+ /* AND the already accumulated flags with these */
+- for (i = 0 ; i < NCAPINTS ; i++)
++ for (i = 0; i < NCAPINTS; i++)
+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+ }
+
++ /* Clear all flags overriden by options */
++ for (i = 0; i < NCAPINTS; i++)
++ c->x86_capability[i] &= ~cleared_cpu_caps[i];
++
+ #ifdef CONFIG_X86_MCE
+ mcheck_init(c);
+ #endif
++ select_idle_routine(c);
++
+ if (c != &boot_cpu_data)
+ mtrr_ap_init();
+ #ifdef CONFIG_NUMA
+ numa_add_cpu(smp_processor_id());
+ #endif
++
+ }
+-
++
++static __init int setup_noclflush(char *arg)
++{
++ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
++ return 1;
++}
++__setup("noclflush", setup_noclflush);
+
+ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+ {
+ if (c->x86_model_id[0])
+- printk("%s", c->x86_model_id);
++ printk(KERN_CONT "%s", c->x86_model_id);
++
++ if (c->x86_mask || c->cpuid_level >= 0)
++ printk(KERN_CONT " stepping %02x\n", c->x86_mask);
++ else
++ printk(KERN_CONT "\n");
++}
+
+- if (c->x86_mask || c->cpuid_level >= 0)
+- printk(" stepping %02x\n", c->x86_mask);
++static __init int setup_disablecpuid(char *arg)
++{
++ int bit;
++ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
++ setup_clear_cpu_cap(bit);
+ else
+- printk("\n");
++ return 0;
++ return 1;
+ }
++__setup("clearcpuid=", setup_disablecpuid);
+
+ /*
+ * Get CPU information for use by the procfs.
+@@ -1246,116 +1310,41 @@ void __cpuinit print_cpu_info(struct cpu
+ static int show_cpuinfo(struct seq_file *m, void *v)
+ {
+ struct cpuinfo_x86 *c = v;
+- int cpu = 0;
+-
+- /*
+- * These flag bits must match the definitions in <asm/cpufeature.h>.
+- * NULL means this bit is undefined or reserved; either way it doesn't
+- * have meaning as far as Linux is concerned. Note that it's important
+- * to realize there is a difference between this table and CPUID -- if
+- * applications want to get the raw CPUID data, they should access
+- * /dev/cpu/<cpu_nr>/cpuid instead.
+- */
+- static const char *const x86_cap_flags[] = {
+- /* Intel-defined */
+- "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+- "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
+- "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
+- "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
+-
+- /* AMD-defined */
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
+- NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
+- "3dnowext", "3dnow",
+-
+- /* Transmeta-defined */
+- "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+-
+- /* Other (Linux-defined) */
+- "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
+- NULL, NULL, NULL, NULL,
+- "constant_tsc", "up", NULL, "arch_perfmon",
+- "pebs", "bts", NULL, "sync_rdtsc",
+- "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+-
+- /* Intel-defined (#2) */
+- "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
+- "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+- NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+-
+- /* VIA/Cyrix/Centaur-defined */
+- NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
+- "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+-
+- /* AMD-defined (#2) */
+- "lahf_lm", "cmp_legacy", "svm", "extapic",
+- "cr8_legacy", "abm", "sse4a", "misalignsse",
+- "3dnowprefetch", "osvw", "ibs", "sse5",
+- "skinit", "wdt", NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+-
+- /* Auxiliary (Linux-defined) */
+- "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+- };
+- static const char *const x86_power_flags[] = {
+- "ts", /* temperature sensor */
+- "fid", /* frequency id control */
+- "vid", /* voltage id control */
+- "ttp", /* thermal trip */
+- "tm",
+- "stc",
+- "100mhzsteps",
+- "hwpstate",
+- "", /* tsc invariant mapped to constant_tsc */
+- /* nothing */
+- };
+-
++ int cpu = 0, i;
+
+ #ifdef CONFIG_SMP
+ cpu = c->cpu_index;
+ #endif
+
+- seq_printf(m,"processor\t: %u\n"
+- "vendor_id\t: %s\n"
+- "cpu family\t: %d\n"
+- "model\t\t: %d\n"
+- "model name\t: %s\n",
+- (unsigned)cpu,
+- c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+- c->x86,
+- (int)c->x86_model,
+- c->x86_model_id[0] ? c->x86_model_id : "unknown");
+-
++ seq_printf(m, "processor\t: %u\n"
++ "vendor_id\t: %s\n"
++ "cpu family\t: %d\n"
++ "model\t\t: %d\n"
++ "model name\t: %s\n",
++ (unsigned)cpu,
++ c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
++ c->x86,
++ (int)c->x86_model,
++ c->x86_model_id[0] ? c->x86_model_id : "unknown");
++
+ if (c->x86_mask || c->cpuid_level >= 0)
+ seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+ else
+ seq_printf(m, "stepping\t: unknown\n");
+-
+- if (cpu_has(c,X86_FEATURE_TSC)) {
++
++ if (cpu_has(c, X86_FEATURE_TSC)) {
+ unsigned int freq = cpufreq_quick_get((unsigned)cpu);
++
+ if (!freq)
+ freq = cpu_khz;
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
+- freq / 1000, (freq % 1000));
++ freq / 1000, (freq % 1000));
+ }
+
+ /* Cache size */
+- if (c->x86_cache_size >= 0)
++ if (c->x86_cache_size >= 0)
+ seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+-
++
+ #ifdef CONFIG_SMP
+ if (smp_num_siblings * c->x86_max_cores > 1) {
+ seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+@@ -1364,48 +1353,43 @@ static int show_cpuinfo(struct seq_file
+ seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+ }
+-#endif
++#endif
+
+ seq_printf(m,
+- "fpu\t\t: yes\n"
+- "fpu_exception\t: yes\n"
+- "cpuid level\t: %d\n"
+- "wp\t\t: yes\n"
+- "flags\t\t:",
++ "fpu\t\t: yes\n"
++ "fpu_exception\t: yes\n"
++ "cpuid level\t: %d\n"
++ "wp\t\t: yes\n"
++ "flags\t\t:",
+ c->cpuid_level);
+
+- {
+- int i;
+- for ( i = 0 ; i < 32*NCAPINTS ; i++ )
+- if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+- seq_printf(m, " %s", x86_cap_flags[i]);
+- }
+-
++ for (i = 0; i < 32*NCAPINTS; i++)
++ if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
++ seq_printf(m, " %s", x86_cap_flags[i]);
++
+ seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+ c->loops_per_jiffy/(500000/HZ),
+ (c->loops_per_jiffy/(5000/HZ)) % 100);
+
+- if (c->x86_tlbsize > 0)
++ if (c->x86_tlbsize > 0)
+ seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
+ seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
+ seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
+
+- seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
++ seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
+ c->x86_phys_bits, c->x86_virt_bits);
+
+ seq_printf(m, "power management:");
+- {
+- unsigned i;
+- for (i = 0; i < 32; i++)
+- if (c->x86_power & (1 << i)) {
+- if (i < ARRAY_SIZE(x86_power_flags) &&
+- x86_power_flags[i])
+- seq_printf(m, "%s%s",
+- x86_power_flags[i][0]?" ":"",
+- x86_power_flags[i]);
+- else
+- seq_printf(m, " [%d]", i);
+- }
++ for (i = 0; i < 32; i++) {
++ if (c->x86_power & (1 << i)) {
++ if (i < ARRAY_SIZE(x86_power_flags) &&
++ x86_power_flags[i])
++ seq_printf(m, "%s%s",
++ x86_power_flags[i][0]?" ":"",
++ x86_power_flags[i]);
++ else
++ seq_printf(m, " [%d]", i);
++ }
+ }
+
+ seq_printf(m, "\n\n");
+@@ -1432,8 +1416,8 @@ static void c_stop(struct seq_file *m, v
+ {
+ }
+
+-struct seq_operations cpuinfo_op = {
+- .start =c_start,
++const struct seq_operations cpuinfo_op = {
++ .start = c_start,
+ .next = c_next,
+ .stop = c_stop,
+ .show = show_cpuinfo,
+--- sle11-2009-06-29.orig/arch/x86/kernel/smp_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/smp_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -168,7 +168,7 @@ void __send_IPI_shortcut(unsigned int sh
+ }
+ }
+
+-void fastcall send_IPI_self(int vector)
++void send_IPI_self(int vector)
+ {
+ __send_IPI_shortcut(APIC_DEST_SELF, vector);
+ }
+@@ -224,13 +224,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
+ * We need to reload %cr3 since the page tables may be going
+ * away from under us..
+ */
+-void leave_mm(unsigned long cpu)
++void leave_mm(int cpu)
+ {
+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+ BUG();
+ cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+ load_cr3(swapper_pg_dir);
+ }
++EXPORT_SYMBOL_GPL(leave_mm);
+
+ /*
+ *
+--- sle11-2009-06-29.orig/arch/x86/kernel/smp_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/smp_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -33,7 +33,7 @@
+
+ #ifndef CONFIG_XEN
+ /*
+- * Smarter SMP flushing macros.
++ * Smarter SMP flushing macros.
+ * c/o Linus Torvalds.
+ *
+ * These mean you can really definitely utterly forget about
+@@ -41,15 +41,15 @@
+ *
+ * Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+- * More scalable flush, from Andi Kleen
++ * More scalable flush, from Andi Kleen
+ *
+- * To avoid global state use 8 different call vectors.
+- * Each CPU uses a specific vector to trigger flushes on other
+- * CPUs. Depending on the received vector the target CPUs look into
++ * To avoid global state use 8 different call vectors.
++ * Each CPU uses a specific vector to trigger flushes on other
++ * CPUs. Depending on the received vector the target CPUs look into
+ * the right per cpu variable for the flush data.
+ *
+- * With more than 8 CPUs they are hashed to the 8 available
+- * vectors. The limited global vector space forces us to this right now.
++ * With more than 8 CPUs they are hashed to the 8 available
++ * vectors. The limited global vector space forces us to this right now.
+ * In future when interrupts are split into per CPU domains this could be
+ * fixed, at the cost of triggering multiple IPIs in some cases.
+ */
+@@ -59,7 +59,6 @@ union smp_flush_state {
+ cpumask_t flush_cpumask;
+ struct mm_struct *flush_mm;
+ unsigned long flush_va;
+-#define FLUSH_ALL -1ULL
+ spinlock_t tlbstate_lock;
+ };
+ char pad[SMP_CACHE_BYTES];
+@@ -71,16 +70,17 @@ union smp_flush_state {
+ static DEFINE_PER_CPU(union smp_flush_state, flush_state);
+
+ /*
+- * We cannot call mmdrop() because we are in interrupt context,
++ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ */
+-static inline void leave_mm(unsigned long cpu)
++void leave_mm(int cpu)
+ {
+ if (read_pda(mmu_state) == TLBSTATE_OK)
+ BUG();
+ cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
+ load_cr3(swapper_pg_dir);
+ }
++EXPORT_SYMBOL_GPL(leave_mm);
+
+ /*
+ *
+@@ -89,25 +89,25 @@ static inline void leave_mm(unsigned lon
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+- * Stop ipi delivery for the old mm. This is not synchronized with
+- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
+- * for the wrong mm, and in the worst case we perform a superfluous
+- * tlb flush.
++ * Stop ipi delivery for the old mm. This is not synchronized with
++ * the other cpus, but smp_invalidate_interrupt ignore flush ipis
++ * for the wrong mm, and in the worst case we perform a superfluous
++ * tlb flush.
+ * 1a2) set cpu mmu_state to TLBSTATE_OK
+- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
++ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ * was in lazy tlb mode.
+ * 1a3) update cpu active_mm
+- * Now cpu0 accepts tlb flushes for the new mm.
++ * Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+- * Now the other cpus will send tlb flush ipis.
++ * Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ * cpu active_mm is correct, cpu0 already handles
+ * flush ipis.
+ * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+- * Atomically set the bit [other cpus will start sending flush ipis],
+- * and test the bit.
++ * Atomically set the bit [other cpus will start sending flush ipis],
++ * and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+@@ -141,12 +141,12 @@ asmlinkage void smp_invalidate_interrupt
+ * orig_rax contains the negated interrupt vector.
+ * Use that to determine where the sender put the data.
+ */
+- sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
++ sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
+ f = &per_cpu(flush_state, sender);
+
+ if (!cpu_isset(cpu, f->flush_cpumask))
+ goto out;
+- /*
++ /*
+ * This was a BUG() but until someone can quote me the
+ * line from the intel manual that guarantees an IPI to
+ * multiple CPUs is retried _only_ on the erroring CPUs
+@@ -154,10 +154,10 @@ asmlinkage void smp_invalidate_interrupt
+ *
+ * BUG();
+ */
+-
++
+ if (f->flush_mm == read_pda(active_mm)) {
+ if (read_pda(mmu_state) == TLBSTATE_OK) {
+- if (f->flush_va == FLUSH_ALL)
++ if (f->flush_va == TLB_FLUSH_ALL)
+ local_flush_tlb();
+ else
+ __flush_tlb_one(f->flush_va);
+@@ -170,19 +170,22 @@ out:
+ add_pda(irq_tlb_count, 1);
+ }
+
+-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+- unsigned long va)
++void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
++ unsigned long va)
+ {
+ int sender;
+ union smp_flush_state *f;
++ cpumask_t cpumask = *cpumaskp;
+
+ /* Caller has disabled preemption */
+ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+ f = &per_cpu(flush_state, sender);
+
+- /* Could avoid this lock when
+- num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+- probably not worth checking this for a cache-hot lock. */
++ /*
++ * Could avoid this lock when
++ * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
++ * probably not worth checking this for a cache-hot lock.
++ */
+ spin_lock(&f->tlbstate_lock);
+
+ f->flush_mm = mm;
+@@ -206,14 +209,14 @@ static void flush_tlb_others(cpumask_t c
+ int __cpuinit init_smp_flush(void)
+ {
+ int i;
++
+ for_each_cpu_mask(i, cpu_possible_map) {
+ spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
+ }
+ return 0;
+ }
+-
+ core_initcall(init_smp_flush);
+-
++
+ void flush_tlb_current_task(void)
+ {
+ struct mm_struct *mm = current->mm;
+@@ -225,10 +228,9 @@ void flush_tlb_current_task(void)
+
+ local_flush_tlb();
+ if (!cpus_empty(cpu_mask))
+- flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
++ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+ preempt_enable();
+ }
+-EXPORT_SYMBOL(flush_tlb_current_task);
+
+ void flush_tlb_mm (struct mm_struct * mm)
+ {
+@@ -245,11 +247,10 @@ void flush_tlb_mm (struct mm_struct * mm
+ leave_mm(smp_processor_id());
+ }
+ if (!cpus_empty(cpu_mask))
+- flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
++ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+
+ preempt_enable();
+ }
+-EXPORT_SYMBOL(flush_tlb_mm);
+
+ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+ {
+@@ -263,8 +264,8 @@ void flush_tlb_page(struct vm_area_struc
+ if (current->active_mm == mm) {
+ if(current->mm)
+ __flush_tlb_one(va);
+- else
+- leave_mm(smp_processor_id());
++ else
++ leave_mm(smp_processor_id());
+ }
+
+ if (!cpus_empty(cpu_mask))
+@@ -272,7 +273,6 @@ void flush_tlb_page(struct vm_area_struc
+
+ preempt_enable();
+ }
+-EXPORT_SYMBOL(flush_tlb_page);
+
+ static void do_flush_tlb_all(void* info)
+ {
+@@ -330,11 +330,9 @@ void unlock_ipi_call_lock(void)
+ * this function sends a 'generic call function' IPI to all other CPU
+ * of the system defined in the mask.
+ */
+-
+-static int
+-__smp_call_function_mask(cpumask_t mask,
+- void (*func)(void *), void *info,
+- int wait)
++static int __smp_call_function_mask(cpumask_t mask,
++ void (*func)(void *), void *info,
++ int wait)
+ {
+ struct call_data_struct data;
+ cpumask_t allbutself;
+@@ -422,11 +420,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
+ */
+
+ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+- int nonatomic, int wait)
++ int nonatomic, int wait)
+ {
+ /* prevent preemption and reschedule on another processor */
+- int ret;
+- int me = get_cpu();
++ int ret, me = get_cpu();
+
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+@@ -476,9 +473,9 @@ static void stop_this_cpu(void *dummy)
+ */
+ cpu_clear(smp_processor_id(), cpu_online_map);
+ disable_all_local_evtchn();
+- for (;;)
++ for (;;)
+ halt();
+-}
++}
+
+ void smp_send_stop(void)
+ {
+--- sle11-2009-06-29.orig/arch/x86/kernel/time_32-xen.c 2009-03-24 10:12:35.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/time_32-xen.c 2009-03-24 10:12:48.000000000 +0100
+@@ -28,21 +28,9 @@
+ * serialize accesses to xtime/lost_ticks).
+ */
+
+-#include <linux/errno.h>
+-#include <linux/sched.h>
+-#include <linux/kernel.h>
+-#include <linux/param.h>
+-#include <linux/string.h>
+-#include <linux/mm.h>
++#include <linux/init.h>
+ #include <linux/interrupt.h>
+ #include <linux/time.h>
+-#include <linux/delay.h>
+-#include <linux/init.h>
+-#include <linux/smp.h>
+-#include <linux/module.h>
+-#include <linux/sysdev.h>
+-#include <linux/bcd.h>
+-#include <linux/efi.h>
+ #include <linux/mca.h>
+ #include <linux/sysctl.h>
+ #include <linux/percpu.h>
+@@ -50,26 +38,10 @@
+ #include <linux/posix-timers.h>
+ #include <linux/cpufreq.h>
+ #include <linux/clocksource.h>
++#include <linux/sysdev.h>
+
+-#include <asm/io.h>
+-#include <asm/smp.h>
+-#include <asm/irq.h>
+-#include <asm/msr.h>
+ #include <asm/delay.h>
+-#include <asm/mpspec.h>
+-#include <asm/uaccess.h>
+-#include <asm/processor.h>
+-#include <asm/timer.h>
+ #include <asm/time.h>
+-#include <asm/sections.h>
+-
+-#include "mach_time.h"
+-
+-#include <linux/timex.h>
+-
+-#include <asm/hpet.h>
+-
+-#include <asm/arch_hooks.h>
+
+ #include <xen/evtchn.h>
+ #include <xen/sysctl.h>
+@@ -89,9 +61,6 @@ volatile unsigned long __jiffies __secti
+ unsigned int cpu_khz; /* Detected as we calibrate the TSC */
+ EXPORT_SYMBOL(cpu_khz);
+
+-DEFINE_SPINLOCK(rtc_lock);
+-EXPORT_SYMBOL(rtc_lock);
+-
+ /* These are peridically updated in shared_info, and then copied here. */
+ struct shadow_time_info {
+ u64 tsc_timestamp; /* TSC at last update of time vals. */
+@@ -154,6 +123,11 @@ static int __init __independent_wallcloc
+ }
+ __setup("independent_wallclock", __independent_wallclock);
+
++int xen_independent_wallclock(void)
++{
++ return independent_wallclock;
++}
++
+ /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
+ static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
+ static int __init __permitted_clock_jitter(char *str)
+@@ -223,7 +197,6 @@ static inline u64 get64(volatile u64 *pt
+ return cmpxchg64(ptr, 0, 0);
+ #else
+ return *ptr;
+-#define cmpxchg64 cmpxchg
+ #endif
+ }
+
+@@ -233,7 +206,6 @@ static inline u64 get64_local(volatile u
+ return cmpxchg64_local(ptr, 0, 0);
+ #else
+ return *ptr;
+-#define cmpxchg64_local cmpxchg_local
+ #endif
+ }
+
+@@ -339,35 +311,6 @@ static inline int time_values_up_to_date
+ return (dst->version == src->version);
+ }
+
+-/*
+- * This is a special lock that is owned by the CPU and holds the index
+- * register we are working with. It is required for NMI access to the
+- * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
+- */
+-volatile unsigned long cmos_lock = 0;
+-EXPORT_SYMBOL(cmos_lock);
+-
+-/* Routines for accessing the CMOS RAM/RTC. */
+-unsigned char rtc_cmos_read(unsigned char addr)
+-{
+- unsigned char val;
+- lock_cmos_prefix(addr);
+- outb_p(addr, RTC_PORT(0));
+- val = inb_p(RTC_PORT(1));
+- lock_cmos_suffix(addr);
+- return val;
+-}
+-EXPORT_SYMBOL(rtc_cmos_read);
+-
+-void rtc_cmos_write(unsigned char val, unsigned char addr)
+-{
+- lock_cmos_prefix(addr);
+- outb_p(addr, RTC_PORT(0));
+- outb_p(val, RTC_PORT(1));
+- lock_cmos_suffix(addr);
+-}
+-EXPORT_SYMBOL(rtc_cmos_write);
+-
+ static void sync_xen_wallclock(unsigned long dummy);
+ static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
+ static void sync_xen_wallclock(unsigned long dummy)
+@@ -376,7 +319,8 @@ static void sync_xen_wallclock(unsigned
+ s64 nsec;
+ struct xen_platform_op op;
+
+- if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
++ BUG_ON(!is_initial_xendomain());
++ if (!ntp_synced() || independent_wallclock)
+ return;
+
+ write_seqlock_irq(&xtime_lock);
+@@ -399,23 +343,6 @@ static void sync_xen_wallclock(unsigned
+ mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
+ }
+
+-static int set_rtc_mmss(unsigned long nowtime)
+-{
+- int retval;
+- unsigned long flags;
+-
+- if (independent_wallclock || !is_initial_xendomain())
+- return 0;
+-
+- /* gets recalled with irq locally disabled */
+- /* XXX - does irqsave resolve this? -johnstul */
+- spin_lock_irqsave(&rtc_lock, flags);
+- retval = set_wallclock(nowtime);
+- spin_unlock_irqrestore(&rtc_lock, flags);
+-
+- return retval;
+-}
+-
+ static unsigned long long local_clock(void)
+ {
+ unsigned int cpu = get_cpu();
+@@ -498,28 +425,24 @@ unsigned long profile_pc(struct pt_regs
+
+ #if defined(CONFIG_SMP) || defined(__x86_64__)
+ # ifdef __i386__
+- if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs)
++ if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs)
+ # else
+ if (!user_mode(regs)
+ # endif
+ && in_lock_functions(pc)) {
+ # ifdef CONFIG_FRAME_POINTER
+-# ifdef __i386__
+- return ((unsigned long *)regs->ebp)[1];
+-# else
+- return ((unsigned long *)regs->rbp)[1];
+-# endif
++ return ((unsigned long *)regs->bp)[1];
+ # else
+ # ifdef __i386__
+- unsigned long *sp = (unsigned long *)®s->esp;
++ unsigned long *sp = (unsigned long *)®s->sp;
+ # else
+- unsigned long *sp = (unsigned long *)regs->rsp;
++ unsigned long *sp = (unsigned long *)regs->sp;
+ # endif
+
+ /* Return address is either directly at stack pointer
+- or above a saved eflags. Eflags has bits 22-31 zero,
++ or above a saved flags. Eflags has bits 22-31 zero,
+ kernel addresses don't. */
+- if (sp[0] >> 22)
++ if (sp[0] >> 22)
+ return sp[0];
+ if (sp[1] >> 22)
+ return sp[1];
+@@ -748,25 +671,32 @@ static void init_missing_ticks_accountin
+ runstate->time[RUNSTATE_offline];
+ }
+
+-/* not static: needed by APM */
+-unsigned long read_persistent_clock(void)
++unsigned long xen_read_persistent_clock(void)
+ {
+- unsigned long retval;
+- unsigned long flags;
+-
+- spin_lock_irqsave(&rtc_lock, flags);
++ const shared_info_t *s = HYPERVISOR_shared_info;
++ u32 version, sec, nsec;
++ u64 delta;
+
+- retval = get_wallclock();
++ do {
++ version = s->wc_version;
++ rmb();
++ sec = s->wc_sec;
++ nsec = s->wc_nsec;
++ rmb();
++ } while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+- spin_unlock_irqrestore(&rtc_lock, flags);
++ delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
++ do_div(delta, NSEC_PER_SEC);
+
+- return retval;
++ return delta;
+ }
+
+-int update_persistent_clock(struct timespec now)
++int xen_update_persistent_clock(void)
+ {
++ if (!is_initial_xendomain())
++ return -1;
+ mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
+- return set_rtc_mmss(now.tv_sec);
++ return 0;
+ }
+
+ extern void (*late_time_init)(void);
+--- sle11-2009-06-29.orig/arch/x86/kernel/traps_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -79,7 +79,8 @@ char ignore_fpu_irq = 0;
+ * F0 0F bug workaround.. We have a special link segment
+ * for this.
+ */
+-struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
++gate_desc idt_table[256]
++ __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
+ #endif
+
+ asmlinkage void divide_error(void);
+@@ -109,6 +110,34 @@ asmlinkage void machine_check(void);
+ int kstack_depth_to_print = 24;
+ static unsigned int code_bytes = 64;
+
++void printk_address(unsigned long address, int reliable)
++{
++#ifdef CONFIG_KALLSYMS
++ unsigned long offset = 0, symsize;
++ const char *symname;
++ char *modname;
++ char *delim = ":";
++ char namebuf[128];
++ char reliab[4] = "";
++
++ symname = kallsyms_lookup(address, &symsize, &offset,
++ &modname, namebuf);
++ if (!symname) {
++ printk(" [<%08lx>]\n", address);
++ return;
++ }
++ if (!reliable)
++ strcpy(reliab, "? ");
++
++ if (!modname)
++ modname = delim = "";
++ printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
++ address, reliab, delim, modname, delim, symname, offset, symsize);
++#else
++ printk(" [<%08lx>]\n", address);
++#endif
++}
++
+ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
+ {
+ return p > (void *)tinfo &&
+@@ -122,48 +151,35 @@ struct stack_frame {
+ };
+
+ static inline unsigned long print_context_stack(struct thread_info *tinfo,
+- unsigned long *stack, unsigned long ebp,
++ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
+ {
+-#ifdef CONFIG_FRAME_POINTER
+- struct stack_frame *frame = (struct stack_frame *)ebp;
+- while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
+- struct stack_frame *next;
+- unsigned long addr;
++ struct stack_frame *frame = (struct stack_frame *)bp;
+
+- addr = frame->return_address;
+- ops->address(data, addr);
+- /*
+- * break out of recursive entries (such as
+- * end_of_stack_stop_unwind_function). Also,
+- * we can never allow a frame pointer to
+- * move downwards!
+- */
+- next = frame->next_frame;
+- if (next <= frame)
+- break;
+- frame = next;
+- }
+-#else
+ while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
+ unsigned long addr;
+
+- addr = *stack++;
+- if (__kernel_text_address(addr))
+- ops->address(data, addr);
++ addr = *stack;
++ if (__kernel_text_address(addr)) {
++ if ((unsigned long) stack == bp + 4) {
++ ops->address(data, addr, 1);
++ frame = frame->next_frame;
++ bp = (unsigned long) frame;
++ } else {
++ ops->address(data, addr, bp == 0);
++ }
++ }
++ stack++;
+ }
+-#endif
+- return ebp;
++ return bp;
+ }
+
+ #define MSG(msg) ops->warning(data, msg)
+
+ void dump_trace(struct task_struct *task, struct pt_regs *regs,
+- unsigned long *stack,
++ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
+ {
+- unsigned long ebp = 0;
+-
+ if (!task)
+ task = current;
+
+@@ -171,17 +187,17 @@ void dump_trace(struct task_struct *task
+ unsigned long dummy;
+ stack = &dummy;
+ if (task != current)
+- stack = (unsigned long *)task->thread.esp;
++ stack = (unsigned long *)task->thread.sp;
+ }
+
+ #ifdef CONFIG_FRAME_POINTER
+- if (!ebp) {
++ if (!bp) {
+ if (task == current) {
+- /* Grab ebp right from our regs */
+- asm ("movl %%ebp, %0" : "=r" (ebp) : );
++ /* Grab bp right from our regs */
++ asm ("movl %%ebp, %0" : "=r" (bp) : );
+ } else {
+- /* ebp is the last reg pushed by switch_to */
+- ebp = *(unsigned long *) task->thread.esp;
++ /* bp is the last reg pushed by switch_to */
++ bp = *(unsigned long *) task->thread.sp;
+ }
+ }
+ #endif
+@@ -190,7 +206,7 @@ void dump_trace(struct task_struct *task
+ struct thread_info *context;
+ context = (struct thread_info *)
+ ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+- ebp = print_context_stack(context, stack, ebp, ops, data);
++ bp = print_context_stack(context, stack, bp, ops, data);
+ /* Should be after the line below, but somewhere
+ in early boot context comes out corrupted and we
+ can't reference it -AK */
+@@ -225,9 +241,11 @@ static int print_trace_stack(void *data,
+ /*
+ * Print one address/symbol entries per line.
+ */
+-static void print_trace_address(void *data, unsigned long addr)
++static void print_trace_address(void *data, unsigned long addr, int reliable)
+ {
+ printk("%s [<%08lx>] ", (char *)data, addr);
++ if (!reliable)
++ printk("? ");
+ print_symbol("%s\n", addr);
+ touch_nmi_watchdog();
+ }
+@@ -241,32 +259,32 @@ static const struct stacktrace_ops print
+
+ static void
+ show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+- unsigned long * stack, char *log_lvl)
++ unsigned long *stack, unsigned long bp, char *log_lvl)
+ {
+- dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
++ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+ printk("%s =======================\n", log_lvl);
+ }
+
+ void show_trace(struct task_struct *task, struct pt_regs *regs,
+- unsigned long * stack)
++ unsigned long *stack, unsigned long bp)
+ {
+- show_trace_log_lvl(task, regs, stack, "");
++ show_trace_log_lvl(task, regs, stack, bp, "");
+ }
+
+ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+- unsigned long *esp, char *log_lvl)
++ unsigned long *sp, unsigned long bp, char *log_lvl)
+ {
+ unsigned long *stack;
+ int i;
+
+- if (esp == NULL) {
++ if (sp == NULL) {
+ if (task)
+- esp = (unsigned long*)task->thread.esp;
++ sp = (unsigned long*)task->thread.sp;
+ else
+- esp = (unsigned long *)&esp;
++ sp = (unsigned long *)&sp;
+ }
+
+- stack = esp;
++ stack = sp;
+ for(i = 0; i < kstack_depth_to_print; i++) {
+ if (kstack_end(stack))
+ break;
+@@ -275,13 +293,13 @@ static void show_stack_log_lvl(struct ta
+ printk("%08lx ", *stack++);
+ }
+ printk("\n%sCall Trace:\n", log_lvl);
+- show_trace_log_lvl(task, regs, esp, log_lvl);
++ show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ }
+
+-void show_stack(struct task_struct *task, unsigned long *esp)
++void show_stack(struct task_struct *task, unsigned long *sp)
+ {
+ printk(" ");
+- show_stack_log_lvl(task, NULL, esp, "");
++ show_stack_log_lvl(task, NULL, sp, 0, "");
+ }
+
+ /*
+@@ -290,13 +308,19 @@ void show_stack(struct task_struct *task
+ void dump_stack(void)
+ {
+ unsigned long stack;
++ unsigned long bp = 0;
++
++#ifdef CONFIG_FRAME_POINTER
++ if (!bp)
++ asm("movl %%ebp, %0" : "=r" (bp):);
++#endif
+
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+- show_trace(current, NULL, &stack);
++ show_trace(current, NULL, &stack, bp);
+ }
+
+ EXPORT_SYMBOL(dump_stack);
+@@ -315,30 +339,30 @@ void show_registers(struct pt_regs *regs
+ * time of the fault..
+ */
+ if (!user_mode_vm(regs)) {
+- u8 *eip;
++ u8 *ip;
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
+ unsigned char c;
+
+ printk("\n" KERN_EMERG "Stack: ");
+- show_stack_log_lvl(NULL, regs, ®s->esp, KERN_EMERG);
++ show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG);
+
+ printk(KERN_EMERG "Code: ");
+
+- eip = (u8 *)regs->eip - code_prologue;
+- if (eip < (u8 *)PAGE_OFFSET ||
+- probe_kernel_address(eip, c)) {
++ ip = (u8 *)regs->ip - code_prologue;
++ if (ip < (u8 *)PAGE_OFFSET ||
++ probe_kernel_address(ip, c)) {
+ /* try starting at EIP */
+- eip = (u8 *)regs->eip;
++ ip = (u8 *)regs->ip;
+ code_len = code_len - code_prologue + 1;
+ }
+- for (i = 0; i < code_len; i++, eip++) {
+- if (eip < (u8 *)PAGE_OFFSET ||
+- probe_kernel_address(eip, c)) {
++ for (i = 0; i < code_len; i++, ip++) {
++ if (ip < (u8 *)PAGE_OFFSET ||
++ probe_kernel_address(ip, c)) {
+ printk(" Bad EIP value.");
+ break;
+ }
+- if (eip == (u8 *)regs->eip)
++ if (ip == (u8 *)regs->ip)
+ printk("<%02x> ", c);
+ else
+ printk("%02x ", c);
+@@ -347,18 +371,57 @@ void show_registers(struct pt_regs *regs
+ printk("\n");
+ }
+
+-int is_valid_bugaddr(unsigned long eip)
++int is_valid_bugaddr(unsigned long ip)
+ {
+ unsigned short ud2;
+
+- if (eip < PAGE_OFFSET)
++ if (ip < PAGE_OFFSET)
+ return 0;
+- if (probe_kernel_address((unsigned short *)eip, ud2))
++ if (probe_kernel_address((unsigned short *)ip, ud2))
+ return 0;
+
+ return ud2 == 0x0b0f;
+ }
+
++static int die_counter;
++
++int __kprobes __die(const char * str, struct pt_regs * regs, long err)
++{
++ unsigned long sp;
++ unsigned short ss;
++
++ printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
++#ifdef CONFIG_PREEMPT
++ printk("PREEMPT ");
++#endif
++#ifdef CONFIG_SMP
++ printk("SMP ");
++#endif
++#ifdef CONFIG_DEBUG_PAGEALLOC
++ printk("DEBUG_PAGEALLOC");
++#endif
++ printk("\n");
++
++ if (notify_die(DIE_OOPS, str, regs, err,
++ current->thread.trap_no, SIGSEGV) !=
++ NOTIFY_STOP) {
++ show_registers(regs);
++ /* Executive summary in case the oops scrolled away */
++ sp = (unsigned long) (®s->sp);
++ savesegment(ss, ss);
++ if (user_mode(regs)) {
++ sp = regs->sp;
++ ss = regs->ss & 0xffff;
++ }
++ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
++ print_symbol("%s", regs->ip);
++ printk(" SS:ESP %04x:%08lx\n", ss, sp);
++ return 0;
++ } else {
++ return 1;
++ }
++}
++
+ /*
+ * This is gone through when something in the kernel has done something bad and
+ * is about to be terminated.
+@@ -374,7 +437,6 @@ void die(const char * str, struct pt_reg
+ .lock_owner = -1,
+ .lock_owner_depth = 0
+ };
+- static int die_counter;
+ unsigned long flags;
+
+ oops_enter();
+@@ -390,43 +452,13 @@ void die(const char * str, struct pt_reg
+ raw_local_irq_save(flags);
+
+ if (++die.lock_owner_depth < 3) {
+- unsigned long esp;
+- unsigned short ss;
+-
+- report_bug(regs->eip, regs);
+-
+- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
+- ++die_counter);
+-#ifdef CONFIG_PREEMPT
+- printk("PREEMPT ");
+-#endif
+-#ifdef CONFIG_SMP
+- printk("SMP ");
+-#endif
+-#ifdef CONFIG_DEBUG_PAGEALLOC
+- printk("DEBUG_PAGEALLOC");
+-#endif
+- printk("\n");
++ report_bug(regs->ip, regs);
+
+- if (notify_die(DIE_OOPS, str, regs, err,
+- current->thread.trap_no, SIGSEGV) !=
+- NOTIFY_STOP) {
+- show_registers(regs);
+- /* Executive summary in case the oops scrolled away */
+- esp = (unsigned long) (®s->esp);
+- savesegment(ss, ss);
+- if (user_mode(regs)) {
+- esp = regs->esp;
+- ss = regs->xss & 0xffff;
+- }
+- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
+- print_symbol("%s", regs->eip);
+- printk(" SS:ESP %04x:%08lx\n", ss, esp);
+- }
+- else
++ if (__die(str, regs, err))
+ regs = NULL;
+- } else
++ } else {
+ printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
++ }
+
+ bust_spinlocks(0);
+ die.lock_owner = -1;
+@@ -462,7 +494,7 @@ static void __kprobes do_trap(int trapnr
+ {
+ struct task_struct *tsk = current;
+
+- if (regs->eflags & VM_MASK) {
++ if (regs->flags & VM_MASK) {
+ if (vm86)
+ goto vm86_trap;
+ goto trap_signal;
+@@ -508,7 +540,7 @@ static void __kprobes do_trap(int trapnr
+ }
+
+ #define DO_ERROR(trapnr, signr, str, name) \
+-fastcall void do_##name(struct pt_regs * regs, long error_code) \
++void do_##name(struct pt_regs * regs, long error_code) \
+ { \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+@@ -517,7 +549,7 @@ fastcall void do_##name(struct pt_regs *
+ }
+
+ #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
+-fastcall void do_##name(struct pt_regs * regs, long error_code) \
++void do_##name(struct pt_regs * regs, long error_code) \
+ { \
+ siginfo_t info; \
+ if (irq) \
+@@ -533,7 +565,7 @@ fastcall void do_##name(struct pt_regs *
+ }
+
+ #define DO_VM86_ERROR(trapnr, signr, str, name) \
+-fastcall void do_##name(struct pt_regs * regs, long error_code) \
++void do_##name(struct pt_regs * regs, long error_code) \
+ { \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+@@ -542,7 +574,7 @@ fastcall void do_##name(struct pt_regs *
+ }
+
+ #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+-fastcall void do_##name(struct pt_regs * regs, long error_code) \
++void do_##name(struct pt_regs * regs, long error_code) \
+ { \
+ siginfo_t info; \
+ info.si_signo = signr; \
+@@ -556,13 +588,13 @@ fastcall void do_##name(struct pt_regs *
+ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
+ }
+
+-DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
++DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+ #ifndef CONFIG_KPROBES
+ DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
+ #endif
+ DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
+ DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
+-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
++DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+ DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+ DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+@@ -570,10 +602,10 @@ DO_ERROR(12, SIGBUS, "stack segment", s
+ DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
+ DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+
+-fastcall void __kprobes do_general_protection(struct pt_regs * regs,
++void __kprobes do_general_protection(struct pt_regs * regs,
+ long error_code)
+ {
+- if (regs->eflags & VM_MASK)
++ if (regs->flags & VM_MASK)
+ goto gp_in_vm86;
+
+ if (!user_mode(regs))
+@@ -582,11 +614,14 @@ fastcall void __kprobes do_general_prote
+ current->thread.error_code = error_code;
+ current->thread.trap_no = 13;
+ if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
+- printk_ratelimit())
++ printk_ratelimit()) {
+ printk(KERN_INFO
+- "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
++ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ current->comm, task_pid_nr(current),
+- regs->eip, regs->esp, error_code);
++ regs->ip, regs->sp, error_code);
++ print_vma_addr(" in ", regs->ip);
++ printk("\n");
++ }
+
+ force_sig(SIGSEGV, current);
+ return;
+@@ -675,8 +710,8 @@ void __kprobes die_nmi(struct pt_regs *r
+ */
+ bust_spinlocks(1);
+ printk(KERN_EMERG "%s", msg);
+- printk(" on CPU%d, eip %08lx, registers:\n",
+- smp_processor_id(), regs->eip);
++ printk(" on CPU%d, ip %08lx, registers:\n",
++ smp_processor_id(), regs->ip);
+ show_registers(regs);
+ console_silent();
+ spin_unlock(&nmi_print_lock);
+@@ -733,7 +768,7 @@ static __kprobes void default_do_nmi(str
+
+ static int ignore_nmis;
+
+-fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
++__kprobes void do_nmi(struct pt_regs * regs, long error_code)
+ {
+ int cpu;
+
+@@ -762,7 +797,7 @@ void restart_nmi(void)
+ }
+
+ #ifdef CONFIG_KPROBES
+-fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
++void __kprobes do_int3(struct pt_regs *regs, long error_code)
+ {
+ trace_hardirqs_fixup();
+
+@@ -798,7 +833,7 @@ fastcall void __kprobes do_int3(struct p
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ */
+-fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
++void __kprobes do_debug(struct pt_regs * regs, long error_code)
+ {
+ unsigned int condition;
+ struct task_struct *tsk = current;
+@@ -807,24 +842,30 @@ fastcall void __kprobes do_debug(struct
+
+ get_debugreg(condition, 6);
+
++ /*
++ * The processor cleared BTF, so don't mark that we need it set.
++ */
++ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
++ tsk->thread.debugctlmsr = 0;
++
+ if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+ SIGTRAP) == NOTIFY_STOP)
+ return;
+ /* It's safe to allow irq's after DR6 has been saved */
+- if (regs->eflags & X86_EFLAGS_IF)
++ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+
+ /* Mask out spurious debug traps due to lazy DR7 setting */
+ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+- if (!tsk->thread.debugreg[7])
++ if (!tsk->thread.debugreg7)
+ goto clear_dr7;
+ }
+
+- if (regs->eflags & VM_MASK)
++ if (regs->flags & VM_MASK)
+ goto debug_vm86;
+
+ /* Save debug status register where ptrace can see it */
+- tsk->thread.debugreg[6] = condition;
++ tsk->thread.debugreg6 = condition;
+
+ /*
+ * Single-stepping through TF: make sure we ignore any events in
+@@ -856,7 +897,7 @@ debug_vm86:
+
+ clear_TF_reenable:
+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+- regs->eflags &= ~TF_MASK;
++ regs->flags &= ~TF_MASK;
+ return;
+ }
+
+@@ -865,7 +906,7 @@ clear_TF_reenable:
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+-void math_error(void __user *eip)
++void math_error(void __user *ip)
+ {
+ struct task_struct * task;
+ siginfo_t info;
+@@ -881,7 +922,7 @@ void math_error(void __user *eip)
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_code = __SI_FAULT;
+- info.si_addr = eip;
++ info.si_addr = ip;
+ /*
+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
+ * status. 0x3f is the exception bits in these regs, 0x200 is the
+@@ -924,13 +965,13 @@ void math_error(void __user *eip)
+ force_sig_info(SIGFPE, &info, task);
+ }
+
+-fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
++void do_coprocessor_error(struct pt_regs * regs, long error_code)
+ {
+ ignore_fpu_irq = 1;
+- math_error((void __user *)regs->eip);
++ math_error((void __user *)regs->ip);
+ }
+
+-static void simd_math_error(void __user *eip)
++static void simd_math_error(void __user *ip)
+ {
+ struct task_struct * task;
+ siginfo_t info;
+@@ -946,7 +987,7 @@ static void simd_math_error(void __user
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_code = __SI_FAULT;
+- info.si_addr = eip;
++ info.si_addr = ip;
+ /*
+ * The SIMD FPU exceptions are handled a little differently, as there
+ * is only a single status/control register. Thus, to determine which
+@@ -978,19 +1019,19 @@ static void simd_math_error(void __user
+ force_sig_info(SIGFPE, &info, task);
+ }
+
+-fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
++void do_simd_coprocessor_error(struct pt_regs * regs,
+ long error_code)
+ {
+ if (cpu_has_xmm) {
+ /* Handle SIMD FPU exceptions on PIII+ processors. */
+ ignore_fpu_irq = 1;
+- simd_math_error((void __user *)regs->eip);
++ simd_math_error((void __user *)regs->ip);
+ } else {
+ /*
+ * Handle strange cache flush from user space exception
+ * in all other cases. This is undocumented behaviour.
+ */
+- if (regs->eflags & VM_MASK) {
++ if (regs->flags & VM_MASK) {
+ handle_vm86_fault((struct kernel_vm86_regs *)regs,
+ error_code);
+ return;
+@@ -1003,7 +1044,7 @@ fastcall void do_simd_coprocessor_error(
+ }
+
+ #ifndef CONFIG_XEN
+-fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
++void do_spurious_interrupt_bug(struct pt_regs * regs,
+ long error_code)
+ {
+ #if 0
+@@ -1012,7 +1053,7 @@ fastcall void do_spurious_interrupt_bug(
+ #endif
+ }
+
+-fastcall unsigned long patch_espfix_desc(unsigned long uesp,
++unsigned long patch_espfix_desc(unsigned long uesp,
+ unsigned long kesp)
+ {
+ struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
+@@ -1072,7 +1113,7 @@ asmlinkage void math_emulate(long arg)
+ * NB. All these are "trap gates" (i.e. events_mask isn't set) except
+ * for those that specify <dpl>|4 in the second field.
+ */
+-static trap_info_t __cpuinitdata trap_table[] = {
++static const trap_info_t __cpuinitconst trap_table[] = {
+ { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
+ { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
+ { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
+@@ -1105,17 +1146,12 @@ void __init trap_init(void)
+ if (ret)
+ printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
+
++ /*
++ * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
++ * Generate a build-time error if the alignment is wrong.
++ */
++ BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
+ if (cpu_has_fxsr) {
+- /*
+- * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
+- * Generates a compile-time "error: zero width for bit-field" if
+- * the alignment is wrong.
+- */
+- struct fxsrAlignAssert {
+- int _:!(offsetof(struct task_struct,
+- thread.i387.fxsave) & 15);
+- };
+-
+ printk(KERN_INFO "Enabling fast FPU save and restore... ");
+ set_in_cr4(X86_CR4_OSFXSR);
+ printk("done.\n");
+--- sle11-2009-06-29.orig/arch/x86/kernel/traps_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -74,38 +74,41 @@ asmlinkage void alignment_check(void);
+ asmlinkage void machine_check(void);
+ asmlinkage void spurious_interrupt_bug(void);
+
++static unsigned int code_bytes = 64;
++
+ static inline void conditional_sti(struct pt_regs *regs)
+ {
+- if (regs->eflags & X86_EFLAGS_IF)
++ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+ }
+
+ static inline void preempt_conditional_sti(struct pt_regs *regs)
+ {
+- preempt_disable();
+- if (regs->eflags & X86_EFLAGS_IF)
++ inc_preempt_count();
++ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+ }
+
+ static inline void preempt_conditional_cli(struct pt_regs *regs)
+ {
+- if (regs->eflags & X86_EFLAGS_IF)
++ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_disable();
+ /* Make sure to not schedule here because we could be running
+ on an exception stack. */
+- preempt_enable_no_resched();
++ dec_preempt_count();
+ }
+
+ int kstack_depth_to_print = 12;
+
+-#ifdef CONFIG_KALLSYMS
+-void printk_address(unsigned long address)
++void printk_address(unsigned long address, int reliable)
+ {
++#ifdef CONFIG_KALLSYMS
+ unsigned long offset = 0, symsize;
+ const char *symname;
+ char *modname;
+ char *delim = ":";
+- char namebuf[128];
++ char namebuf[KSYM_NAME_LEN];
++ char reliab[4] = "";
+
+ symname = kallsyms_lookup(address, &symsize, &offset,
+ &modname, namebuf);
+@@ -113,17 +116,17 @@ void printk_address(unsigned long addres
+ printk(" [<%016lx>]\n", address);
+ return;
+ }
++ if (!reliable)
++ strcpy(reliab, "? ");
++
+ if (!modname)
+- modname = delim = "";
+- printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
+- address, delim, modname, delim, symname, offset, symsize);
+-}
++ modname = delim = "";
++ printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
++ address, reliab, delim, modname, delim, symname, offset, symsize);
+ #else
+-void printk_address(unsigned long address)
+-{
+ printk(" [<%016lx>]\n", address);
+-}
+ #endif
++}
+
+ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
+ unsigned *usedp, char **idp)
+@@ -210,14 +213,53 @@ static unsigned long *in_exception_stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
++static inline int valid_stack_ptr(struct thread_info *tinfo,
++ void *p, unsigned int size, void *end)
+ {
+- void *t = (void *)tinfo;
+- return p > t && p < t + THREAD_SIZE - 3;
++ void *t = tinfo;
++ if (end) {
++ if (p < end && p >= (end-THREAD_SIZE))
++ return 1;
++ else
++ return 0;
++ }
++ return p > t && p < t + THREAD_SIZE - size;
++}
++
++/* The form of the top of the frame on the stack */
++struct stack_frame {
++ struct stack_frame *next_frame;
++ unsigned long return_address;
++};
++
++
++static inline unsigned long print_context_stack(struct thread_info *tinfo,
++ unsigned long *stack, unsigned long bp,
++ const struct stacktrace_ops *ops, void *data,
++ unsigned long *end)
++{
++ struct stack_frame *frame = (struct stack_frame *)bp;
++
++ while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
++ unsigned long addr;
++
++ addr = *stack;
++ if (__kernel_text_address(addr)) {
++ if ((unsigned long) stack == bp + 8) {
++ ops->address(data, addr, 1);
++ frame = frame->next_frame;
++ bp = (unsigned long) frame;
++ } else {
++ ops->address(data, addr, bp == 0);
++ }
++ }
++ stack++;
++ }
++ return bp;
+ }
+
+ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+- unsigned long *stack,
++ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
+ {
+ const unsigned cpu = get_cpu();
+@@ -227,36 +269,28 @@ void dump_trace(struct task_struct *tsk,
+
+ if (!tsk)
+ tsk = current;
++ tinfo = task_thread_info(tsk);
+
+ if (!stack) {
+ unsigned long dummy;
+ stack = &dummy;
+ if (tsk && tsk != current)
+- stack = (unsigned long *)tsk->thread.rsp;
++ stack = (unsigned long *)tsk->thread.sp;
+ }
+
+- /*
+- * Print function call entries within a stack. 'cond' is the
+- * "end of stackframe" condition, that the 'stack++'
+- * iteration will eventually trigger.
+- */
+-#define HANDLE_STACK(cond) \
+- do while (cond) { \
+- unsigned long addr = *stack++; \
+- /* Use unlocked access here because except for NMIs \
+- we should be already protected against module unloads */ \
+- if (__kernel_text_address(addr)) { \
+- /* \
+- * If the address is either in the text segment of the \
+- * kernel, or in the region which contains vmalloc'ed \
+- * memory, it *may* be the address of a calling \
+- * routine; if so, print it so that someone tracing \
+- * down the cause of the crash will be able to figure \
+- * out the call path that was taken. \
+- */ \
+- ops->address(data, addr); \
+- } \
+- } while (0)
++#ifdef CONFIG_FRAME_POINTER
++ if (!bp) {
++ if (tsk == current) {
++ /* Grab bp right from our regs */
++ asm("movq %%rbp, %0" : "=r" (bp):);
++ } else {
++ /* bp is the last reg pushed by switch_to */
++ bp = *(unsigned long *) tsk->thread.sp;
++ }
++ }
++#endif
++
++
+
+ /*
+ * Print function call entries in all stacks, starting at the
+@@ -272,7 +306,9 @@ void dump_trace(struct task_struct *tsk,
+ if (estack_end) {
+ if (ops->stack(data, id) < 0)
+ break;
+- HANDLE_STACK (stack < estack_end);
++
++ bp = print_context_stack(tinfo, stack, bp, ops,
++ data, estack_end);
+ ops->stack(data, "<EOE>");
+ /*
+ * We link to the next stack via the
+@@ -290,7 +326,8 @@ void dump_trace(struct task_struct *tsk,
+ if (stack >= irqstack && stack < irqstack_end) {
+ if (ops->stack(data, "IRQ") < 0)
+ break;
+- HANDLE_STACK (stack < irqstack_end);
++ bp = print_context_stack(tinfo, stack, bp,
++ ops, data, irqstack_end);
+ /*
+ * We link to the next stack (which would be
+ * the process stack normally) the last
+@@ -308,9 +345,7 @@ void dump_trace(struct task_struct *tsk,
+ /*
+ * This handles the process stack:
+ */
+- tinfo = task_thread_info(tsk);
+- HANDLE_STACK (valid_stack_ptr(tinfo, stack));
+-#undef HANDLE_STACK
++ bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+ put_cpu();
+ }
+ EXPORT_SYMBOL(dump_trace);
+@@ -333,10 +368,10 @@ static int print_trace_stack(void *data,
+ return 0;
+ }
+
+-static void print_trace_address(void *data, unsigned long addr)
++static void print_trace_address(void *data, unsigned long addr, int reliable)
+ {
+ touch_nmi_watchdog();
+- printk_address(addr);
++ printk_address(addr, reliable);
+ }
+
+ static const struct stacktrace_ops print_trace_ops = {
+@@ -347,15 +382,17 @@ static const struct stacktrace_ops print
+ };
+
+ void
+-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
++show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
++ unsigned long bp)
+ {
+ printk("\nCall Trace:\n");
+- dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
++ dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
+ printk("\n");
+ }
+
+ static void
+-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
++_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
++ unsigned long bp)
+ {
+ unsigned long *stack;
+ int i;
+@@ -366,14 +403,14 @@ _show_stack(struct task_struct *tsk, str
+ // debugging aid: "show_stack(NULL, NULL);" prints the
+ // back trace for this cpu.
+
+- if (rsp == NULL) {
++ if (sp == NULL) {
+ if (tsk)
+- rsp = (unsigned long *)tsk->thread.rsp;
++ sp = (unsigned long *)tsk->thread.sp;
+ else
+- rsp = (unsigned long *)&rsp;
++ sp = (unsigned long *)&sp;
+ }
+
+- stack = rsp;
++ stack = sp;
+ for(i=0; i < kstack_depth_to_print; i++) {
+ if (stack >= irqstack && stack <= irqstack_end) {
+ if (stack == irqstack_end) {
+@@ -389,12 +426,12 @@ _show_stack(struct task_struct *tsk, str
+ printk(" %016lx", *stack++);
+ touch_nmi_watchdog();
+ }
+- show_trace(tsk, regs, rsp);
++ show_trace(tsk, regs, sp, bp);
+ }
+
+-void show_stack(struct task_struct *tsk, unsigned long * rsp)
++void show_stack(struct task_struct *tsk, unsigned long * sp)
+ {
+- _show_stack(tsk, NULL, rsp);
++ _show_stack(tsk, NULL, sp, 0);
+ }
+
+ /*
+@@ -403,13 +440,19 @@ void show_stack(struct task_struct *tsk,
+ void dump_stack(void)
+ {
+ unsigned long dummy;
++ unsigned long bp = 0;
++
++#ifdef CONFIG_FRAME_POINTER
++ if (!bp)
++ asm("movq %%rbp, %0" : "=r" (bp):);
++#endif
+
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+- show_trace(NULL, NULL, &dummy);
++ show_trace(NULL, NULL, &dummy, bp);
+ }
+
+ EXPORT_SYMBOL(dump_stack);
+@@ -417,12 +460,15 @@ EXPORT_SYMBOL(dump_stack);
+ void show_registers(struct pt_regs *regs)
+ {
+ int i;
+- int in_kernel = !user_mode(regs);
+- unsigned long rsp;
++ unsigned long sp;
+ const int cpu = smp_processor_id();
+ struct task_struct *cur = cpu_pda(cpu)->pcurrent;
++ u8 *ip;
++ unsigned int code_prologue = code_bytes * 43 / 64;
++ unsigned int code_len = code_bytes;
+
+- rsp = regs->rsp;
++ sp = regs->sp;
++ ip = (u8 *) regs->ip - code_prologue;
+ printk("CPU %d ", cpu);
+ __show_regs(regs);
+ printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
+@@ -432,45 +478,43 @@ void show_registers(struct pt_regs *regs
+ * When in-kernel, we also print out the stack and code at the
+ * time of the fault..
+ */
+- if (in_kernel) {
++ if (!user_mode(regs)) {
++ unsigned char c;
+ printk("Stack: ");
+- _show_stack(NULL, regs, (unsigned long*)rsp);
++ _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
++ printk("\n");
+
+- printk("\nCode: ");
+- if (regs->rip < PAGE_OFFSET)
+- goto bad;
+-
+- for (i=0; i<20; i++) {
+- unsigned char c;
+- if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
+-bad:
++ printk(KERN_EMERG "Code: ");
++ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
++ /* try starting at RIP */
++ ip = (u8 *) regs->ip;
++ code_len = code_len - code_prologue + 1;
++ }
++ for (i = 0; i < code_len; i++, ip++) {
++ if (ip < (u8 *)PAGE_OFFSET ||
++ probe_kernel_address(ip, c)) {
+ printk(" Bad RIP value.");
+ break;
+ }
+- printk("%02x ", c);
++ if (ip == (u8 *)regs->ip)
++ printk("<%02x> ", c);
++ else
++ printk("%02x ", c);
+ }
+ }
+ printk("\n");
+ }
+
+-int is_valid_bugaddr(unsigned long rip)
++int is_valid_bugaddr(unsigned long ip)
+ {
+ unsigned short ud2;
+
+- if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
++ if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
+ return 0;
+
+ return ud2 == 0x0b0f;
+ }
+
+-#ifdef CONFIG_BUG
+-void out_of_line_bug(void)
+-{
+- BUG();
+-}
+-EXPORT_SYMBOL(out_of_line_bug);
+-#endif
+-
+ static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+ static int die_owner = -1;
+ static unsigned int die_nest_count;
+@@ -498,7 +542,7 @@ unsigned __kprobes long oops_begin(void)
+ return flags;
+ }
+
+-void __kprobes oops_end(unsigned long flags)
++void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+ {
+ die_owner = -1;
+ bust_spinlocks(0);
+@@ -507,12 +551,17 @@ void __kprobes oops_end(unsigned long fl
+ /* Nest count reaches zero, release the lock. */
+ __raw_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
++ if (!regs) {
++ oops_exit();
++ return;
++ }
+ if (panic_on_oops)
+ panic("Fatal exception");
+ oops_exit();
++ do_exit(signr);
+ }
+
+-void __kprobes __die(const char * str, struct pt_regs * regs, long err)
++int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+ {
+ static int die_counter;
+ printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
+@@ -526,15 +575,17 @@ void __kprobes __die(const char * str, s
+ printk("DEBUG_PAGEALLOC");
+ #endif
+ printk("\n");
+- notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
++ if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
++ return 1;
+ show_registers(regs);
+ add_taint(TAINT_DIE);
+ /* Executive summary in case the oops scrolled away */
+ printk(KERN_ALERT "RIP ");
+- printk_address(regs->rip);
+- printk(" RSP <%016lx>\n", regs->rsp);
++ printk_address(regs->ip, 1);
++ printk(" RSP <%016lx>\n", regs->sp);
+ if (kexec_should_crash(current))
+ crash_kexec(regs);
++ return 0;
+ }
+
+ void die(const char * str, struct pt_regs * regs, long err)
+@@ -542,11 +593,11 @@ void die(const char * str, struct pt_reg
+ unsigned long flags = oops_begin();
+
+ if (!user_mode(regs))
+- report_bug(regs->rip, regs);
++ report_bug(regs->ip, regs);
+
+- __die(str, regs, err);
+- oops_end(flags);
+- do_exit(SIGSEGV);
++ if (__die(str, regs, err))
++ regs = NULL;
++ oops_end(flags, regs, SIGSEGV);
+ }
+
+ #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
+@@ -564,10 +615,10 @@ void __kprobes die_nmi(char *str, struct
+ crash_kexec(regs);
+ if (do_panic || panic_on_oops)
+ panic("Non maskable interrupt");
+- oops_end(flags);
++ oops_end(flags, NULL, SIGBUS);
+ nmi_exit();
+ local_irq_enable();
+- do_exit(SIGSEGV);
++ do_exit(SIGBUS);
+ }
+ #endif
+
+@@ -592,11 +643,14 @@ static void __kprobes do_trap(int trapnr
+ tsk->thread.trap_no = trapnr;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+- printk_ratelimit())
++ printk_ratelimit()) {
+ printk(KERN_INFO
+- "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
++ "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid, str,
+- regs->rip, regs->rsp, error_code);
++ regs->ip, regs->sp, error_code);
++ print_vma_addr(" in ", regs->ip);
++ printk("\n");
++ }
+
+ if (info)
+ force_sig_info(signr, info, tsk);
+@@ -606,19 +660,12 @@ static void __kprobes do_trap(int trapnr
+ }
+
+
+- /* kernel trap */
+- {
+- const struct exception_table_entry *fixup;
+- fixup = search_exception_tables(regs->rip);
+- if (fixup)
+- regs->rip = fixup->fixup;
+- else {
+- tsk->thread.error_code = error_code;
+- tsk->thread.trap_no = trapnr;
+- die(str, regs, error_code);
+- }
+- return;
++ if (!fixup_exception(regs)) {
++ tsk->thread.error_code = error_code;
++ tsk->thread.trap_no = trapnr;
++ die(str, regs, error_code);
+ }
++ return;
+ }
+
+ #define DO_ERROR(trapnr, signr, str, name) \
+@@ -647,10 +694,10 @@ asmlinkage void do_##name(struct pt_regs
+ do_trap(trapnr, signr, str, regs, error_code, &info); \
+ }
+
+-DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
++DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+ DO_ERROR( 4, SIGSEGV, "overflow", overflow)
+ DO_ERROR( 5, SIGSEGV, "bounds", bounds)
+-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
++DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
+ DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
+ DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+@@ -698,32 +745,28 @@ asmlinkage void __kprobes do_general_pro
+ tsk->thread.trap_no = 13;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+- printk_ratelimit())
++ printk_ratelimit()) {
+ printk(KERN_INFO
+- "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
++ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid,
+- regs->rip, regs->rsp, error_code);
++ regs->ip, regs->sp, error_code);
++ print_vma_addr(" in ", regs->ip);
++ printk("\n");
++ }
+
+ force_sig(SIGSEGV, tsk);
+ return;
+ }
+
+- /* kernel gp */
+- {
+- const struct exception_table_entry *fixup;
+- fixup = search_exception_tables(regs->rip);
+- if (fixup) {
+- regs->rip = fixup->fixup;
+- return;
+- }
++ if (fixup_exception(regs))
++ return;
+
+- tsk->thread.error_code = error_code;
+- tsk->thread.trap_no = 13;
+- if (notify_die(DIE_GPF, "general protection fault", regs,
+- error_code, 13, SIGSEGV) == NOTIFY_STOP)
+- return;
+- die("general protection fault", regs, error_code);
+- }
++ tsk->thread.error_code = error_code;
++ tsk->thread.trap_no = 13;
++ if (notify_die(DIE_GPF, "general protection fault", regs,
++ error_code, 13, SIGSEGV) == NOTIFY_STOP)
++ return;
++ die("general protection fault", regs, error_code);
+ }
+
+ static __kprobes void
+@@ -833,15 +876,15 @@ asmlinkage __kprobes struct pt_regs *syn
+ {
+ struct pt_regs *regs = eregs;
+ /* Did already sync */
+- if (eregs == (struct pt_regs *)eregs->rsp)
++ if (eregs == (struct pt_regs *)eregs->sp)
+ ;
+ /* Exception from user space */
+ else if (user_mode(eregs))
+ regs = task_pt_regs(current);
+ /* Exception from kernel and interrupts are enabled. Move to
+ kernel process stack. */
+- else if (eregs->eflags & X86_EFLAGS_IF)
+- regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
++ else if (eregs->flags & X86_EFLAGS_IF)
++ regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
+ if (eregs != regs)
+ *regs = *eregs;
+ return regs;
+@@ -859,6 +902,12 @@ asmlinkage void __kprobes do_debug(struc
+
+ get_debugreg(condition, 6);
+
++ /*
++ * The processor cleared BTF, so don't mark that we need it set.
++ */
++ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
++ tsk->thread.debugctlmsr = 0;
++
+ if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+ SIGTRAP) == NOTIFY_STOP)
+ return;
+@@ -874,27 +923,14 @@ asmlinkage void __kprobes do_debug(struc
+
+ tsk->thread.debugreg6 = condition;
+
+- /* Mask out spurious TF errors due to lazy TF clearing */
++
++ /*
++ * Single-stepping through TF: make sure we ignore any events in
++ * kernel space (but re-enable TF when returning to user mode).
++ */
+ if (condition & DR_STEP) {
+- /*
+- * The TF error should be masked out only if the current
+- * process is not traced and if the TRAP flag has been set
+- * previously by a tracing process (condition detected by
+- * the PT_DTRACE flag); remember that the i386 TRAP flag
+- * can be modified by the process itself in user mode,
+- * allowing programs to debug themselves without the ptrace()
+- * interface.
+- */
+ if (!user_mode(regs))
+ goto clear_TF_reenable;
+- /*
+- * Was the TF flag set by a debugger? If so, clear it now,
+- * so that register information is correct.
+- */
+- if (tsk->ptrace & PT_DTRACE) {
+- regs->eflags &= ~TF_MASK;
+- tsk->ptrace &= ~PT_DTRACE;
+- }
+ }
+
+ /* Ok, finally something we can handle */
+@@ -903,7 +939,7 @@ asmlinkage void __kprobes do_debug(struc
+ info.si_signo = SIGTRAP;
+ info.si_errno = 0;
+ info.si_code = TRAP_BRKPT;
+- info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
++ info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
+ force_sig_info(SIGTRAP, &info, tsk);
+
+ clear_dr7:
+@@ -913,18 +949,15 @@ clear_dr7:
+
+ clear_TF_reenable:
+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+- regs->eflags &= ~TF_MASK;
++ regs->flags &= ~X86_EFLAGS_TF;
+ preempt_conditional_cli(regs);
+ }
+
+ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
+ {
+- const struct exception_table_entry *fixup;
+- fixup = search_exception_tables(regs->rip);
+- if (fixup) {
+- regs->rip = fixup->fixup;
++ if (fixup_exception(regs))
+ return 1;
+- }
++
+ notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
+ /* Illegal floating point operation in the kernel */
+ current->thread.trap_no = trapnr;
+@@ -939,7 +972,7 @@ static int kernel_math_error(struct pt_r
+ */
+ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
+ {
+- void __user *rip = (void __user *)(regs->rip);
++ void __user *ip = (void __user *)(regs->ip);
+ struct task_struct * task;
+ siginfo_t info;
+ unsigned short cwd, swd;
+@@ -959,7 +992,7 @@ asmlinkage void do_coprocessor_error(str
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_code = __SI_FAULT;
+- info.si_addr = rip;
++ info.si_addr = ip;
+ /*
+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
+ * status. 0x3f is the exception bits in these regs, 0x200 is the
+@@ -1008,7 +1041,7 @@ asmlinkage void bad_intr(void)
+
+ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
+ {
+- void __user *rip = (void __user *)(regs->rip);
++ void __user *ip = (void __user *)(regs->ip);
+ struct task_struct * task;
+ siginfo_t info;
+ unsigned short mxcsr;
+@@ -1028,7 +1061,7 @@ asmlinkage void do_simd_coprocessor_erro
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_code = __SI_FAULT;
+- info.si_addr = rip;
++ info.si_addr = ip;
+ /*
+ * The SIMD FPU exceptions are handled a little differently, as there
+ * is only a single status/control register. Thus, to determine which
+@@ -1092,13 +1125,14 @@ asmlinkage void math_state_restore(void)
+ task_thread_info(me)->status |= TS_USEDFPU;
+ me->fpu_counter++;
+ }
++EXPORT_SYMBOL_GPL(math_state_restore);
+
+
+ /*
+ * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
+ * specify <dpl>|4 in the second field.
+ */
+-static trap_info_t __cpuinitdata trap_table[] = {
++static const trap_info_t __cpuinitconst trap_table[] = {
+ { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
+ { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
+ { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
+@@ -1169,3 +1203,14 @@ static int __init kstack_setup(char *s)
+ return 0;
+ }
+ early_param("kstack", kstack_setup);
++
++
++static int __init code_bytes_setup(char *s)
++{
++ code_bytes = simple_strtoul(s, NULL, 0);
++ if (code_bytes > 8192)
++ code_bytes = 8192;
++
++ return 1;
++}
++__setup("code_bytes=", code_bytes_setup);
+--- sle11-2009-06-29.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -43,12 +43,7 @@
+ #include <asm/vgtod.h>
+
+ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+-#define __syscall_clobber "r11","rcx","memory"
+-#define __pa_vsymbol(x) \
+- ({unsigned long v; \
+- extern char __vsyscall_0; \
+- asm("" : "=r" (v) : "0" (x)); \
+- ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
++#define __syscall_clobber "r11","cx","memory"
+
+ /*
+ * vsyscall_gtod_data contains data that is :
+@@ -102,7 +97,7 @@ static __always_inline void do_get_tz(st
+ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+ {
+ int ret;
+- asm volatile("vsysc2: syscall"
++ asm volatile("syscall"
+ : "=a" (ret)
+ : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
+ : __syscall_clobber );
+@@ -112,7 +107,7 @@ static __always_inline int gettimeofday(
+ static __always_inline long time_syscall(long *t)
+ {
+ long secs;
+- asm volatile("vsysc1: syscall"
++ asm volatile("syscall"
+ : "=a" (secs)
+ : "0" (__NR_time),"D" (t) : __syscall_clobber);
+ return secs;
+@@ -190,7 +185,7 @@ time_t __vsyscall(1) vtime(time_t *t)
+ long __vsyscall(2)
+ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+ {
+- unsigned int dummy, p;
++ unsigned int p;
+ unsigned long j = 0;
+
+ /* Fast cache - only recompute value once per jiffies and avoid
+@@ -205,7 +200,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
+ p = tcache->blob[1];
+ } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+ /* Load per CPU data from RDTSCP */
+- rdtscp(dummy, dummy, p);
++ native_read_tscp(&p);
+ } else {
+ /* Load per CPU data from GDT */
+ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+@@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void)
+
+ #ifdef CONFIG_SYSCTL
+
+-#define SYSCALL 0x050f
+-#define NOP2 0x9090
+-
+-/*
+- * NOP out syscall in vsyscall page when not needed.
+- */
+-static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
+- void __user *buffer, size_t *lenp, loff_t *ppos)
++static int
++vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
++ void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+- extern u16 vsysc1, vsysc2;
+- u16 __iomem *map1;
+- u16 __iomem *map2;
+- int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+- if (!write)
+- return ret;
+- /* gcc has some trouble with __va(__pa()), so just do it this
+- way. */
+- map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
+- if (!map1)
+- return -ENOMEM;
+- map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
+- if (!map2) {
+- ret = -ENOMEM;
+- goto out;
+- }
+- if (!vsyscall_gtod_data.sysctl_enabled) {
+- writew(SYSCALL, map1);
+- writew(SYSCALL, map2);
+- } else {
+- writew(NOP2, map1);
+- writew(NOP2, map2);
+- }
+- iounmap(map2);
+-out:
+- iounmap(map1);
+- return ret;
++ return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+ }
+
+ static ctl_table kernel_table2[] = {
+@@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] =
+ .child = kernel_table2 },
+ {}
+ };
+-
+ #endif
+
+ /* Assume __initcall executes before all user space. Hopefully kmod
+@@ -301,7 +264,7 @@ static void __cpuinit vsyscall_set_cpu(i
+ d |= cpu;
+ d |= (node & 0xf) << 12;
+ d |= (node >> 4) << 48;
+- if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
++ if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
+ + GDT_ENTRY_PER_CPU),
+ d))
+ BUG();
+@@ -322,7 +285,7 @@ cpu_vsyscall_notifier(struct notifier_bl
+ return NOTIFY_DONE;
+ }
+
+-static void __init map_vsyscall(void)
++void __init map_vsyscall(void)
+ {
+ extern char __vsyscall_0;
+ unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+@@ -338,7 +301,6 @@ static int __init vsyscall_init(void)
+ BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
+ BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
+ BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
+- map_vsyscall();
+ #ifdef CONFIG_XEN
+ vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */
+ if (boot_cpu_has(X86_FEATURE_RDTSCP))
+--- sle11-2009-06-29.orig/arch/x86/kernel/xen_entry_64.S 2009-06-29 15:14:52.000000000 +0200
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,36 +0,0 @@
+-/*
+- * Copied from arch/xen/i386/kernel/entry.S
+- */
+-/* Offsets into shared_info_t. */
+-#define evtchn_upcall_pending /* 0 */
+-#define evtchn_upcall_mask 1
+-
+-#define sizeof_vcpu_shift 6
+-
+-#ifdef CONFIG_SMP
+-//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
+-//#define preempt_enable(reg) decl threadinfo_preempt_count(reg)
+-#define preempt_disable(reg)
+-#define preempt_enable(reg)
+-#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \
+- movq %gs:pda_cpunumber,reg ; \
+- shl $32, reg ; \
+- shr $32-sizeof_vcpu_shift,reg ; \
+- addq HYPERVISOR_shared_info,reg
+-#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \
+-#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
+-#else
+-#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
+-#define XEN_PUT_VCPU_INFO(reg)
+-#define XEN_PUT_VCPU_INFO_fixup
+-#endif
+-
+-#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
+-#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
+-#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
+- XEN_LOCKED_BLOCK_EVENTS(reg) ; \
+- XEN_PUT_VCPU_INFO(reg)
+-#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
+- XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
+- XEN_PUT_VCPU_INFO(reg)
+-#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
+--- sle11-2009-06-29.orig/arch/x86/mach-xen/setup.c 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/arch/x86/mach-xen/setup.c 2009-03-16 16:33:40.000000000 +0100
+@@ -161,15 +161,12 @@ void __init machine_specific_arch_setup(
+
+ /* Do an early initialization of the fixmap area */
+ {
+- extern pte_t swapper_pg_pmd[PTRS_PER_PTE];
++ extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
+ unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+- pgd_t *pgd = (pgd_t *)xen_start_info->pt_base;
+- pud_t *pud = pud_offset(pgd + pgd_index(addr), addr);
++ pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+
+- swapper_pg_dir = pgd;
+- init_mm.pgd = pgd;
+- make_lowmem_page_readonly(swapper_pg_pmd, XENFEAT_writable_page_tables);
+- set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_pmd) | _PAGE_TABLE));
++ make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
++ set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
+ }
+ }
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-06-29/arch/x86/mm/fault-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -0,0 +1,1025 @@
++/*
++ * Copyright (C) 1995 Linus Torvalds
++ * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
++ */
++
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/string.h>
++#include <linux/types.h>
++#include <linux/ptrace.h>
++#include <linux/mman.h>
++#include <linux/mm.h>
++#include <linux/smp.h>
++#include <linux/interrupt.h>
++#include <linux/init.h>
++#include <linux/tty.h>
++#include <linux/vt_kern.h> /* For unblank_screen() */
++#include <linux/compiler.h>
++#include <linux/highmem.h>
++#include <linux/bootmem.h> /* for max_low_pfn */
++#include <linux/vmalloc.h>
++#include <linux/module.h>
++#include <linux/kprobes.h>
++#include <linux/uaccess.h>
++#include <linux/kdebug.h>
++
++#include <asm/system.h>
++#include <asm/desc.h>
++#include <asm/segment.h>
++#include <asm/pgalloc.h>
++#include <asm/smp.h>
++#include <asm/tlbflush.h>
++#include <asm/proto.h>
++#include <asm-generic/sections.h>
++
++/*
++ * Page fault error code bits
++ * bit 0 == 0 means no page found, 1 means protection fault
++ * bit 1 == 0 means read, 1 means write
++ * bit 2 == 0 means kernel, 1 means user-mode
++ * bit 3 == 1 means use of reserved bit detected
++ * bit 4 == 1 means fault was an instruction fetch
++ */
++#define PF_PROT (1<<0)
++#define PF_WRITE (1<<1)
++#define PF_USER (1<<2)
++#define PF_RSVD (1<<3)
++#define PF_INSTR (1<<4)
++
++static inline int notify_page_fault(struct pt_regs *regs)
++{
++#ifdef CONFIG_KPROBES
++ int ret = 0;
++
++ /* kprobe_running() needs smp_processor_id() */
++#ifdef CONFIG_X86_32
++ if (!user_mode_vm(regs)) {
++#else
++ if (!user_mode(regs)) {
++#endif
++ preempt_disable();
++ if (kprobe_running() && kprobe_fault_handler(regs, 14))
++ ret = 1;
++ preempt_enable();
++ }
++
++ return ret;
++#else
++ return 0;
++#endif
++}
++
++/*
++ * X86_32
++ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
++ * Check that here and ignore it.
++ *
++ * X86_64
++ * Sometimes the CPU reports invalid exceptions on prefetch.
++ * Check that here and ignore it.
++ *
++ * Opcode checker based on code by Richard Brunner
++ */
++static int is_prefetch(struct pt_regs *regs, unsigned long addr,
++ unsigned long error_code)
++{
++ unsigned char *instr;
++ int scan_more = 1;
++ int prefetch = 0;
++ unsigned char *max_instr;
++
++ /*
++ * If it was a exec (instruction fetch) fault on NX page, then
++ * do not ignore the fault:
++ */
++ if (error_code & PF_INSTR)
++ return 0;
++
++ instr = (unsigned char *)convert_ip_to_linear(current, regs);
++ max_instr = instr + 15;
++
++ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
++ return 0;
++
++ while (scan_more && instr < max_instr) {
++ unsigned char opcode;
++ unsigned char instr_hi;
++ unsigned char instr_lo;
++
++ if (probe_kernel_address(instr, opcode))
++ break;
++
++ instr_hi = opcode & 0xf0;
++ instr_lo = opcode & 0x0f;
++ instr++;
++
++ switch (instr_hi) {
++ case 0x20:
++ case 0x30:
++ /*
++ * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
++ * In X86_64 long mode, the CPU will signal invalid
++ * opcode if some of these prefixes are present so
++ * X86_64 will never get here anyway
++ */
++ scan_more = ((instr_lo & 7) == 0x6);
++ break;
++#ifdef CONFIG_X86_64
++ case 0x40:
++ /*
++ * In AMD64 long mode 0x40..0x4F are valid REX prefixes
++ * Need to figure out under what instruction mode the
++ * instruction was issued. Could check the LDT for lm,
++ * but for now it's good enough to assume that long
++ * mode only uses well known segments or kernel.
++ */
++ scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
++ break;
++#endif
++ case 0x60:
++ /* 0x64 thru 0x67 are valid prefixes in all modes. */
++ scan_more = (instr_lo & 0xC) == 0x4;
++ break;
++ case 0xF0:
++ /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
++ scan_more = !instr_lo || (instr_lo>>1) == 1;
++ break;
++ case 0x00:
++ /* Prefetch instruction is 0x0F0D or 0x0F18 */
++ scan_more = 0;
++
++ if (probe_kernel_address(instr, opcode))
++ break;
++ prefetch = (instr_lo == 0xF) &&
++ (opcode == 0x0D || opcode == 0x18);
++ break;
++ default:
++ scan_more = 0;
++ break;
++ }
++ }
++ return prefetch;
++}
++
++static void force_sig_info_fault(int si_signo, int si_code,
++ unsigned long address, struct task_struct *tsk)
++{
++ siginfo_t info;
++
++ info.si_signo = si_signo;
++ info.si_errno = 0;
++ info.si_code = si_code;
++ info.si_addr = (void __user *)address;
++ force_sig_info(si_signo, &info, tsk);
++}
++
++#ifdef CONFIG_X86_64
++static int bad_address(void *p)
++{
++ unsigned long dummy;
++ return probe_kernel_address((unsigned long *)p, dummy);
++}
++#endif
++
++static void dump_pagetable(unsigned long address)
++{
++#ifdef CONFIG_X86_32
++ __typeof__(pte_val(__pte(0))) page;
++
++ page = read_cr3();
++ page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
++#ifdef CONFIG_X86_PAE
++ printk("*pdpt = %016Lx ", page);
++ if ((page & _PAGE_PRESENT)
++ && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
++ page = mfn_to_pfn(page >> PAGE_SHIFT);
++ page <<= PAGE_SHIFT;
++ page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
++ & (PTRS_PER_PMD - 1)];
++ printk(KERN_CONT "*pde = %016Lx ", page);
++ page &= ~_PAGE_NX;
++ }
++#else
++ printk("*pde = %08lx ", page);
++#endif
++
++ /*
++ * We must not directly access the pte in the highpte
++ * case if the page table is located in highmem.
++ * And let's rather not kmap-atomic the pte, just in case
++ * it's allocated already.
++ */
++ if ((page & _PAGE_PRESENT)
++ && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
++ && !(page & _PAGE_PSE)) {
++ page = mfn_to_pfn(page >> PAGE_SHIFT);
++ page <<= PAGE_SHIFT;
++ page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
++ & (PTRS_PER_PTE - 1)];
++ printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
++ }
++
++ printk(KERN_CONT "\n");
++#else /* CONFIG_X86_64 */
++ pgd_t *pgd;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *pte;
++
++ pgd = (pgd_t *)read_cr3();
++
++ pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
++ pgd += pgd_index(address);
++ if (bad_address(pgd)) goto bad;
++ printk("PGD %lx ", pgd_val(*pgd));
++ if (!pgd_present(*pgd)) goto ret;
++
++ pud = pud_offset(pgd, address);
++ if (bad_address(pud)) goto bad;
++ printk(KERN_CONT "PUD %lx ", pud_val(*pud));
++ if (!pud_present(*pud) || pud_large(*pud))
++ goto ret;
++
++ pmd = pmd_offset(pud, address);
++ if (bad_address(pmd)) goto bad;
++ printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
++ if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
++
++ pte = pte_offset_kernel(pmd, address);
++ if (bad_address(pte)) goto bad;
++ printk(KERN_CONT "PTE %lx", pte_val(*pte));
++ret:
++ printk(KERN_CONT "\n");
++ return;
++bad:
++ printk("BAD\n");
++#endif
++}
++
++#ifdef CONFIG_X86_32
++static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
++{
++ unsigned index = pgd_index(address);
++ pgd_t *pgd_k;
++ pud_t *pud, *pud_k;
++ pmd_t *pmd, *pmd_k;
++
++ pgd += index;
++ pgd_k = init_mm.pgd + index;
++
++ if (!pgd_present(*pgd_k))
++ return NULL;
++
++ /*
++ * set_pgd(pgd, *pgd_k); here would be useless on PAE
++ * and redundant with the set_pmd() on non-PAE. As would
++ * set_pud.
++ */
++
++ pud = pud_offset(pgd, address);
++ pud_k = pud_offset(pgd_k, address);
++ if (!pud_present(*pud_k))
++ return NULL;
++
++ pmd = pmd_offset(pud, address);
++ pmd_k = pmd_offset(pud_k, address);
++ if (!pmd_present(*pmd_k))
++ return NULL;
++ if (!pmd_present(*pmd)) {
++ bool lazy = x86_read_percpu(xen_lazy_mmu);
++
++ x86_write_percpu(xen_lazy_mmu, false);
++#if CONFIG_XEN_COMPAT > 0x030002
++ set_pmd(pmd, *pmd_k);
++#else
++ /*
++ * When running on older Xen we must launder *pmd_k through
++ * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
++ */
++ set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
++#endif
++ x86_write_percpu(xen_lazy_mmu, lazy);
++ } else
++ BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
++ return pmd_k;
++}
++#endif
++
++#ifdef CONFIG_X86_64
++static const char errata93_warning[] =
++KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
++KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
++KERN_ERR "******* Please consider a BIOS update.\n"
++KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
++#endif
++
++/* Workaround for K8 erratum #93 & buggy BIOS.
++ BIOS SMM functions are required to use a specific workaround
++ to avoid corruption of the 64bit RIP register on C stepping K8.
++ A lot of BIOS that didn't get tested properly miss this.
++ The OS sees this as a page fault with the upper 32bits of RIP cleared.
++ Try to work around it here.
++ Note we only handle faults in kernel here.
++ Does nothing for X86_32
++ */
++static int is_errata93(struct pt_regs *regs, unsigned long address)
++{
++#ifdef CONFIG_X86_64
++ static int warned;
++ if (address != regs->ip)
++ return 0;
++ if ((address >> 32) != 0)
++ return 0;
++ address |= 0xffffffffUL << 32;
++ if ((address >= (u64)_stext && address <= (u64)_etext) ||
++ (address >= MODULES_VADDR && address <= MODULES_END)) {
++ if (!warned) {
++ printk(errata93_warning);
++ warned = 1;
++ }
++ regs->ip = address;
++ return 1;
++ }
++#endif
++ return 0;
++}
++
++/*
++ * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
++ * addresses >4GB. We catch this in the page fault handler because these
++ * addresses are not reachable. Just detect this case and return. Any code
++ * segment in LDT is compatibility mode.
++ */
++static int is_errata100(struct pt_regs *regs, unsigned long address)
++{
++#ifdef CONFIG_X86_64
++ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
++ (address >> 32))
++ return 1;
++#endif
++ return 0;
++}
++
++void do_invalid_op(struct pt_regs *, unsigned long);
++
++static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
++{
++#ifdef CONFIG_X86_F00F_BUG
++ unsigned long nr;
++ /*
++ * Pentium F0 0F C7 C8 bug workaround.
++ */
++ if (boot_cpu_data.f00f_bug) {
++ nr = (address - idt_descr.address) >> 3;
++
++ if (nr == 6) {
++ do_invalid_op(regs, 0);
++ return 1;
++ }
++ }
++#endif
++ return 0;
++}
++
++static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
++ unsigned long address)
++{
++#ifdef CONFIG_X86_32
++ if (!oops_may_print())
++ return;
++#endif
++
++#ifdef CONFIG_X86_PAE
++ if (error_code & PF_INSTR) {
++ unsigned int level;
++ pte_t *pte = lookup_address(address, &level);
++
++ if (pte && pte_present(*pte) && !pte_exec(*pte))
++ printk(KERN_CRIT "kernel tried to execute "
++ "NX-protected page - exploit attempt? "
++ "(uid: %d)\n", current->uid);
++ }
++#endif
++
++ printk(KERN_ALERT "BUG: unable to handle kernel ");
++ if (address < PAGE_SIZE)
++ printk(KERN_CONT "NULL pointer dereference");
++ else
++ printk(KERN_CONT "paging request");
++#ifdef CONFIG_X86_32
++ printk(KERN_CONT " at %08lx\n", address);
++#else
++ printk(KERN_CONT " at %016lx\n", address);
++#endif
++ printk(KERN_ALERT "IP:");
++ printk_address(regs->ip, 1);
++ dump_pagetable(address);
++}
++
++#ifdef CONFIG_X86_64
++static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
++ unsigned long error_code)
++{
++ unsigned long flags = oops_begin();
++ struct task_struct *tsk;
++
++ printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
++ current->comm, address);
++ dump_pagetable(address);
++ tsk = current;
++ tsk->thread.cr2 = address;
++ tsk->thread.trap_no = 14;
++ tsk->thread.error_code = error_code;
++ if (__die("Bad pagetable", regs, error_code))
++ regs = NULL;
++ oops_end(flags, regs, SIGKILL);
++}
++#endif
++
++static int spurious_fault_check(unsigned long error_code, pte_t *pte)
++{
++ if ((error_code & PF_WRITE) && !pte_write(*pte))
++ return 0;
++ if ((error_code & PF_INSTR) && !pte_exec(*pte))
++ return 0;
++
++ return 1;
++}
++
++/*
++ * Handle a spurious fault caused by a stale TLB entry. This allows
++ * us to lazily refresh the TLB when increasing the permissions of a
++ * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
++ * expensive since that implies doing a full cross-processor TLB
++ * flush, even if no stale TLB entries exist on other processors.
++ * There are no security implications to leaving a stale TLB when
++ * increasing the permissions on a page.
++ */
++static int spurious_fault(unsigned long address,
++ unsigned long error_code)
++{
++ pgd_t *pgd;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *pte;
++
++ /* Reserved-bit violation or user access to kernel space? */
++ if (error_code & (PF_USER | PF_RSVD))
++ return 0;
++
++ pgd = init_mm.pgd + pgd_index(address);
++ if (!pgd_present(*pgd))
++ return 0;
++
++ pud = pud_offset(pgd, address);
++ if (!pud_present(*pud))
++ return 0;
++
++ if (pud_large(*pud))
++ return spurious_fault_check(error_code, (pte_t *) pud);
++
++ pmd = pmd_offset(pud, address);
++ if (!pmd_present(*pmd))
++ return 0;
++
++ if (pmd_large(*pmd))
++ return spurious_fault_check(error_code, (pte_t *) pmd);
++
++ pte = pte_offset_kernel(pmd, address);
++ if (!pte_present(*pte))
++ return 0;
++
++ return spurious_fault_check(error_code, pte);
++}
++
++/*
++ * X86_32
++ * Handle a fault on the vmalloc or module mapping area
++ *
++ * X86_64
++ * Handle a fault on the vmalloc area
++ *
++ * This assumes no large pages in there.
++ */
++static int vmalloc_fault(unsigned long address)
++{
++#ifdef CONFIG_X86_32
++ unsigned long pgd_paddr;
++ pmd_t *pmd_k;
++ pte_t *pte_k;
++ /*
++ * Synchronize this task's top level page-table
++ * with the 'reference' page table.
++ *
++ * Do _not_ use "current" here. We might be inside
++ * an interrupt in the middle of a task switch..
++ */
++ pgd_paddr = read_cr3();
++ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
++ if (!pmd_k)
++ return -1;
++ pte_k = pte_offset_kernel(pmd_k, address);
++ if (!pte_present(*pte_k))
++ return -1;
++ return 0;
++#else
++ pgd_t *pgd, *pgd_ref;
++ pud_t *pud, *pud_ref;
++ pmd_t *pmd, *pmd_ref;
++ pte_t *pte, *pte_ref;
++
++ /* Make sure we are in vmalloc area */
++ if (!(address >= VMALLOC_START && address < VMALLOC_END))
++ return -1;
++
++ /* Copy kernel mappings over when needed. This can also
++ happen within a race in page table update. In the later
++ case just flush. */
++
++ /* On Xen the line below does not always work. Needs investigating! */
++ /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
++ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
++ pgd += pgd_index(address);
++ pgd_ref = pgd_offset_k(address);
++ if (pgd_none(*pgd_ref))
++ return -1;
++ if (pgd_none(*pgd))
++ set_pgd(pgd, *pgd_ref);
++ else
++ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
++
++ /* Below here mismatches are bugs because these lower tables
++ are shared */
++
++ pud = pud_offset(pgd, address);
++ pud_ref = pud_offset(pgd_ref, address);
++ if (pud_none(*pud_ref))
++ return -1;
++ if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
++ BUG();
++ pmd = pmd_offset(pud, address);
++ pmd_ref = pmd_offset(pud_ref, address);
++ if (pmd_none(*pmd_ref))
++ return -1;
++ if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
++ BUG();
++ pte_ref = pte_offset_kernel(pmd_ref, address);
++ if (!pte_present(*pte_ref))
++ return -1;
++ pte = pte_offset_kernel(pmd, address);
++ /* Don't use pte_page here, because the mappings can point
++ outside mem_map, and the NUMA hash lookup cannot handle
++ that. */
++ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
++ BUG();
++ return 0;
++#endif
++}
++
++int show_unhandled_signals = 1;
++
++/*
++ * This routine handles page faults. It determines the address,
++ * and the problem, and then passes it off to one of the appropriate
++ * routines.
++ */
++#ifdef CONFIG_X86_64
++asmlinkage
++#endif
++void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
++{
++ struct task_struct *tsk;
++ struct mm_struct *mm;
++ struct vm_area_struct *vma;
++ unsigned long address;
++ int write, si_code;
++ int fault;
++#ifdef CONFIG_X86_64
++ unsigned long flags;
++#endif
++
++ /*
++ * We can fault from pretty much anywhere, with unknown IRQ state.
++ */
++ trace_hardirqs_fixup();
++
++ /* Set the "privileged fault" bit to something sane. */
++ if (user_mode_vm(regs))
++ error_code |= PF_USER;
++ else
++ error_code &= ~PF_USER;
++
++ tsk = current;
++ mm = tsk->mm;
++ prefetchw(&mm->mmap_sem);
++
++ /* get the address */
++ address = read_cr2();
++
++ si_code = SEGV_MAPERR;
++
++ if (notify_page_fault(regs))
++ return;
++
++ /*
++ * We fault-in kernel-space virtual memory on-demand. The
++ * 'reference' page table is init_mm.pgd.
++ *
++ * NOTE! We MUST NOT take any locks for this case. We may
++ * be in an interrupt or a critical region, and should
++ * only copy the information from the master page table,
++ * nothing more.
++ *
++ * This verifies that the fault happens in kernel space
++ * (error_code & 4) == 0, and that the fault was not a
++ * protection error (error_code & 9) == 0.
++ */
++#ifdef CONFIG_X86_32
++ if (unlikely(address >= TASK_SIZE)) {
++#else
++ if (unlikely(address >= TASK_SIZE64)) {
++#endif
++ /* Faults in hypervisor area can never be patched up. */
++#if defined(CONFIG_X86_XEN)
++ if (address >= hypervisor_virt_start)
++ goto bad_area_nosemaphore;
++#elif defined(CONFIG_X86_64_XEN)
++ if (address >= HYPERVISOR_VIRT_START
++ && address < HYPERVISOR_VIRT_END)
++ goto bad_area_nosemaphore;
++#endif
++ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
++ vmalloc_fault(address) >= 0)
++ return;
++
++ /* Can handle a stale RO->RW TLB */
++ if (spurious_fault(address, error_code))
++ return;
++
++ /*
++ * Don't take the mm semaphore here. If we fixup a prefetch
++ * fault we could otherwise deadlock.
++ */
++ goto bad_area_nosemaphore;
++ }
++
++
++#ifdef CONFIG_X86_32
++ /* It's safe to allow irq's after cr2 has been saved and the vmalloc
++ fault has been handled. */
++ if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
++ local_irq_enable();
++
++ /*
++ * If we're in an interrupt, have no user context or are running in an
++ * atomic region then we must not take the fault.
++ */
++ if (in_atomic() || !mm)
++ goto bad_area_nosemaphore;
++#else /* CONFIG_X86_64 */
++ if (likely(regs->flags & X86_EFLAGS_IF))
++ local_irq_enable();
++
++ if (unlikely(error_code & PF_RSVD))
++ pgtable_bad(address, regs, error_code);
++
++ /*
++ * If we're in an interrupt, have no user context or are running in an
++ * atomic region then we must not take the fault.
++ */
++ if (unlikely(in_atomic() || !mm))
++ goto bad_area_nosemaphore;
++
++ /*
++ * User-mode registers count as a user access even for any
++ * potential system fault or CPU buglet.
++ */
++ if (user_mode_vm(regs))
++ error_code |= PF_USER;
++again:
++#endif
++ /* When running in the kernel we expect faults to occur only to
++ * addresses in user space. All other faults represent errors in the
++ * kernel and should generate an OOPS. Unfortunately, in the case of an
++ * erroneous fault occurring in a code path which already holds mmap_sem
++ * we will deadlock attempting to validate the fault against the
++ * address space. Luckily the kernel only validly references user
++ * space from well defined areas of code, which are listed in the
++ * exceptions table.
++ *
++ * As the vast majority of faults will be valid we will only perform
++ * the source reference check when there is a possibility of a deadlock.
++ * Attempt to lock the address space, if we cannot we then validate the
++ * source. If this is invalid we can skip the address space check,
++ * thus avoiding the deadlock.
++ */
++ if (!down_read_trylock(&mm->mmap_sem)) {
++ if ((error_code & PF_USER) == 0 &&
++ !search_exception_tables(regs->ip))
++ goto bad_area_nosemaphore;
++ down_read(&mm->mmap_sem);
++ }
++
++ vma = find_vma(mm, address);
++ if (!vma)
++ goto bad_area;
++ if (vma->vm_start <= address)
++ goto good_area;
++ if (!(vma->vm_flags & VM_GROWSDOWN))
++ goto bad_area;
++ if (error_code & PF_USER) {
++ /*
++ * Accessing the stack below %sp is always a bug.
++ * The large cushion allows instructions like enter
++ * and pusha to work. ("enter $65535,$31" pushes
++ * 32 pointers and then decrements %sp by 65535.)
++ */
++ if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
++ goto bad_area;
++ }
++ if (expand_stack(vma, address))
++ goto bad_area;
++/*
++ * Ok, we have a good vm_area for this memory access, so
++ * we can handle it..
++ */
++good_area:
++ si_code = SEGV_ACCERR;
++ write = 0;
++ switch (error_code & (PF_PROT|PF_WRITE)) {
++ default: /* 3: write, present */
++ /* fall through */
++ case PF_WRITE: /* write, not present */
++ if (!(vma->vm_flags & VM_WRITE))
++ goto bad_area;
++ write++;
++ break;
++ case PF_PROT: /* read, present */
++ goto bad_area;
++ case 0: /* read, not present */
++ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
++ goto bad_area;
++ }
++
++#ifdef CONFIG_X86_32
++survive:
++#endif
++ /*
++ * If for any reason at all we couldn't handle the fault,
++ * make sure we exit gracefully rather than endlessly redo
++ * the fault.
++ */
++ fault = handle_mm_fault(mm, vma, address, write);
++ if (unlikely(fault & VM_FAULT_ERROR)) {
++ if (fault & VM_FAULT_OOM)
++ goto out_of_memory;
++ else if (fault & VM_FAULT_SIGBUS)
++ goto do_sigbus;
++ BUG();
++ }
++ if (fault & VM_FAULT_MAJOR)
++ tsk->maj_flt++;
++ else
++ tsk->min_flt++;
++
++#ifdef CONFIG_X86_32
++ /*
++ * Did it hit the DOS screen memory VA from vm86 mode?
++ */
++ if (v8086_mode(regs)) {
++ unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
++ if (bit < 32)
++ tsk->thread.screen_bitmap |= 1 << bit;
++ }
++#endif
++ up_read(&mm->mmap_sem);
++ return;
++
++/*
++ * Something tried to access memory that isn't in our memory map..
++ * Fix it, but check if it's kernel or user first..
++ */
++bad_area:
++ up_read(&mm->mmap_sem);
++
++bad_area_nosemaphore:
++ /* User mode accesses just cause a SIGSEGV */
++ if (error_code & PF_USER) {
++ /*
++ * It's possible to have interrupts off here.
++ */
++ local_irq_enable();
++
++ /*
++ * Valid to do another page fault here because this one came
++ * from user space.
++ */
++ if (is_prefetch(regs, address, error_code))
++ return;
++
++ if (is_errata100(regs, address))
++ return;
++
++ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
++ printk_ratelimit()) {
++ printk(
++#ifdef CONFIG_X86_32
++ "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
++#else
++ "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
++#endif
++ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
++ tsk->comm, task_pid_nr(tsk), address, regs->ip,
++ regs->sp, error_code);
++ print_vma_addr(" in ", regs->ip);
++ printk("\n");
++ }
++
++ tsk->thread.cr2 = address;
++ /* Kernel addresses are always protection faults */
++ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
++ tsk->thread.trap_no = 14;
++ force_sig_info_fault(SIGSEGV, si_code, address, tsk);
++ return;
++ }
++
++ if (is_f00f_bug(regs, address))
++ return;
++
++no_context:
++ /* Are we prepared to handle this kernel fault? */
++ if (fixup_exception(regs))
++ return;
++
++ /*
++ * X86_32
++ * Valid to do another page fault here, because if this fault
++ * had been triggered by is_prefetch fixup_exception would have
++ * handled it.
++ *
++ * X86_64
++ * Hall of shame of CPU/BIOS bugs.
++ */
++ if (is_prefetch(regs, address, error_code))
++ return;
++
++ if (is_errata93(regs, address))
++ return;
++
++/*
++ * Oops. The kernel tried to access some bad page. We'll have to
++ * terminate things with extreme prejudice.
++ */
++#ifdef CONFIG_X86_32
++ bust_spinlocks(1);
++#else
++ flags = oops_begin();
++#endif
++
++ show_fault_oops(regs, error_code, address);
++
++ tsk->thread.cr2 = address;
++ tsk->thread.trap_no = 14;
++ tsk->thread.error_code = error_code;
++
++#ifdef CONFIG_X86_32
++ die("Oops", regs, error_code);
++ bust_spinlocks(0);
++ do_exit(SIGKILL);
++#else
++ if (__die("Oops", regs, error_code))
++ regs = NULL;
++ /* Executive summary in case the body of the oops scrolled away */
++ printk(KERN_EMERG "CR2: %016lx\n", address);
++ oops_end(flags, regs, SIGKILL);
++#endif
++
++/*
++ * We ran out of memory, or some other thing happened to us that made
++ * us unable to handle the page fault gracefully.
++ */
++out_of_memory:
++ up_read(&mm->mmap_sem);
++ if (is_global_init(tsk)) {
++ yield();
++#ifdef CONFIG_X86_32
++ down_read(&mm->mmap_sem);
++ goto survive;
++#else
++ goto again;
++#endif
++ }
++
++ printk("VM: killing process %s\n", tsk->comm);
++ if (error_code & PF_USER)
++ do_group_exit(SIGKILL);
++ goto no_context;
++
++do_sigbus:
++ up_read(&mm->mmap_sem);
++
++ /* Kernel mode? Handle exceptions or die */
++ if (!(error_code & PF_USER))
++ goto no_context;
++#ifdef CONFIG_X86_32
++ /* User space => ok to do another page fault */
++ if (is_prefetch(regs, address, error_code))
++ return;
++#endif
++ tsk->thread.cr2 = address;
++ tsk->thread.error_code = error_code;
++ tsk->thread.trap_no = 14;
++ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
++}
++
++DEFINE_SPINLOCK(pgd_lock);
++LIST_HEAD(pgd_list);
++
++void vmalloc_sync_all(void)
++{
++#ifdef CONFIG_X86_32
++ /*
++ * Note that races in the updates of insync and start aren't
++ * problematic: insync can only get set bits added, and updates to
++ * start are only improving performance (without affecting correctness
++ * if undone).
++ * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
++ * This change works just fine with 2-level paging too.
++ */
++#define sync_index(a) ((a) >> PMD_SHIFT)
++ static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
++ static unsigned long start = TASK_SIZE;
++ unsigned long address;
++
++ if (SHARED_KERNEL_PMD)
++ return;
++
++ BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
++ for (address = start;
++ address < hypervisor_virt_start;
++ address += PMD_SIZE) {
++ if (!test_bit(sync_index(address), insync)) {
++ unsigned long flags;
++ struct page *page;
++
++ spin_lock_irqsave(&pgd_lock, flags);
++ /* XEN: failure path assumes non-empty pgd_list. */
++ if (unlikely(list_empty(&pgd_list))) {
++ spin_unlock_irqrestore(&pgd_lock, flags);
++ return;
++ }
++ list_for_each_entry(page, &pgd_list, lru) {
++ if (!vmalloc_sync_one(page_address(page),
++ address))
++ break;
++ }
++ spin_unlock_irqrestore(&pgd_lock, flags);
++ if (!page)
++ set_bit(sync_index(address), insync);
++ }
++ if (address == start && test_bit(sync_index(address), insync))
++ start = address + PMD_SIZE;
++ }
++#else /* CONFIG_X86_64 */
++ /*
++ * Note that races in the updates of insync and start aren't
++ * problematic: insync can only get set bits added, and updates to
++ * start are only improving performance (without affecting correctness
++ * if undone).
++ */
++ static DECLARE_BITMAP(insync, PTRS_PER_PGD);
++ static unsigned long start = VMALLOC_START & PGDIR_MASK;
++ unsigned long address;
++
++ for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
++ if (!test_bit(pgd_index(address), insync)) {
++ const pgd_t *pgd_ref = pgd_offset_k(address);
++ unsigned long flags;
++ struct page *page;
++
++ if (pgd_none(*pgd_ref))
++ continue;
++ spin_lock_irqsave(&pgd_lock, flags);
++ list_for_each_entry(page, &pgd_list, lru) {
++ pgd_t *pgd;
++ pgd = (pgd_t *)page_address(page) + pgd_index(address);
++ if (pgd_none(*pgd))
++ set_pgd(pgd, *pgd_ref);
++ else
++ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
++ }
++ spin_unlock_irqrestore(&pgd_lock, flags);
++ set_bit(pgd_index(address), insync);
++ }
++ if (address == start)
++ start = address + PGDIR_SIZE;
++ }
++ /* Check that there is no need to do the same for the modules area. */
++ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
++ BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
++ (__START_KERNEL & PGDIR_MASK)));
++#endif
++}
+--- sle11-2009-06-29.orig/arch/x86/mm/fault_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,757 +0,0 @@
+-/*
+- * linux/arch/i386/mm/fault.c
+- *
+- * Copyright (C) 1995 Linus Torvalds
+- */
+-
+-#include <linux/signal.h>
+-#include <linux/sched.h>
+-#include <linux/kernel.h>
+-#include <linux/errno.h>
+-#include <linux/string.h>
+-#include <linux/types.h>
+-#include <linux/ptrace.h>
+-#include <linux/mman.h>
+-#include <linux/mm.h>
+-#include <linux/smp.h>
+-#include <linux/interrupt.h>
+-#include <linux/init.h>
+-#include <linux/tty.h>
+-#include <linux/vt_kern.h> /* For unblank_screen() */
+-#include <linux/highmem.h>
+-#include <linux/bootmem.h> /* for max_low_pfn */
+-#include <linux/vmalloc.h>
+-#include <linux/module.h>
+-#include <linux/kprobes.h>
+-#include <linux/uaccess.h>
+-#include <linux/kdebug.h>
+-#include <linux/kprobes.h>
+-
+-#include <asm/system.h>
+-#include <asm/desc.h>
+-#include <asm/segment.h>
+-
+-extern void die(const char *,struct pt_regs *,long);
+-
+-#ifdef CONFIG_KPROBES
+-static inline int notify_page_fault(struct pt_regs *regs)
+-{
+- int ret = 0;
+-
+- /* kprobe_running() needs smp_processor_id() */
+- if (!user_mode_vm(regs)) {
+- preempt_disable();
+- if (kprobe_running() && kprobe_fault_handler(regs, 14))
+- ret = 1;
+- preempt_enable();
+- }
+-
+- return ret;
+-}
+-#else
+-static inline int notify_page_fault(struct pt_regs *regs)
+-{
+- return 0;
+-}
+-#endif
+-
+-/*
+- * Return EIP plus the CS segment base. The segment limit is also
+- * adjusted, clamped to the kernel/user address space (whichever is
+- * appropriate), and returned in *eip_limit.
+- *
+- * The segment is checked, because it might have been changed by another
+- * task between the original faulting instruction and here.
+- *
+- * If CS is no longer a valid code segment, or if EIP is beyond the
+- * limit, or if it is a kernel address when CS is not a kernel segment,
+- * then the returned value will be greater than *eip_limit.
+- *
+- * This is slow, but is very rarely executed.
+- */
+-static inline unsigned long get_segment_eip(struct pt_regs *regs,
+- unsigned long *eip_limit)
+-{
+- unsigned long eip = regs->eip;
+- unsigned seg = regs->xcs & 0xffff;
+- u32 seg_ar, seg_limit, base, *desc;
+-
+- /* Unlikely, but must come before segment checks. */
+- if (unlikely(regs->eflags & VM_MASK)) {
+- base = seg << 4;
+- *eip_limit = base + 0xffff;
+- return base + (eip & 0xffff);
+- }
+-
+- /* The standard kernel/user address space limit. */
+- *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
+-
+- /* By far the most common cases. */
+- if (likely(SEGMENT_IS_FLAT_CODE(seg)))
+- return eip;
+-
+- /* Check the segment exists, is within the current LDT/GDT size,
+- that kernel/user (ring 0..3) has the appropriate privilege,
+- that it's a code segment, and get the limit. */
+- __asm__ ("larl %3,%0; lsll %3,%1"
+- : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
+- if ((~seg_ar & 0x9800) || eip > seg_limit) {
+- *eip_limit = 0;
+- return 1; /* So that returned eip > *eip_limit. */
+- }
+-
+- /* Get the GDT/LDT descriptor base.
+- When you look for races in this code remember that
+- LDT and other horrors are only used in user space. */
+- if (seg & (1<<2)) {
+- /* Must lock the LDT while reading it. */
+- mutex_lock(¤t->mm->context.lock);
+- desc = current->mm->context.ldt;
+- desc = (void *)desc + (seg & ~7);
+- } else {
+- /* Must disable preemption while reading the GDT. */
+- desc = (u32 *)get_cpu_gdt_table(get_cpu());
+- desc = (void *)desc + (seg & ~7);
+- }
+-
+- /* Decode the code segment base from the descriptor */
+- base = get_desc_base((unsigned long *)desc);
+-
+- if (seg & (1<<2)) {
+- mutex_unlock(¤t->mm->context.lock);
+- } else
+- put_cpu();
+-
+- /* Adjust EIP and segment limit, and clamp at the kernel limit.
+- It's legitimate for segments to wrap at 0xffffffff. */
+- seg_limit += base;
+- if (seg_limit < *eip_limit && seg_limit >= base)
+- *eip_limit = seg_limit;
+- return eip + base;
+-}
+-
+-/*
+- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+- * Check that here and ignore it.
+- */
+-static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
+-{
+- unsigned long limit;
+- unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
+- int scan_more = 1;
+- int prefetch = 0;
+- int i;
+-
+- for (i = 0; scan_more && i < 15; i++) {
+- unsigned char opcode;
+- unsigned char instr_hi;
+- unsigned char instr_lo;
+-
+- if (instr > (unsigned char *)limit)
+- break;
+- if (probe_kernel_address(instr, opcode))
+- break;
+-
+- instr_hi = opcode & 0xf0;
+- instr_lo = opcode & 0x0f;
+- instr++;
+-
+- switch (instr_hi) {
+- case 0x20:
+- case 0x30:
+- /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
+- scan_more = ((instr_lo & 7) == 0x6);
+- break;
+-
+- case 0x60:
+- /* 0x64 thru 0x67 are valid prefixes in all modes. */
+- scan_more = (instr_lo & 0xC) == 0x4;
+- break;
+- case 0xF0:
+- /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
+- scan_more = !instr_lo || (instr_lo>>1) == 1;
+- break;
+- case 0x00:
+- /* Prefetch instruction is 0x0F0D or 0x0F18 */
+- scan_more = 0;
+- if (instr > (unsigned char *)limit)
+- break;
+- if (probe_kernel_address(instr, opcode))
+- break;
+- prefetch = (instr_lo == 0xF) &&
+- (opcode == 0x0D || opcode == 0x18);
+- break;
+- default:
+- scan_more = 0;
+- break;
+- }
+- }
+- return prefetch;
+-}
+-
+-static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
+- unsigned long error_code)
+-{
+- if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+- boot_cpu_data.x86 >= 6)) {
+- /* Catch an obscure case of prefetch inside an NX page. */
+- if (nx_enabled && (error_code & 16))
+- return 0;
+- return __is_prefetch(regs, addr);
+- }
+- return 0;
+-}
+-
+-static noinline void force_sig_info_fault(int si_signo, int si_code,
+- unsigned long address, struct task_struct *tsk)
+-{
+- siginfo_t info;
+-
+- info.si_signo = si_signo;
+- info.si_errno = 0;
+- info.si_code = si_code;
+- info.si_addr = (void __user *)address;
+- force_sig_info(si_signo, &info, tsk);
+-}
+-
+-fastcall void do_invalid_op(struct pt_regs *, unsigned long);
+-
+-#ifdef CONFIG_X86_PAE
+-static void dump_fault_path(unsigned long address)
+-{
+- unsigned long *p, page;
+- unsigned long mfn;
+-
+- page = read_cr3();
+- p = (unsigned long *)__va(page);
+- p += (address >> 30) * 2;
+- printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
+- if (p[0] & _PAGE_PRESENT) {
+- mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
+- page = mfn_to_pfn(mfn) << PAGE_SHIFT;
+- p = (unsigned long *)__va(page);
+- address &= 0x3fffffff;
+- p += (address >> 21) * 2;
+- printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
+- page, p[1], p[0]);
+- mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
+-#ifdef CONFIG_HIGHPTE
+- if (mfn_to_pfn(mfn) >= highstart_pfn)
+- return;
+-#endif
+- if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) {
+- page = mfn_to_pfn(mfn) << PAGE_SHIFT;
+- p = (unsigned long *) __va(page);
+- address &= 0x001fffff;
+- p += (address >> 12) * 2;
+- printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
+- page, p[1], p[0]);
+- }
+- }
+-}
+-#else
+-static void dump_fault_path(unsigned long address)
+-{
+- unsigned long page;
+-
+- page = read_cr3();
+- page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
+- printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
+- machine_to_phys(page));
+- /*
+- * We must not directly access the pte in the highpte
+- * case if the page table is located in highmem.
+- * And lets rather not kmap-atomic the pte, just in case
+- * it's allocated already.
+- */
+- if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
+- && (page & _PAGE_PRESENT)
+- && !(page & _PAGE_PSE)) {
+- page = machine_to_phys(page & PAGE_MASK);
+- page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
+- & (PTRS_PER_PTE - 1)];
+- printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
+- machine_to_phys(page));
+- }
+-}
+-#endif
+-
+-static int spurious_fault(struct pt_regs *regs,
+- unsigned long address,
+- unsigned long error_code)
+-{
+- pgd_t *pgd;
+- pud_t *pud;
+- pmd_t *pmd;
+- pte_t *pte;
+-
+- /* Reserved-bit violation or user access to kernel space? */
+- if (error_code & 0x0c)
+- return 0;
+-
+- pgd = init_mm.pgd + pgd_index(address);
+- if (!pgd_present(*pgd))
+- return 0;
+-
+- pud = pud_offset(pgd, address);
+- if (!pud_present(*pud))
+- return 0;
+-
+- pmd = pmd_offset(pud, address);
+- if (!pmd_present(*pmd))
+- return 0;
+-
+- pte = pte_offset_kernel(pmd, address);
+- if (!pte_present(*pte))
+- return 0;
+- if ((error_code & 0x02) && !pte_write(*pte))
+- return 0;
+-#ifdef CONFIG_X86_PAE
+- if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
+- return 0;
+-#endif
+-
+- return 1;
+-}
+-
+-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+-{
+- unsigned index = pgd_index(address);
+- pgd_t *pgd_k;
+- pud_t *pud, *pud_k;
+- pmd_t *pmd, *pmd_k;
+-
+- pgd += index;
+- pgd_k = init_mm.pgd + index;
+-
+- if (!pgd_present(*pgd_k))
+- return NULL;
+-
+- /*
+- * set_pgd(pgd, *pgd_k); here would be useless on PAE
+- * and redundant with the set_pmd() on non-PAE. As would
+- * set_pud.
+- */
+-
+- pud = pud_offset(pgd, address);
+- pud_k = pud_offset(pgd_k, address);
+- if (!pud_present(*pud_k))
+- return NULL;
+-
+- pmd = pmd_offset(pud, address);
+- pmd_k = pmd_offset(pud_k, address);
+- if (!pmd_present(*pmd_k))
+- return NULL;
+- if (!pmd_present(*pmd)) {
+- bool lazy = x86_read_percpu(xen_lazy_mmu);
+-
+- x86_write_percpu(xen_lazy_mmu, false);
+-#if CONFIG_XEN_COMPAT > 0x030002
+- set_pmd(pmd, *pmd_k);
+-#else
+- /*
+- * When running on older Xen we must launder *pmd_k through
+- * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
+- */
+- set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
+-#endif
+- x86_write_percpu(xen_lazy_mmu, lazy);
+- } else
+- BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+- return pmd_k;
+-}
+-
+-/*
+- * Handle a fault on the vmalloc or module mapping area
+- *
+- * This assumes no large pages in there.
+- */
+-static inline int vmalloc_fault(unsigned long address)
+-{
+- unsigned long pgd_paddr;
+- pmd_t *pmd_k;
+- pte_t *pte_k;
+- /*
+- * Synchronize this task's top level page-table
+- * with the 'reference' page table.
+- *
+- * Do _not_ use "current" here. We might be inside
+- * an interrupt in the middle of a task switch..
+- */
+- pgd_paddr = read_cr3();
+- pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+- if (!pmd_k)
+- return -1;
+- pte_k = pte_offset_kernel(pmd_k, address);
+- if (!pte_present(*pte_k))
+- return -1;
+- return 0;
+-}
+-
+-int show_unhandled_signals = 1;
+-
+-/*
+- * This routine handles page faults. It determines the address,
+- * and the problem, and then passes it off to one of the appropriate
+- * routines.
+- *
+- * error_code:
+- * bit 0 == 0 means no page found, 1 means protection fault
+- * bit 1 == 0 means read, 1 means write
+- * bit 2 == 0 means kernel, 1 means user-mode
+- * bit 3 == 1 means use of reserved bit detected
+- * bit 4 == 1 means fault was an instruction fetch
+- */
+-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
+- unsigned long error_code)
+-{
+- struct task_struct *tsk;
+- struct mm_struct *mm;
+- struct vm_area_struct * vma;
+- unsigned long address;
+- int write, si_code;
+- int fault;
+-
+- /*
+- * We can fault from pretty much anywhere, with unknown IRQ state.
+- */
+- trace_hardirqs_fixup();
+-
+- /* get the address */
+- address = read_cr2();
+-
+- /* Set the "privileged fault" bit to something sane. */
+- error_code &= ~4;
+- error_code |= (regs->xcs & 2) << 1;
+- if (regs->eflags & X86_EFLAGS_VM)
+- error_code |= 4;
+-
+- tsk = current;
+-
+- si_code = SEGV_MAPERR;
+-
+- /*
+- * We fault-in kernel-space virtual memory on-demand. The
+- * 'reference' page table is init_mm.pgd.
+- *
+- * NOTE! We MUST NOT take any locks for this case. We may
+- * be in an interrupt or a critical region, and should
+- * only copy the information from the master page table,
+- * nothing more.
+- *
+- * This verifies that the fault happens in kernel space
+- * (error_code & 4) == 0, and that the fault was not a
+- * protection error (error_code & 9) == 0.
+- */
+- if (unlikely(address >= TASK_SIZE)) {
+-#ifdef CONFIG_XEN
+- /* Faults in hypervisor area can never be patched up. */
+- if (address >= hypervisor_virt_start)
+- goto bad_area_nosemaphore;
+-#endif
+- if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
+- return;
+- /* Can take a spurious fault if mapping changes R/O -> R/W. */
+- if (spurious_fault(regs, address, error_code))
+- return;
+- if (notify_page_fault(regs))
+- return;
+- /*
+- * Don't take the mm semaphore here. If we fixup a prefetch
+- * fault we could otherwise deadlock.
+- */
+- goto bad_area_nosemaphore;
+- }
+-
+- if (notify_page_fault(regs))
+- return;
+-
+- /* It's safe to allow irq's after cr2 has been saved and the vmalloc
+- fault has been handled. */
+- if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
+- local_irq_enable();
+-
+- mm = tsk->mm;
+-
+- /*
+- * If we're in an interrupt, have no user context or are running in an
+- * atomic region then we must not take the fault..
+- */
+- if (in_atomic() || !mm)
+- goto bad_area_nosemaphore;
+-
+- /* When running in the kernel we expect faults to occur only to
+- * addresses in user space. All other faults represent errors in the
+- * kernel and should generate an OOPS. Unfortunately, in the case of an
+- * erroneous fault occurring in a code path which already holds mmap_sem
+- * we will deadlock attempting to validate the fault against the
+- * address space. Luckily the kernel only validly references user
+- * space from well defined areas of code, which are listed in the
+- * exceptions table.
+- *
+- * As the vast majority of faults will be valid we will only perform
+- * the source reference check when there is a possibility of a deadlock.
+- * Attempt to lock the address space, if we cannot we then validate the
+- * source. If this is invalid we can skip the address space check,
+- * thus avoiding the deadlock.
+- */
+- if (!down_read_trylock(&mm->mmap_sem)) {
+- if ((error_code & 4) == 0 &&
+- !search_exception_tables(regs->eip))
+- goto bad_area_nosemaphore;
+- down_read(&mm->mmap_sem);
+- }
+-
+- vma = find_vma(mm, address);
+- if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (error_code & 4) {
+- /*
+- * Accessing the stack below %esp is always a bug.
+- * The large cushion allows instructions like enter
+- * and pusha to work. ("enter $65535,$31" pushes
+- * 32 pointers and then decrements %esp by 65535.)
+- */
+- if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
+- goto bad_area;
+- }
+- if (expand_stack(vma, address))
+- goto bad_area;
+-/*
+- * Ok, we have a good vm_area for this memory access, so
+- * we can handle it..
+- */
+-good_area:
+- si_code = SEGV_ACCERR;
+- write = 0;
+- switch (error_code & 3) {
+- default: /* 3: write, present */
+- /* fall through */
+- case 2: /* write, not present */
+- if (!(vma->vm_flags & VM_WRITE))
+- goto bad_area;
+- write++;
+- break;
+- case 1: /* read, present */
+- goto bad_area;
+- case 0: /* read, not present */
+- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+- goto bad_area;
+- }
+-
+- survive:
+- /*
+- * If for any reason at all we couldn't handle the fault,
+- * make sure we exit gracefully rather than endlessly redo
+- * the fault.
+- */
+- fault = handle_mm_fault(mm, vma, address, write);
+- if (unlikely(fault & VM_FAULT_ERROR)) {
+- if (fault & VM_FAULT_OOM)
+- goto out_of_memory;
+- else if (fault & VM_FAULT_SIGBUS)
+- goto do_sigbus;
+- BUG();
+- }
+- if (fault & VM_FAULT_MAJOR)
+- tsk->maj_flt++;
+- else
+- tsk->min_flt++;
+-
+- /*
+- * Did it hit the DOS screen memory VA from vm86 mode?
+- */
+- if (regs->eflags & VM_MASK) {
+- unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
+- if (bit < 32)
+- tsk->thread.screen_bitmap |= 1 << bit;
+- }
+- up_read(&mm->mmap_sem);
+- return;
+-
+-/*
+- * Something tried to access memory that isn't in our memory map..
+- * Fix it, but check if it's kernel or user first..
+- */
+-bad_area:
+- up_read(&mm->mmap_sem);
+-
+-bad_area_nosemaphore:
+- /* User mode accesses just cause a SIGSEGV */
+- if (error_code & 4) {
+- /*
+- * It's possible to have interrupts off here.
+- */
+- local_irq_enable();
+-
+- /*
+- * Valid to do another page fault here because this one came
+- * from user space.
+- */
+- if (is_prefetch(regs, address, error_code))
+- return;
+-
+- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+- printk_ratelimit()) {
+- printk("%s%s[%d]: segfault at %08lx eip %08lx "
+- "esp %08lx error %lx\n",
+- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+- tsk->comm, task_pid_nr(tsk), address, regs->eip,
+- regs->esp, error_code);
+- }
+- tsk->thread.cr2 = address;
+- /* Kernel addresses are always protection faults */
+- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+- tsk->thread.trap_no = 14;
+- force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+- return;
+- }
+-
+-#ifdef CONFIG_X86_F00F_BUG
+- /*
+- * Pentium F0 0F C7 C8 bug workaround.
+- */
+- if (boot_cpu_data.f00f_bug) {
+- unsigned long nr;
+-
+- nr = (address - idt_descr.address) >> 3;
+-
+- if (nr == 6) {
+- do_invalid_op(regs, 0);
+- return;
+- }
+- }
+-#endif
+-
+-no_context:
+- /* Are we prepared to handle this kernel fault? */
+- if (fixup_exception(regs))
+- return;
+-
+- /*
+- * Valid to do another page fault here, because if this fault
+- * had been triggered by is_prefetch fixup_exception would have
+- * handled it.
+- */
+- if (is_prefetch(regs, address, error_code))
+- return;
+-
+-/*
+- * Oops. The kernel tried to access some bad page. We'll have to
+- * terminate things with extreme prejudice.
+- */
+-
+- bust_spinlocks(1);
+-
+- if (oops_may_print()) {
+-#ifdef CONFIG_X86_PAE
+- if (error_code & 16) {
+- pte_t *pte = lookup_address(address);
+-
+- if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
+- printk(KERN_CRIT "kernel tried to execute "
+- "NX-protected page - exploit attempt? "
+- "(uid: %d)\n", current->uid);
+- }
+-#endif
+- if (address < PAGE_SIZE)
+- printk(KERN_ALERT "BUG: unable to handle kernel NULL "
+- "pointer dereference");
+- else
+- printk(KERN_ALERT "BUG: unable to handle kernel paging"
+- " request");
+- printk(" at virtual address %08lx\n",address);
+- printk(KERN_ALERT "printing eip: %08lx\n", regs->eip);
+- dump_fault_path(address);
+- }
+- tsk->thread.cr2 = address;
+- tsk->thread.trap_no = 14;
+- tsk->thread.error_code = error_code;
+- die("Oops", regs, error_code);
+- bust_spinlocks(0);
+- do_exit(SIGKILL);
+-
+-/*
+- * We ran out of memory, or some other thing happened to us that made
+- * us unable to handle the page fault gracefully.
+- */
+-out_of_memory:
+- up_read(&mm->mmap_sem);
+- if (is_global_init(tsk)) {
+- yield();
+- down_read(&mm->mmap_sem);
+- goto survive;
+- }
+- printk("VM: killing process %s\n", tsk->comm);
+- if (error_code & 4)
+- do_group_exit(SIGKILL);
+- goto no_context;
+-
+-do_sigbus:
+- up_read(&mm->mmap_sem);
+-
+- /* Kernel mode? Handle exceptions or die */
+- if (!(error_code & 4))
+- goto no_context;
+-
+- /* User space => ok to do another page fault */
+- if (is_prefetch(regs, address, error_code))
+- return;
+-
+- tsk->thread.cr2 = address;
+- tsk->thread.error_code = error_code;
+- tsk->thread.trap_no = 14;
+- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+-}
+-
+-void vmalloc_sync_all(void)
+-{
+- /*
+- * Note that races in the updates of insync and start aren't
+- * problematic: insync can only get set bits added, and updates to
+- * start are only improving performance (without affecting correctness
+- * if undone).
+- * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
+- * This change works just fine with 2-level paging too.
+- */
+-#define sync_index(a) ((a) >> PMD_SHIFT)
+- static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
+- static unsigned long start = TASK_SIZE;
+- unsigned long address;
+-
+- if (SHARED_KERNEL_PMD)
+- return;
+-
+- BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
+- for (address = start;
+- address >= TASK_SIZE && address < hypervisor_virt_start;
+- address += 1UL << PMD_SHIFT) {
+- if (!test_bit(sync_index(address), insync)) {
+- unsigned long flags;
+- struct page *page;
+-
+- spin_lock_irqsave(&pgd_lock, flags);
+- /* XEN: failure path assumes non-empty pgd_list. */
+- if (unlikely(!pgd_list)) {
+- spin_unlock_irqrestore(&pgd_lock, flags);
+- return;
+- }
+- for (page = pgd_list; page; page =
+- (struct page *)page->index)
+- if (!vmalloc_sync_one(page_address(page),
+- address)) {
+- BUG_ON(page != pgd_list);
+- break;
+- }
+- spin_unlock_irqrestore(&pgd_lock, flags);
+- if (!page)
+- set_bit(sync_index(address), insync);
+- }
+- if (address == start && test_bit(sync_index(address), insync))
+- start = address + (1UL << PMD_SHIFT);
+- }
+-}
+--- sle11-2009-06-29.orig/arch/x86/mm/fault_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,686 +0,0 @@
+-/*
+- * linux/arch/x86-64/mm/fault.c
+- *
+- * Copyright (C) 1995 Linus Torvalds
+- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+- */
+-
+-#include <linux/signal.h>
+-#include <linux/sched.h>
+-#include <linux/kernel.h>
+-#include <linux/errno.h>
+-#include <linux/string.h>
+-#include <linux/types.h>
+-#include <linux/ptrace.h>
+-#include <linux/mman.h>
+-#include <linux/mm.h>
+-#include <linux/smp.h>
+-#include <linux/interrupt.h>
+-#include <linux/init.h>
+-#include <linux/tty.h>
+-#include <linux/vt_kern.h> /* For unblank_screen() */
+-#include <linux/compiler.h>
+-#include <linux/vmalloc.h>
+-#include <linux/module.h>
+-#include <linux/kprobes.h>
+-#include <linux/uaccess.h>
+-#include <linux/kdebug.h>
+-#include <linux/kprobes.h>
+-
+-#include <asm/system.h>
+-#include <asm/pgalloc.h>
+-#include <asm/smp.h>
+-#include <asm/tlbflush.h>
+-#include <asm/proto.h>
+-#include <asm-generic/sections.h>
+-
+-/* Page fault error code bits */
+-#define PF_PROT (1<<0) /* or no page found */
+-#define PF_WRITE (1<<1)
+-#define PF_USER (1<<2)
+-#define PF_RSVD (1<<3)
+-#define PF_INSTR (1<<4)
+-
+-#ifdef CONFIG_KPROBES
+-static inline int notify_page_fault(struct pt_regs *regs)
+-{
+- int ret = 0;
+-
+- /* kprobe_running() needs smp_processor_id() */
+- if (!user_mode(regs)) {
+- preempt_disable();
+- if (kprobe_running() && kprobe_fault_handler(regs, 14))
+- ret = 1;
+- preempt_enable();
+- }
+-
+- return ret;
+-}
+-#else
+-static inline int notify_page_fault(struct pt_regs *regs)
+-{
+- return 0;
+-}
+-#endif
+-
+-/* Sometimes the CPU reports invalid exceptions on prefetch.
+- Check that here and ignore.
+- Opcode checker based on code by Richard Brunner */
+-static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
+- unsigned long error_code)
+-{
+- unsigned char *instr;
+- int scan_more = 1;
+- int prefetch = 0;
+- unsigned char *max_instr;
+-
+- /* If it was a exec fault ignore */
+- if (error_code & PF_INSTR)
+- return 0;
+-
+- instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
+- max_instr = instr + 15;
+-
+- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+- return 0;
+-
+- while (scan_more && instr < max_instr) {
+- unsigned char opcode;
+- unsigned char instr_hi;
+- unsigned char instr_lo;
+-
+- if (probe_kernel_address(instr, opcode))
+- break;
+-
+- instr_hi = opcode & 0xf0;
+- instr_lo = opcode & 0x0f;
+- instr++;
+-
+- switch (instr_hi) {
+- case 0x20:
+- case 0x30:
+- /* Values 0x26,0x2E,0x36,0x3E are valid x86
+- prefixes. In long mode, the CPU will signal
+- invalid opcode if some of these prefixes are
+- present so we will never get here anyway */
+- scan_more = ((instr_lo & 7) == 0x6);
+- break;
+-
+- case 0x40:
+- /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
+- Need to figure out under what instruction mode the
+- instruction was issued ... */
+- /* Could check the LDT for lm, but for now it's good
+- enough to assume that long mode only uses well known
+- segments or kernel. */
+- scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
+- break;
+-
+- case 0x60:
+- /* 0x64 thru 0x67 are valid prefixes in all modes. */
+- scan_more = (instr_lo & 0xC) == 0x4;
+- break;
+- case 0xF0:
+- /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
+- scan_more = !instr_lo || (instr_lo>>1) == 1;
+- break;
+- case 0x00:
+- /* Prefetch instruction is 0x0F0D or 0x0F18 */
+- scan_more = 0;
+- if (probe_kernel_address(instr, opcode))
+- break;
+- prefetch = (instr_lo == 0xF) &&
+- (opcode == 0x0D || opcode == 0x18);
+- break;
+- default:
+- scan_more = 0;
+- break;
+- }
+- }
+- return prefetch;
+-}
+-
+-static int bad_address(void *p)
+-{
+- unsigned long dummy;
+- return probe_kernel_address((unsigned long *)p, dummy);
+-}
+-
+-void dump_pagetable(unsigned long address)
+-{
+- pgd_t *pgd;
+- pud_t *pud;
+- pmd_t *pmd;
+- pte_t *pte;
+-
+- pgd = (pgd_t *)read_cr3();
+-
+- pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
+- pgd += pgd_index(address);
+- if (bad_address(pgd)) goto bad;
+- printk("PGD %lx ", pgd_val(*pgd));
+- if (!pgd_present(*pgd)) goto ret;
+-
+- pud = pud_offset(pgd, address);
+- if (bad_address(pud)) goto bad;
+- printk("PUD %lx ", pud_val(*pud));
+- if (!pud_present(*pud)) goto ret;
+-
+- pmd = pmd_offset(pud, address);
+- if (bad_address(pmd)) goto bad;
+- printk("PMD %lx ", pmd_val(*pmd));
+- if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
+-
+- pte = pte_offset_kernel(pmd, address);
+- if (bad_address(pte)) goto bad;
+- printk("PTE %lx", pte_val(*pte));
+-ret:
+- printk("\n");
+- return;
+-bad:
+- printk("BAD\n");
+-}
+-
+-static const char errata93_warning[] =
+-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+-KERN_ERR "******* Please consider a BIOS update.\n"
+-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+-
+-/* Workaround for K8 erratum #93 & buggy BIOS.
+- BIOS SMM functions are required to use a specific workaround
+- to avoid corruption of the 64bit RIP register on C stepping K8.
+- A lot of BIOS that didn't get tested properly miss this.
+- The OS sees this as a page fault with the upper 32bits of RIP cleared.
+- Try to work around it here.
+- Note we only handle faults in kernel here. */
+-
+-static int is_errata93(struct pt_regs *regs, unsigned long address)
+-{
+- static int warned;
+- if (address != regs->rip)
+- return 0;
+- if ((address >> 32) != 0)
+- return 0;
+- address |= 0xffffffffUL << 32;
+- if ((address >= (u64)_stext && address <= (u64)_etext) ||
+- (address >= MODULES_VADDR && address <= MODULES_END)) {
+- if (!warned) {
+- printk(errata93_warning);
+- warned = 1;
+- }
+- regs->rip = address;
+- return 1;
+- }
+- return 0;
+-}
+-
+-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+- unsigned long error_code)
+-{
+- unsigned long flags = oops_begin();
+- struct task_struct *tsk;
+-
+- printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+- current->comm, address);
+- dump_pagetable(address);
+- tsk = current;
+- tsk->thread.cr2 = address;
+- tsk->thread.trap_no = 14;
+- tsk->thread.error_code = error_code;
+- __die("Bad pagetable", regs, error_code);
+- oops_end(flags);
+- do_exit(SIGKILL);
+-}
+-
+-/*
+- * Handle a fault on the vmalloc area
+- *
+- * This assumes no large pages in there.
+- */
+-static int vmalloc_fault(unsigned long address)
+-{
+- pgd_t *pgd, *pgd_ref;
+- pud_t *pud, *pud_ref;
+- pmd_t *pmd, *pmd_ref;
+- pte_t *pte, *pte_ref;
+-
+- /* Copy kernel mappings over when needed. This can also
+- happen within a race in page table update. In the later
+- case just flush. */
+-
+- /* On Xen the line below does not always work. Needs investigating! */
+- /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
+- pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+- pgd += pgd_index(address);
+- pgd_ref = pgd_offset_k(address);
+- if (pgd_none(*pgd_ref))
+- return -1;
+- if (pgd_none(*pgd))
+- set_pgd(pgd, *pgd_ref);
+- else
+- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+-
+- /* Below here mismatches are bugs because these lower tables
+- are shared */
+-
+- pud = pud_offset(pgd, address);
+- pud_ref = pud_offset(pgd_ref, address);
+- if (pud_none(*pud_ref))
+- return -1;
+- if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+- BUG();
+- pmd = pmd_offset(pud, address);
+- pmd_ref = pmd_offset(pud_ref, address);
+- if (pmd_none(*pmd_ref))
+- return -1;
+- if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+- BUG();
+- pte_ref = pte_offset_kernel(pmd_ref, address);
+- if (!pte_present(*pte_ref))
+- return -1;
+- pte = pte_offset_kernel(pmd, address);
+- /* Don't use pte_page here, because the mappings can point
+- outside mem_map, and the NUMA hash lookup cannot handle
+- that. */
+- if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+- BUG();
+- return 0;
+-}
+-
+-int show_unhandled_signals = 1;
+-
+-
+-#define MEM_VERBOSE 1
+-
+-#ifdef MEM_VERBOSE
+-#define MEM_LOG(_f, _a...) \
+- printk("fault.c:[%d]-> " _f "\n", \
+- __LINE__ , ## _a )
+-#else
+-#define MEM_LOG(_f, _a...) ((void)0)
+-#endif
+-
+-static int spurious_fault(struct pt_regs *regs,
+- unsigned long address,
+- unsigned long error_code)
+-{
+- pgd_t *pgd;
+- pud_t *pud;
+- pmd_t *pmd;
+- pte_t *pte;
+-
+-#ifdef CONFIG_XEN
+- /* Faults in hypervisor area are never spurious. */
+- if ((address >= HYPERVISOR_VIRT_START) &&
+- (address < HYPERVISOR_VIRT_END))
+- return 0;
+-#endif
+-
+- /* Reserved-bit violation or user access to kernel space? */
+- if (error_code & (PF_RSVD|PF_USER))
+- return 0;
+-
+- pgd = init_mm.pgd + pgd_index(address);
+- if (!pgd_present(*pgd))
+- return 0;
+-
+- pud = pud_offset(pgd, address);
+- if (!pud_present(*pud))
+- return 0;
+-
+- pmd = pmd_offset(pud, address);
+- if (!pmd_present(*pmd))
+- return 0;
+-
+- pte = pte_offset_kernel(pmd, address);
+- if (!pte_present(*pte))
+- return 0;
+- if ((error_code & PF_WRITE) && !pte_write(*pte))
+- return 0;
+- if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
+- return 0;
+-
+- return 1;
+-}
+-
+-/*
+- * This routine handles page faults. It determines the address,
+- * and the problem, and then passes it off to one of the appropriate
+- * routines.
+- */
+-asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
+- unsigned long error_code)
+-{
+- struct task_struct *tsk;
+- struct mm_struct *mm;
+- struct vm_area_struct * vma;
+- unsigned long address;
+- const struct exception_table_entry *fixup;
+- int write, fault;
+- unsigned long flags;
+- siginfo_t info;
+-
+- if (!user_mode(regs))
+- error_code &= ~PF_USER; /* means kernel */
+-
+- /*
+- * We can fault from pretty much anywhere, with unknown IRQ state.
+- */
+- trace_hardirqs_fixup();
+-
+- tsk = current;
+- mm = tsk->mm;
+- prefetchw(&mm->mmap_sem);
+-
+- /* get the address */
+- address = read_cr2();
+-
+- info.si_code = SEGV_MAPERR;
+-
+-
+- /*
+- * We fault-in kernel-space virtual memory on-demand. The
+- * 'reference' page table is init_mm.pgd.
+- *
+- * NOTE! We MUST NOT take any locks for this case. We may
+- * be in an interrupt or a critical region, and should
+- * only copy the information from the master page table,
+- * nothing more.
+- *
+- * This verifies that the fault happens in kernel space
+- * (error_code & 4) == 0, and that the fault was not a
+- * protection error (error_code & 9) == 0.
+- */
+- if (unlikely(address >= TASK_SIZE64)) {
+- /*
+- * Don't check for the module range here: its PML4
+- * is always initialized because it's shared with the main
+- * kernel text. Only vmalloc may need PML4 syncups.
+- */
+- if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+- ((address >= VMALLOC_START && address < VMALLOC_END))) {
+- if (vmalloc_fault(address) >= 0)
+- return;
+- }
+- /* Can take a spurious fault if mapping changes R/O -> R/W. */
+- if (spurious_fault(regs, address, error_code))
+- return;
+- if (notify_page_fault(regs))
+- return;
+- /*
+- * Don't take the mm semaphore here. If we fixup a prefetch
+- * fault we could otherwise deadlock.
+- */
+- goto bad_area_nosemaphore;
+- }
+-
+- if (notify_page_fault(regs))
+- return;
+-
+- if (likely(regs->eflags & X86_EFLAGS_IF))
+- local_irq_enable();
+-
+- if (unlikely(error_code & PF_RSVD))
+- pgtable_bad(address, regs, error_code);
+-
+- /*
+- * If we're in an interrupt or have no user
+- * context, we must not take the fault..
+- */
+- if (unlikely(in_atomic() || !mm))
+- goto bad_area_nosemaphore;
+-
+- /*
+- * User-mode registers count as a user access even for any
+- * potential system fault or CPU buglet.
+- */
+- if (user_mode_vm(regs))
+- error_code |= PF_USER;
+-
+- again:
+- /* When running in the kernel we expect faults to occur only to
+- * addresses in user space. All other faults represent errors in the
+- * kernel and should generate an OOPS. Unfortunately, in the case of an
+- * erroneous fault occurring in a code path which already holds mmap_sem
+- * we will deadlock attempting to validate the fault against the
+- * address space. Luckily the kernel only validly references user
+- * space from well defined areas of code, which are listed in the
+- * exceptions table.
+- *
+- * As the vast majority of faults will be valid we will only perform
+- * the source reference check when there is a possibility of a deadlock.
+- * Attempt to lock the address space, if we cannot we then validate the
+- * source. If this is invalid we can skip the address space check,
+- * thus avoiding the deadlock.
+- */
+- if (!down_read_trylock(&mm->mmap_sem)) {
+- if ((error_code & PF_USER) == 0 &&
+- !search_exception_tables(regs->rip))
+- goto bad_area_nosemaphore;
+- down_read(&mm->mmap_sem);
+- }
+-
+- vma = find_vma(mm, address);
+- if (!vma)
+- goto bad_area;
+- if (likely(vma->vm_start <= address))
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (error_code & 4) {
+- /* Allow userspace just enough access below the stack pointer
+- * to let the 'enter' instruction work.
+- */
+- if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
+- goto bad_area;
+- }
+- if (expand_stack(vma, address))
+- goto bad_area;
+-/*
+- * Ok, we have a good vm_area for this memory access, so
+- * we can handle it..
+- */
+-good_area:
+- info.si_code = SEGV_ACCERR;
+- write = 0;
+- switch (error_code & (PF_PROT|PF_WRITE)) {
+- default: /* 3: write, present */
+- /* fall through */
+- case PF_WRITE: /* write, not present */
+- if (!(vma->vm_flags & VM_WRITE))
+- goto bad_area;
+- write++;
+- break;
+- case PF_PROT: /* read, present */
+- goto bad_area;
+- case 0: /* read, not present */
+- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+- goto bad_area;
+- }
+-
+- /*
+- * If for any reason at all we couldn't handle the fault,
+- * make sure we exit gracefully rather than endlessly redo
+- * the fault.
+- */
+- fault = handle_mm_fault(mm, vma, address, write);
+- if (unlikely(fault & VM_FAULT_ERROR)) {
+- if (fault & VM_FAULT_OOM)
+- goto out_of_memory;
+- else if (fault & VM_FAULT_SIGBUS)
+- goto do_sigbus;
+- BUG();
+- }
+- if (fault & VM_FAULT_MAJOR)
+- tsk->maj_flt++;
+- else
+- tsk->min_flt++;
+- up_read(&mm->mmap_sem);
+- return;
+-
+-/*
+- * Something tried to access memory that isn't in our memory map..
+- * Fix it, but check if it's kernel or user first..
+- */
+-bad_area:
+- up_read(&mm->mmap_sem);
+-
+-bad_area_nosemaphore:
+- /* User mode accesses just cause a SIGSEGV */
+- if (error_code & PF_USER) {
+-
+- /*
+- * It's possible to have interrupts off here.
+- */
+- local_irq_enable();
+-
+- if (is_prefetch(regs, address, error_code))
+- return;
+-
+- /* Work around K8 erratum #100 K8 in compat mode
+- occasionally jumps to illegal addresses >4GB. We
+- catch this here in the page fault handler because
+- these addresses are not reachable. Just detect this
+- case and return. Any code segment in LDT is
+- compatibility mode. */
+- if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+- (address >> 32))
+- return;
+-
+- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+- printk_ratelimit()) {
+- printk(
+- "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
+- tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+- tsk->comm, tsk->pid, address, regs->rip,
+- regs->rsp, error_code);
+- }
+-
+- tsk->thread.cr2 = address;
+- /* Kernel addresses are always protection faults */
+- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+- tsk->thread.trap_no = 14;
+- info.si_signo = SIGSEGV;
+- info.si_errno = 0;
+- /* info.si_code has been set above */
+- info.si_addr = (void __user *)address;
+- force_sig_info(SIGSEGV, &info, tsk);
+- return;
+- }
+-
+-no_context:
+-
+- /* Are we prepared to handle this kernel fault? */
+- fixup = search_exception_tables(regs->rip);
+- if (fixup) {
+- regs->rip = fixup->fixup;
+- return;
+- }
+-
+- /*
+- * Hall of shame of CPU/BIOS bugs.
+- */
+-
+- if (is_prefetch(regs, address, error_code))
+- return;
+-
+- if (is_errata93(regs, address))
+- return;
+-
+-/*
+- * Oops. The kernel tried to access some bad page. We'll have to
+- * terminate things with extreme prejudice.
+- */
+-
+- flags = oops_begin();
+-
+- if (address < PAGE_SIZE)
+- printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+- else
+- printk(KERN_ALERT "Unable to handle kernel paging request");
+- printk(" at %016lx RIP: \n" KERN_ALERT,address);
+- printk_address(regs->rip);
+- dump_pagetable(address);
+- tsk->thread.cr2 = address;
+- tsk->thread.trap_no = 14;
+- tsk->thread.error_code = error_code;
+- __die("Oops", regs, error_code);
+- /* Executive summary in case the body of the oops scrolled away */
+- printk(KERN_EMERG "CR2: %016lx\n", address);
+- oops_end(flags);
+- do_exit(SIGKILL);
+-
+-/*
+- * We ran out of memory, or some other thing happened to us that made
+- * us unable to handle the page fault gracefully.
+- */
+-out_of_memory:
+- up_read(&mm->mmap_sem);
+- if (is_global_init(current)) {
+- yield();
+- goto again;
+- }
+- printk("VM: killing process %s\n", tsk->comm);
+- if (error_code & 4)
+- do_group_exit(SIGKILL);
+- goto no_context;
+-
+-do_sigbus:
+- up_read(&mm->mmap_sem);
+-
+- /* Kernel mode? Handle exceptions or die */
+- if (!(error_code & PF_USER))
+- goto no_context;
+-
+- tsk->thread.cr2 = address;
+- tsk->thread.error_code = error_code;
+- tsk->thread.trap_no = 14;
+- info.si_signo = SIGBUS;
+- info.si_errno = 0;
+- info.si_code = BUS_ADRERR;
+- info.si_addr = (void __user *)address;
+- force_sig_info(SIGBUS, &info, tsk);
+- return;
+-}
+-
+-DEFINE_SPINLOCK(pgd_lock);
+-LIST_HEAD(pgd_list);
+-
+-void vmalloc_sync_all(void)
+-{
+- /* Note that races in the updates of insync and start aren't
+- problematic:
+- insync can only get set bits added, and updates to start are only
+- improving performance (without affecting correctness if undone). */
+- static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+- static unsigned long start = VMALLOC_START & PGDIR_MASK;
+- unsigned long address;
+-
+- for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+- if (!test_bit(pgd_index(address), insync)) {
+- const pgd_t *pgd_ref = pgd_offset_k(address);
+- struct page *page;
+-
+- if (pgd_none(*pgd_ref))
+- continue;
+- spin_lock(&pgd_lock);
+- list_for_each_entry(page, &pgd_list, lru) {
+- pgd_t *pgd;
+- pgd = (pgd_t *)page_address(page) + pgd_index(address);
+- if (pgd_none(*pgd))
+- set_pgd(pgd, *pgd_ref);
+- else
+- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+- }
+- spin_unlock(&pgd_lock);
+- set_bit(pgd_index(address), insync);
+- }
+- if (address == start)
+- start = address + PGDIR_SIZE;
+- }
+- /* Check that there is no need to do the same for the modules area. */
+- BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+- (__START_KERNEL & PGDIR_MASK)));
+-}
+--- sle11-2009-06-29.orig/arch/x86/mm/highmem_32-xen.c 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/arch/x86/mm/highmem_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -18,6 +18,49 @@ void kunmap(struct page *page)
+ kunmap_high(page);
+ }
+
++static void debug_kmap_atomic_prot(enum km_type type)
++{
++#ifdef CONFIG_DEBUG_HIGHMEM
++ static unsigned warn_count = 10;
++
++ if (unlikely(warn_count == 0))
++ return;
++
++ if (unlikely(in_interrupt())) {
++ if (in_irq()) {
++ if (type != KM_IRQ0 && type != KM_IRQ1 &&
++ type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
++ type != KM_BOUNCE_READ) {
++ WARN_ON(1);
++ warn_count--;
++ }
++ } else if (!irqs_disabled()) { /* softirq */
++ if (type != KM_IRQ0 && type != KM_IRQ1 &&
++ type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
++ type != KM_SKB_SUNRPC_DATA &&
++ type != KM_SKB_DATA_SOFTIRQ &&
++ type != KM_BOUNCE_READ) {
++ WARN_ON(1);
++ warn_count--;
++ }
++ }
++ }
++
++ if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
++ type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
++ if (!irqs_disabled()) {
++ WARN_ON(1);
++ warn_count--;
++ }
++ } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
++ if (irq_count() == 0 && !irqs_disabled()) {
++ WARN_ON(1);
++ warn_count--;
++ }
++ }
++#endif
++}
++
+ /*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+@@ -37,6 +80,8 @@ void *kmap_atomic_prot(struct page *page
+ if (!PageHighMem(page))
+ return page_address(page);
+
++ debug_kmap_atomic_prot(type);
++
+ idx = type + KM_TYPE_NR*smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ BUG_ON(!pte_none(*(kmap_pte-idx)));
+--- sle11-2009-06-29.orig/arch/x86/mm/hypervisor.c 2009-05-06 10:23:43.000000000 +0200
++++ sle11-2009-06-29/arch/x86/mm/hypervisor.c 2009-05-14 11:18:39.000000000 +0200
+@@ -869,15 +869,11 @@ int xen_limit_pages_to_max_mfn(
+ }
+ EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
+
+-#ifdef __i386__
+-int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
++int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
+ {
+- __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
+- maddr_t mach_lp = arbitrary_virt_to_machine(lp);
+- return HYPERVISOR_update_descriptor(
+- mach_lp, (u64)entry_a | ((u64)entry_b<<32));
++ maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
++ return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
+ }
+-#endif
+
+ #define MAX_BATCHED_FULL_PTES 32
+
+--- sle11-2009-06-29.orig/arch/x86/mm/init_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/mm/init_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -27,13 +27,13 @@
+ #include <linux/bootmem.h>
+ #include <linux/slab.h>
+ #include <linux/proc_fs.h>
+-#include <linux/efi.h>
+ #include <linux/memory_hotplug.h>
+ #include <linux/initrd.h>
+ #include <linux/cpumask.h>
+ #include <linux/dma-mapping.h>
+ #include <linux/scatterlist.h>
+
++#include <asm/asm.h>
+ #include <asm/processor.h>
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
+@@ -42,18 +42,22 @@
+ #include <asm/fixmap.h>
+ #include <asm/e820.h>
+ #include <asm/apic.h>
++#include <asm/bugs.h>
+ #include <asm/tlb.h>
+ #include <asm/tlbflush.h>
++#include <asm/pgalloc.h>
+ #include <asm/sections.h>
+ #include <asm/hypervisor.h>
+ #include <asm/swiotlb.h>
++#include <asm/setup.h>
++#include <asm/cacheflush.h>
+
+ unsigned int __VMALLOC_RESERVE = 128 << 20;
+
+ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+ unsigned long highstart_pfn, highend_pfn;
+
+-static int noinline do_test_wp_bit(void);
++static noinline int do_test_wp_bit(void);
+
+ /*
+ * Creates a middle page table and puts a pointer to it in the
+@@ -64,17 +68,16 @@ static pmd_t * __init one_md_table_init(
+ {
+ pud_t *pud;
+ pmd_t *pmd_table;
+-
++
+ #ifdef CONFIG_X86_PAE
+ if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
+ pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+
+- paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
++ paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+ make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
+ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+ pud = pud_offset(pgd, 0);
+- if (pmd_table != pmd_offset(pud, 0))
+- BUG();
++ BUG_ON(pmd_table != pmd_offset(pud, 0));
+ }
+ #endif
+ pud = pud_offset(pgd, 0);
+@@ -85,7 +88,7 @@ static pmd_t * __init one_md_table_init(
+
+ /*
+ * Create a page table and place a pointer to it in a middle page
+- * directory entry.
++ * directory entry:
+ */
+ static pte_t * __init one_page_table_init(pmd_t *pmd)
+ {
+@@ -99,9 +102,10 @@ static pte_t * __init one_page_table_ini
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+ page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+ #endif
+- if (!page_table)
++ if (!page_table) {
+ page_table =
+ (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
++ }
+
+ paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+ make_lowmem_page_readonly(page_table,
+@@ -114,22 +118,21 @@ static pte_t * __init one_page_table_ini
+ }
+
+ /*
+- * This function initializes a certain range of kernel virtual memory
++ * This function initializes a certain range of kernel virtual memory
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+- */
+-
+-/*
+- * NOTE: The pagetables are allocated contiguous on the physical space
+- * so we can cache the place of the first one and move around without
++ *
++ * NOTE: The pagetables are allocated contiguous on the physical space
++ * so we can cache the place of the first one and move around without
+ * checking the pgd every time.
+ */
+-static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
++static void __init
++page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
+ {
+- pgd_t *pgd;
+- pmd_t *pmd;
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
++ pgd_t *pgd;
++ pmd_t *pmd;
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+@@ -139,7 +142,8 @@ static void __init page_table_range_init
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+ pmd = one_md_table_init(pgd);
+ pmd = pmd + pmd_index(vaddr);
+- for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
++ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
++ pmd++, pmd_idx++) {
+ if (vaddr < hypervisor_virt_start)
+ one_page_table_init(pmd);
+
+@@ -157,17 +161,17 @@ static inline int is_kernel_text(unsigne
+ }
+
+ /*
+- * This maps the physical memory to kernel virtual address space, a total
+- * of max_low_pfn pages, by creating page tables starting from address
+- * PAGE_OFFSET.
++ * This maps the physical memory to kernel virtual address space, a total
++ * of max_low_pfn pages, by creating page tables starting from address
++ * PAGE_OFFSET:
+ */
+ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+ {
++ int pgd_idx, pmd_idx, pte_ofs;
+ unsigned long pfn;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+- int pgd_idx, pmd_idx, pte_ofs;
+
+ unsigned long max_ram_pfn = xen_start_info->nr_pages;
+ if (max_ram_pfn > max_low_pfn)
+@@ -195,36 +199,49 @@ static void __init kernel_physical_mappi
+ if (pfn >= max_low_pfn)
+ continue;
+ pmd += pmd_idx;
+- for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
+- unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
+- if (address >= hypervisor_virt_start)
++ for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
++ pmd++, pmd_idx++) {
++ unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
++
++ if (addr >= hypervisor_virt_start)
+ continue;
+
+- /* Map with big pages if possible, otherwise create normal page tables. */
++ /*
++ * Map with big pages if possible, otherwise
++ * create normal page tables:
++ */
+ if (cpu_has_pse) {
+- unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
+- if (is_kernel_text(address) || is_kernel_text(address2))
+- set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
+- else
+- set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
++ unsigned int addr2;
++ pgprot_t prot = PAGE_KERNEL_LARGE;
++
++ addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
++ PAGE_OFFSET + PAGE_SIZE-1;
++
++ if (is_kernel_text(addr) ||
++ is_kernel_text(addr2))
++ prot = PAGE_KERNEL_LARGE_EXEC;
++
++ set_pmd(pmd, pfn_pmd(pfn, prot));
+
+ pfn += PTRS_PER_PTE;
+- } else {
+- pte = one_page_table_init(pmd);
++ continue;
++ }
++ pte = one_page_table_init(pmd);
++
++ for (pte += pte_ofs;
++ pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
++ pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
++ pgprot_t prot = PAGE_KERNEL;
++
++ /* XEN: Only map initial RAM allocation. */
++ if ((pfn >= max_ram_pfn) || pte_present(*pte))
++ continue;
++ if (is_kernel_text(addr))
++ prot = PAGE_KERNEL_EXEC;
+
+- for (pte += pte_ofs;
+- pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+- pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
+- /* XEN: Only map initial RAM allocation. */
+- if ((pfn >= max_ram_pfn) || pte_present(*pte))
+- continue;
+- if (is_kernel_text(address))
+- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
+- else
+- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
+- }
+- pte_ofs = 0;
++ set_pte(pte, pfn_pte(pfn, prot));
+ }
++ pte_ofs = 0;
+ }
+ pmd_idx = 0;
+ }
+@@ -245,57 +262,23 @@ static inline int page_kills_ppro(unsign
+
+ #endif
+
+-int page_is_ram(unsigned long pagenr)
+-{
+- int i;
+- unsigned long addr, end;
+-
+- if (efi_enabled) {
+- efi_memory_desc_t *md;
+- void *p;
+-
+- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+- md = p;
+- if (!is_available_memory(md))
+- continue;
+- addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+- end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
+-
+- if ((pagenr >= addr) && (pagenr < end))
+- return 1;
+- }
+- return 0;
+- }
+-
+- for (i = 0; i < e820.nr_map; i++) {
+-
+- if (e820.map[i].type != E820_RAM) /* not usable memory */
+- continue;
+- /*
+- * !!!FIXME!!! Some BIOSen report areas as RAM that
+- * are not. Notably the 640->1Mb area. We need a sanity
+- * check here.
+- */
+- addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+- end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
+- if ((pagenr >= addr) && (pagenr < end))
+- return 1;
+- }
+- return 0;
+-}
+-
+ #ifdef CONFIG_HIGHMEM
+ pte_t *kmap_pte;
+ pgprot_t kmap_prot;
+
+-#define kmap_get_fixmap_pte(vaddr) \
+- pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
++static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
++{
++ return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
++ vaddr), vaddr), vaddr);
++}
+
+ static void __init kmap_init(void)
+ {
+ unsigned long kmap_vstart;
+
+- /* cache the first kmap pte */
++ /*
++ * Cache the first kmap pte:
++ */
+ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+ kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+@@ -304,11 +287,11 @@ static void __init kmap_init(void)
+
+ static void __init permanent_kmaps_init(pgd_t *pgd_base)
+ {
++ unsigned long vaddr;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+- unsigned long vaddr;
+
+ vaddr = PKMAP_BASE;
+ page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+@@ -317,7 +300,7 @@ static void __init permanent_kmaps_init(
+ pud = pud_offset(pgd, vaddr);
+ pmd = pmd_offset(pud, vaddr);
+ pte = pte_offset_kernel(pmd, vaddr);
+- pkmap_page_table = pte;
++ pkmap_page_table = pte;
+ }
+
+ static void __meminit free_new_highpage(struct page *page, int pfn)
+@@ -337,7 +320,8 @@ void __init add_one_highpage_init(struct
+ SetPageReserved(page);
+ }
+
+-static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
++static int __meminit
++add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+ {
+ free_new_highpage(page, pfn);
+ totalram_pages++;
+@@ -345,6 +329,7 @@ static int __meminit add_one_highpage_ho
+ max_mapnr = max(pfn, max_mapnr);
+ #endif
+ num_physpages++;
++
+ return 0;
+ }
+
+@@ -352,7 +337,7 @@ static int __meminit add_one_highpage_ho
+ * Not currently handling the NUMA case.
+ * Assuming single node and all memory that
+ * has been added dynamically that would be
+- * onlined here is in HIGHMEM
++ * onlined here is in HIGHMEM.
+ */
+ void __meminit online_page(struct page *page)
+ {
+@@ -360,13 +345,11 @@ void __meminit online_page(struct page *
+ add_one_highpage_hotplug(page, page_to_pfn(page));
+ }
+
+-
+-#ifdef CONFIG_NUMA
+-extern void set_highmem_pages_init(int);
+-#else
++#ifndef CONFIG_NUMA
+ static void __init set_highmem_pages_init(int bad_ppro)
+ {
+ int pfn;
++
+ for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
+ /*
+ * Holes under sparsemem might not have no mem_map[]:
+@@ -376,23 +359,18 @@ static void __init set_highmem_pages_ini
+ }
+ totalram_pages += totalhigh_pages;
+ }
+-#endif /* CONFIG_FLATMEM */
++#endif /* !CONFIG_NUMA */
+
+ #else
+-#define kmap_init() do { } while (0)
+-#define permanent_kmaps_init(pgd_base) do { } while (0)
+-#define set_highmem_pages_init(bad_ppro) do { } while (0)
++# define kmap_init() do { } while (0)
++# define permanent_kmaps_init(pgd_base) do { } while (0)
++# define set_highmem_pages_init(bad_ppro) do { } while (0)
+ #endif /* CONFIG_HIGHMEM */
+
+-unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
++pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
+ EXPORT_SYMBOL(__PAGE_KERNEL);
+-unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
+
+-#ifdef CONFIG_NUMA
+-extern void __init remap_numa_kva(void);
+-#else
+-#define remap_numa_kva() do {} while (0)
+-#endif
++pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
+
+ pgd_t *swapper_pg_dir;
+
+@@ -410,9 +388,8 @@ static void __init xen_pagetable_setup_d
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+- * constructed in arch/i386/kernel/head.S, and not running in PAE mode
+- * (even if we'll end up running in PAE). The root of the pagetable
+- * will be swapper_pg_dir.
++ * constructed in arch/x86/kernel/head_32.S. The root of the
++ * pagetable will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+@@ -424,10 +401,10 @@ static void __init xen_pagetable_setup_d
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+-static void __init pagetable_init (void)
++static void __init pagetable_init(void)
+ {
+- unsigned long vaddr, end;
+ pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
++ unsigned long vaddr, end;
+
+ xen_pagetable_setup_start(pgd_base);
+
+@@ -449,34 +426,36 @@ static void __init pagetable_init (void)
+ * Fixed mappings, only the page table structure has to be
+ * created - mappings will be set by set_fixmap():
+ */
++ early_ioremap_clear();
+ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+ end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+ page_table_range_init(vaddr, end, pgd_base);
++ early_ioremap_reset();
+
+ permanent_kmaps_init(pgd_base);
+
+ xen_pagetable_setup_done(pgd_base);
+ }
+
+-#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
++#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
+ /*
+- * Swap suspend & friends need this for resume because things like the intel-agp
++ * ACPI suspend needs this for resume, because things like the intel-agp
+ * driver might have split up a kernel 4MB mapping.
+ */
+-char __nosavedata swsusp_pg_dir[PAGE_SIZE]
+- __attribute__ ((aligned (PAGE_SIZE)));
++char swsusp_pg_dir[PAGE_SIZE]
++ __attribute__ ((aligned(PAGE_SIZE)));
+
+ static inline void save_pg_dir(void)
+ {
+ memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+ }
+-#else
++#else /* !CONFIG_ACPI_SLEEP */
+ static inline void save_pg_dir(void)
+ {
+ }
+-#endif
++#endif /* !CONFIG_ACPI_SLEEP */
+
+-void zap_low_mappings (void)
++void zap_low_mappings(void)
+ {
+ int i;
+
+@@ -488,22 +467,24 @@ void zap_low_mappings (void)
+ * Note that "pgd_clear()" doesn't do it for
+ * us, because pgd_clear() is a no-op on i386.
+ */
+- for (i = 0; i < USER_PTRS_PER_PGD; i++)
++ for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+ #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
+ set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
+ #else
+ set_pgd(swapper_pg_dir+i, __pgd(0));
+ #endif
++ }
+ flush_tlb_all();
+ }
+
+-int nx_enabled = 0;
++int nx_enabled;
++
++pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
++EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+ #ifdef CONFIG_X86_PAE
+
+-static int disable_nx __initdata = 0;
+-u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+-EXPORT_SYMBOL_GPL(__supported_pte_mask);
++static int disable_nx __initdata;
+
+ /*
+ * noexec = on|off
+@@ -520,11 +501,14 @@ static int __init noexec_setup(char *str
+ __supported_pte_mask |= _PAGE_NX;
+ disable_nx = 0;
+ }
+- } else if (!strcmp(str,"off")) {
+- disable_nx = 1;
+- __supported_pte_mask &= ~_PAGE_NX;
+- } else
+- return -EINVAL;
++ } else {
++ if (!strcmp(str, "off")) {
++ disable_nx = 1;
++ __supported_pte_mask &= ~_PAGE_NX;
++ } else {
++ return -EINVAL;
++ }
++ }
+
+ return 0;
+ }
+@@ -536,6 +520,7 @@ static void __init set_nx(void)
+
+ if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+ cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
++
+ if ((v[3] & (1 << 20)) && !disable_nx) {
+ rdmsr(MSR_EFER, l, h);
+ l |= EFER_NX;
+@@ -545,35 +530,6 @@ static void __init set_nx(void)
+ }
+ }
+ }
+-
+-/*
+- * Enables/disables executability of a given kernel page and
+- * returns the previous setting.
+- */
+-int __init set_kernel_exec(unsigned long vaddr, int enable)
+-{
+- pte_t *pte;
+- int ret = 1;
+-
+- if (!nx_enabled)
+- goto out;
+-
+- pte = lookup_address(vaddr);
+- BUG_ON(!pte);
+-
+- if (!pte_exec_kernel(*pte))
+- ret = 0;
+-
+- if (enable)
+- pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
+- else
+- pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
+- pte_update_defer(&init_mm, vaddr, pte);
+- __flush_tlb_all();
+-out:
+- return ret;
+-}
+-
+ #endif
+
+ /*
+@@ -590,21 +546,10 @@ void __init paging_init(void)
+ #ifdef CONFIG_X86_PAE
+ set_nx();
+ if (nx_enabled)
+- printk("NX (Execute Disable) protection: active\n");
++ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+ #endif
+-
+ pagetable_init();
+
+-#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
+- /*
+- * We will bail out later - printk doesn't work right now so
+- * the user would just see a hanging kernel.
+- * when running as xen domain we are already in PAE mode at
+- * this point.
+- */
+- if (cpu_has_pae)
+- set_in_cr4(X86_CR4_PAE);
+-#endif
+ __flush_tlb_all();
+
+ kmap_init();
+@@ -631,10 +576,10 @@ void __init paging_init(void)
+ * used to involve black magic jumps to work around some nasty CPU bugs,
+ * but fortunately the switch to using exceptions got rid of all that.
+ */
+-
+ static void __init test_wp_bit(void)
+ {
+- printk("Checking if this processor honours the WP bit even in supervisor mode... ");
++ printk(KERN_INFO
++ "Checking if this processor honours the WP bit even in supervisor mode...");
+
+ /* Any page-aligned address will do, the test is non-destructive */
+ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
+@@ -642,23 +587,22 @@ static void __init test_wp_bit(void)
+ clear_fixmap(FIX_WP_TEST);
+
+ if (!boot_cpu_data.wp_works_ok) {
+- printk("No.\n");
++ printk(KERN_CONT "No.\n");
+ #ifdef CONFIG_X86_WP_WORKS_OK
+- panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
++ panic(
++ "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+ #endif
+ } else {
+- printk("Ok.\n");
++ printk(KERN_CONT "Ok.\n");
+ }
+ }
+
+-static struct kcore_list kcore_mem, kcore_vmalloc;
++static struct kcore_list kcore_mem, kcore_vmalloc;
+
+ void __init mem_init(void)
+ {
+- extern int ppro_with_ram_bug(void);
+ int codesize, reservedpages, datasize, initsize;
+- int tmp;
+- int bad_ppro;
++ int tmp, bad_ppro;
+ unsigned long pfn;
+
+ #if defined(CONFIG_SWIOTLB)
+@@ -668,19 +612,19 @@ void __init mem_init(void)
+ #ifdef CONFIG_FLATMEM
+ BUG_ON(!mem_map);
+ #endif
+-
+ bad_ppro = ppro_with_ram_bug();
+
+ #ifdef CONFIG_HIGHMEM
+ /* check that fixmap and pkmap do not overlap */
+- if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
+- printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
++ if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
++ printk(KERN_ERR
++ "fixmap and kmap areas overlap - this will crash\n");
+ printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
+- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
++ PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
++ FIXADDR_START);
+ BUG();
+ }
+ #endif
+-
+ /* this will put all low memory onto the freelists */
+ totalram_pages += free_all_bootmem();
+ /* XEN: init and count low-mem pages outside initial allocation. */
+@@ -693,7 +637,7 @@ void __init mem_init(void)
+ reservedpages = 0;
+ for (tmp = 0; tmp < max_low_pfn; tmp++)
+ /*
+- * Only count reserved RAM pages
++ * Only count reserved RAM pages:
+ */
+ if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
+ reservedpages++;
+@@ -704,11 +648,12 @@ void __init mem_init(void)
+ datasize = (unsigned long) &_edata - (unsigned long) &_etext;
+ initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
++ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
++ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+ VMALLOC_END-VMALLOC_START);
+
+- printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
++ printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
++ "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
+ (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+ num_physpages << (PAGE_SHIFT-10),
+ codesize >> 10,
+@@ -719,54 +664,53 @@ void __init mem_init(void)
+ );
+
+ #if 1 /* double-sanity-check paranoia */
+- printk("virtual kernel memory layout:\n"
+- " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
++ printk(KERN_INFO "virtual kernel memory layout:\n"
++ " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ #ifdef CONFIG_HIGHMEM
+- " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
++ " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ #endif
+- " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
+- " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
+- " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
+- " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
+- " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
+- FIXADDR_START, FIXADDR_TOP,
+- (FIXADDR_TOP - FIXADDR_START) >> 10,
++ " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
++ " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
++ " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
++ " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
++ " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
++ FIXADDR_START, FIXADDR_TOP,
++ (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+ #ifdef CONFIG_HIGHMEM
+- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+- (LAST_PKMAP*PAGE_SIZE) >> 10,
++ PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
++ (LAST_PKMAP*PAGE_SIZE) >> 10,
+ #endif
+
+- VMALLOC_START, VMALLOC_END,
+- (VMALLOC_END - VMALLOC_START) >> 20,
++ VMALLOC_START, VMALLOC_END,
++ (VMALLOC_END - VMALLOC_START) >> 20,
+
+- (unsigned long)__va(0), (unsigned long)high_memory,
+- ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
++ (unsigned long)__va(0), (unsigned long)high_memory,
++ ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+- (unsigned long)&__init_begin, (unsigned long)&__init_end,
+- ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
++ (unsigned long)&__init_begin, (unsigned long)&__init_end,
++ ((unsigned long)&__init_end -
++ (unsigned long)&__init_begin) >> 10,
+
+- (unsigned long)&_etext, (unsigned long)&_edata,
+- ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
++ (unsigned long)&_etext, (unsigned long)&_edata,
++ ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+- (unsigned long)&_text, (unsigned long)&_etext,
+- ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
++ (unsigned long)&_text, (unsigned long)&_etext,
++ ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+ #ifdef CONFIG_HIGHMEM
+- BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+- BUG_ON(VMALLOC_END > PKMAP_BASE);
++ BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
++ BUG_ON(VMALLOC_END > PKMAP_BASE);
+ #endif
+- BUG_ON(VMALLOC_START > VMALLOC_END);
+- BUG_ON((unsigned long)high_memory > VMALLOC_START);
++ BUG_ON(VMALLOC_START > VMALLOC_END);
++ BUG_ON((unsigned long)high_memory > VMALLOC_START);
+ #endif /* double-sanity-check paranoia */
+
+-#ifdef CONFIG_X86_PAE
+- if (!cpu_has_pae)
+- panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
+-#endif
+ if (boot_cpu_data.wp_works_ok < 0)
+ test_wp_bit();
+
++ cpa_init();
++
+ /*
+ * Subtle. SMP is doing it's boot stuff late (because it has to
+ * fork idle threads) - but it also needs low mappings for the
+@@ -790,49 +734,35 @@ int arch_add_memory(int nid, u64 start,
+
+ return __add_pages(zone, start_pfn, nr_pages);
+ }
+-
+ #endif
+
+-struct kmem_cache *pmd_cache;
+-
+-void __init pgtable_cache_init(void)
+-{
+- if (PTRS_PER_PMD > 1)
+- pmd_cache = kmem_cache_create("pmd",
+- PTRS_PER_PMD*sizeof(pmd_t),
+- PTRS_PER_PMD*sizeof(pmd_t),
+- SLAB_PANIC,
+- pmd_ctor);
+-}
+-
+ /*
+ * This function cannot be __init, since exceptions don't work in that
+ * section. Put this after the callers, so that it cannot be inlined.
+ */
+-static int noinline do_test_wp_bit(void)
++static noinline int do_test_wp_bit(void)
+ {
+ char tmp_reg;
+ int flag;
+
+ __asm__ __volatile__(
+- " movb %0,%1 \n"
+- "1: movb %1,%0 \n"
+- " xorl %2,%2 \n"
++ " movb %0, %1 \n"
++ "1: movb %1, %0 \n"
++ " xorl %2, %2 \n"
+ "2: \n"
+- ".section __ex_table,\"a\"\n"
+- " .align 4 \n"
+- " .long 1b,2b \n"
+- ".previous \n"
++ _ASM_EXTABLE(1b,2b)
+ :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
+ "=q" (tmp_reg),
+ "=r" (flag)
+ :"2" (1)
+ :"memory");
+-
++
+ return flag;
+ }
+
+ #ifdef CONFIG_DEBUG_RODATA
++const int rodata_test_data = 0xC3;
++EXPORT_SYMBOL_GPL(rodata_test_data);
+
+ void mark_rodata_ro(void)
+ {
+@@ -845,32 +775,58 @@ void mark_rodata_ro(void)
+ if (num_possible_cpus() <= 1)
+ #endif
+ {
+- change_page_attr(virt_to_page(start),
+- size >> PAGE_SHIFT, PAGE_KERNEL_RX);
+- printk("Write protecting the kernel text: %luk\n", size >> 10);
++ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
++ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
++ size >> 10);
++
++#ifdef CONFIG_CPA_DEBUG
++ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
++ start, start+size);
++ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
++
++ printk(KERN_INFO "Testing CPA: write protecting again\n");
++ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
++#endif
+ }
+ #endif
+ start += size;
+ size = (unsigned long)__end_rodata - start;
+- change_page_attr(virt_to_page(start),
+- size >> PAGE_SHIFT, PAGE_KERNEL_RO);
+- printk("Write protecting the kernel read-only data: %luk\n",
+- size >> 10);
++ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
++ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
++ size >> 10);
++ rodata_test();
++
++#ifdef CONFIG_CPA_DEBUG
++ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
++ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+
+- /*
+- * change_page_attr() requires a global_flush_tlb() call after it.
+- * We do this after the printk so that if something went wrong in the
+- * change, the printk gets out at least to give a better debug hint
+- * of who is the culprit.
+- */
+- global_flush_tlb();
++ printk(KERN_INFO "Testing CPA: write protecting again\n");
++ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
++#endif
+ }
+ #endif
+
+ void free_init_pages(char *what, unsigned long begin, unsigned long end)
+ {
++#ifdef CONFIG_DEBUG_PAGEALLOC
++ /*
++ * If debugging page accesses then do not free this memory but
++ * mark them not present - any buggy init-section access will
++ * create a kernel page fault:
++ */
++ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
++ begin, PAGE_ALIGN(end));
++ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
++#else
+ unsigned long addr;
+
++ /*
++ * We just marked the kernel text read only above, now that
++ * we are going to free part of that, we need to make that
++ * writeable first.
++ */
++ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
++
+ for (addr = begin; addr < end; addr += PAGE_SIZE) {
+ ClearPageReserved(virt_to_page(addr));
+ init_page_count(virt_to_page(addr));
+@@ -879,6 +835,7 @@ void free_init_pages(char *what, unsigne
+ totalram_pages++;
+ }
+ printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
++#endif
+ }
+
+ void free_initmem(void)
+@@ -894,4 +851,3 @@ void free_initrd_mem(unsigned long start
+ free_init_pages("initrd memory", start, end);
+ }
+ #endif
+-
+--- sle11-2009-06-29.orig/arch/x86/mm/init_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/mm/init_64-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -46,14 +46,13 @@
+ #include <asm/proto.h>
+ #include <asm/smp.h>
+ #include <asm/sections.h>
++#include <asm/kdebug.h>
++#include <asm/numa.h>
++#include <asm/cacheflush.h>
+
+ #include <xen/features.h>
+
+-#ifndef Dprintk
+-#define Dprintk(x...)
+-#endif
+-
+-const struct dma_mapping_ops* dma_ops;
++const struct dma_mapping_ops *dma_ops;
+ EXPORT_SYMBOL(dma_ops);
+
+ #if CONFIG_XEN_COMPAT <= 0x030002
+@@ -80,7 +79,21 @@ extern pte_t level1_fixmap_pgt[PTRS_PER_
+ (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
+ __START_KERNEL_map)))
+
+-static void __meminit early_make_page_readonly(void *va, unsigned int feature)
++pmd_t *__init early_get_pmd(unsigned long va)
++{
++ unsigned long addr;
++ unsigned long *page = (unsigned long *)init_level4_pgt;
++
++ addr = page[pgd_index(va)];
++ addr_to_page(addr, page);
++
++ addr = page[pud_index(va)];
++ addr_to_page(addr, page);
++
++ return (pmd_t *)&page[pmd_index(va)];
++}
++
++void __meminit early_make_page_readonly(void *va, unsigned int feature)
+ {
+ unsigned long addr, _va = (unsigned long)va;
+ pte_t pte, *ptep;
+@@ -107,76 +120,6 @@ static void __meminit early_make_page_re
+ BUG();
+ }
+
+-static void __make_page_readonly(void *va)
+-{
+- pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
+- unsigned long addr = (unsigned long) va;
+-
+- pgd = pgd_offset_k(addr);
+- pud = pud_offset(pgd, addr);
+- pmd = pmd_offset(pud, addr);
+- ptep = pte_offset_kernel(pmd, addr);
+-
+- pte.pte = ptep->pte & ~_PAGE_RW;
+- if (HYPERVISOR_update_va_mapping(addr, pte, 0))
+- xen_l1_entry_update(ptep, pte); /* fallback */
+-
+- if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
+- __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
+-}
+-
+-static void __make_page_writable(void *va)
+-{
+- pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
+- unsigned long addr = (unsigned long) va;
+-
+- pgd = pgd_offset_k(addr);
+- pud = pud_offset(pgd, addr);
+- pmd = pmd_offset(pud, addr);
+- ptep = pte_offset_kernel(pmd, addr);
+-
+- pte.pte = ptep->pte | _PAGE_RW;
+- if (HYPERVISOR_update_va_mapping(addr, pte, 0))
+- xen_l1_entry_update(ptep, pte); /* fallback */
+-
+- if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
+- __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
+-}
+-
+-void make_page_readonly(void *va, unsigned int feature)
+-{
+- if (!xen_feature(feature))
+- __make_page_readonly(va);
+-}
+-
+-void make_page_writable(void *va, unsigned int feature)
+-{
+- if (!xen_feature(feature))
+- __make_page_writable(va);
+-}
+-
+-void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
+-{
+- if (xen_feature(feature))
+- return;
+-
+- while (nr-- != 0) {
+- __make_page_readonly(va);
+- va = (void*)((unsigned long)va + PAGE_SIZE);
+- }
+-}
+-
+-void make_pages_writable(void *va, unsigned nr, unsigned int feature)
+-{
+- if (xen_feature(feature))
+- return;
+-
+- while (nr-- != 0) {
+- __make_page_writable(va);
+- va = (void*)((unsigned long)va + PAGE_SIZE);
+- }
+-}
+-
+ /*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+@@ -187,22 +130,26 @@ void show_mem(void)
+ {
+ long i, total = 0, reserved = 0;
+ long shared = 0, cached = 0;
+- pg_data_t *pgdat;
+ struct page *page;
++ pg_data_t *pgdat;
+
+ printk(KERN_INFO "Mem-info:\n");
+ show_free_areas();
+- printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
++ printk(KERN_INFO "Free swap: %6ldkB\n",
++ nr_swap_pages << (PAGE_SHIFT-10));
+
+ for_each_online_pgdat(pgdat) {
+- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+- /* this loop can take a while with 256 GB and 4k pages
+- so update the NMI watchdog */
+- if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
++ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
++ /*
++ * This loop can take a while with 256 GB and
++ * 4k pages so defer the NMI watchdog:
++ */
++ if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
+ touch_nmi_watchdog();
+- }
++
+ if (!pfn_valid(pgdat->node_start_pfn + i))
+ continue;
++
+ page = pfn_to_page(pgdat->node_start_pfn + i);
+ total++;
+ if (PageReserved(page))
+@@ -211,58 +158,67 @@ void show_mem(void)
+ cached++;
+ else if (page_count(page))
+ shared += page_count(page) - 1;
+- }
++ }
+ }
+- printk(KERN_INFO "%lu pages of RAM\n", total);
+- printk(KERN_INFO "%lu reserved pages\n",reserved);
+- printk(KERN_INFO "%lu pages shared\n",shared);
+- printk(KERN_INFO "%lu pages swap cached\n",cached);
++ printk(KERN_INFO "%lu pages of RAM\n", total);
++ printk(KERN_INFO "%lu reserved pages\n", reserved);
++ printk(KERN_INFO "%lu pages shared\n", shared);
++ printk(KERN_INFO "%lu pages swap cached\n", cached);
+ }
+
++static unsigned long __meminitdata table_start;
++static unsigned long __meminitdata table_end;
+
+ static __init void *spp_getpage(void)
+-{
++{
+ void *ptr;
++
+ if (after_bootmem)
+- ptr = (void *) get_zeroed_page(GFP_ATOMIC);
++ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
+ else if (start_pfn < table_end) {
+ ptr = __va(start_pfn << PAGE_SHIFT);
+ start_pfn++;
+ memset(ptr, 0, PAGE_SIZE);
+ } else
+ ptr = alloc_bootmem_pages(PAGE_SIZE);
+- if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
+- panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
+
+- Dprintk("spp_getpage %p\n", ptr);
++ if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
++ panic("set_pte_phys: cannot allocate page data %s\n",
++ after_bootmem ? "after bootmem" : "");
++ }
++
++ pr_debug("spp_getpage %p\n", ptr);
++
+ return ptr;
+-}
++}
+
+ #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
+ #define pud_offset_u(address) (level3_user_pgt + pud_index(address))
+
+-static __init void set_pte_phys(unsigned long vaddr,
+- unsigned long phys, pgprot_t prot, int user_mode)
++static __init void
++set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
+ {
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte, new_pte;
+
+- Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
++ pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
+
+ pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
+ if (pgd_none(*pgd)) {
+- printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
++ printk(KERN_ERR
++ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
+ return;
+ }
+ pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
+ if (pud_none(*pud)) {
+- pmd = (pmd_t *) spp_getpage();
++ pmd = (pmd_t *) spp_getpage();
+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+ if (pmd != pmd_offset(pud, 0)) {
+- printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
++ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
++ pmd, pmd_offset(pud, 0));
+ return;
+ }
+ }
+@@ -272,7 +228,7 @@ static __init void set_pte_phys(unsigned
+ make_page_readonly(pte, XENFEAT_writable_page_tables);
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+ if (pte != pte_offset_kernel(pmd, 0)) {
+- printk("PAGETABLE BUG #02!\n");
++ printk(KERN_ERR "PAGETABLE BUG #02!\n");
+ return;
+ }
+ }
+@@ -294,30 +250,30 @@ static __init void set_pte_phys(unsigned
+ __flush_tlb_one(vaddr);
+ }
+
+-static __init void set_pte_phys_ma(unsigned long vaddr,
+- unsigned long phys, pgprot_t prot)
++static __init void
++set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
+ {
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte, new_pte;
+
+- Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
++ pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
+
+ pgd = pgd_offset_k(vaddr);
+ if (pgd_none(*pgd)) {
+- printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
++ printk(KERN_ERR
++ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
+ return;
+ }
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+-
+- pmd = (pmd_t *) spp_getpage();
++ pmd = (pmd_t *) spp_getpage();
+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+ if (pmd != pmd_offset(pud, 0)) {
+- printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
+- return;
++ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
++ pmd, pmd_offset(pud, 0));
+ }
+ }
+ pmd = pmd_offset(pud, vaddr);
+@@ -326,7 +282,7 @@ static __init void set_pte_phys_ma(unsig
+ make_page_readonly(pte, XENFEAT_writable_page_tables);
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+ if (pte != pte_offset_kernel(pmd, 0)) {
+- printk("PAGETABLE BUG #02!\n");
++ printk(KERN_ERR "PAGETABLE BUG #02!\n");
+ return;
+ }
+ }
+@@ -350,14 +306,44 @@ static __init void set_pte_phys_ma(unsig
+ __flush_tlb_one(vaddr);
+ }
+
++#ifndef CONFIG_XEN
++/*
++ * The head.S code sets up the kernel high mapping:
++ *
++ * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
++ *
++ * phys_addr holds the negative offset to the kernel, which is added
++ * to the compile time generated pmds. This results in invalid pmds up
++ * to the point where we hit the physaddr 0 mapping.
++ *
++ * We limit the mappings to the region from _text to _end. _end is
++ * rounded up to the 2MB boundary. This catches the invalid pmds as
++ * well, as they are located before _text:
++ */
++void __init cleanup_highmap(void)
++{
++ unsigned long vaddr = __START_KERNEL_map;
++ unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
++ pmd_t *pmd = level2_kernel_pgt;
++ pmd_t *last_pmd = pmd + PTRS_PER_PMD;
++
++ for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
++ if (!pmd_present(*pmd))
++ continue;
++ if (vaddr < (unsigned long) _text || vaddr > end)
++ set_pmd(pmd, __pmd(0));
++ }
++}
++#endif
++
+ /* NOTE: this is meant to be run only at boot */
+-void __init
+-__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
++void __init
++__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+ {
+ unsigned long address = __fix_to_virt(idx);
+
+ if (idx >= __end_of_fixed_addresses) {
+- printk("Invalid __set_fixmap\n");
++ printk(KERN_ERR "Invalid __set_fixmap\n");
+ return;
+ }
+ switch (idx) {
+@@ -375,16 +361,14 @@ __set_fixmap (enum fixed_addresses idx,
+ }
+ }
+
+-unsigned long __meminitdata table_start, table_end;
+-
+ static __meminit void *alloc_static_page(unsigned long *phys)
+ {
+ unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
+
+ if (after_bootmem) {
+ void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
+-
+ *phys = __pa(adr);
++
+ return adr;
+ }
+
+@@ -396,7 +380,7 @@ static __meminit void *alloc_static_page
+
+ #define PTE_SIZE PAGE_SIZE
+
+-static inline int make_readonly(unsigned long paddr)
++static inline int __meminit make_readonly(unsigned long paddr)
+ {
+ extern char __vsyscall_0;
+ int readonly = 0;
+@@ -430,33 +414,38 @@ static inline int make_readonly(unsigned
+ /* Must run before zap_low_mappings */
+ __meminit void *early_ioremap(unsigned long addr, unsigned long size)
+ {
+- unsigned long vaddr;
+ pmd_t *pmd, *last_pmd;
++ unsigned long vaddr;
+ int i, pmds;
+
+ pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+ vaddr = __START_KERNEL_map;
+ pmd = level2_kernel_pgt;
+ last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
++
+ for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
+ for (i = 0; i < pmds; i++) {
+ if (pmd_present(pmd[i]))
+- goto next;
++ goto continue_outer_loop;
+ }
+ vaddr += addr & ~PMD_MASK;
+ addr &= PMD_MASK;
++
+ for (i = 0; i < pmds; i++, addr += PMD_SIZE)
+- set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
+- __flush_tlb();
++ set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++ __flush_tlb_all();
++
+ return (void *)vaddr;
+- next:
++continue_outer_loop:
+ ;
+ }
+ printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
+ return NULL;
+ }
+
+-/* To avoid virtual aliases later */
++/*
++ * To avoid virtual aliases later:
++ */
+ __meminit void early_iounmap(void *addr, unsigned long size)
+ {
+ unsigned long vaddr;
+@@ -466,9 +455,11 @@ __meminit void early_iounmap(void *addr,
+ vaddr = (unsigned long)addr;
+ pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+ pmd = level2_kernel_pgt + pmd_index(vaddr);
++
+ for (i = 0; i < pmds; i++)
+ pmd_clear(pmd + i);
+- __flush_tlb();
++
++ __flush_tlb_all();
+ }
+ #endif
+
+@@ -517,18 +508,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
+ static void __meminit
+ phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+ {
+- pmd_t *pmd = pmd_offset(pud,0);
++ pmd_t *pmd = pmd_offset(pud, 0);
+ spin_lock(&init_mm.page_table_lock);
+ phys_pmd_init(pmd, address, end);
+ spin_unlock(&init_mm.page_table_lock);
+ __flush_tlb_all();
+ }
+
+-static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+-{
++static void __meminit
++phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
++{
+ int i = pud_index(addr);
+
+- for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
++ for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
+ unsigned long pmd_phys;
+ pud_t *pud = pud_page + pud_index(addr);
+ pmd_t *pmd;
+@@ -550,8 +542,8 @@ static void __meminit phys_pud_init(pud_
+
+ early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
+ }
+- __flush_tlb();
+-}
++ __flush_tlb_all();
++}
+
+ void __init xen_init_pt(void)
+ {
+@@ -632,6 +624,7 @@ void __init xen_init_pt(void)
+ static void __init extend_init_mapping(unsigned long tables_space)
+ {
+ unsigned long va = __START_KERNEL_map;
++ unsigned long start = start_pfn;
+ unsigned long phys, addr, *pte_page;
+ pmd_t *pmd;
+ pte_t *pte, new_pte;
+@@ -682,6 +675,10 @@ static void __init extend_init_mapping(u
+ BUG();
+ va += PAGE_SIZE;
+ }
++
++ if (start_pfn > start)
++ reserve_early(start << PAGE_SHIFT,
++ start_pfn << PAGE_SHIFT, "INITMAP");
+ }
+
+ static void __init find_early_table_space(unsigned long end)
+@@ -706,7 +703,7 @@ static void __init find_early_table_spac
+ (table_start << PAGE_SHIFT) + tables);
+ }
+
+-static void xen_finish_init_mapping(void)
++static void __init xen_finish_init_mapping(void)
+ {
+ unsigned long i, start, end;
+
+@@ -738,13 +735,6 @@ static void xen_finish_init_mapping(void
+ /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
+ table_end = ~0UL;
+
+- /*
+- * Prefetch pte's for the bt_ioremap() area. It gets used before the
+- * boot-time allocator is online, so allocate-on-demand would fail.
+- */
+- for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
+- __set_fixmap(i, 0, __pgprot(0));
+-
+ /* Switch to the real shared_info page, and clear the dummy page. */
+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+@@ -764,20 +754,23 @@ static void xen_finish_init_mapping(void
+ table_end = start_pfn;
+ }
+
+-/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
+- This runs before bootmem is initialized and gets pages directly from the
+- physical memory. To access them they are temporarily mapped. */
++/*
++ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
++ * This runs before bootmem is initialized and gets pages directly from
++ * the physical memory. To access them they are temporarily mapped.
++ */
+ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+-{
++{
+ unsigned long next;
+
+- Dprintk("init_memory_mapping\n");
++ pr_debug("init_memory_mapping\n");
+
+- /*
++ /*
+ * Find space for the kernel direct mapping tables.
+- * Later we should allocate these tables in the local node of the memory
+- * mapped. Unfortunately this is done currently before the nodes are
+- * discovered.
++ *
++ * Later we should allocate these tables in the local node of the
++ * memory mapped. Unfortunately this is done currently before the
++ * nodes are discovered.
+ */
+ if (!after_bootmem)
+ find_early_table_space(end);
+@@ -786,8 +779,8 @@ void __init_refok init_memory_mapping(un
+ end = (unsigned long)__va(end);
+
+ for (; start < end; start = next) {
+- unsigned long pud_phys;
+ pgd_t *pgd = pgd_offset_k(start);
++ unsigned long pud_phys;
+ pud_t *pud;
+
+ if (after_bootmem)
+@@ -795,8 +788,8 @@ void __init_refok init_memory_mapping(un
+ else
+ pud = alloc_static_page(&pud_phys);
+ next = start + PGDIR_SIZE;
+- if (next > end)
+- next = end;
++ if (next > end)
++ next = end;
+ phys_pud_init(pud, __pa(start), __pa(next));
+ if (!after_bootmem) {
+ early_make_page_readonly(pud, XENFEAT_writable_page_tables);
+@@ -810,12 +803,17 @@ void __init_refok init_memory_mapping(un
+ }
+
+ __flush_tlb_all();
++
++ if (!after_bootmem)
++ reserve_early(table_start << PAGE_SHIFT,
++ table_end << PAGE_SHIFT, "PGTABLE");
+ }
+
+ #ifndef CONFIG_NUMA
+ void __init paging_init(void)
+ {
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
++
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+ max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+ max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+@@ -829,40 +827,6 @@ void __init paging_init(void)
+ }
+ #endif
+
+-/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
+- from the CPU leading to inconsistent cache lines. address and size
+- must be aligned to 2MB boundaries.
+- Does nothing when the mapping doesn't exist. */
+-void __init clear_kernel_mapping(unsigned long address, unsigned long size)
+-{
+- unsigned long end = address + size;
+-
+- BUG_ON(address & ~LARGE_PAGE_MASK);
+- BUG_ON(size & ~LARGE_PAGE_MASK);
+-
+- for (; address < end; address += LARGE_PAGE_SIZE) {
+- pgd_t *pgd = pgd_offset_k(address);
+- pud_t *pud;
+- pmd_t *pmd;
+- if (pgd_none(*pgd))
+- continue;
+- pud = pud_offset(pgd, address);
+- if (pud_none(*pud))
+- continue;
+- pmd = pmd_offset(pud, address);
+- if (!pmd || pmd_none(*pmd))
+- continue;
+- if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
+- /* Could handle this, but it should not happen currently. */
+- printk(KERN_ERR
+- "clear_kernel_mapping: mapping has been split. will leak memory\n");
+- pmd_ERROR(*pmd);
+- }
+- set_pmd(pmd, __pmd(0));
+- }
+- __flush_tlb_all();
+-}
+-
+ /*
+ * Memory hotplug specific functions
+ */
+@@ -888,16 +852,12 @@ int arch_add_memory(int nid, u64 start,
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ int ret;
+
+- init_memory_mapping(start, (start + size -1));
++ init_memory_mapping(start, start + size-1);
+
+ ret = __add_pages(zone, start_pfn, nr_pages);
+- if (ret)
+- goto error;
++ WARN_ON(1);
+
+ return ret;
+-error:
+- printk("%s: Problem encountered in __add_pages!\n", __func__);
+- return ret;
+ }
+ EXPORT_SYMBOL_GPL(arch_add_memory);
+
+@@ -911,36 +871,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
+
+ #endif /* CONFIG_MEMORY_HOTPLUG */
+
+-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+-/*
+- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
+- * just online the pages.
+- */
+-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+-{
+- int err = -EIO;
+- unsigned long pfn;
+- unsigned long total = 0, mem = 0;
+- for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+- if (pfn_valid(pfn)) {
+- online_page(pfn_to_page(pfn));
+- err = 0;
+- mem++;
+- }
+- total++;
+- }
+- if (!err) {
+- z->spanned_pages += total;
+- z->present_pages += mem;
+- z->zone_pgdat->node_spanned_pages += total;
+- z->zone_pgdat->node_present_pages += mem;
+- }
+- return err;
+-}
+-#endif
+-
+-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
+- kcore_vsyscall;
++static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
++ kcore_modules, kcore_vsyscall;
+
+ void __init mem_init(void)
+ {
+@@ -949,8 +881,7 @@ void __init mem_init(void)
+
+ pci_iommu_alloc();
+
+- /* clear the zero-page */
+- memset(empty_zero_page, 0, PAGE_SIZE);
++ /* clear_bss() already clear the empty_zero_page */
+
+ reservedpages = 0;
+
+@@ -968,7 +899,6 @@ void __init mem_init(void)
+ }
+ reservedpages = end_pfn - totalram_pages -
+ absent_pages_in_range(0, end_pfn);
+-
+ after_bootmem = 1;
+
+ codesize = (unsigned long) &_etext - (unsigned long) &_text;
+@@ -976,46 +906,64 @@ void __init mem_init(void)
+ initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+ /* Register memory areas for /proc/kcore */
+- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
++ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
++ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+ VMALLOC_END-VMALLOC_START);
+ kclist_add(&kcore_kernel, &_stext, _end - _stext);
+ kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
+- kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
++ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
+ VSYSCALL_END - VSYSCALL_START);
+
+- printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
++ printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
++ "%ldk reserved, %ldk data, %ldk init)\n",
+ (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+ end_pfn << (PAGE_SHIFT-10),
+ codesize >> 10,
+ reservedpages << (PAGE_SHIFT-10),
+ datasize >> 10,
+ initsize >> 10);
++
++ cpa_init();
+ }
+
+ void free_init_pages(char *what, unsigned long begin, unsigned long end)
+ {
+- unsigned long addr;
++ unsigned long addr = begin;
+
+- if (begin >= end)
++ if (addr >= end)
+ return;
+
++ /*
++ * If debugging page accesses then do not free this memory but
++ * mark them not present - any buggy init-section access will
++ * create a kernel page fault:
++ */
++#ifdef CONFIG_DEBUG_PAGEALLOC
++ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
++ begin, PAGE_ALIGN(end));
++ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
++#else
+ printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+- for (addr = begin; addr < end; addr += PAGE_SIZE) {
++
++ for (; addr < end; addr += PAGE_SIZE) {
+ ClearPageReserved(virt_to_page(addr));
+ init_page_count(virt_to_page(addr));
+ memset((void *)(addr & ~(PAGE_SIZE-1)),
+ POISON_FREE_INITMEM, PAGE_SIZE);
+ if (addr >= __START_KERNEL_map) {
+ /* make_readonly() reports all kernel addresses. */
+- __make_page_writable(__va(__pa(addr)));
+- change_page_attr_addr(addr, 1, __pgprot(0));
++ if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
++ pfn_pte(__pa(addr) >> PAGE_SHIFT,
++ PAGE_KERNEL),
++ 0))
++ BUG();
++ if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
++ BUG();
+ }
+ free_page(addr);
+ totalram_pages++;
+ }
+- if (addr > __START_KERNEL_map)
+- global_flush_tlb();
++#endif
+ }
+
+ void free_initmem(void)
+@@ -1026,6 +974,8 @@ void free_initmem(void)
+ }
+
+ #ifdef CONFIG_DEBUG_RODATA
++const int rodata_test_data = 0xC3;
++EXPORT_SYMBOL_GPL(rodata_test_data);
+
+ void mark_rodata_ro(void)
+ {
+@@ -1047,18 +997,27 @@ void mark_rodata_ro(void)
+ if (end <= start)
+ return;
+
+- change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
+
+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+ (end - start) >> 10);
++ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+
+ /*
+- * change_page_attr_addr() requires a global_flush_tlb() call after it.
+- * We do this after the printk so that if something went wrong in the
+- * change, the printk gets out at least to give a better debug hint
+- * of who is the culprit.
++ * The rodata section (but not the kernel text!) should also be
++ * not-executable.
+ */
+- global_flush_tlb();
++ start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
++ set_memory_nx(start, (end - start) >> PAGE_SHIFT);
++
++ rodata_test();
++
++#ifdef CONFIG_CPA_DEBUG
++ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
++ set_memory_rw(start, (end-start) >> PAGE_SHIFT);
++
++ printk(KERN_INFO "Testing CPA: again\n");
++ set_memory_ro(start, (end-start) >> PAGE_SHIFT);
++#endif
+ }
+ #endif
+
+@@ -1069,17 +1028,21 @@ void free_initrd_mem(unsigned long start
+ }
+ #endif
+
+-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+-{
++void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
++{
+ #ifdef CONFIG_NUMA
+ int nid = phys_to_nid(phys);
+ #endif
+ unsigned long pfn = phys >> PAGE_SHIFT;
++
+ if (pfn >= end_pfn) {
+- /* This can happen with kdump kernels when accessing firmware
+- tables. */
++ /*
++ * This can happen with kdump kernels when accessing
++ * firmware tables:
++ */
+ if (pfn < end_pfn_map)
+ return;
++
+ printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+ phys, len);
+ return;
+@@ -1087,9 +1050,9 @@ void __init reserve_bootmem_generic(unsi
+
+ /* Should check here against the e820 map to avoid double free */
+ #ifdef CONFIG_NUMA
+- reserve_bootmem_node(NODE_DATA(nid), phys, len);
+-#else
+- reserve_bootmem(phys, len);
++ reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
++#else
++ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ #endif
+ #ifndef CONFIG_XEN
+ if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
+@@ -1099,46 +1062,49 @@ void __init reserve_bootmem_generic(unsi
+ #endif
+ }
+
+-int kern_addr_valid(unsigned long addr)
+-{
++int kern_addr_valid(unsigned long addr)
++{
+ unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+- pgd_t *pgd;
+- pud_t *pud;
+- pmd_t *pmd;
+- pte_t *pte;
++ pgd_t *pgd;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *pte;
+
+ if (above != 0 && above != -1UL)
+- return 0;
+-
++ return 0;
++
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd))
+ return 0;
+
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud))
+- return 0;
++ return 0;
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return 0;
++
+ if (pmd_large(*pmd))
+ return pfn_valid(pmd_pfn(*pmd));
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ return 0;
++
+ return pfn_valid(pte_pfn(*pte));
+ }
+
+-/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
+- covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+- not need special handling anymore. */
+-
++/*
++ * A pseudo VMA to allow ptrace access for the vsyscall page. This only
++ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
++ * not need special handling anymore:
++ */
+ static struct vm_area_struct gate_vma = {
+- .vm_start = VSYSCALL_START,
+- .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
+- .vm_page_prot = PAGE_READONLY_EXEC,
+- .vm_flags = VM_READ | VM_EXEC
++ .vm_start = VSYSCALL_START,
++ .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
++ .vm_page_prot = PAGE_READONLY_EXEC,
++ .vm_flags = VM_READ | VM_EXEC
+ };
+
+ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+@@ -1153,14 +1119,17 @@ struct vm_area_struct *get_gate_vma(stru
+ int in_gate_area(struct task_struct *task, unsigned long addr)
+ {
+ struct vm_area_struct *vma = get_gate_vma(task);
++
+ if (!vma)
+ return 0;
++
+ return (addr >= vma->vm_start) && (addr < vma->vm_end);
+ }
+
+-/* Use this when you have no reliable task/vma, typically from interrupt
+- * context. It is less reliable than using the task's vma and may give
+- * false positives.
++/*
++ * Use this when you have no reliable task/vma, typically from interrupt
++ * context. It is less reliable than using the task's vma and may give
++ * false positives:
+ */
+ int in_gate_area_no_task(unsigned long addr)
+ {
+@@ -1180,8 +1149,8 @@ const char *arch_vma_name(struct vm_area
+ /*
+ * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
+ */
+-int __meminit vmemmap_populate(struct page *start_page,
+- unsigned long size, int node)
++int __meminit
++vmemmap_populate(struct page *start_page, unsigned long size, int node)
+ {
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + size);
+@@ -1196,6 +1165,7 @@ int __meminit vmemmap_populate(struct pa
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return -ENOMEM;
++
+ pud = vmemmap_pud_populate(pgd, addr, node);
+ if (!pud)
+ return -ENOMEM;
+@@ -1203,20 +1173,22 @@ int __meminit vmemmap_populate(struct pa
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ pte_t entry;
+- void *p = vmemmap_alloc_block(PMD_SIZE, node);
++ void *p;
++
++ p = vmemmap_alloc_block(PMD_SIZE, node);
+ if (!p)
+ return -ENOMEM;
+
+- entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+- mk_pte_huge(entry);
+- set_pmd(pmd, __pmd(pte_val(entry)));
++ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
++ PAGE_KERNEL_LARGE);
++ set_pmd(pmd, __pmd_ma(__pte_val(entry)));
+
+ printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
+ addr, addr + PMD_SIZE - 1, p, node);
+- } else
++ } else {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
++ }
+ }
+-
+ return 0;
+ }
+ #endif
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-06-29/arch/x86/mm/ioremap-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -0,0 +1,687 @@
++/*
++ * Re-map IO memory to kernel address space so that we can access it.
++ * This is needed for high PCI addresses that aren't mapped in the
++ * 640k-1MB IO memory area on PC's
++ *
++ * (C) Copyright 1995 1996 Linus Torvalds
++ */
++
++#include <linux/bootmem.h>
++#include <linux/init.h>
++#include <linux/io.h>
++#include <linux/module.h>
++#include <linux/pfn.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++
++#include <asm/cacheflush.h>
++#include <asm/e820.h>
++#include <asm/fixmap.h>
++#include <asm/pgtable.h>
++#include <asm/tlbflush.h>
++#include <asm/pgalloc.h>
++
++enum ioremap_mode {
++ IOR_MODE_UNCACHED,
++ IOR_MODE_CACHED,
++};
++
++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
++
++unsigned long __phys_addr(unsigned long x)
++{
++ if (x >= __START_KERNEL_map)
++ return x - __START_KERNEL_map + phys_base;
++ return x - PAGE_OFFSET;
++}
++EXPORT_SYMBOL(__phys_addr);
++
++#endif
++
++static int direct_remap_area_pte_fn(pte_t *pte,
++ struct page *pmd_page,
++ unsigned long address,
++ void *data)
++{
++ mmu_update_t **v = (mmu_update_t **)data;
++
++ BUG_ON(!pte_none(*pte));
++
++ (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
++ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
++ (*v)++;
++
++ return 0;
++}
++
++static int __direct_remap_pfn_range(struct mm_struct *mm,
++ unsigned long address,
++ unsigned long mfn,
++ unsigned long size,
++ pgprot_t prot,
++ domid_t domid)
++{
++ int rc;
++ unsigned long i, start_address;
++ mmu_update_t *u, *v, *w;
++
++ u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
++ if (u == NULL)
++ return -ENOMEM;
++
++ start_address = address;
++
++ flush_cache_all();
++
++ for (i = 0; i < size; i += PAGE_SIZE) {
++ if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
++ /* Flush a full batch after filling in the PTE ptrs. */
++ rc = apply_to_page_range(mm, start_address,
++ address - start_address,
++ direct_remap_area_pte_fn, &w);
++ if (rc)
++ goto out;
++ rc = -EFAULT;
++ if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
++ goto out;
++ v = w = u;
++ start_address = address;
++ }
++
++ /*
++ * Fill in the machine address: PTE ptr is done later by
++ * apply_to_page_range().
++ */
++ v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
++
++ mfn++;
++ address += PAGE_SIZE;
++ v++;
++ }
++
++ if (v != u) {
++ /* Final batch. */
++ rc = apply_to_page_range(mm, start_address,
++ address - start_address,
++ direct_remap_area_pte_fn, &w);
++ if (rc)
++ goto out;
++ rc = -EFAULT;
++ if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
++ goto out;
++ }
++
++ rc = 0;
++
++ out:
++ flush_tlb_all();
++
++ free_page((unsigned long)u);
++
++ return rc;
++}
++
++int direct_remap_pfn_range(struct vm_area_struct *vma,
++ unsigned long address,
++ unsigned long mfn,
++ unsigned long size,
++ pgprot_t prot,
++ domid_t domid)
++{
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ return remap_pfn_range(vma, address, mfn, size, prot);
++
++ if (domid == DOMID_SELF)
++ return -EINVAL;
++
++ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
++
++ vma->vm_mm->context.has_foreign_mappings = 1;
++
++ return __direct_remap_pfn_range(
++ vma->vm_mm, address, mfn, size, prot, domid);
++}
++EXPORT_SYMBOL(direct_remap_pfn_range);
++
++int direct_kernel_remap_pfn_range(unsigned long address,
++ unsigned long mfn,
++ unsigned long size,
++ pgprot_t prot,
++ domid_t domid)
++{
++ return __direct_remap_pfn_range(
++ &init_mm, address, mfn, size, prot, domid);
++}
++EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
++
++static int lookup_pte_fn(
++ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
++{
++ uint64_t *ptep = (uint64_t *)data;
++ if (ptep)
++ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
++ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
++ return 0;
++}
++
++int create_lookup_pte_addr(struct mm_struct *mm,
++ unsigned long address,
++ uint64_t *ptep)
++{
++ return apply_to_page_range(mm, address, PAGE_SIZE,
++ lookup_pte_fn, ptep);
++}
++
++EXPORT_SYMBOL(create_lookup_pte_addr);
++
++static int noop_fn(
++ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
++{
++ return 0;
++}
++
++int touch_pte_range(struct mm_struct *mm,
++ unsigned long address,
++ unsigned long size)
++{
++ return apply_to_page_range(mm, address, size, noop_fn, NULL);
++}
++
++EXPORT_SYMBOL(touch_pte_range);
++
++#ifdef CONFIG_X86_32
++int page_is_ram(unsigned long pagenr)
++{
++ unsigned long addr, end;
++ int i;
++
++#ifndef CONFIG_XEN
++ /*
++ * A special case is the first 4Kb of memory;
++ * This is a BIOS owned area, not kernel ram, but generally
++ * not listed as such in the E820 table.
++ */
++ if (pagenr == 0)
++ return 0;
++
++ /*
++ * Second special case: Some BIOSen report the PC BIOS
++ * area (640->1Mb) as ram even though it is not.
++ */
++ if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
++ pagenr < (BIOS_END >> PAGE_SHIFT))
++ return 0;
++#endif
++
++ for (i = 0; i < e820.nr_map; i++) {
++ /*
++ * Not usable memory:
++ */
++ if (e820.map[i].type != E820_RAM)
++ continue;
++ addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
++ end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
++
++
++ if ((pagenr >= addr) && (pagenr < end))
++ return 1;
++ }
++ return 0;
++}
++#endif
++
++/*
++ * Fix up the linear direct mapping of the kernel to avoid cache attribute
++ * conflicts.
++ */
++static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
++ enum ioremap_mode mode)
++{
++ unsigned long nrpages = size >> PAGE_SHIFT;
++ int err;
++
++ switch (mode) {
++ case IOR_MODE_UNCACHED:
++ default:
++ err = set_memory_uc(vaddr, nrpages);
++ break;
++ case IOR_MODE_CACHED:
++ err = set_memory_wb(vaddr, nrpages);
++ break;
++ }
++
++ return err;
++}
++
++/*
++ * Remap an arbitrary physical address space into the kernel virtual
++ * address space. Needed when the kernel wants to access high addresses
++ * directly.
++ *
++ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
++ * have to convert them into an offset in a page-aligned mapping, but the
++ * caller shouldn't need to know that small detail.
++ */
++static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
++ enum ioremap_mode mode)
++{
++ unsigned long mfn, offset, last_addr, vaddr;
++ struct vm_struct *area;
++ pgprot_t prot;
++ domid_t domid = DOMID_IO;
++
++ /* Don't allow wraparound or zero size */
++ last_addr = phys_addr + size - 1;
++ if (!size || last_addr < phys_addr)
++ return NULL;
++
++ /*
++ * Don't remap the low PCI/ISA area, it's always mapped..
++ */
++ if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
++ return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
++
++ /*
++ * Don't allow anybody to remap normal RAM that we're using..
++ */
++ for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
++ unsigned long pfn = mfn_to_local_pfn(mfn);
++
++ if (pfn >= max_pfn)
++ continue;
++
++ domid = DOMID_SELF;
++
++ if (pfn >= max_pfn_mapped) /* bogus */
++ continue;
++
++ if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
++ return NULL;
++ }
++
++ switch (mode) {
++ case IOR_MODE_UNCACHED:
++ default:
++ /*
++ * FIXME: we will use UC MINUS for now, as video fb drivers
++ * depend on it. Upcoming ioremap_wc() will fix this behavior.
++ */
++ prot = PAGE_KERNEL_UC_MINUS;
++ break;
++ case IOR_MODE_CACHED:
++ prot = PAGE_KERNEL;
++ break;
++ }
++
++ /*
++ * Mappings have to be page-aligned
++ */
++ offset = phys_addr & ~PAGE_MASK;
++ phys_addr &= PAGE_MASK;
++ size = PAGE_ALIGN(last_addr+1) - phys_addr;
++
++ /*
++ * Ok, go for it..
++ */
++ area = get_vm_area(size, VM_IOREMAP | (mode << 20));
++ if (!area)
++ return NULL;
++ area->phys_addr = phys_addr;
++ vaddr = (unsigned long) area->addr;
++ if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
++ size, prot, domid)) {
++ free_vm_area(area);
++ return NULL;
++ }
++
++ if (ioremap_change_attr(vaddr, size, mode) < 0) {
++ iounmap((void __iomem *) vaddr);
++ return NULL;
++ }
++
++ return (void __iomem *) (vaddr + offset);
++}
++
++/**
++ * ioremap_nocache - map bus memory into CPU space
++ * @offset: bus address of the memory
++ * @size: size of the resource to map
++ *
++ * ioremap_nocache performs a platform specific sequence of operations to
++ * make bus memory CPU accessible via the readb/readw/readl/writeb/
++ * writew/writel functions and the other mmio helpers. The returned
++ * address is not guaranteed to be usable directly as a virtual
++ * address.
++ *
++ * This version of ioremap ensures that the memory is marked uncachable
++ * on the CPU as well as honouring existing caching rules from things like
++ * the PCI bus. Note that there are other caches and buffers on many
++ * busses. In particular driver authors should read up on PCI writes
++ *
++ * It's useful if some control registers are in such an area and
++ * write combining or read caching is not desirable:
++ *
++ * Must be freed with iounmap.
++ */
++void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
++{
++ return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
++}
++EXPORT_SYMBOL(ioremap_nocache);
++
++void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
++{
++ return __ioremap(phys_addr, size, IOR_MODE_CACHED);
++}
++EXPORT_SYMBOL(ioremap_cache);
++
++/**
++ * iounmap - Free a IO remapping
++ * @addr: virtual address from ioremap_*
++ *
++ * Caller must ensure there is only one unmapping for the same pointer.
++ */
++void iounmap(volatile void __iomem *addr)
++{
++ struct vm_struct *p, *o;
++
++ if ((void __force *)addr <= high_memory)
++ return;
++
++ /*
++ * __ioremap special-cases the PCI/ISA range by not instantiating a
++ * vm_area and by simply returning an address into the kernel mapping
++ * of ISA space. So handle that here.
++ */
++ if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
++ return;
++
++ addr = (volatile void __iomem *)
++ (PAGE_MASK & (unsigned long __force)addr);
++
++ /* Use the vm area unlocked, assuming the caller
++ ensures there isn't another iounmap for the same address
++ in parallel. Reuse of the virtual address is prevented by
++ leaving it in the global lists until we're done with it.
++ cpa takes care of the direct mappings. */
++ read_lock(&vmlist_lock);
++ for (p = vmlist; p; p = p->next) {
++ if (p->addr == addr)
++ break;
++ }
++ read_unlock(&vmlist_lock);
++
++ if (!p) {
++ printk(KERN_ERR "iounmap: bad address %p\n", addr);
++ dump_stack();
++ return;
++ }
++
++ if ((p->flags >> 20) != IOR_MODE_CACHED) {
++ unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
++ unsigned long mfn = p->phys_addr;
++ unsigned long va = (unsigned long)addr;
++
++ for (; n > 0; n--, mfn++, va += PAGE_SIZE)
++ if (mfn_to_local_pfn(mfn) < max_pfn)
++ set_memory_wb(va, 1);
++ }
++
++ /* Finally remove it */
++ o = remove_vm_area((void *)addr);
++ BUG_ON(p != o || o == NULL);
++ kfree(p);
++}
++EXPORT_SYMBOL(iounmap);
++
++int __initdata early_ioremap_debug;
++
++static int __init early_ioremap_debug_setup(char *str)
++{
++ early_ioremap_debug = 1;
++
++ return 0;
++}
++early_param("early_ioremap_debug", early_ioremap_debug_setup);
++
++static __initdata int after_paging_init;
++static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
++ __attribute__((aligned(PAGE_SIZE)));
++
++#ifdef CONFIG_X86_32
++static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
++{
++ /* Don't assume we're using swapper_pg_dir at this point */
++ pgd_t *base = __va(read_cr3());
++ pgd_t *pgd = &base[pgd_index(addr)];
++ pud_t *pud = pud_offset(pgd, addr);
++ pmd_t *pmd = pmd_offset(pud, addr);
++
++ return pmd;
++}
++#else
++#define early_ioremap_pmd early_get_pmd
++#define make_lowmem_page_readonly early_make_page_readonly
++#define make_lowmem_page_writable make_page_writable
++#endif
++
++static inline pte_t * __init early_ioremap_pte(unsigned long addr)
++{
++ return &bm_pte[pte_index(addr)];
++}
++
++void __init early_ioremap_init(void)
++{
++ pmd_t *pmd;
++
++ if (early_ioremap_debug)
++ printk(KERN_INFO "early_ioremap_init()\n");
++
++ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
++ memset(bm_pte, 0, sizeof(bm_pte));
++ make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
++ pmd_populate_kernel(&init_mm, pmd, bm_pte);
++
++ /*
++ * The boot-ioremap range spans multiple pmds, for which
++ * we are not prepared:
++ */
++ if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
++ WARN_ON(1);
++ printk(KERN_WARNING "pmd %p != %p\n",
++ pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
++ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
++ fix_to_virt(FIX_BTMAP_BEGIN));
++ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
++ fix_to_virt(FIX_BTMAP_END));
++
++ printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
++ printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
++ FIX_BTMAP_BEGIN);
++ }
++}
++
++#ifdef CONFIG_X86_32
++void __init early_ioremap_clear(void)
++{
++ pmd_t *pmd;
++
++ if (early_ioremap_debug)
++ printk(KERN_INFO "early_ioremap_clear()\n");
++
++ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
++ pmd_clear(pmd);
++ make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
++ /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
++ __flush_tlb_all();
++}
++
++void __init early_ioremap_reset(void)
++{
++ enum fixed_addresses idx;
++ unsigned long addr, phys;
++ pte_t *pte;
++
++ after_paging_init = 1;
++ for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
++ addr = fix_to_virt(idx);
++ pte = early_ioremap_pte(addr);
++ if (pte_present(*pte)) {
++ phys = __pte_val(*pte) & PAGE_MASK;
++ set_fixmap(idx, phys);
++ }
++ }
++}
++#endif /* CONFIG_X86_32 */
++
++static void __init __early_set_fixmap(enum fixed_addresses idx,
++ unsigned long phys, pgprot_t flags)
++{
++ unsigned long addr = __fix_to_virt(idx);
++ pte_t *pte;
++
++ if (idx >= __end_of_fixed_addresses) {
++ BUG();
++ return;
++ }
++ pte = early_ioremap_pte(addr);
++ if (pgprot_val(flags))
++ set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
++ else
++ pte_clear(NULL, addr, pte);
++ __flush_tlb_one(addr);
++}
++
++static inline void __init early_set_fixmap(enum fixed_addresses idx,
++ unsigned long phys)
++{
++ if (after_paging_init)
++ set_fixmap(idx, phys);
++ else
++ __early_set_fixmap(idx, phys, PAGE_KERNEL);
++}
++
++static inline void __init early_clear_fixmap(enum fixed_addresses idx)
++{
++ if (after_paging_init)
++ clear_fixmap(idx);
++ else
++ __early_set_fixmap(idx, 0, __pgprot(0));
++}
++
++
++int __initdata early_ioremap_nested;
++
++static int __init check_early_ioremap_leak(void)
++{
++ if (!early_ioremap_nested)
++ return 0;
++
++ printk(KERN_WARNING
++ "Debug warning: early ioremap leak of %d areas detected.\n",
++ early_ioremap_nested);
++ printk(KERN_WARNING
++ "please boot with early_ioremap_debug and report the dmesg.\n");
++ WARN_ON(1);
++
++ return 1;
++}
++late_initcall(check_early_ioremap_leak);
++
++void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
++{
++ unsigned long offset, last_addr;
++ unsigned int nrpages, nesting;
++ enum fixed_addresses idx0, idx;
++
++ WARN_ON(system_state != SYSTEM_BOOTING);
++
++ nesting = early_ioremap_nested;
++ if (early_ioremap_debug) {
++ printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
++ phys_addr, size, nesting);
++ dump_stack();
++ }
++
++ /* Don't allow wraparound or zero size */
++ last_addr = phys_addr + size - 1;
++ if (!size || last_addr < phys_addr) {
++ WARN_ON(1);
++ return NULL;
++ }
++
++ if (nesting >= FIX_BTMAPS_NESTING) {
++ WARN_ON(1);
++ return NULL;
++ }
++ early_ioremap_nested++;
++ /*
++ * Mappings have to be page-aligned
++ */
++ offset = phys_addr & ~PAGE_MASK;
++ phys_addr &= PAGE_MASK;
++ size = PAGE_ALIGN(last_addr) - phys_addr;
++
++ /*
++ * Mappings have to fit in the FIX_BTMAP area.
++ */
++ nrpages = size >> PAGE_SHIFT;
++ if (nrpages > NR_FIX_BTMAPS) {
++ WARN_ON(1);
++ return NULL;
++ }
++
++ /*
++ * Ok, go for it..
++ */
++ idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
++ idx = idx0;
++ while (nrpages > 0) {
++ early_set_fixmap(idx, phys_addr);
++ phys_addr += PAGE_SIZE;
++ --idx;
++ --nrpages;
++ }
++ if (early_ioremap_debug)
++ printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
++
++ return (void *) (offset + fix_to_virt(idx0));
++}
++
++void __init early_iounmap(void *addr, unsigned long size)
++{
++ unsigned long virt_addr;
++ unsigned long offset;
++ unsigned int nrpages;
++ enum fixed_addresses idx;
++ unsigned int nesting;
++
++ nesting = --early_ioremap_nested;
++ WARN_ON(nesting < 0);
++
++ if (early_ioremap_debug) {
++ printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
++ size, nesting);
++ dump_stack();
++ }
++
++ virt_addr = (unsigned long)addr;
++ if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
++ WARN_ON(1);
++ return;
++ }
++ offset = virt_addr & ~PAGE_MASK;
++ nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
++
++ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
++ while (nrpages > 0) {
++ early_clear_fixmap(idx);
++ --idx;
++ --nrpages;
++ }
++}
++
++void __this_fixmap_does_not_exist(void)
++{
++ WARN_ON(1);
++}
+--- sle11-2009-06-29.orig/arch/x86/mm/ioremap_32-xen.c 2009-02-16 16:17:21.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,445 +0,0 @@
+-/*
+- * arch/i386/mm/ioremap.c
+- *
+- * Re-map IO memory to kernel address space so that we can access it.
+- * This is needed for high PCI addresses that aren't mapped in the
+- * 640k-1MB IO memory area on PC's
+- *
+- * (C) Copyright 1995 1996 Linus Torvalds
+- */
+-
+-#include <linux/vmalloc.h>
+-#include <linux/init.h>
+-#include <linux/slab.h>
+-#include <linux/module.h>
+-#include <linux/io.h>
+-#include <linux/sched.h>
+-#include <asm/fixmap.h>
+-#include <asm/cacheflush.h>
+-#include <asm/tlbflush.h>
+-#include <asm/pgtable.h>
+-#include <asm/pgalloc.h>
+-
+-#define ISA_START_ADDRESS 0x0
+-#define ISA_END_ADDRESS 0x100000
+-
+-static int direct_remap_area_pte_fn(pte_t *pte,
+- struct page *pmd_page,
+- unsigned long address,
+- void *data)
+-{
+- mmu_update_t **v = (mmu_update_t **)data;
+-
+- BUG_ON(!pte_none(*pte));
+-
+- (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+- PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+- (*v)++;
+-
+- return 0;
+-}
+-
+-static int __direct_remap_pfn_range(struct mm_struct *mm,
+- unsigned long address,
+- unsigned long mfn,
+- unsigned long size,
+- pgprot_t prot,
+- domid_t domid)
+-{
+- int rc;
+- unsigned long i, start_address;
+- mmu_update_t *u, *v, *w;
+-
+- u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+- if (u == NULL)
+- return -ENOMEM;
+-
+- start_address = address;
+-
+- flush_cache_all();
+-
+- for (i = 0; i < size; i += PAGE_SIZE) {
+- if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
+- /* Flush a full batch after filling in the PTE ptrs. */
+- rc = apply_to_page_range(mm, start_address,
+- address - start_address,
+- direct_remap_area_pte_fn, &w);
+- if (rc)
+- goto out;
+- rc = -EFAULT;
+- if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
+- goto out;
+- v = w = u;
+- start_address = address;
+- }
+-
+- /*
+- * Fill in the machine address: PTE ptr is done later by
+- * apply_to_page_range().
+- */
+- v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
+-
+- mfn++;
+- address += PAGE_SIZE;
+- v++;
+- }
+-
+- if (v != u) {
+- /* Final batch. */
+- rc = apply_to_page_range(mm, start_address,
+- address - start_address,
+- direct_remap_area_pte_fn, &w);
+- if (rc)
+- goto out;
+- rc = -EFAULT;
+- if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
+- goto out;
+- }
+-
+- rc = 0;
+-
+- out:
+- flush_tlb_all();
+-
+- free_page((unsigned long)u);
+-
+- return rc;
+-}
+-
+-int direct_remap_pfn_range(struct vm_area_struct *vma,
+- unsigned long address,
+- unsigned long mfn,
+- unsigned long size,
+- pgprot_t prot,
+- domid_t domid)
+-{
+- if (xen_feature(XENFEAT_auto_translated_physmap))
+- return remap_pfn_range(vma, address, mfn, size, prot);
+-
+- if (domid == DOMID_SELF)
+- return -EINVAL;
+-
+- vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+-
+- vma->vm_mm->context.has_foreign_mappings = 1;
+-
+- return __direct_remap_pfn_range(
+- vma->vm_mm, address, mfn, size, prot, domid);
+-}
+-EXPORT_SYMBOL(direct_remap_pfn_range);
+-
+-int direct_kernel_remap_pfn_range(unsigned long address,
+- unsigned long mfn,
+- unsigned long size,
+- pgprot_t prot,
+- domid_t domid)
+-{
+- return __direct_remap_pfn_range(
+- &init_mm, address, mfn, size, prot, domid);
+-}
+-EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
+-
+-static int lookup_pte_fn(
+- pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+-{
+- uint64_t *ptep = (uint64_t *)data;
+- if (ptep)
+- *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+- PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+- return 0;
+-}
+-
+-int create_lookup_pte_addr(struct mm_struct *mm,
+- unsigned long address,
+- uint64_t *ptep)
+-{
+- return apply_to_page_range(mm, address, PAGE_SIZE,
+- lookup_pte_fn, ptep);
+-}
+-
+-EXPORT_SYMBOL(create_lookup_pte_addr);
+-
+-static int noop_fn(
+- pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+-{
+- return 0;
+-}
+-
+-int touch_pte_range(struct mm_struct *mm,
+- unsigned long address,
+- unsigned long size)
+-{
+- return apply_to_page_range(mm, address, size, noop_fn, NULL);
+-}
+-
+-EXPORT_SYMBOL(touch_pte_range);
+-
+-/*
+- * Does @address reside within a non-highmem page that is local to this virtual
+- * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
+- * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
+- * why this works.
+- */
+-static inline int is_local_lowmem(unsigned long address)
+-{
+- extern unsigned long max_low_pfn;
+- return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
+-}
+-
+-/*
+- * Generic mapping function (not visible outside):
+- */
+-
+-/*
+- * Remap an arbitrary physical address space into the kernel virtual
+- * address space. Needed when the kernel wants to access high addresses
+- * directly.
+- *
+- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+- * have to convert them into an offset in a page-aligned mapping, but the
+- * caller shouldn't need to know that small detail.
+- */
+-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
+-{
+- void __iomem * addr;
+- struct vm_struct * area;
+- unsigned long offset, last_addr;
+- pgprot_t prot;
+- domid_t domid = DOMID_IO;
+-
+- /* Don't allow wraparound or zero size */
+- last_addr = phys_addr + size - 1;
+- if (!size || last_addr < phys_addr)
+- return NULL;
+-
+- /*
+- * Don't remap the low PCI/ISA area, it's always mapped..
+- */
+- if (is_initial_xendomain() &&
+- phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+- return (void __iomem *) isa_bus_to_virt(phys_addr);
+-
+- /*
+- * Don't allow anybody to remap normal RAM that we're using..
+- */
+- if (is_local_lowmem(phys_addr)) {
+- char *t_addr, *t_end;
+- struct page *page;
+-
+- t_addr = bus_to_virt(phys_addr);
+- t_end = t_addr + (size - 1);
+-
+- for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
+- if(!PageReserved(page))
+- return NULL;
+-
+- domid = DOMID_SELF;
+- }
+-
+- prot = __pgprot(_KERNPG_TABLE | flags);
+-
+- /*
+- * Mappings have to be page-aligned
+- */
+- offset = phys_addr & ~PAGE_MASK;
+- phys_addr &= PAGE_MASK;
+- size = PAGE_ALIGN(last_addr+1) - phys_addr;
+-
+- /*
+- * Ok, go for it..
+- */
+- area = get_vm_area(size, VM_IOREMAP | (flags << 20));
+- if (!area)
+- return NULL;
+- area->phys_addr = phys_addr;
+- addr = (void __iomem *) area->addr;
+- if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
+- phys_addr>>PAGE_SHIFT,
+- size, prot, domid)) {
+- vunmap((void __force *) addr);
+- return NULL;
+- }
+- return (void __iomem *) (offset + (char __iomem *)addr);
+-}
+-EXPORT_SYMBOL(__ioremap);
+-
+-/**
+- * ioremap_nocache - map bus memory into CPU space
+- * @offset: bus address of the memory
+- * @size: size of the resource to map
+- *
+- * ioremap_nocache performs a platform specific sequence of operations to
+- * make bus memory CPU accessible via the readb/readw/readl/writeb/
+- * writew/writel functions and the other mmio helpers. The returned
+- * address is not guaranteed to be usable directly as a virtual
+- * address.
+- *
+- * This version of ioremap ensures that the memory is marked uncachable
+- * on the CPU as well as honouring existing caching rules from things like
+- * the PCI bus. Note that there are other caches and buffers on many
+- * busses. In particular driver authors should read up on PCI writes
+- *
+- * It's useful if some control registers are in such an area and
+- * write combining or read caching is not desirable:
+- *
+- * Must be freed with iounmap.
+- */
+-
+-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
+-{
+- unsigned long last_addr;
+- void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
+- if (!p)
+- return p;
+-
+- /* Guaranteed to be > phys_addr, as per __ioremap() */
+- last_addr = phys_addr + size - 1;
+-
+- if (is_local_lowmem(last_addr)) {
+- struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
+- unsigned long npages;
+-
+- phys_addr &= PAGE_MASK;
+-
+- /* This might overflow and become zero.. */
+- last_addr = PAGE_ALIGN(last_addr);
+-
+- /* .. but that's ok, because modulo-2**n arithmetic will make
+- * the page-aligned "last - first" come out right.
+- */
+- npages = (last_addr - phys_addr) >> PAGE_SHIFT;
+-
+- if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
+- iounmap(p);
+- p = NULL;
+- }
+- global_flush_tlb();
+- }
+-
+- return p;
+-}
+-EXPORT_SYMBOL(ioremap_nocache);
+-
+-/**
+- * iounmap - Free a IO remapping
+- * @addr: virtual address from ioremap_*
+- *
+- * Caller must ensure there is only one unmapping for the same pointer.
+- */
+-void iounmap(volatile void __iomem *addr)
+-{
+- struct vm_struct *p, *o;
+-
+- if ((void __force *)addr <= high_memory)
+- return;
+-
+- /*
+- * __ioremap special-cases the PCI/ISA range by not instantiating a
+- * vm_area and by simply returning an address into the kernel mapping
+- * of ISA space. So handle that here.
+- */
+- if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
+- return;
+-
+- addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
+-
+- /* Use the vm area unlocked, assuming the caller
+- ensures there isn't another iounmap for the same address
+- in parallel. Reuse of the virtual address is prevented by
+- leaving it in the global lists until we're done with it.
+- cpa takes care of the direct mappings. */
+- read_lock(&vmlist_lock);
+- for (p = vmlist; p; p = p->next) {
+- if (p->addr == addr)
+- break;
+- }
+- read_unlock(&vmlist_lock);
+-
+- if (!p) {
+- printk("iounmap: bad address %p\n", addr);
+- dump_stack();
+- return;
+- }
+-
+- /* Reset the direct mapping. Can block */
+- if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
+- change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
+- get_vm_area_size(p) >> PAGE_SHIFT,
+- PAGE_KERNEL);
+- global_flush_tlb();
+- }
+-
+- /* Finally remove it */
+- o = remove_vm_area((void *)addr);
+- BUG_ON(p != o || o == NULL);
+- kfree(p);
+-}
+-EXPORT_SYMBOL(iounmap);
+-
+-void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
+-{
+- unsigned long offset, last_addr;
+- unsigned int nrpages;
+- enum fixed_addresses idx;
+-
+- /* Don't allow wraparound or zero size */
+- last_addr = phys_addr + size - 1;
+- if (!size || last_addr < phys_addr)
+- return NULL;
+-
+- /*
+- * Don't remap the low PCI/ISA area, it's always mapped..
+- */
+- if (is_initial_xendomain() &&
+- phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+- return isa_bus_to_virt(phys_addr);
+-
+- /*
+- * Mappings have to be page-aligned
+- */
+- offset = phys_addr & ~PAGE_MASK;
+- phys_addr &= PAGE_MASK;
+- size = PAGE_ALIGN(last_addr) - phys_addr;
+-
+- /*
+- * Mappings have to fit in the FIX_BTMAP area.
+- */
+- nrpages = size >> PAGE_SHIFT;
+- if (nrpages > NR_FIX_BTMAPS)
+- return NULL;
+-
+- /*
+- * Ok, go for it..
+- */
+- idx = FIX_BTMAP_BEGIN;
+- while (nrpages > 0) {
+- set_fixmap(idx, phys_addr);
+- phys_addr += PAGE_SIZE;
+- --idx;
+- --nrpages;
+- }
+- return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
+-}
+-
+-void __init bt_iounmap(void *addr, unsigned long size)
+-{
+- unsigned long virt_addr;
+- unsigned long offset;
+- unsigned int nrpages;
+- enum fixed_addresses idx;
+-
+- virt_addr = (unsigned long)addr;
+- if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
+- return;
+- if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
+- return;
+- offset = virt_addr & ~PAGE_MASK;
+- nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+-
+- idx = FIX_BTMAP_BEGIN;
+- while (nrpages > 0) {
+- clear_fixmap(idx);
+- --idx;
+- --nrpages;
+- }
+-}
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-06-29/arch/x86/mm/pageattr-xen.c 2009-03-16 16:37:14.000000000 +0100
+@@ -0,0 +1,1413 @@
++/*
++ * Copyright 2002 Andi Kleen, SuSE Labs.
++ * Thanks to Ben LaHaise for precious feedback.
++ */
++#include <linux/highmem.h>
++#include <linux/bootmem.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/mm.h>
++#include <linux/interrupt.h>
++
++#include <asm/e820.h>
++#include <asm/processor.h>
++#include <asm/tlbflush.h>
++#include <asm/sections.h>
++#include <asm/uaccess.h>
++#include <asm/pgalloc.h>
++#include <asm/proto.h>
++#include <asm/mmu_context.h>
++
++#ifndef CONFIG_X86_64
++#define TASK_SIZE64 TASK_SIZE
++#endif
++
++static void _pin_lock(struct mm_struct *mm, int lock) {
++ if (lock)
++ spin_lock(&mm->page_table_lock);
++#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
++ /* While mm->page_table_lock protects us against insertions and
++ * removals of higher level page table pages, it doesn't protect
++ * against updates of pte-s. Such updates, however, require the
++ * pte pages to be in consistent state (unpinned+writable or
++ * pinned+readonly). The pinning and attribute changes, however
++ * cannot be done atomically, which is why such updates must be
++ * prevented from happening concurrently.
++ * Note that no pte lock can ever elsewhere be acquired nesting
++ * with an already acquired one in the same mm, or with the mm's
++ * page_table_lock already acquired, as that would break in the
++ * non-split case (where all these are actually resolving to the
++ * one page_table_lock). Thus acquiring all of them here is not
++ * going to result in dead locks, and the order of acquires
++ * doesn't matter.
++ */
++ {
++ pgd_t *pgd = mm->pgd;
++ unsigned g;
++
++ for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
++ pud_t *pud;
++ unsigned u;
++
++ if (pgd_none(*pgd))
++ continue;
++ pud = pud_offset(pgd, 0);
++ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
++ pmd_t *pmd;
++ unsigned m;
++
++ if (pud_none(*pud))
++ continue;
++ pmd = pmd_offset(pud, 0);
++ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
++ spinlock_t *ptl;
++
++ if (pmd_none(*pmd))
++ continue;
++ ptl = pte_lockptr(0, pmd);
++ if (lock)
++ spin_lock(ptl);
++ else
++ spin_unlock(ptl);
++ }
++ }
++ }
++ }
++#endif
++ if (!lock)
++ spin_unlock(&mm->page_table_lock);
++}
++#define pin_lock(mm) _pin_lock(mm, 1)
++#define pin_unlock(mm) _pin_lock(mm, 0)
++
++#define PIN_BATCH sizeof(void *)
++static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
++
++static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
++ unsigned int cpu, unsigned int seq)
++{
++ unsigned long pfn = page_to_pfn(page);
++
++ if (PageHighMem(page)) {
++ if (pgprot_val(flags) & _PAGE_RW)
++ ClearPagePinned(page);
++ else
++ SetPagePinned(page);
++ } else {
++ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
++ (unsigned long)__va(pfn << PAGE_SHIFT),
++ pfn_pte(pfn, flags), 0);
++ if (unlikely(++seq == PIN_BATCH)) {
++ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
++ PIN_BATCH, NULL)))
++ BUG();
++ seq = 0;
++ }
++ }
++
++ return seq;
++}
++
++static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
++{
++ pgd_t *pgd = pgd_base;
++ pud_t *pud;
++ pmd_t *pmd;
++ int g,u,m;
++ unsigned int cpu, seq;
++ multicall_entry_t *mcl;
++
++ if (xen_feature(XENFEAT_auto_translated_physmap))
++ return;
++
++ cpu = get_cpu();
++
++ /*
++ * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
++ * may not be the 'current' task's pagetables (e.g., current may be
++ * 32-bit, but the pagetables may be for a 64-bit task).
++ * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
++ * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
++ */
++ for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
++ if (pgd_none(*pgd))
++ continue;
++ pud = pud_offset(pgd, 0);
++ if (PTRS_PER_PUD > 1) /* not folded */
++ seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
++ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
++ if (pud_none(*pud))
++ continue;
++ pmd = pmd_offset(pud, 0);
++ if (PTRS_PER_PMD > 1) /* not folded */
++ seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
++ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
++ if (pmd_none(*pmd))
++ continue;
++ seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
++ }
++ }
++ }
++
++ mcl = per_cpu(pb_mcl, cpu);
++#ifdef CONFIG_X86_64
++ if (unlikely(seq > PIN_BATCH - 2)) {
++ if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
++ BUG();
++ seq = 0;
++ }
++ MULTI_update_va_mapping(mcl + seq,
++ (unsigned long)__user_pgd(pgd_base),
++ pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
++ 0);
++ MULTI_update_va_mapping(mcl + seq + 1,
++ (unsigned long)pgd_base,
++ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
++ UVMF_TLB_FLUSH);
++ if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
++ BUG();
++#else
++ if (likely(seq != 0)) {
++ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
++ (unsigned long)pgd_base,
++ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
++ UVMF_TLB_FLUSH);
++ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
++ seq + 1, NULL)))
++ BUG();
++ } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
++ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
++ UVMF_TLB_FLUSH))
++ BUG();
++#endif
++
++ put_cpu();
++}
++
++static void __pgd_pin(pgd_t *pgd)
++{
++ pgd_walk(pgd, PAGE_KERNEL_RO);
++ kmap_flush_unused();
++ xen_pgd_pin(__pa(pgd)); /* kernel */
++#ifdef CONFIG_X86_64
++ xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
++#endif
++ SetPagePinned(virt_to_page(pgd));
++}
++
++static void __pgd_unpin(pgd_t *pgd)
++{
++ xen_pgd_unpin(__pa(pgd));
++#ifdef CONFIG_X86_64
++ xen_pgd_unpin(__pa(__user_pgd(pgd)));
++#endif
++ pgd_walk(pgd, PAGE_KERNEL);
++ ClearPagePinned(virt_to_page(pgd));
++}
++
++void pgd_test_and_unpin(pgd_t *pgd)
++{
++ if (PagePinned(virt_to_page(pgd)))
++ __pgd_unpin(pgd);
++}
++
++void mm_pin(struct mm_struct *mm)
++{
++ if (xen_feature(XENFEAT_writable_page_tables))
++ return;
++
++ pin_lock(mm);
++ __pgd_pin(mm->pgd);
++ pin_unlock(mm);
++}
++
++void mm_unpin(struct mm_struct *mm)
++{
++ if (xen_feature(XENFEAT_writable_page_tables))
++ return;
++
++ pin_lock(mm);
++ __pgd_unpin(mm->pgd);
++ pin_unlock(mm);
++}
++
++void mm_pin_all(void)
++{
++ struct page *page;
++ unsigned long flags;
++
++ if (xen_feature(XENFEAT_writable_page_tables))
++ return;
++
++ /*
++ * Allow uninterrupted access to the pgd_list. Also protects
++ * __pgd_pin() by disabling preemption.
++ * All other CPUs must be at a safe point (e.g., in stop_machine
++ * or offlined entirely).
++ */
++ spin_lock_irqsave(&pgd_lock, flags);
++ list_for_each_entry(page, &pgd_list, lru) {
++ if (!PagePinned(page))
++ __pgd_pin((pgd_t *)page_address(page));
++ }
++ spin_unlock_irqrestore(&pgd_lock, flags);
++}
++
++void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
++{
++ if (!PagePinned(virt_to_page(mm->pgd)))
++ mm_pin(mm);
++}
++
++void arch_exit_mmap(struct mm_struct *mm)
++{
++ struct task_struct *tsk = current;
++
++ task_lock(tsk);
++
++ /*
++ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
++ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
++ */
++ if (tsk->active_mm == mm) {
++ tsk->active_mm = &init_mm;
++ atomic_inc(&init_mm.mm_count);
++
++ switch_mm(mm, &init_mm, tsk);
++
++ atomic_dec(&mm->mm_count);
++ BUG_ON(atomic_read(&mm->mm_count) == 0);
++ }
++
++ task_unlock(tsk);
++
++ if (PagePinned(virt_to_page(mm->pgd))
++ && atomic_read(&mm->mm_count) == 1
++ && !mm->context.has_foreign_mappings)
++ mm_unpin(mm);
++}
++
++static void _pte_free(struct page *page, unsigned int order)
++{
++ BUG_ON(order);
++ __pte_free(page);
++}
++
++pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
++{
++ struct page *pte;
++
++#ifdef CONFIG_HIGHPTE
++ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
++#else
++ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
++#endif
++ if (pte) {
++ pgtable_page_ctor(pte);
++ SetPageForeign(pte, _pte_free);
++ init_page_count(pte);
++ }
++ return pte;
++}
++
++void __pte_free(pgtable_t pte)
++{
++ if (!PageHighMem(pte)) {
++ unsigned long va = (unsigned long)page_address(pte);
++ unsigned int level;
++ pte_t *ptep = lookup_address(va, &level);
++
++ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
++ if (!pte_write(*ptep)
++ && HYPERVISOR_update_va_mapping(va,
++ mk_pte(pte, PAGE_KERNEL),
++ 0))
++ BUG();
++ } else
++#ifdef CONFIG_HIGHPTE
++ ClearPagePinned(pte);
++#else
++ BUG();
++#endif
++
++ ClearPageForeign(pte);
++ init_page_count(pte);
++ pgtable_page_dtor(pte);
++ __free_page(pte);
++}
++
++#if PAGETABLE_LEVELS >= 3
++static void _pmd_free(struct page *page, unsigned int order)
++{
++ BUG_ON(order);
++ __pmd_free(page);
++}
++
++pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
++{
++ struct page *pmd;
++
++ pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
++ if (!pmd)
++ return NULL;
++ SetPageForeign(pmd, _pmd_free);
++ init_page_count(pmd);
++ return page_address(pmd);
++}
++
++void __pmd_free(pgtable_t pmd)
++{
++ unsigned long va = (unsigned long)page_address(pmd);
++ unsigned int level;
++ pte_t *ptep = lookup_address(va, &level);
++
++ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
++ if (!pte_write(*ptep)
++ && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
++ BUG();
++
++ ClearPageForeign(pmd);
++ init_page_count(pmd);
++ __free_page(pmd);
++}
++#endif
++
++/* blktap and gntdev need this, as otherwise they would implicitly (and
++ * needlessly, as they never use it) reference init_mm. */
++pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
++ unsigned long addr, pte_t *ptep, int full)
++{
++ return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
++}
++EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
++
++/*
++ * The current flushing context - we pass it instead of 5 arguments:
++ */
++struct cpa_data {
++ unsigned long vaddr;
++ pgprot_t mask_set;
++ pgprot_t mask_clr;
++ int numpages;
++ int flushtlb;
++ unsigned long pfn;
++};
++
++#ifdef CONFIG_X86_64
++
++static inline unsigned long highmap_start_pfn(void)
++{
++ return __pa(_text) >> PAGE_SHIFT;
++}
++
++static inline unsigned long highmap_end_pfn(void)
++{
++ return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
++}
++
++#endif
++
++#ifdef CONFIG_DEBUG_PAGEALLOC
++# define debug_pagealloc 1
++#else
++# define debug_pagealloc 0
++#endif
++
++static inline int
++within(unsigned long addr, unsigned long start, unsigned long end)
++{
++ return addr >= start && addr < end;
++}
++
++/*
++ * Flushing functions
++ */
++
++/**
++ * clflush_cache_range - flush a cache range with clflush
++ * @addr: virtual start address
++ * @size: number of bytes to flush
++ *
++ * clflush is an unordered instruction which needs fencing with mfence
++ * to avoid ordering issues.
++ */
++void clflush_cache_range(void *vaddr, unsigned int size)
++{
++ void *vend = vaddr + size - 1;
++
++ mb();
++
++ for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
++ clflush(vaddr);
++ /*
++ * Flush any possible final partial cacheline:
++ */
++ clflush(vend);
++
++ mb();
++}
++
++static void __cpa_flush_all(void *arg)
++{
++ unsigned long cache = (unsigned long)arg;
++
++ /*
++ * Flush all to work around Errata in early athlons regarding
++ * large page flushing.
++ */
++ __flush_tlb_all();
++
++ if (cache && boot_cpu_data.x86_model >= 4)
++ wbinvd();
++}
++
++static void cpa_flush_all(unsigned long cache)
++{
++ BUG_ON(irqs_disabled());
++
++ on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
++}
++
++static void __cpa_flush_range(void *arg)
++{
++ /*
++ * We could optimize that further and do individual per page
++ * tlb invalidates for a low number of pages. Caveat: we must
++ * flush the high aliases on 64bit as well.
++ */
++ __flush_tlb_all();
++}
++
++static void cpa_flush_range(unsigned long start, int numpages, int cache)
++{
++ unsigned int i, level;
++ unsigned long addr;
++
++ BUG_ON(irqs_disabled());
++ WARN_ON(PAGE_ALIGN(start) != start);
++
++ on_each_cpu(__cpa_flush_range, NULL, 1, 1);
++
++ if (!cache)
++ return;
++
++ /*
++ * We only need to flush on one CPU,
++ * clflush is a MESI-coherent instruction that
++ * will cause all other CPUs to flush the same
++ * cachelines:
++ */
++ for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
++ pte_t *pte = lookup_address(addr, &level);
++
++ /*
++ * Only flush present addresses:
++ */
++ if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
++ clflush_cache_range((void *) addr, PAGE_SIZE);
++ }
++}
++
++/*
++ * Certain areas of memory on x86 require very specific protection flags,
++ * for example the BIOS area or kernel text. Callers don't always get this
++ * right (again, ioremap() on BIOS memory is not uncommon) so this function
++ * checks and fixes these known static required protection bits.
++ */
++static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
++ unsigned long pfn)
++{
++ pgprot_t forbidden = __pgprot(0);
++
++#ifndef CONFIG_XEN
++ /*
++ * The BIOS area between 640k and 1Mb needs to be executable for
++ * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
++ */
++ if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
++ pgprot_val(forbidden) |= _PAGE_NX;
++#endif
++
++ /*
++ * The kernel text needs to be executable for obvious reasons
++ * Does not cover __inittext since that is gone later on. On
++ * 64bit we do not enforce !NX on the low mapping
++ */
++ if (within(address, (unsigned long)_text, (unsigned long)_etext))
++ pgprot_val(forbidden) |= _PAGE_NX;
++
++ /*
++ * The .rodata section needs to be read-only. Using the pfn
++ * catches all aliases.
++ */
++ if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
++ __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
++ pgprot_val(forbidden) |= _PAGE_RW;
++
++ prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
++
++ return prot;
++}
++
++/*
++ * Lookup the page table entry for a virtual address. Return a pointer
++ * to the entry and the level of the mapping.
++ *
++ * Note: We return pud and pmd either when the entry is marked large
++ * or when the present bit is not set. Otherwise we would return a
++ * pointer to a nonexisting mapping.
++ */
++pte_t *lookup_address(unsigned long address, unsigned int *level)
++{
++ pgd_t *pgd = pgd_offset_k(address);
++ pud_t *pud;
++ pmd_t *pmd;
++
++ *level = PG_LEVEL_NONE;
++
++ if (pgd_none(*pgd))
++ return NULL;
++
++ pud = pud_offset(pgd, address);
++ if (pud_none(*pud))
++ return NULL;
++
++ *level = PG_LEVEL_1G;
++ if (pud_large(*pud) || !pud_present(*pud))
++ return (pte_t *)pud;
++
++ pmd = pmd_offset(pud, address);
++ if (pmd_none(*pmd))
++ return NULL;
++
++ *level = PG_LEVEL_2M;
++ if (pmd_large(*pmd) || !pmd_present(*pmd))
++ return (pte_t *)pmd;
++
++ *level = PG_LEVEL_4K;
++
++ return pte_offset_kernel(pmd, address);
++}
++
++/*
++ * Set the new pmd in all the pgds we know about:
++ */
++static void __set_pmd_pte(pte_t *kpte, unsigned long address,
++ unsigned int level, pte_t pte)
++{
++ /* change init_mm */
++ switch(level) {
++ case PG_LEVEL_2M:
++ xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
++ break;
++#ifdef CONFIG_X86_64
++ case PG_LEVEL_1G:
++ xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
++ break;
++#endif
++ default:
++ BUG();
++ }
++#ifdef CONFIG_X86_32
++ if (!SHARED_KERNEL_PMD) {
++ struct page *page;
++
++ list_for_each_entry(page, &pgd_list, lru) {
++ pgd_t *pgd;
++ pud_t *pud;
++ pmd_t *pmd;
++
++ pgd = (pgd_t *)page_address(page) + pgd_index(address);
++ pud = pud_offset(pgd, address);
++ pmd = pmd_offset(pud, address);
++ xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
++ }
++ }
++#endif
++}
++
++static int
++try_preserve_large_page(pte_t *kpte, unsigned long address,
++ struct cpa_data *cpa)
++{
++ unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
++ pte_t new_pte, old_pte, *tmp;
++ pgprot_t old_prot, new_prot;
++ int i, do_split = 1;
++ unsigned int level;
++
++ spin_lock_irqsave(&pgd_lock, flags);
++ /*
++ * Check for races, another CPU might have split this page
++ * up already:
++ */
++ tmp = lookup_address(address, &level);
++ if (tmp != kpte)
++ goto out_unlock;
++
++ switch (level) {
++ case PG_LEVEL_2M:
++ psize = PMD_PAGE_SIZE;
++ pmask = PMD_PAGE_MASK;
++ break;
++#ifdef CONFIG_X86_64
++ case PG_LEVEL_1G:
++ psize = PUD_PAGE_SIZE;
++ pmask = PUD_PAGE_MASK;
++ break;
++#endif
++ default:
++ do_split = -EINVAL;
++ goto out_unlock;
++ }
++
++ /*
++ * Calculate the number of pages, which fit into this large
++ * page starting at address:
++ */
++ nextpage_addr = (address + psize) & pmask;
++ numpages = (nextpage_addr - address) >> PAGE_SHIFT;
++ if (numpages < cpa->numpages)
++ cpa->numpages = numpages;
++
++ /*
++ * We are safe now. Check whether the new pgprot is the same:
++ */
++ old_pte = *kpte;
++ old_prot = new_prot = pte_pgprot(old_pte);
++
++ pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
++ pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
++
++ /*
++ * old_pte points to the large page base address. So we need
++ * to add the offset of the virtual address:
++ */
++ pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
++ cpa->pfn = pfn;
++
++ new_prot = static_protections(new_prot, address, pfn);
++
++ /*
++ * We need to check the full range, whether
++ * static_protection() requires a different pgprot for one of
++ * the pages in the range we try to preserve:
++ */
++ if (pfn < max_mapnr) {
++ addr = address + PAGE_SIZE;
++ for (i = 1; i < cpa->numpages && ++pfn < max_mapnr;
++ i++, addr += PAGE_SIZE) {
++ pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
++
++ if (pgprot_val(chk_prot) != pgprot_val(new_prot))
++ goto out_unlock;
++ }
++ }
++
++ /*
++ * If there are no changes, return. maxpages has been updated
++ * above:
++ */
++ if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
++ do_split = 0;
++ goto out_unlock;
++ }
++
++ /*
++ * We need to change the attributes. Check, whether we can
++ * change the large page in one go. We request a split, when
++ * the address is not aligned and the number of pages is
++ * smaller than the number of pages in the large page. Note
++ * that we limited the number of possible pages already to
++ * the number of pages in the large page.
++ */
++ if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
++ /*
++ * The address is aligned and the number of pages
++ * covers the full page.
++ */
++ new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
++ __set_pmd_pte(kpte, address, level, new_pte);
++ cpa->flushtlb = 1;
++ do_split = 0;
++ }
++
++out_unlock:
++ spin_unlock_irqrestore(&pgd_lock, flags);
++
++ return do_split;
++}
++
++static LIST_HEAD(page_pool);
++static unsigned long pool_size, pool_pages, pool_low;
++static unsigned long pool_used, pool_failed;
++
++static void cpa_fill_pool(struct page **ret)
++{
++ gfp_t gfp = GFP_KERNEL;
++ unsigned long flags;
++ struct page *p;
++
++ /*
++ * Avoid recursion (on debug-pagealloc) and also signal
++ * our priority to get to these pagetables:
++ */
++ if (current->flags & PF_MEMALLOC)
++ return;
++ current->flags |= PF_MEMALLOC;
++
++ /*
++ * Allocate atomically from atomic contexts:
++ */
++ if (in_atomic() || irqs_disabled() || debug_pagealloc)
++ gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
++
++ while (pool_pages < pool_size || (ret && !*ret)) {
++ p = alloc_pages(gfp, 0);
++ if (!p) {
++ pool_failed++;
++ break;
++ }
++ /*
++ * If the call site needs a page right now, provide it:
++ */
++ if (ret && !*ret) {
++ *ret = p;
++ continue;
++ }
++ spin_lock_irqsave(&pgd_lock, flags);
++ list_add(&p->lru, &page_pool);
++ pool_pages++;
++ spin_unlock_irqrestore(&pgd_lock, flags);
++ }
++
++ current->flags &= ~PF_MEMALLOC;
++}
++
++#define SHIFT_MB (20 - PAGE_SHIFT)
++#define ROUND_MB_GB ((1 << 10) - 1)
++#define SHIFT_MB_GB 10
++#define POOL_PAGES_PER_GB 16
++
++void __init cpa_init(void)
++{
++ struct sysinfo si;
++ unsigned long gb;
++
++ si_meminfo(&si);
++ /*
++ * Calculate the number of pool pages:
++ *
++ * Convert totalram (nr of pages) to MiB and round to the next
++ * GiB. Shift MiB to Gib and multiply the result by
++ * POOL_PAGES_PER_GB:
++ */
++ if (debug_pagealloc) {
++ gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
++ pool_size = POOL_PAGES_PER_GB * gb;
++ } else {
++ pool_size = 1;
++ }
++ pool_low = pool_size;
++
++ cpa_fill_pool(NULL);
++ printk(KERN_DEBUG
++ "CPA: page pool initialized %lu of %lu pages preallocated\n",
++ pool_pages, pool_size);
++}
++
++static int split_large_page(pte_t *kpte, unsigned long address)
++{
++ unsigned long flags, mfn, mfninc = 1;
++ unsigned int i, level;
++ pte_t *pbase, *tmp;
++ pgprot_t ref_prot;
++ struct page *base;
++
++ /*
++ * Get a page from the pool. The pool list is protected by the
++ * pgd_lock, which we have to take anyway for the split
++ * operation:
++ */
++ spin_lock_irqsave(&pgd_lock, flags);
++ if (list_empty(&page_pool)) {
++ spin_unlock_irqrestore(&pgd_lock, flags);
++ base = NULL;
++ cpa_fill_pool(&base);
++ if (!base)
++ return -ENOMEM;
++ spin_lock_irqsave(&pgd_lock, flags);
++ } else {
++ base = list_first_entry(&page_pool, struct page, lru);
++ list_del(&base->lru);
++ pool_pages--;
++
++ if (pool_pages < pool_low)
++ pool_low = pool_pages;
++ }
++
++ /*
++ * Check for races, another CPU might have split this page
++ * up for us already:
++ */
++ tmp = lookup_address(address, &level);
++ if (tmp != kpte)
++ goto out_unlock;
++
++ pbase = (pte_t *)page_address(base);
++#ifdef CONFIG_X86_32
++ paravirt_alloc_pt(&init_mm, page_to_pfn(base));
++#endif
++ ref_prot = pte_pgprot(pte_clrhuge(*kpte));
++
++#ifdef CONFIG_X86_64
++ if (level == PG_LEVEL_1G) {
++ mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
++ pgprot_val(ref_prot) |= _PAGE_PSE;
++ }
++#endif
++
++ /*
++ * Get the target mfn from the original entry:
++ */
++ mfn = __pte_mfn(*kpte);
++ for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
++ set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
++
++ /*
++ * Install the new, split up pagetable. Important details here:
++ *
++ * On Intel the NX bit of all levels must be cleared to make a
++ * page executable. See section 4.13.2 of Intel 64 and IA-32
++ * Architectures Software Developer's Manual).
++ *
++ * Mark the entry present. The current mapping might be
++ * set to not present, which we preserved above.
++ */
++ if (!xen_feature(XENFEAT_writable_page_tables) &&
++ HYPERVISOR_update_va_mapping((unsigned long)pbase,
++ mk_pte(base, PAGE_KERNEL_RO), 0))
++ BUG();
++ ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
++ pgprot_val(ref_prot) |= _PAGE_PRESENT;
++ __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
++ base = NULL;
++
++out_unlock:
++ /*
++ * If we dropped out via the lookup_address check under
++ * pgd_lock then stick the page back into the pool:
++ */
++ if (base) {
++ list_add(&base->lru, &page_pool);
++ pool_pages++;
++ } else
++ pool_used++;
++ spin_unlock_irqrestore(&pgd_lock, flags);
++
++ return 0;
++}
++
++static int __change_page_attr(struct cpa_data *cpa, int primary)
++{
++ unsigned long address = cpa->vaddr;
++ int do_split, err;
++ unsigned int level;
++ pte_t *kpte, old_pte;
++
++repeat:
++ kpte = lookup_address(address, &level);
++ if (!kpte)
++ return primary ? -EINVAL : 0;
++
++ old_pte = *kpte;
++ if (!__pte_val(old_pte)) {
++ if (!primary)
++ return 0;
++ printk(KERN_WARNING "CPA: called for zero pte. "
++ "vaddr = %lx cpa->vaddr = %lx\n", address,
++ cpa->vaddr);
++ WARN_ON(1);
++ return -EINVAL;
++ }
++
++ if (level == PG_LEVEL_4K) {
++ pte_t new_pte;
++ pgprot_t new_prot = pte_pgprot(old_pte);
++ unsigned long mfn = __pte_mfn(old_pte);
++
++ pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
++ pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
++
++ new_prot = static_protections(new_prot, address,
++ mfn_to_local_pfn(mfn));
++
++ /*
++ * We need to keep the mfn from the existing PTE,
++ * after all we're only going to change it's attributes
++ * not the memory it points to
++ */
++ new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
++ cpa->pfn = mfn_to_local_pfn(mfn);
++ /*
++ * Do we really change anything ?
++ */
++ if (__pte_val(old_pte) != __pte_val(new_pte)) {
++ set_pte_atomic(kpte, new_pte);
++ cpa->flushtlb = 1;
++ }
++ cpa->numpages = 1;
++ return 0;
++ }
++
++ /*
++ * Check, whether we can keep the large page intact
++ * and just change the pte:
++ */
++ do_split = try_preserve_large_page(kpte, address, cpa);
++ /*
++ * When the range fits into the existing large page,
++ * return. cp->numpages and cpa->tlbflush have been updated in
++ * try_large_page:
++ */
++ if (do_split <= 0)
++ return do_split;
++
++ /*
++ * We have to split the large page:
++ */
++ err = split_large_page(kpte, address);
++ if (!err) {
++ cpa->flushtlb = 1;
++ goto repeat;
++ }
++
++ return err;
++}
++
++static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
++
++static int cpa_process_alias(struct cpa_data *cpa)
++{
++ struct cpa_data alias_cpa;
++ int ret = 0;
++
++ if (cpa->pfn > max_pfn_mapped)
++ return 0;
++
++ /*
++ * No need to redo, when the primary call touched the direct
++ * mapping already:
++ */
++ if (!within(cpa->vaddr, PAGE_OFFSET,
++ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
++
++ alias_cpa = *cpa;
++ alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
++
++ ret = __change_page_attr_set_clr(&alias_cpa, 0);
++ }
++
++#ifdef CONFIG_X86_64
++ if (ret)
++ return ret;
++ /*
++ * No need to redo, when the primary call touched the high
++ * mapping already:
++ */
++ if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
++ return 0;
++
++ /*
++ * If the physical address is inside the kernel map, we need
++ * to touch the high mapped kernel as well:
++ */
++ if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
++ return 0;
++
++ alias_cpa = *cpa;
++ alias_cpa.vaddr =
++ (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
++
++ /*
++ * The high mapping range is imprecise, so ignore the return value.
++ */
++ __change_page_attr_set_clr(&alias_cpa, 0);
++#endif
++ return ret;
++}
++
++static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
++{
++ int ret, numpages = cpa->numpages;
++
++ while (numpages) {
++ /*
++ * Store the remaining nr of pages for the large page
++ * preservation check.
++ */
++ cpa->numpages = numpages;
++
++ ret = __change_page_attr(cpa, checkalias);
++ if (ret)
++ return ret;
++
++ if (checkalias) {
++ ret = cpa_process_alias(cpa);
++ if (ret)
++ return ret;
++ }
++
++ /*
++ * Adjust the number of pages with the result of the
++ * CPA operation. Either a large page has been
++ * preserved or a single page update happened.
++ */
++ BUG_ON(cpa->numpages > numpages);
++ numpages -= cpa->numpages;
++ cpa->vaddr += cpa->numpages * PAGE_SIZE;
++ }
++ return 0;
++}
++
++static inline int cache_attr(pgprot_t attr)
++{
++ return pgprot_val(attr) &
++ (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
++}
++
++static int change_page_attr_set_clr(unsigned long addr, int numpages,
++ pgprot_t mask_set, pgprot_t mask_clr)
++{
++ struct cpa_data cpa;
++ int ret, cache, checkalias;
++
++ /*
++ * Check, if we are requested to change a not supported
++ * feature:
++ */
++ mask_set = canon_pgprot(mask_set);
++ mask_clr = canon_pgprot(mask_clr);
++ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
++ return 0;
++
++ /* Ensure we are PAGE_SIZE aligned */
++ if (addr & ~PAGE_MASK) {
++ addr &= PAGE_MASK;
++ /*
++ * People should not be passing in unaligned addresses:
++ */
++ WARN_ON_ONCE(1);
++ }
++
++ cpa.vaddr = addr;
++ cpa.numpages = numpages;
++ cpa.mask_set = mask_set;
++ cpa.mask_clr = mask_clr;
++ cpa.flushtlb = 0;
++
++ /* No alias checking for _NX bit modifications */
++ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
++
++ ret = __change_page_attr_set_clr(&cpa, checkalias);
++
++ /*
++ * Check whether we really changed something:
++ */
++ if (!cpa.flushtlb)
++ goto out;
++
++ /*
++ * No need to flush, when we did not set any of the caching
++ * attributes:
++ */
++ cache = cache_attr(mask_set);
++
++ /*
++ * On success we use clflush, when the CPU supports it to
++ * avoid the wbindv. If the CPU does not support it and in the
++ * error case we fall back to cpa_flush_all (which uses
++ * wbindv):
++ */
++ if (!ret && cpu_has_clflush)
++ cpa_flush_range(addr, numpages, cache);
++ else
++ cpa_flush_all(cache);
++
++out:
++ cpa_fill_pool(NULL);
++
++ return ret;
++}
++
++static inline int change_page_attr_set(unsigned long addr, int numpages,
++ pgprot_t mask)
++{
++ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
++}
++
++static inline int change_page_attr_clear(unsigned long addr, int numpages,
++ pgprot_t mask)
++{
++ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
++}
++
++int set_memory_uc(unsigned long addr, int numpages)
++{
++ return change_page_attr_set(addr, numpages,
++ __pgprot(_PAGE_PCD));
++}
++EXPORT_SYMBOL(set_memory_uc);
++
++int set_memory_wb(unsigned long addr, int numpages)
++{
++ return change_page_attr_clear(addr, numpages,
++ __pgprot(_PAGE_PCD | _PAGE_PWT));
++}
++EXPORT_SYMBOL(set_memory_wb);
++
++int set_memory_x(unsigned long addr, int numpages)
++{
++ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
++}
++EXPORT_SYMBOL(set_memory_x);
++
++int set_memory_nx(unsigned long addr, int numpages)
++{
++ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
++}
++EXPORT_SYMBOL(set_memory_nx);
++
++int set_memory_ro(unsigned long addr, int numpages)
++{
++ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
++}
++
++int set_memory_rw(unsigned long addr, int numpages)
++{
++ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
++}
++
++int set_memory_np(unsigned long addr, int numpages)
++{
++ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
++}
++
++int set_pages_uc(struct page *page, int numpages)
++{
++ unsigned long addr = (unsigned long)page_address(page);
++
++ return set_memory_uc(addr, numpages);
++}
++EXPORT_SYMBOL(set_pages_uc);
++
++int set_pages_wb(struct page *page, int numpages)
++{
++ unsigned long addr = (unsigned long)page_address(page);
++
++ return set_memory_wb(addr, numpages);
++}
++EXPORT_SYMBOL(set_pages_wb);
++
++int set_pages_x(struct page *page, int numpages)
++{
++ unsigned long addr = (unsigned long)page_address(page);
++
++ return set_memory_x(addr, numpages);
++}
++EXPORT_SYMBOL(set_pages_x);
++
++int set_pages_nx(struct page *page, int numpages)
++{
++ unsigned long addr = (unsigned long)page_address(page);
++
++ return set_memory_nx(addr, numpages);
++}
++EXPORT_SYMBOL(set_pages_nx);
++
++int set_pages_ro(struct page *page, int numpages)
++{
++ unsigned long addr = (unsigned long)page_address(page);
++
++ return set_memory_ro(addr, numpages);
++}
++
++int set_pages_rw(struct page *page, int numpages)
++{
++ unsigned long addr = (unsigned long)page_address(page);
++
++ return set_memory_rw(addr, numpages);
++}
++
++#ifdef CONFIG_DEBUG_PAGEALLOC
++
++static int __set_pages_p(struct page *page, int numpages)
++{
++ struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
++ .numpages = numpages,
++ .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
++ .mask_clr = __pgprot(0)};
++
++ return __change_page_attr_set_clr(&cpa, 1);
++}
++
++static int __set_pages_np(struct page *page, int numpages)
++{
++ struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
++ .numpages = numpages,
++ .mask_set = __pgprot(0),
++ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
++
++ return __change_page_attr_set_clr(&cpa, 1);
++}
++
++void kernel_map_pages(struct page *page, int numpages, int enable)
++{
++ if (PageHighMem(page))
++ return;
++ if (!enable) {
++ debug_check_no_locks_freed(page_address(page),
++ numpages * PAGE_SIZE);
++ }
++
++ /*
++ * If page allocator is not up yet then do not call c_p_a():
++ */
++ if (!debug_pagealloc_enabled)
++ return;
++
++ /*
++ * The return value is ignored as the calls cannot fail.
++ * Large pages are kept enabled at boot time, and are
++ * split up quickly with DEBUG_PAGEALLOC. If a splitup
++ * fails here (due to temporary memory shortage) no damage
++ * is done because we just keep the largepage intact up
++ * to the next attempt when it will likely be split up:
++ */
++ if (enable)
++ __set_pages_p(page, numpages);
++ else
++ __set_pages_np(page, numpages);
++
++ /*
++ * We should perform an IPI and flush all tlbs,
++ * but that can deadlock->flush only current cpu:
++ */
++ __flush_tlb_all();
++
++ /*
++ * Try to refill the page pool here. We can do this only after
++ * the tlb flush.
++ */
++ cpa_fill_pool(NULL);
++}
++
++#ifdef CONFIG_HIBERNATION
++
++bool kernel_page_present(struct page *page)
++{
++ unsigned int level;
++ pte_t *pte;
++
++ if (PageHighMem(page))
++ return false;
++
++ pte = lookup_address((unsigned long)page_address(page), &level);
++ return (__pte_val(*pte) & _PAGE_PRESENT);
++}
++
++#endif /* CONFIG_HIBERNATION */
++
++#endif /* CONFIG_DEBUG_PAGEALLOC */
++
++static inline int in_secondary_range(unsigned long va)
++{
++#ifdef CONFIG_X86_64
++ return va >= VMALLOC_START && va < VMALLOC_END;
++#else
++ return va >= (unsigned long)high_memory;
++#endif
++}
++
++static void __make_page_readonly(unsigned long va)
++{
++ pte_t *pte;
++ unsigned int level;
++
++ pte = lookup_address(va, &level);
++ BUG_ON(!pte || level != PG_LEVEL_4K);
++ if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
++ BUG();
++ if (in_secondary_range(va)) {
++ unsigned long pfn = pte_pfn(*pte);
++
++#ifdef CONFIG_HIGHMEM
++ if (pfn >= highstart_pfn)
++ kmap_flush_unused(); /* flush stale writable kmaps */
++ else
++#endif
++ __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
++ }
++}
++
++static void __make_page_writable(unsigned long va)
++{
++ pte_t *pte;
++ unsigned int level;
++
++ pte = lookup_address(va, &level);
++ BUG_ON(!pte || level != PG_LEVEL_4K);
++ if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
++ BUG();
++ if (in_secondary_range(va)) {
++ unsigned long pfn = pte_pfn(*pte);
++
++#ifdef CONFIG_HIGHMEM
++ if (pfn < highstart_pfn)
++#endif
++ __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
++ }
++}
++
++void make_page_readonly(void *va, unsigned int feature)
++{
++ if (!xen_feature(feature))
++ __make_page_readonly((unsigned long)va);
++}
++
++void make_page_writable(void *va, unsigned int feature)
++{
++ if (!xen_feature(feature))
++ __make_page_writable((unsigned long)va);
++}
++
++void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
++{
++ unsigned long addr;
++
++ if (xen_feature(feature))
++ return;
++
++ for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
++ __make_page_readonly(addr);
++}
++
++void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
++{
++ unsigned long addr;
++
++ if (xen_feature(feature))
++ return;
++
++ for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
++ __make_page_writable(addr);
++}
++
++/*
++ * The testcases use internal knowledge of the implementation that shouldn't
++ * be exposed to the rest of the kernel. Include these directly here.
++ */
++#ifdef CONFIG_CPA_DEBUG
++#include "pageattr-test.c"
++#endif
+--- sle11-2009-06-29.orig/arch/x86/mm/pageattr_64-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,542 +0,0 @@
+-/*
+- * Copyright 2002 Andi Kleen, SuSE Labs.
+- * Thanks to Ben LaHaise for precious feedback.
+- */
+-
+-#include <linux/mm.h>
+-#include <linux/sched.h>
+-#include <linux/highmem.h>
+-#include <linux/module.h>
+-#include <linux/slab.h>
+-#include <asm/uaccess.h>
+-#include <asm/processor.h>
+-#include <asm/tlbflush.h>
+-#include <asm/io.h>
+-
+-#ifdef CONFIG_XEN
+-#include <asm/pgalloc.h>
+-#include <asm/mmu_context.h>
+-
+-static void _pin_lock(struct mm_struct *mm, int lock) {
+- if (lock)
+- spin_lock(&mm->page_table_lock);
+-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+- /* While mm->page_table_lock protects us against insertions and
+- * removals of higher level page table pages, it doesn't protect
+- * against updates of pte-s. Such updates, however, require the
+- * pte pages to be in consistent state (unpinned+writable or
+- * pinned+readonly). The pinning and attribute changes, however
+- * cannot be done atomically, which is why such updates must be
+- * prevented from happening concurrently.
+- * Note that no pte lock can ever elsewhere be acquired nesting
+- * with an already acquired one in the same mm, or with the mm's
+- * page_table_lock already acquired, as that would break in the
+- * non-split case (where all these are actually resolving to the
+- * one page_table_lock). Thus acquiring all of them here is not
+- * going to result in dead locks, and the order of acquires
+- * doesn't matter.
+- */
+- {
+- pgd_t *pgd = mm->pgd;
+- unsigned g;
+-
+- for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+- pud_t *pud;
+- unsigned u;
+-
+- if (pgd_none(*pgd))
+- continue;
+- pud = pud_offset(pgd, 0);
+- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+- pmd_t *pmd;
+- unsigned m;
+-
+- if (pud_none(*pud))
+- continue;
+- pmd = pmd_offset(pud, 0);
+- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+- spinlock_t *ptl;
+-
+- if (pmd_none(*pmd))
+- continue;
+- ptl = pte_lockptr(0, pmd);
+- if (lock)
+- spin_lock(ptl);
+- else
+- spin_unlock(ptl);
+- }
+- }
+- }
+- }
+-#endif
+- if (!lock)
+- spin_unlock(&mm->page_table_lock);
+-}
+-#define pin_lock(mm) _pin_lock(mm, 1)
+-#define pin_unlock(mm) _pin_lock(mm, 0)
+-
+-#define PIN_BATCH 8
+-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
+-
+-static inline unsigned int pgd_walk_set_prot(void *pt, pgprot_t flags,
+- unsigned int cpu, unsigned int seq)
+-{
+- struct page *page = virt_to_page(pt);
+- unsigned long pfn = page_to_pfn(page);
+-
+- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+- (unsigned long)__va(pfn << PAGE_SHIFT),
+- pfn_pte(pfn, flags), 0);
+- if (unlikely(++seq == PIN_BATCH)) {
+- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+- PIN_BATCH, NULL)))
+- BUG();
+- seq = 0;
+- }
+-
+- return seq;
+-}
+-
+-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+-{
+- pgd_t *pgd = pgd_base;
+- pud_t *pud;
+- pmd_t *pmd;
+- pte_t *pte;
+- int g,u,m;
+- unsigned int cpu, seq;
+- multicall_entry_t *mcl;
+-
+- cpu = get_cpu();
+-
+- /*
+- * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
+- * be the 'current' task's pagetables (e.g., current may be 32-bit,
+- * but the pagetables may be for a 64-bit task).
+- * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
+- * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
+- */
+- for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+- if (pgd_none(*pgd))
+- continue;
+- pud = pud_offset(pgd, 0);
+- if (PTRS_PER_PUD > 1) /* not folded */
+- seq = pgd_walk_set_prot(pud,flags,cpu,seq);
+- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+- if (pud_none(*pud))
+- continue;
+- pmd = pmd_offset(pud, 0);
+- if (PTRS_PER_PMD > 1) /* not folded */
+- seq = pgd_walk_set_prot(pmd,flags,cpu,seq);
+- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+- if (pmd_none(*pmd))
+- continue;
+- pte = pte_offset_kernel(pmd,0);
+- seq = pgd_walk_set_prot(pte,flags,cpu,seq);
+- }
+- }
+- }
+-
+- mcl = per_cpu(pb_mcl, cpu);
+- if (unlikely(seq > PIN_BATCH - 2)) {
+- if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
+- BUG();
+- seq = 0;
+- }
+- MULTI_update_va_mapping(mcl + seq,
+- (unsigned long)__user_pgd(pgd_base),
+- pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
+- 0);
+- MULTI_update_va_mapping(mcl + seq + 1,
+- (unsigned long)pgd_base,
+- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+- UVMF_TLB_FLUSH);
+- if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
+- BUG();
+-
+- put_cpu();
+-}
+-
+-static void __pgd_pin(pgd_t *pgd)
+-{
+- pgd_walk(pgd, PAGE_KERNEL_RO);
+- xen_pgd_pin(__pa(pgd)); /* kernel */
+- xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
+- SetPagePinned(virt_to_page(pgd));
+-}
+-
+-static void __pgd_unpin(pgd_t *pgd)
+-{
+- xen_pgd_unpin(__pa(pgd));
+- xen_pgd_unpin(__pa(__user_pgd(pgd)));
+- pgd_walk(pgd, PAGE_KERNEL);
+- ClearPagePinned(virt_to_page(pgd));
+-}
+-
+-void pgd_test_and_unpin(pgd_t *pgd)
+-{
+- if (PagePinned(virt_to_page(pgd)))
+- __pgd_unpin(pgd);
+-}
+-
+-void mm_pin(struct mm_struct *mm)
+-{
+- if (xen_feature(XENFEAT_writable_page_tables))
+- return;
+-
+- pin_lock(mm);
+- __pgd_pin(mm->pgd);
+- pin_unlock(mm);
+-}
+-
+-void mm_unpin(struct mm_struct *mm)
+-{
+- if (xen_feature(XENFEAT_writable_page_tables))
+- return;
+-
+- pin_lock(mm);
+- __pgd_unpin(mm->pgd);
+- pin_unlock(mm);
+-}
+-
+-void mm_pin_all(void)
+-{
+- struct page *page;
+- unsigned long flags;
+-
+- if (xen_feature(XENFEAT_writable_page_tables))
+- return;
+-
+- /*
+- * Allow uninterrupted access to the pgd_list. Also protects
+- * __pgd_pin() by disabling preemption.
+- * All other CPUs must be at a safe point (e.g., in stop_machine
+- * or offlined entirely).
+- */
+- spin_lock_irqsave(&pgd_lock, flags);
+- list_for_each_entry(page, &pgd_list, lru) {
+- if (!PagePinned(page))
+- __pgd_pin((pgd_t *)page_address(page));
+- }
+- spin_unlock_irqrestore(&pgd_lock, flags);
+-}
+-
+-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+-{
+- if (!PagePinned(virt_to_page(mm->pgd)))
+- mm_pin(mm);
+-}
+-
+-void arch_exit_mmap(struct mm_struct *mm)
+-{
+- struct task_struct *tsk = current;
+-
+- task_lock(tsk);
+-
+- /*
+- * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+- * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+- */
+- if (tsk->active_mm == mm) {
+- tsk->active_mm = &init_mm;
+- atomic_inc(&init_mm.mm_count);
+-
+- switch_mm(mm, &init_mm, tsk);
+-
+- atomic_dec(&mm->mm_count);
+- BUG_ON(atomic_read(&mm->mm_count) == 0);
+- }
+-
+- task_unlock(tsk);
+-
+- if (PagePinned(virt_to_page(mm->pgd))
+- && (atomic_read(&mm->mm_count) == 1)
+- && !mm->context.has_foreign_mappings)
+- mm_unpin(mm);
+-}
+-
+-static void _pte_free(struct page *page, unsigned int order)
+-{
+- BUG_ON(order);
+- pte_free(page);
+-}
+-
+-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+-{
+- struct page *pte;
+-
+- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+- if (pte) {
+- SetPageForeign(pte, _pte_free);
+- init_page_count(pte);
+- }
+- return pte;
+-}
+-
+-void pte_free(struct page *pte)
+-{
+- unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
+-
+- if (!pte_write(*virt_to_ptep(va)))
+- if (HYPERVISOR_update_va_mapping(
+- va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
+- BUG();
+-
+- ClearPageForeign(pte);
+- init_page_count(pte);
+-
+- __free_page(pte);
+-}
+-#endif /* CONFIG_XEN */
+-
+-pte_t *lookup_address(unsigned long address)
+-{
+- pgd_t *pgd = pgd_offset_k(address);
+- pud_t *pud;
+- pmd_t *pmd;
+- pte_t *pte;
+- if (pgd_none(*pgd))
+- return NULL;
+- pud = pud_offset(pgd, address);
+- if (!pud_present(*pud))
+- return NULL;
+- pmd = pmd_offset(pud, address);
+- if (!pmd_present(*pmd))
+- return NULL;
+- if (pmd_large(*pmd))
+- return (pte_t *)pmd;
+- pte = pte_offset_kernel(pmd, address);
+- if (pte && !pte_present(*pte))
+- pte = NULL;
+- return pte;
+-}
+-
+-static struct page *split_large_page(unsigned long address, pgprot_t prot,
+- pgprot_t ref_prot)
+-{
+- int i;
+- unsigned long addr;
+- struct page *base = alloc_pages(GFP_KERNEL, 0);
+- pte_t *pbase;
+- if (!base)
+- return NULL;
+- /*
+- * page_private is used to track the number of entries in
+- * the page table page have non standard attributes.
+- */
+- SetPagePrivate(base);
+- page_private(base) = 0;
+-
+- address = __pa(address);
+- addr = address & LARGE_PAGE_MASK;
+- pbase = (pte_t *)page_address(base);
+- for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
+- pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
+- addr == address ? prot : ref_prot);
+- }
+- return base;
+-}
+-
+-void clflush_cache_range(void *adr, int size)
+-{
+- int i;
+- for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
+- clflush(adr+i);
+-}
+-
+-static void flush_kernel_map(void *arg)
+-{
+- struct list_head *l = (struct list_head *)arg;
+- struct page *pg;
+-
+- /* When clflush is available always use it because it is
+- much cheaper than WBINVD. */
+- /* clflush is still broken. Disable for now. */
+- if (1 || !cpu_has_clflush)
+- asm volatile("wbinvd" ::: "memory");
+- else list_for_each_entry(pg, l, lru) {
+- void *adr = page_address(pg);
+- clflush_cache_range(adr, PAGE_SIZE);
+- }
+- __flush_tlb_all();
+-}
+-
+-static inline void flush_map(struct list_head *l)
+-{
+- on_each_cpu(flush_kernel_map, l, 1, 1);
+-}
+-
+-static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
+-
+-static inline void save_page(struct page *fpage)
+-{
+- if (!test_and_set_bit(PG_arch_1, &fpage->flags))
+- list_add(&fpage->lru, &deferred_pages);
+-}
+-
+-/*
+- * No more special protections in this 2/4MB area - revert to a
+- * large page again.
+- */
+-static void revert_page(unsigned long address, pgprot_t ref_prot)
+-{
+- pgd_t *pgd;
+- pud_t *pud;
+- pmd_t *pmd;
+- pte_t large_pte;
+- unsigned long pfn;
+-
+- pgd = pgd_offset_k(address);
+- BUG_ON(pgd_none(*pgd));
+- pud = pud_offset(pgd,address);
+- BUG_ON(pud_none(*pud));
+- pmd = pmd_offset(pud, address);
+- BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
+- pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
+- large_pte = pfn_pte(pfn, ref_prot);
+- large_pte = pte_mkhuge(large_pte);
+- set_pte((pte_t *)pmd, large_pte);
+-}
+-
+-static int
+-__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
+- pgprot_t ref_prot)
+-{
+- pte_t *kpte;
+- struct page *kpte_page;
+- pgprot_t ref_prot2;
+-
+- kpte = lookup_address(address);
+- if (!kpte) return 0;
+- kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+- BUG_ON(PageLRU(kpte_page));
+- BUG_ON(PageCompound(kpte_page));
+- if (pgprot_val(prot) != pgprot_val(ref_prot)) {
+- if (!pte_huge(*kpte)) {
+- set_pte(kpte, pfn_pte(pfn, prot));
+- } else {
+- /*
+- * split_large_page will take the reference for this
+- * change_page_attr on the split page.
+- */
+- struct page *split;
+- ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
+- split = split_large_page(address, prot, ref_prot2);
+- if (!split)
+- return -ENOMEM;
+- pgprot_val(ref_prot2) &= ~_PAGE_NX;
+- set_pte(kpte, mk_pte(split, ref_prot2));
+- kpte_page = split;
+- }
+- page_private(kpte_page)++;
+- } else if (!pte_huge(*kpte)) {
+- set_pte(kpte, pfn_pte(pfn, ref_prot));
+- BUG_ON(page_private(kpte_page) == 0);
+- page_private(kpte_page)--;
+- } else
+- BUG();
+-
+- /* on x86-64 the direct mapping set at boot is not using 4k pages */
+- /*
+- * ..., but the XEN guest kernels (currently) do:
+- * If the pte was reserved, it means it was created at boot
+- * time (not via split_large_page) and in turn we must not
+- * replace it with a large page.
+- */
+-#ifndef CONFIG_XEN
+- BUG_ON(PageReserved(kpte_page));
+-#else
+- if (PageReserved(kpte_page))
+- return 0;
+-#endif
+-
+- save_page(kpte_page);
+- if (page_private(kpte_page) == 0)
+- revert_page(address, ref_prot);
+- return 0;
+-}
+-
+-/*
+- * Change the page attributes of an page in the linear mapping.
+- *
+- * This should be used when a page is mapped with a different caching policy
+- * than write-back somewhere - some CPUs do not like it when mappings with
+- * different caching policies exist. This changes the page attributes of the
+- * in kernel linear mapping too.
+- *
+- * The caller needs to ensure that there are no conflicting mappings elsewhere.
+- * This function only deals with the kernel linear map.
+- *
+- * Caller must call global_flush_tlb() after this.
+- */
+-int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
+-{
+- int err = 0, kernel_map = 0;
+- int i;
+-
+- if (address >= __START_KERNEL_map
+- && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
+- address = (unsigned long)__va(__pa(address));
+- kernel_map = 1;
+- }
+-
+- down_write(&init_mm.mmap_sem);
+- for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
+- unsigned long pfn = __pa(address) >> PAGE_SHIFT;
+-
+- if (!kernel_map || pte_present(pfn_pte(0, prot))) {
+- err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
+- if (err)
+- break;
+- }
+- /* Handle kernel mapping too which aliases part of the
+- * lowmem */
+- if (__pa(address) < KERNEL_TEXT_SIZE) {
+- unsigned long addr2;
+- pgprot_t prot2;
+- addr2 = __START_KERNEL_map + __pa(address);
+- /* Make sure the kernel mappings stay executable */
+- prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
+- err = __change_page_attr(addr2, pfn, prot2,
+- PAGE_KERNEL_EXEC);
+- }
+- }
+- up_write(&init_mm.mmap_sem);
+- return err;
+-}
+-
+-/* Don't call this for MMIO areas that may not have a mem_map entry */
+-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+-{
+- unsigned long addr = (unsigned long)page_address(page);
+- return change_page_attr_addr(addr, numpages, prot);
+-}
+-
+-void global_flush_tlb(void)
+-{
+- struct page *pg, *next;
+- struct list_head l;
+-
+- /*
+- * Write-protect the semaphore, to exclude two contexts
+- * doing a list_replace_init() call in parallel and to
+- * exclude new additions to the deferred_pages list:
+- */
+- down_write(&init_mm.mmap_sem);
+- list_replace_init(&deferred_pages, &l);
+- up_write(&init_mm.mmap_sem);
+-
+- flush_map(&l);
+-
+- list_for_each_entry_safe(pg, next, &l, lru) {
+- list_del(&pg->lru);
+- clear_bit(PG_arch_1, &pg->flags);
+- if (page_private(pg) != 0)
+- continue;
+- ClearPagePrivate(pg);
+- __free_page(pg);
+- }
+-}
+-
+-EXPORT_SYMBOL(change_page_attr);
+-EXPORT_SYMBOL(global_flush_tlb);
+--- sle11-2009-06-29.orig/arch/x86/mm/pgtable_32-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -29,8 +29,6 @@
+ #include <xen/features.h>
+ #include <asm/hypervisor.h>
+
+-static void pgd_test_and_unpin(pgd_t *pgd);
+-
+ void show_mem(void)
+ {
+ int total = 0, reserved = 0;
+@@ -167,53 +165,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st
+ return pte;
+ }
+
+-static void _pte_free(struct page *page, unsigned int order)
+-{
+- BUG_ON(order);
+- pte_free(page);
+-}
+-
+-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+-{
+- struct page *pte;
+-
+-#ifdef CONFIG_HIGHPTE
+- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+-#else
+- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+-#endif
+- if (pte) {
+- SetPageForeign(pte, _pte_free);
+- init_page_count(pte);
+- }
+- return pte;
+-}
+-
+-void pte_free(struct page *pte)
+-{
+- unsigned long pfn = page_to_pfn(pte);
+-
+- if (!PageHighMem(pte)) {
+- unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
+-
+- if (!pte_write(*virt_to_ptep(va)))
+- if (HYPERVISOR_update_va_mapping(
+- va, pfn_pte(pfn, PAGE_KERNEL), 0))
+- BUG();
+- } else
+- ClearPagePinned(pte);
+-
+- ClearPageForeign(pte);
+- init_page_count(pte);
+-
+- __free_page(pte);
+-}
+-
+-void pmd_ctor(struct kmem_cache *cache, void *pmd)
+-{
+- memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+-}
+-
+ /*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+@@ -224,224 +175,191 @@ void pmd_ctor(struct kmem_cache *cache,
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+-DEFINE_SPINLOCK(pgd_lock);
+-struct page *pgd_list;
+-
+ static inline void pgd_list_add(pgd_t *pgd)
+ {
+ struct page *page = virt_to_page(pgd);
+- page->index = (unsigned long)pgd_list;
+- if (pgd_list)
+- set_page_private(pgd_list, (unsigned long)&page->index);
+- pgd_list = page;
+- set_page_private(page, (unsigned long)&pgd_list);
++
++ list_add(&page->lru, &pgd_list);
+ }
+
+ static inline void pgd_list_del(pgd_t *pgd)
+ {
+- struct page *next, **pprev, *page = virt_to_page(pgd);
+- next = (struct page *)page->index;
+- pprev = (struct page **)page_private(page);
+- *pprev = next;
+- if (next)
+- set_page_private(next, (unsigned long)pprev);
+-}
++ struct page *page = virt_to_page(pgd);
+
++ list_del(&page->lru);
++}
+
++#define UNSHARED_PTRS_PER_PGD \
++ (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
+
+-#if (PTRS_PER_PMD == 1)
+-/* Non-PAE pgd constructor */
+-static void pgd_ctor(void *pgd)
++static void pgd_ctor(void *p)
+ {
++ pgd_t *pgd = p;
+ unsigned long flags;
+
+- /* !PAE, no pagetable sharing */
++ pgd_test_and_unpin(pgd);
++
++ /* Clear usermode parts of PGD */
+ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+
+ spin_lock_irqsave(&pgd_lock, flags);
+
+- /* must happen under lock */
+- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+- swapper_pg_dir + USER_PTRS_PER_PGD,
+- KERNEL_PGD_PTRS);
+-
+- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+- __pa(swapper_pg_dir) >> PAGE_SHIFT,
+- USER_PTRS_PER_PGD,
+- KERNEL_PGD_PTRS);
+- pgd_list_add(pgd);
+- spin_unlock_irqrestore(&pgd_lock, flags);
+-}
+-#else /* PTRS_PER_PMD > 1 */
+-/* PAE pgd constructor */
+-static void pgd_ctor(void *pgd)
+-{
+- /* PAE, kernel PMD may be shared */
+-
+- if (SHARED_KERNEL_PMD) {
+- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
++ /* If the pgd points to a shared pagetable level (either the
++ ptes in non-PAE, or shared PMD in PAE), then just copy the
++ references from swapper_pg_dir. */
++ if (PAGETABLE_LEVELS == 2 ||
++ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
++ clone_pgd_range(pgd + USER_PTRS_PER_PGD,
+ swapper_pg_dir + USER_PTRS_PER_PGD,
+ KERNEL_PGD_PTRS);
+- } else {
+- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
++ paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
++ __pa(swapper_pg_dir) >> PAGE_SHIFT,
++ USER_PTRS_PER_PGD,
++ KERNEL_PGD_PTRS);
+ }
++
++ /* list required to sync kernel mapping updates */
++ if (PAGETABLE_LEVELS == 2)
++ pgd_list_add(pgd);
++
++ spin_unlock_irqrestore(&pgd_lock, flags);
+ }
+-#endif /* PTRS_PER_PMD */
+
+ static void pgd_dtor(void *pgd)
+ {
+ unsigned long flags; /* can be called from interrupt context */
+
+- if (SHARED_KERNEL_PMD)
+- return;
+-
+- paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
+- spin_lock_irqsave(&pgd_lock, flags);
+- pgd_list_del(pgd);
+- spin_unlock_irqrestore(&pgd_lock, flags);
++ if (!SHARED_KERNEL_PMD) {
++ spin_lock_irqsave(&pgd_lock, flags);
++ pgd_list_del(pgd);
++ spin_unlock_irqrestore(&pgd_lock, flags);
++ }
+
+ pgd_test_and_unpin(pgd);
+ }
+
+-#define UNSHARED_PTRS_PER_PGD \
+- (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
+-
+-/* If we allocate a pmd for part of the kernel address space, then
+- make sure its initialized with the appropriate kernel mappings.
+- Otherwise use a cached zeroed pmd. */
+-static pmd_t *pmd_cache_alloc(int idx)
++#ifdef CONFIG_X86_PAE
++/*
++ * Mop up any pmd pages which may still be attached to the pgd.
++ * Normally they will be freed by munmap/exit_mmap, but any pmd we
++ * preallocate which never got a corresponding vma will need to be
++ * freed manually.
++ */
++static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+ {
+- pmd_t *pmd;
++ int i;
+
+- if (idx >= USER_PTRS_PER_PGD) {
+- pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
++ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
++ pgd_t pgd = pgdp[i];
+
+-#ifndef CONFIG_XEN
+- if (pmd)
+- memcpy(pmd,
+- (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
+- sizeof(pmd_t) * PTRS_PER_PMD);
+-#endif
+- } else
+- pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
++ if (__pgd_val(pgd) != 0) {
++ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+- return pmd;
+-}
++ pgdp[i] = xen_make_pgd(0);
+
+-static void pmd_cache_free(pmd_t *pmd, int idx)
+-{
+- if (idx >= USER_PTRS_PER_PGD) {
+- make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
+- memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+- free_page((unsigned long)pmd);
+- } else
+- kmem_cache_free(pmd_cache, pmd);
++ paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
++ pmd_free(mm, pmd);
++ }
++ }
+ }
+
+-pgd_t *pgd_alloc(struct mm_struct *mm)
++/*
++ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
++ * updating the top-level pagetable entries to guarantee the
++ * processor notices the update. Since this is expensive, and
++ * all 4 top-level entries are used almost immediately in a
++ * new process's life, we just pre-populate them here.
++ *
++ * Also, if we're in a paravirt environment where the kernel pmd is
++ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
++ * and initialize the kernel pmds here.
++ */
++static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+ {
++ pud_t *pud;
++ pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
++ unsigned long addr, flags;
+ int i;
+- pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
+- pmd_t **pmds = NULL;
+- unsigned long flags;
+-
+- pgd_test_and_unpin(pgd);
+-
+- if (PTRS_PER_PMD == 1 || !pgd)
+- return pgd;
+-
+-#ifdef CONFIG_XEN
+- if (!SHARED_KERNEL_PMD) {
+- /*
+- * We can race save/restore (if we sleep during a GFP_KERNEL memory
+- * allocation). We therefore store virtual addresses of pmds as they
+- * do not change across save/restore, and poke the machine addresses
+- * into the pgdir under the pgd_lock.
+- */
+- pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
+- if (!pmds) {
+- quicklist_free(0, pgd_dtor, pgd);
+- return NULL;
+- }
+- }
+-#endif
+
+- /* Allocate pmds, remember virtual addresses. */
+- for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
+- pmd_t *pmd = pmd_cache_alloc(i);
+-
+- if (!pmd)
++ /*
++ * We can race save/restore (if we sleep during a GFP_KERNEL memory
++ * allocation). We therefore store virtual addresses of pmds as they
++ * do not change across save/restore, and poke the machine addresses
++ * into the pgdir under the pgd_lock.
++ */
++ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
++ pmds[i] = pmd_alloc_one(mm, addr);
++ if (!pmds[i])
+ goto out_oom;
+-
+- paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
+- if (pmds)
+- pmds[i] = pmd;
+- else
+- set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+ }
+
+-#ifdef CONFIG_XEN
+- if (SHARED_KERNEL_PMD)
+- return pgd;
+-
+ spin_lock_irqsave(&pgd_lock, flags);
+
+ /* Protect against save/restore: move below 4GB under pgd_lock. */
+- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
+- int rc = xen_create_contiguous_region(
+- (unsigned long)pgd, 0, 32);
+- if (rc) {
+- spin_unlock_irqrestore(&pgd_lock, flags);
+- goto out_oom;
+- }
++ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
++ && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
++ spin_unlock_irqrestore(&pgd_lock, flags);
++out_oom:
++ while (i--)
++ pmd_free(mm, pmds[i]);
++ return 0;
+ }
+
+ /* Copy kernel pmd contents and write-protect the new pmds. */
+- for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
+- memcpy(pmds[i],
+- (void *)pgd_page_vaddr(swapper_pg_dir[i]),
+- sizeof(pmd_t) * PTRS_PER_PMD);
+- make_lowmem_page_readonly(
+- pmds[i], XENFEAT_writable_page_tables);
+- }
++ pud = pud_offset(pgd, 0);
++ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
++ i++, pud++, addr += PUD_SIZE) {
++ if (i >= USER_PTRS_PER_PGD) {
++ memcpy(pmds[i],
++ (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
++ sizeof(pmd_t) * PTRS_PER_PMD);
++ make_lowmem_page_readonly(
++ pmds[i], XENFEAT_writable_page_tables);
++ }
+
+- /* It is safe to poke machine addresses of pmds under the pmd_lock. */
+- for (i = 0; i < PTRS_PER_PGD; i++)
+- set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
++ /* It is safe to poke machine addresses of pmds under the pgd_lock. */
++ pud_populate(mm, pud, pmds[i]);
++ }
+
+- /* Ensure this pgd gets picked up and pinned on save/restore. */
++ /* List required to sync kernel mapping updates and
++ * to pin/unpin on save/restore. */
+ pgd_list_add(pgd);
+
+ spin_unlock_irqrestore(&pgd_lock, flags);
+
+- kfree(pmds);
+-#endif
++ return 1;
++}
++#else /* !CONFIG_X86_PAE */
++/* No need to prepopulate any pagetable entries in non-PAE modes. */
++static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
++{
++ return 1;
++}
+
+- return pgd;
++static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
++{
++}
++#endif /* CONFIG_X86_PAE */
+
+-out_oom:
+- if (!pmds) {
+- for (i--; i >= 0; i--) {
+- pgd_t pgdent = pgd[i];
+- void* pmd = (void *)__va(pgd_val(pgdent)-1);
+- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+- pmd_cache_free(pmd, i);
+- }
+- } else {
+- for (i--; i >= 0; i--) {
+- paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
+- pmd_cache_free(pmds[i], i);
+- }
+- kfree(pmds);
++pgd_t *pgd_alloc(struct mm_struct *mm)
++{
++ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
++
++ /* so that alloc_pd can use it */
++ mm->pgd = pgd;
++ if (pgd)
++ pgd_ctor(pgd);
++
++ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
++ free_page((unsigned long)pgd);
++ pgd = NULL;
+ }
+- quicklist_free(0, pgd_dtor, pgd);
+- return NULL;
++
++ return pgd;
+ }
+
+-void pgd_free(pgd_t *pgd)
++void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+ {
+- int i;
+-
+ /*
+ * After this the pgd should not be pinned for the duration of this
+ * function's execution. We should never sleep and thus never race:
+@@ -450,39 +368,43 @@ void pgd_free(pgd_t *pgd)
+ * 2. The machine addresses in PGD entries will not become invalid
+ * due to a concurrent save/restore.
+ */
+- pgd_test_and_unpin(pgd);
++ pgd_dtor(pgd);
+
+- /* in the PAE case user pgd entries are overwritten before usage */
+- if (PTRS_PER_PMD > 1) {
+- for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
+- pgd_t pgdent = pgd[i];
+- void* pmd = (void *)__va(pgd_val(pgdent)-1);
+- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+- pmd_cache_free(pmd, i);
+- }
++ if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
++ xen_destroy_contiguous_region((unsigned long)pgd, 0);
+
+- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
+- xen_destroy_contiguous_region((unsigned long)pgd, 0);
+- }
++ pgd_mop_up_pmds(mm, pgd);
++ free_page((unsigned long)pgd);
++}
+
+- /* in the non-PAE case, free_pgtables() clears user pgd entries */
+- quicklist_free(0, pgd_dtor, pgd);
++void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
++{
++ pgtable_page_dtor(pte);
++ paravirt_release_pt(page_to_pfn(pte));
++ tlb_remove_page(tlb, pte);
+ }
+
+-void check_pgt_cache(void)
++#ifdef CONFIG_X86_PAE
++
++void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+ {
+- quicklist_trim(0, pgd_dtor, 25, 16);
++ paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
++ tlb_remove_page(tlb, virt_to_page(pmd));
+ }
+
++#endif
++
+ void make_lowmem_page_readonly(void *va, unsigned int feature)
+ {
+ pte_t *pte;
++ unsigned int level;
+ int rc;
+
+ if (xen_feature(feature))
+ return;
+
+- pte = virt_to_ptep(va);
++ pte = lookup_address((unsigned long)va, &level);
++ BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
+ rc = HYPERVISOR_update_va_mapping(
+ (unsigned long)va, pte_wrprotect(*pte), 0);
+ BUG_ON(rc);
+@@ -491,313 +413,15 @@ void make_lowmem_page_readonly(void *va,
+ void make_lowmem_page_writable(void *va, unsigned int feature)
+ {
+ pte_t *pte;
++ unsigned int level;
+ int rc;
+
+ if (xen_feature(feature))
+ return;
+
+- pte = virt_to_ptep(va);
++ pte = lookup_address((unsigned long)va, &level);
++ BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
+ rc = HYPERVISOR_update_va_mapping(
+ (unsigned long)va, pte_mkwrite(*pte), 0);
+ BUG_ON(rc);
+ }
+-
+-void make_page_readonly(void *va, unsigned int feature)
+-{
+- pte_t *pte;
+- int rc;
+-
+- if (xen_feature(feature))
+- return;
+-
+- pte = virt_to_ptep(va);
+- rc = HYPERVISOR_update_va_mapping(
+- (unsigned long)va, pte_wrprotect(*pte), 0);
+- if (rc) /* fallback? */
+- xen_l1_entry_update(pte, pte_wrprotect(*pte));
+- if ((unsigned long)va >= (unsigned long)high_memory) {
+- unsigned long pfn = pte_pfn(*pte);
+-#ifdef CONFIG_HIGHMEM
+- if (pfn >= highstart_pfn)
+- kmap_flush_unused(); /* flush stale writable kmaps */
+- else
+-#endif
+- make_lowmem_page_readonly(
+- phys_to_virt(pfn << PAGE_SHIFT), feature);
+- }
+-}
+-
+-void make_page_writable(void *va, unsigned int feature)
+-{
+- pte_t *pte;
+- int rc;
+-
+- if (xen_feature(feature))
+- return;
+-
+- pte = virt_to_ptep(va);
+- rc = HYPERVISOR_update_va_mapping(
+- (unsigned long)va, pte_mkwrite(*pte), 0);
+- if (rc) /* fallback? */
+- xen_l1_entry_update(pte, pte_mkwrite(*pte));
+- if ((unsigned long)va >= (unsigned long)high_memory) {
+- unsigned long pfn = pte_pfn(*pte);
+-#ifdef CONFIG_HIGHMEM
+- if (pfn < highstart_pfn)
+-#endif
+- make_lowmem_page_writable(
+- phys_to_virt(pfn << PAGE_SHIFT), feature);
+- }
+-}
+-
+-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
+-{
+- if (xen_feature(feature))
+- return;
+-
+- while (nr-- != 0) {
+- make_page_readonly(va, feature);
+- va = (void *)((unsigned long)va + PAGE_SIZE);
+- }
+-}
+-
+-void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
+-{
+- if (xen_feature(feature))
+- return;
+-
+- while (nr-- != 0) {
+- make_page_writable(va, feature);
+- va = (void *)((unsigned long)va + PAGE_SIZE);
+- }
+-}
+-
+-static void _pin_lock(struct mm_struct *mm, int lock) {
+- if (lock)
+- spin_lock(&mm->page_table_lock);
+-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+- /* While mm->page_table_lock protects us against insertions and
+- * removals of higher level page table pages, it doesn't protect
+- * against updates of pte-s. Such updates, however, require the
+- * pte pages to be in consistent state (unpinned+writable or
+- * pinned+readonly). The pinning and attribute changes, however
+- * cannot be done atomically, which is why such updates must be
+- * prevented from happening concurrently.
+- * Note that no pte lock can ever elsewhere be acquired nesting
+- * with an already acquired one in the same mm, or with the mm's
+- * page_table_lock already acquired, as that would break in the
+- * non-split case (where all these are actually resolving to the
+- * one page_table_lock). Thus acquiring all of them here is not
+- * going to result in dead locks, and the order of acquires
+- * doesn't matter.
+- */
+- {
+- pgd_t *pgd = mm->pgd;
+- unsigned g;
+-
+- for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+- pud_t *pud;
+- unsigned u;
+-
+- if (pgd_none(*pgd))
+- continue;
+- pud = pud_offset(pgd, 0);
+- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+- pmd_t *pmd;
+- unsigned m;
+-
+- if (pud_none(*pud))
+- continue;
+- pmd = pmd_offset(pud, 0);
+- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+- spinlock_t *ptl;
+-
+- if (pmd_none(*pmd))
+- continue;
+- ptl = pte_lockptr(0, pmd);
+- if (lock)
+- spin_lock(ptl);
+- else
+- spin_unlock(ptl);
+- }
+- }
+- }
+- }
+-#endif
+- if (!lock)
+- spin_unlock(&mm->page_table_lock);
+-}
+-#define pin_lock(mm) _pin_lock(mm, 1)
+-#define pin_unlock(mm) _pin_lock(mm, 0)
+-
+-#define PIN_BATCH 4
+-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
+-
+-static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
+- unsigned int cpu, unsigned seq)
+-{
+- unsigned long pfn = page_to_pfn(page);
+-
+- if (PageHighMem(page)) {
+- if (pgprot_val(flags) & _PAGE_RW)
+- ClearPagePinned(page);
+- else
+- SetPagePinned(page);
+- } else {
+- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+- (unsigned long)__va(pfn << PAGE_SHIFT),
+- pfn_pte(pfn, flags), 0);
+- if (unlikely(++seq == PIN_BATCH)) {
+- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+- PIN_BATCH, NULL)))
+- BUG();
+- seq = 0;
+- }
+- }
+-
+- return seq;
+-}
+-
+-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+-{
+- pgd_t *pgd = pgd_base;
+- pud_t *pud;
+- pmd_t *pmd;
+- int g, u, m;
+- unsigned int cpu, seq;
+-
+- if (xen_feature(XENFEAT_auto_translated_physmap))
+- return;
+-
+- cpu = get_cpu();
+-
+- for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+- if (pgd_none(*pgd))
+- continue;
+- pud = pud_offset(pgd, 0);
+- if (PTRS_PER_PUD > 1) /* not folded */
+- seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
+- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+- if (pud_none(*pud))
+- continue;
+- pmd = pmd_offset(pud, 0);
+- if (PTRS_PER_PMD > 1) /* not folded */
+- seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
+- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+- if (pmd_none(*pmd))
+- continue;
+- seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
+- }
+- }
+- }
+-
+- if (likely(seq != 0)) {
+- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+- (unsigned long)pgd_base,
+- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+- UVMF_TLB_FLUSH);
+- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+- seq + 1, NULL)))
+- BUG();
+- } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
+- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+- UVMF_TLB_FLUSH))
+- BUG();
+-
+- put_cpu();
+-}
+-
+-static void __pgd_pin(pgd_t *pgd)
+-{
+- pgd_walk(pgd, PAGE_KERNEL_RO);
+- kmap_flush_unused();
+- xen_pgd_pin(__pa(pgd));
+- SetPagePinned(virt_to_page(pgd));
+-}
+-
+-static void __pgd_unpin(pgd_t *pgd)
+-{
+- xen_pgd_unpin(__pa(pgd));
+- pgd_walk(pgd, PAGE_KERNEL);
+- ClearPagePinned(virt_to_page(pgd));
+-}
+-
+-static void pgd_test_and_unpin(pgd_t *pgd)
+-{
+- if (PagePinned(virt_to_page(pgd)))
+- __pgd_unpin(pgd);
+-}
+-
+-void mm_pin(struct mm_struct *mm)
+-{
+- if (xen_feature(XENFEAT_writable_page_tables))
+- return;
+- pin_lock(mm);
+- __pgd_pin(mm->pgd);
+- pin_unlock(mm);
+-}
+-
+-void mm_unpin(struct mm_struct *mm)
+-{
+- if (xen_feature(XENFEAT_writable_page_tables))
+- return;
+- pin_lock(mm);
+- __pgd_unpin(mm->pgd);
+- pin_unlock(mm);
+-}
+-
+-void mm_pin_all(void)
+-{
+- struct page *page;
+- unsigned long flags;
+-
+- if (xen_feature(XENFEAT_writable_page_tables))
+- return;
+-
+- /*
+- * Allow uninterrupted access to the pgd_list. Also protects
+- * __pgd_pin() by disabling preemption.
+- * All other CPUs must be at a safe point (e.g., in stop_machine
+- * or offlined entirely).
+- */
+- spin_lock_irqsave(&pgd_lock, flags);
+- for (page = pgd_list; page; page = (struct page *)page->index) {
+- if (!PagePinned(page))
+- __pgd_pin((pgd_t *)page_address(page));
+- }
+- spin_unlock_irqrestore(&pgd_lock, flags);
+-}
+-
+-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+-{
+- if (!PagePinned(virt_to_page(mm->pgd)))
+- mm_pin(mm);
+-}
+-
+-void arch_exit_mmap(struct mm_struct *mm)
+-{
+- struct task_struct *tsk = current;
+-
+- task_lock(tsk);
+-
+- /*
+- * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+- * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+- */
+- if (tsk->active_mm == mm) {
+- tsk->active_mm = &init_mm;
+- atomic_inc(&init_mm.mm_count);
+-
+- switch_mm(mm, &init_mm, tsk);
+-
+- atomic_dec(&mm->mm_count);
+- BUG_ON(atomic_read(&mm->mm_count) == 0);
+- }
+-
+- task_unlock(tsk);
+-
+- if (PagePinned(virt_to_page(mm->pgd)) &&
+- (atomic_read(&mm->mm_count) == 1) &&
+- !mm->context.has_foreign_mappings)
+- mm_unpin(mm);
+-}
+--- sle11-2009-06-29.orig/arch/x86/pci/irq-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/arch/x86/pci/irq-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -204,6 +204,7 @@ static int pirq_ali_get(struct pci_dev *
+ {
+ static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
+
++ WARN_ON_ONCE(pirq >= 16);
+ return irqmap[read_config_nybble(router, 0x48, pirq-1)];
+ }
+
+@@ -211,7 +212,8 @@ static int pirq_ali_set(struct pci_dev *
+ {
+ static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
+ unsigned int val = irqmap[irq];
+-
++
++ WARN_ON_ONCE(pirq >= 16);
+ if (val) {
+ write_config_nybble(router, 0x48, pirq-1, val);
+ return 1;
+@@ -261,12 +263,16 @@ static int pirq_via_set(struct pci_dev *
+ static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+ {
+ static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
++
++ WARN_ON_ONCE(pirq >= 5);
+ return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
+ }
+
+ static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+ {
+ static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
++
++ WARN_ON_ONCE(pirq >= 5);
+ write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
+ return 1;
+ }
+@@ -279,12 +285,16 @@ static int pirq_via586_set(struct pci_de
+ static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+ {
+ static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
++
++ WARN_ON_ONCE(pirq >= 4);
+ return read_config_nybble(router,0x43, pirqmap[pirq-1]);
+ }
+
+ static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+ {
+ static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
++
++ WARN_ON_ONCE(pirq >= 4);
+ write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
+ return 1;
+ }
+@@ -423,6 +433,7 @@ static int pirq_sis_set(struct pci_dev *
+
+ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+ {
++ WARN_ON_ONCE(pirq >= 9);
+ if (pirq > 8) {
+ printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+ return 0;
+@@ -432,6 +443,7 @@ static int pirq_vlsi_get(struct pci_dev
+
+ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+ {
++ WARN_ON_ONCE(pirq >= 9);
+ if (pirq > 8) {
+ printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+ return 0;
+@@ -453,14 +465,14 @@ static int pirq_vlsi_set(struct pci_dev
+ */
+ static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+ {
+- outb_p(pirq, 0xc00);
++ outb(pirq, 0xc00);
+ return inb(0xc01) & 0xf;
+ }
+
+ static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+ {
+- outb_p(pirq, 0xc00);
+- outb_p(irq, 0xc01);
++ outb(pirq, 0xc00);
++ outb(irq, 0xc01);
+ return 1;
+ }
+
+@@ -575,6 +587,10 @@ static __init int intel_router_probe(str
+ case PCI_DEVICE_ID_INTEL_ICH9_4:
+ case PCI_DEVICE_ID_INTEL_ICH9_5:
+ case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
++ case PCI_DEVICE_ID_INTEL_ICH10_0:
++ case PCI_DEVICE_ID_INTEL_ICH10_1:
++ case PCI_DEVICE_ID_INTEL_ICH10_2:
++ case PCI_DEVICE_ID_INTEL_ICH10_3:
+ r->name = "PIIX/ICH";
+ r->get = pirq_piix_get;
+ r->set = pirq_piix_set;
+--- sle11-2009-06-29.orig/arch/x86/vdso/Makefile 2008-11-25 12:35:54.000000000 +0100
++++ sle11-2009-06-29/arch/x86/vdso/Makefile 2009-03-16 16:33:40.000000000 +0100
+@@ -66,6 +66,7 @@ vdso32.so-$(VDSO32-y) += int80
+ vdso32.so-$(CONFIG_COMPAT) += syscall
+ vdso32.so-$(VDSO32-y) += sysenter
+ xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
++xen-vdso32-$(CONFIG_X86_32) += syscall
+ vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
+
+ vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
+--- sle11-2009-06-29.orig/arch/x86/vdso/vdso32/syscall.S 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/arch/x86/vdso/vdso32/syscall.S 2009-03-16 16:33:40.000000000 +0100
+@@ -19,8 +19,10 @@ __kernel_vsyscall:
+ .Lpush_ebp:
+ movl %ecx, %ebp
+ syscall
++#ifndef CONFIG_XEN
+ movl $__USER32_DS, %ecx
+ movl %ecx, %ss
++#endif
+ movl %ebp, %ecx
+ popl %ebp
+ .Lpop_ebp:
+--- sle11-2009-06-29.orig/arch/x86/vdso/vdso32.S 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/arch/x86/vdso/vdso32.S 2009-03-16 16:33:40.000000000 +0100
+@@ -19,4 +19,16 @@ vdso32_sysenter_start:
+ .incbin "arch/x86/vdso/vdso32-sysenter.so"
+ vdso32_sysenter_end:
+
++#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
++ .globl vdso32_int80_start, vdso32_int80_end
++vdso32_int80_start:
++ .incbin "arch/x86/vdso/vdso32-int80.so"
++vdso32_int80_end:
++#elif defined(CONFIG_X86_XEN)
++ .globl vdso32_syscall_start, vdso32_syscall_end
++vdso32_syscall_start:
++ .incbin "arch/x86/vdso/vdso32-syscall.so"
++vdso32_syscall_end:
++#endif
++
+ __FINIT
+--- sle11-2009-06-29.orig/arch/x86/vdso/vdso32-setup.c 2008-11-25 12:35:53.000000000 +0100
++++ sle11-2009-06-29/arch/x86/vdso/vdso32-setup.c 2009-03-16 16:33:40.000000000 +0100
+@@ -26,10 +26,6 @@
+ #include <asm/vdso.h>
+ #include <asm/proto.h>
+
+-#ifdef CONFIG_XEN
+-#include <xen/interface/callback.h>
+-#endif
+-
+ enum {
+ VDSO_DISABLED = 0,
+ VDSO_ENABLED = 1,
+@@ -229,7 +225,6 @@ static inline void map_compat_vdso(int m
+
+ void enable_sep_cpu(void)
+ {
+-#ifndef CONFIG_XEN
+ int cpu = get_cpu();
+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+@@ -244,35 +239,6 @@ void enable_sep_cpu(void)
+ wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+ put_cpu();
+-#else
+- extern asmlinkage void ia32pv_sysenter_target(void);
+- static struct callback_register sysenter = {
+- .type = CALLBACKTYPE_sysenter,
+- .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
+- };
+-
+- if (!boot_cpu_has(X86_FEATURE_SEP))
+- return;
+-
+- get_cpu();
+-
+- if (xen_feature(XENFEAT_supervisor_mode_kernel))
+- sysenter.address.eip = (unsigned long)ia32_sysenter_target;
+-
+- switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
+- case 0:
+- break;
+-#if CONFIG_XEN_COMPAT < 0x030200
+- case -ENOSYS:
+- sysenter.type = CALLBACKTYPE_sysenter_deprecated;
+- if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
+- break;
+-#endif
+- default:
+- clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
+- break;
+- }
+-#endif
+ }
+
+ static struct vm_area_struct gate_vma;
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-06-29/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -0,0 +1,506 @@
++/*
++ * (C) Copyright 2002 Linus Torvalds
++ * Portions based on the vdso-randomization code from exec-shield:
++ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
++ *
++ * This file contains the needed initializations to support sysenter.
++ */
++
++#include <linux/init.h>
++#include <linux/smp.h>
++#include <linux/thread_info.h>
++#include <linux/sched.h>
++#include <linux/gfp.h>
++#include <linux/string.h>
++#include <linux/elf.h>
++#include <linux/mm.h>
++#include <linux/err.h>
++#include <linux/module.h>
++
++#include <asm/cpufeature.h>
++#include <asm/msr.h>
++#include <asm/pgtable.h>
++#include <asm/unistd.h>
++#include <asm/elf.h>
++#include <asm/tlbflush.h>
++#include <asm/vdso.h>
++#include <asm/proto.h>
++
++#include <xen/interface/callback.h>
++
++enum {
++ VDSO_DISABLED = 0,
++ VDSO_ENABLED = 1,
++ VDSO_COMPAT = 2,
++};
++
++#ifdef CONFIG_COMPAT_VDSO
++#define VDSO_DEFAULT VDSO_COMPAT
++#else
++#define VDSO_DEFAULT VDSO_ENABLED
++#endif
++
++#ifdef CONFIG_X86_64
++#define vdso_enabled sysctl_vsyscall32
++#define arch_setup_additional_pages syscall32_setup_pages
++#endif
++
++/*
++ * This is the difference between the prelinked addresses in the vDSO images
++ * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
++ * in the user address space.
++ */
++#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
++
++/*
++ * Should the kernel map a VDSO page into processes and pass its
++ * address down to glibc upon exec()?
++ */
++unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
++
++static int __init vdso_setup(char *s)
++{
++ vdso_enabled = simple_strtoul(s, NULL, 0);
++
++ return 1;
++}
++
++/*
++ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
++ * behavior on both 64-bit and 32-bit kernels.
++ * On 32-bit kernels, vdso=[012] means the same thing.
++ */
++__setup("vdso32=", vdso_setup);
++
++#ifdef CONFIG_X86_32
++__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
++
++EXPORT_SYMBOL_GPL(vdso_enabled);
++#endif
++
++static __init void reloc_symtab(Elf32_Ehdr *ehdr,
++ unsigned offset, unsigned size)
++{
++ Elf32_Sym *sym = (void *)ehdr + offset;
++ unsigned nsym = size / sizeof(*sym);
++ unsigned i;
++
++ for(i = 0; i < nsym; i++, sym++) {
++ if (sym->st_shndx == SHN_UNDEF ||
++ sym->st_shndx == SHN_ABS)
++ continue; /* skip */
++
++ if (sym->st_shndx > SHN_LORESERVE) {
++ printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
++ sym->st_shndx);
++ continue;
++ }
++
++ switch(ELF_ST_TYPE(sym->st_info)) {
++ case STT_OBJECT:
++ case STT_FUNC:
++ case STT_SECTION:
++ case STT_FILE:
++ sym->st_value += VDSO_ADDR_ADJUST;
++ }
++ }
++}
++
++static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
++{
++ Elf32_Dyn *dyn = (void *)ehdr + offset;
++
++ for(; dyn->d_tag != DT_NULL; dyn++)
++ switch(dyn->d_tag) {
++ case DT_PLTGOT:
++ case DT_HASH:
++ case DT_STRTAB:
++ case DT_SYMTAB:
++ case DT_RELA:
++ case DT_INIT:
++ case DT_FINI:
++ case DT_REL:
++ case DT_DEBUG:
++ case DT_JMPREL:
++ case DT_VERSYM:
++ case DT_VERDEF:
++ case DT_VERNEED:
++ case DT_ADDRRNGLO ... DT_ADDRRNGHI:
++ /* definitely pointers needing relocation */
++ dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
++ break;
++
++ case DT_ENCODING ... OLD_DT_LOOS-1:
++ case DT_LOOS ... DT_HIOS-1:
++ /* Tags above DT_ENCODING are pointers if
++ they're even */
++ if (dyn->d_tag >= DT_ENCODING &&
++ (dyn->d_tag & 1) == 0)
++ dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
++ break;
++
++ case DT_VERDEFNUM:
++ case DT_VERNEEDNUM:
++ case DT_FLAGS_1:
++ case DT_RELACOUNT:
++ case DT_RELCOUNT:
++ case DT_VALRNGLO ... DT_VALRNGHI:
++ /* definitely not pointers */
++ break;
++
++ case OLD_DT_LOOS ... DT_LOOS-1:
++ case DT_HIOS ... DT_VALRNGLO-1:
++ default:
++ if (dyn->d_tag > DT_ENCODING)
++ printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
++ dyn->d_tag);
++ break;
++ }
++}
++
++static __init void relocate_vdso(Elf32_Ehdr *ehdr)
++{
++ Elf32_Phdr *phdr;
++ Elf32_Shdr *shdr;
++ int i;
++
++ BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
++ !elf_check_arch_ia32(ehdr) ||
++ ehdr->e_type != ET_DYN);
++
++ ehdr->e_entry += VDSO_ADDR_ADJUST;
++
++ /* rebase phdrs */
++ phdr = (void *)ehdr + ehdr->e_phoff;
++ for (i = 0; i < ehdr->e_phnum; i++) {
++ phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
++
++ /* relocate dynamic stuff */
++ if (phdr[i].p_type == PT_DYNAMIC)
++ reloc_dyn(ehdr, phdr[i].p_offset);
++ }
++
++ /* rebase sections */
++ shdr = (void *)ehdr + ehdr->e_shoff;
++ for(i = 0; i < ehdr->e_shnum; i++) {
++ if (!(shdr[i].sh_flags & SHF_ALLOC))
++ continue;
++
++ shdr[i].sh_addr += VDSO_ADDR_ADJUST;
++
++ if (shdr[i].sh_type == SHT_SYMTAB ||
++ shdr[i].sh_type == SHT_DYNSYM)
++ reloc_symtab(ehdr, shdr[i].sh_offset,
++ shdr[i].sh_size);
++ }
++}
++
++/*
++ * These symbols are defined by vdso32.S to mark the bounds
++ * of the ELF DSO images included therein.
++ */
++extern const char vdso32_default_start, vdso32_default_end;
++extern const char vdso32_sysenter_start, vdso32_sysenter_end;
++static struct page *vdso32_pages[1];
++
++#ifdef CONFIG_X86_64
++
++#if CONFIG_XEN_COMPAT < 0x030200
++static int use_int80 = 1;
++#endif
++static int use_sysenter __read_mostly = -1;
++
++#define vdso32_sysenter() (use_sysenter > 0)
++
++/* May not be __init: called during resume */
++void syscall32_cpu_init(void)
++{
++ static const struct callback_register cstar = {
++ .type = CALLBACKTYPE_syscall32,
++ .address = (unsigned long)ia32_cstar_target
++ };
++ static const struct callback_register sysenter = {
++ .type = CALLBACKTYPE_sysenter,
++ .address = (unsigned long)ia32_sysenter_target
++ };
++
++ if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
++ (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
++#if CONFIG_XEN_COMPAT < 0x030200
++ return;
++ use_int80 = 0;
++#else
++ BUG();
++#endif
++
++ if (use_sysenter < 0)
++ use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
++}
++
++#define compat_uses_vma 1
++
++static inline void map_compat_vdso(int map)
++{
++}
++
++#else /* CONFIG_X86_32 */
++
++#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
++
++extern asmlinkage void ia32pv_cstar_target(void);
++static const struct callback_register __cpuinitconst cstar = {
++ .type = CALLBACKTYPE_syscall32,
++ .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
++};
++
++void __cpuinit enable_sep_cpu(void)
++{
++ extern asmlinkage void ia32pv_sysenter_target(void);
++ static struct callback_register __cpuinitdata sysenter = {
++ .type = CALLBACKTYPE_sysenter,
++ .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
++ };
++
++ if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
++ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
++ BUG();
++ return;
++ }
++
++ if (!boot_cpu_has(X86_FEATURE_SEP))
++ return;
++
++ if (xen_feature(XENFEAT_supervisor_mode_kernel))
++ sysenter.address.eip = (unsigned long)ia32_sysenter_target;
++
++ switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
++ case 0:
++ break;
++#if CONFIG_XEN_COMPAT < 0x030200
++ case -ENOSYS:
++ sysenter.type = CALLBACKTYPE_sysenter_deprecated;
++ if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
++ break;
++#endif
++ default:
++ setup_clear_cpu_cap(X86_FEATURE_SEP);
++ break;
++ }
++}
++
++static struct vm_area_struct gate_vma;
++
++static int __init gate_vma_init(void)
++{
++ gate_vma.vm_mm = NULL;
++ gate_vma.vm_start = FIXADDR_USER_START;
++ gate_vma.vm_end = FIXADDR_USER_END;
++ gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
++ gate_vma.vm_page_prot = __P101;
++ /*
++ * Make sure the vDSO gets into every core dump.
++ * Dumping its contents makes post-mortem fully interpretable later
++ * without matching up the same kernel and hardware config to see
++ * what PC values meant.
++ */
++ gate_vma.vm_flags |= VM_ALWAYSDUMP;
++ return 0;
++}
++
++#define compat_uses_vma 0
++
++static void map_compat_vdso(int map)
++{
++ static int vdso_mapped;
++
++ if (map == vdso_mapped)
++ return;
++
++ vdso_mapped = map;
++
++ __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
++ map ? PAGE_READONLY_EXEC : PAGE_NONE);
++
++ /* flush stray tlbs */
++ flush_tlb_all();
++}
++
++#endif /* CONFIG_X86_64 */
++
++int __init sysenter_setup(void)
++{
++ void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
++ const void *vsyscall;
++ size_t vsyscall_len;
++
++ vdso32_pages[0] = virt_to_page(syscall_page);
++
++#ifdef CONFIG_X86_32
++ gate_vma_init();
++
++ printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
++#endif
++
++#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
++ if (use_int80) {
++ extern const char vdso32_int80_start, vdso32_int80_end;
++
++ vsyscall = &vdso32_int80_start;
++ vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
++ } else
++#elif defined(CONFIG_X86_32)
++ if (boot_cpu_has(X86_FEATURE_SYSCALL)
++ && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
++ || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
++ setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
++ barrier(); /* until clear_bit()'s constraints are correct ... */
++ if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
++ extern const char vdso32_syscall_start, vdso32_syscall_end;
++
++ vsyscall = &vdso32_syscall_start;
++ vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
++ } else
++#endif
++ if (!vdso32_sysenter()) {
++ vsyscall = &vdso32_default_start;
++ vsyscall_len = &vdso32_default_end - &vdso32_default_start;
++ } else {
++ vsyscall = &vdso32_sysenter_start;
++ vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
++ }
++
++ memcpy(syscall_page, vsyscall, vsyscall_len);
++ relocate_vdso(syscall_page);
++
++ return 0;
++}
++
++/* Setup a VMA at program startup for the vsyscall page */
++int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
++{
++ struct mm_struct *mm = current->mm;
++ unsigned long addr;
++ int ret = 0;
++ bool compat;
++
++ down_write(&mm->mmap_sem);
++
++ /* Test compat mode once here, in case someone
++ changes it via sysctl */
++ compat = (vdso_enabled == VDSO_COMPAT);
++
++ map_compat_vdso(compat);
++
++ if (compat)
++ addr = VDSO_HIGH_BASE;
++ else {
++ addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
++ if (IS_ERR_VALUE(addr)) {
++ ret = addr;
++ goto up_fail;
++ }
++ }
++
++ if (compat_uses_vma || !compat) {
++ /*
++ * MAYWRITE to allow gdb to COW and set breakpoints
++ *
++ * Make sure the vDSO gets into every core dump.
++ * Dumping its contents makes post-mortem fully
++ * interpretable later without matching up the same
++ * kernel and hardware config to see what PC values
++ * meant.
++ */
++ ret = install_special_mapping(mm, addr, PAGE_SIZE,
++ VM_READ|VM_EXEC|
++ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
++ VM_ALWAYSDUMP,
++ vdso32_pages);
++
++ if (ret)
++ goto up_fail;
++ }
++
++ current->mm->context.vdso = (void *)addr;
++ current_thread_info()->sysenter_return =
++ VDSO32_SYMBOL(addr, SYSENTER_RETURN);
++
++ up_fail:
++ up_write(&mm->mmap_sem);
++
++ return ret;
++}
++
++#ifdef CONFIG_X86_64
++
++/*
++ * This must be done early in case we have an initrd containing 32-bit
++ * binaries (e.g., hotplug). This could be pushed upstream.
++ */
++core_initcall(sysenter_setup);
++
++#ifdef CONFIG_SYSCTL
++/* Register vsyscall32 into the ABI table */
++#include <linux/sysctl.h>
++
++static ctl_table abi_table2[] = {
++ {
++ .procname = "vsyscall32",
++ .data = &sysctl_vsyscall32,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = proc_dointvec
++ },
++ {}
++};
++
++static ctl_table abi_root_table2[] = {
++ {
++ .ctl_name = CTL_ABI,
++ .procname = "abi",
++ .mode = 0555,
++ .child = abi_table2
++ },
++ {}
++};
++
++static __init int ia32_binfmt_init(void)
++{
++ register_sysctl_table(abi_root_table2);
++ return 0;
++}
++__initcall(ia32_binfmt_init);
++#endif
++
++#else /* CONFIG_X86_32 */
++
++const char *arch_vma_name(struct vm_area_struct *vma)
++{
++ if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
++ return "[vdso]";
++ return NULL;
++}
++
++struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
++{
++ struct mm_struct *mm = tsk->mm;
++
++ /* Check to see if this task was created in compat vdso mode */
++ if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
++ return &gate_vma;
++ return NULL;
++}
++
++int in_gate_area(struct task_struct *task, unsigned long addr)
++{
++ const struct vm_area_struct *vma = get_gate_vma(task);
++
++ return vma && addr >= vma->vm_start && addr < vma->vm_end;
++}
++
++int in_gate_area_no_task(unsigned long addr)
++{
++ return 0;
++}
++
++#endif /* CONFIG_X86_64 */
+--- sle11-2009-06-29.orig/drivers/pci/msi-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/drivers/pci/msi-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -43,6 +43,53 @@ struct msi_pirq_entry {
+ int entry_nr;
+ };
+
++/* Arch hooks */
++
++int __attribute__ ((weak))
++arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
++{
++ return 0;
++}
++
++#ifndef CONFIG_XEN
++int __attribute__ ((weak))
++arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
++{
++ return 0;
++}
++
++int __attribute__ ((weak))
++arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++ struct msi_desc *entry;
++ int ret;
++
++ list_for_each_entry(entry, &dev->msi_list, list) {
++ ret = arch_setup_msi_irq(dev, entry);
++ if (ret)
++ return ret;
++ }
++
++ return 0;
++}
++
++void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
++{
++ return;
++}
++
++void __attribute__ ((weak))
++arch_teardown_msi_irqs(struct pci_dev *dev)
++{
++ struct msi_desc *entry;
++
++ list_for_each_entry(entry, &dev->msi_list, list) {
++ if (entry->irq != 0)
++ arch_teardown_msi_irq(entry->irq);
++ }
++}
++#endif
++
+ static void msi_set_enable(struct pci_dev *dev, int enable)
+ {
+ int pos;
+@@ -270,7 +317,6 @@ static void pci_intx_for_msi(struct pci_
+ pci_intx(dev, enable);
+ }
+
+-#ifdef CONFIG_PM
+ static void __pci_restore_msi_state(struct pci_dev *dev)
+ {
+ int pirq;
+@@ -328,7 +374,7 @@ void pci_restore_msi_state(struct pci_de
+ __pci_restore_msi_state(dev);
+ __pci_restore_msix_state(dev);
+ }
+-#endif /* CONFIG_PM */
++EXPORT_SYMBOL_GPL(pci_restore_msi_state);
+
+ /**
+ * msi_capability_init - configure device's MSI capability structure
+@@ -755,51 +801,3 @@ void pci_msi_init_pci_dev(struct pci_dev
+ INIT_LIST_HEAD(&dev->msi_list);
+ #endif
+ }
+-
+-
+-/* Arch hooks */
+-
+-int __attribute__ ((weak))
+-arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
+-{
+- return 0;
+-}
+-
+-#ifndef CONFIG_XEN
+-int __attribute__ ((weak))
+-arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
+-{
+- return 0;
+-}
+-
+-int __attribute__ ((weak))
+-arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+-{
+- struct msi_desc *entry;
+- int ret;
+-
+- list_for_each_entry(entry, &dev->msi_list, list) {
+- ret = arch_setup_msi_irq(dev, entry);
+- if (ret)
+- return ret;
+- }
+-
+- return 0;
+-}
+-
+-void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
+-{
+- return;
+-}
+-
+-void __attribute__ ((weak))
+-arch_teardown_msi_irqs(struct pci_dev *dev)
+-{
+- struct msi_desc *entry;
+-
+- list_for_each_entry(entry, &dev->msi_list, list) {
+- if (entry->irq != 0)
+- arch_teardown_msi_irq(entry->irq);
+- }
+-}
+-#endif
+--- sle11-2009-06-29.orig/drivers/pci/pci.c 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/drivers/pci/pci.c 2009-03-16 16:33:40.000000000 +0100
+@@ -353,7 +353,12 @@ pci_find_parent_resource(const struct pc
+ * Restore the BAR values for a given device, so as to make it
+ * accessible by its driver.
+ */
++#ifndef CONFIG_XEN
+ static void
++#else
++EXPORT_SYMBOL_GPL(pci_restore_bars);
++void
++#endif
+ pci_restore_bars(struct pci_dev *dev)
+ {
+ int i, numres;
+--- sle11-2009-06-29.orig/drivers/xen/balloon/sysfs.c 2009-03-04 11:25:55.000000000 +0100
++++ sle11-2009-06-29/drivers/xen/balloon/sysfs.c 2009-06-29 15:29:24.000000000 +0200
+@@ -104,7 +104,7 @@ static struct attribute_group balloon_in
+ };
+
+ static struct sysdev_class balloon_sysdev_class = {
+- set_kset_name(BALLOON_CLASS_NAME),
++ .name = BALLOON_CLASS_NAME,
+ };
+
+ static struct sys_device balloon_sysdev;
+--- sle11-2009-06-29.orig/drivers/xen/blkback/blkback.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/drivers/xen/blkback/blkback.c 2009-03-16 16:33:40.000000000 +0100
+@@ -148,7 +148,7 @@ static void unplug_queue(blkif_t *blkif)
+ return;
+ if (blkif->plug->unplug_fn)
+ blkif->plug->unplug_fn(blkif->plug);
+- blk_put_queue(blkif->plug);
++ kobject_put(&blkif->plug->kobj);
+ blkif->plug = NULL;
+ }
+
+@@ -159,7 +159,8 @@ static void plug_queue(blkif_t *blkif, s
+ if (q == blkif->plug)
+ return;
+ unplug_queue(blkif);
+- blk_get_queue(q);
++ WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
++ kobject_get(&q->kobj);
+ blkif->plug = q;
+ }
+
+--- sle11-2009-06-29.orig/drivers/xen/blkfront/blkfront.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/drivers/xen/blkfront/blkfront.c 2009-03-24 10:12:53.000000000 +0100
+@@ -713,7 +713,6 @@ static irqreturn_t blkif_int(int irq, vo
+ RING_IDX i, rp;
+ unsigned long flags;
+ struct blkfront_info *info = (struct blkfront_info *)dev_id;
+- int uptodate;
+
+ spin_lock_irqsave(&blkif_io_lock, flags);
+
+@@ -738,13 +737,13 @@ static irqreturn_t blkif_int(int irq, vo
+
+ ADD_ID_TO_FREELIST(info, id);
+
+- uptodate = (bret->status == BLKIF_RSP_OKAY);
++ ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
+ switch (bret->operation) {
+ case BLKIF_OP_WRITE_BARRIER:
+ if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+ printk("blkfront: %s: write barrier op failed\n",
+ info->gd->disk_name);
+- uptodate = -EOPNOTSUPP;
++ ret = -EOPNOTSUPP;
+ info->feature_barrier = 0;
+ xlvbd_barrier(info);
+ }
+@@ -755,10 +754,8 @@ static irqreturn_t blkif_int(int irq, vo
+ DPRINTK("Bad return from blkdev data "
+ "request: %x\n", bret->status);
+
+- ret = end_that_request_first(req, uptodate,
+- req->hard_nr_sectors);
++ ret = __blk_end_request(req, ret, blk_rq_bytes(req));
+ BUG_ON(ret);
+- end_that_request_last(req, uptodate);
+ break;
+ default:
+ BUG();
+--- sle11-2009-06-29.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:37:50.000000000 +0200
++++ sle11-2009-06-29/drivers/xen/blktap/blktap.c 2009-04-20 11:38:54.000000000 +0200
+@@ -331,8 +331,8 @@ static pte_t blktap_clear_pte(struct vm_
+ * if vm_file is NULL (meaning mmap failed and we have nothing to do)
+ */
+ if (uvaddr < uvstart || vma->vm_file == NULL)
+- return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
+- ptep, is_fullmm);
++ return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
++ is_fullmm);
+
+ info = vma->vm_file->private_data;
+ priv = vma->vm_private_data;
+@@ -379,8 +379,8 @@ static pte_t blktap_clear_pte(struct vm_
+ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
+
+ /* USING SHADOW PAGE TABLES. */
+- copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
+- is_fullmm);
++ copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
++ is_fullmm);
+ }
+
+ if (count) {
+--- sle11-2009-06-29.orig/drivers/xen/core/Makefile 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/drivers/xen/core/Makefile 2009-03-16 16:33:40.000000000 +0100
+@@ -10,5 +10,6 @@ obj-$(CONFIG_SYS_HYPERVISOR) += hypervis
+ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
+ obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
+ obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o
++obj-$(CONFIG_X86_SMP) += spinlock.o
+ obj-$(CONFIG_KEXEC) += machine_kexec.o
+ obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
+--- sle11-2009-06-29.orig/drivers/xen/core/evtchn.c 2009-03-04 11:25:55.000000000 +0100
++++ sle11-2009-06-29/drivers/xen/core/evtchn.c 2009-03-16 16:33:40.000000000 +0100
+@@ -194,7 +194,7 @@ static inline unsigned int cpu_from_evtc
+
+ /* Upcall to generic IRQ layer. */
+ #ifdef CONFIG_X86
+-extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
++extern unsigned int do_IRQ(struct pt_regs *regs);
+ void __init xen_init_IRQ(void);
+ void __init init_IRQ(void)
+ {
+@@ -203,13 +203,11 @@ void __init init_IRQ(void)
+ }
+ #if defined (__i386__)
+ static inline void exit_idle(void) {}
+-#define IRQ_REG orig_eax
+ #elif defined (__x86_64__)
+ #include <asm/idle.h>
+-#define IRQ_REG orig_rax
+ #endif
+ #define do_IRQ(irq, regs) do { \
+- (regs)->IRQ_REG = ~(irq); \
++ (regs)->orig_ax = ~(irq); \
+ do_IRQ((regs)); \
+ } while (0)
+ #endif
+@@ -670,13 +668,12 @@ static void set_affinity_irq(unsigned in
+ int resend_irq_on_evtchn(unsigned int irq)
+ {
+ int masked, evtchn = evtchn_from_irq(irq);
+- shared_info_t *s = HYPERVISOR_shared_info;
+
+ if (!VALID_EVTCHN(evtchn))
+ return 1;
+
+ masked = test_and_set_evtchn_mask(evtchn);
+- synch_set_bit(evtchn, s->evtchn_pending);
++ set_evtchn(evtchn);
+ if (!masked)
+ unmask_evtchn(evtchn);
+
+@@ -969,6 +966,43 @@ void disable_all_local_evtchn(void)
+ synch_set_bit(i, &s->evtchn_mask[0]);
+ }
+
++/* Clear an irq's pending state, in preparation for polling on it. */
++void xen_clear_irq_pending(int irq)
++{
++ int evtchn = evtchn_from_irq(irq);
++
++ if (VALID_EVTCHN(evtchn))
++ clear_evtchn(evtchn);
++}
++
++/* Set an irq's pending state, to avoid blocking on it. */
++void xen_set_irq_pending(int irq)
++{
++ int evtchn = evtchn_from_irq(irq);
++
++ if (VALID_EVTCHN(evtchn))
++ set_evtchn(evtchn);
++}
++
++/* Test an irq's pending state. */
++int xen_test_irq_pending(int irq)
++{
++ int evtchn = evtchn_from_irq(irq);
++
++ return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
++}
++
++/* Poll waiting for an irq to become pending. In the usual case, the
++ irq will be disabled so it won't deliver an interrupt. */
++void xen_poll_irq(int irq)
++{
++ evtchn_port_t evtchn = evtchn_from_irq(irq);
++
++ if (VALID_EVTCHN(evtchn)
++ && HYPERVISOR_poll_no_timeout(&evtchn, 1))
++ BUG();
++}
++
+ static void restore_cpu_virqs(unsigned int cpu)
+ {
+ struct evtchn_bind_virq bind_virq;
+@@ -1022,8 +1056,8 @@ static void restore_cpu_ipis(unsigned in
+ bind_evtchn_to_cpu(evtchn, cpu);
+
+ /* Ready for use. */
+- unmask_evtchn(evtchn);
+-
++ if (!(irq_desc[irq].status & IRQ_DISABLED))
++ unmask_evtchn(evtchn);
+ }
+ }
+
+--- sle11-2009-06-29.orig/drivers/xen/core/hypervisor_sysfs.c 2008-12-15 11:27:22.000000000 +0100
++++ sle11-2009-06-29/drivers/xen/core/hypervisor_sysfs.c 2009-03-16 16:33:40.000000000 +0100
+@@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+- hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
++ hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
+ return 0;
+ }
+
+--- sle11-2009-06-29.orig/drivers/xen/core/smpboot.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/drivers/xen/core/smpboot.c 2009-03-16 16:33:40.000000000 +0100
+@@ -135,6 +135,10 @@ static int __cpuinit xen_smp_intr_init(u
+ goto fail;
+ per_cpu(callfunc_irq, cpu) = rc;
+
++ rc = xen_spinlock_init(cpu);
++ if (rc < 0)
++ goto fail;
++
+ if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
+ goto fail;
+
+@@ -145,6 +149,7 @@ static int __cpuinit xen_smp_intr_init(u
+ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+ if (per_cpu(callfunc_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
++ xen_spinlock_cleanup(cpu);
+ return rc;
+ }
+
+@@ -156,6 +161,7 @@ static void xen_smp_intr_exit(unsigned i
+
+ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
++ xen_spinlock_cleanup(cpu);
+ }
+ #endif
+
+@@ -208,36 +214,25 @@ static void __cpuinit cpu_initialize_con
+ smp_trap_init(ctxt.trap_ctxt);
+
+ ctxt.ldt_ents = 0;
+- ctxt.gdt_ents = GDT_SIZE / 8;
+-
+-#ifdef __i386__
+ ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
++ ctxt.gdt_ents = GDT_SIZE / 8;
+
+ ctxt.user_regs.cs = __KERNEL_CS;
+- ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
++ ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
+
+ ctxt.kernel_ss = __KERNEL_DS;
+- ctxt.kernel_sp = idle->thread.esp0;
++ ctxt.kernel_sp = idle->thread.sp0;
+
+- ctxt.event_callback_cs = __KERNEL_CS;
+ ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
+- ctxt.failsafe_callback_cs = __KERNEL_CS;
+ ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
++#ifdef __i386__
++ ctxt.event_callback_cs = __KERNEL_CS;
++ ctxt.failsafe_callback_cs = __KERNEL_CS;
+
+ ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+
+ ctxt.user_regs.fs = __KERNEL_PERCPU;
+ #else /* __x86_64__ */
+- ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
+-
+- ctxt.user_regs.cs = __KERNEL_CS;
+- ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
+-
+- ctxt.kernel_ss = __KERNEL_DS;
+- ctxt.kernel_sp = idle->thread.rsp0;
+-
+- ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
+- ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
+ ctxt.syscall_callback_eip = (unsigned long)system_call;
+
+ ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-06-29/drivers/xen/core/spinlock.c 2009-03-16 16:33:40.000000000 +0100
+@@ -0,0 +1,161 @@
++/*
++ * Xen spinlock functions
++ *
++ * See arch/x86/xen/smp.c for copyright and credits for derived
++ * portions of this file.
++ */
++
++#include <linux/init.h>
++#include <linux/irq.h>
++#include <linux/kernel.h>
++#include <linux/kernel_stat.h>
++#include <linux/module.h>
++#include <xen/evtchn.h>
++
++extern irqreturn_t smp_reschedule_interrupt(int, void *);
++
++static DEFINE_PER_CPU(int, spinlock_irq) = -1;
++static char spinlock_name[NR_CPUS][15];
++
++struct spinning {
++ raw_spinlock_t *lock;
++ unsigned int ticket;
++ struct spinning *prev;
++};
++static DEFINE_PER_CPU(struct spinning *, spinning);
++/*
++ * Protect removal of objects: Addition can be done lockless, and even
++ * removal itself doesn't need protection - what needs to be prevented is
++ * removed objects going out of scope (as they're allocated on the stack.
++ */
++static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED;
++
++int __cpuinit xen_spinlock_init(unsigned int cpu)
++{
++ int rc;
++
++ sprintf(spinlock_name[cpu], "spinlock%u", cpu);
++ rc = bind_ipi_to_irqhandler(SPIN_UNLOCK_VECTOR,
++ cpu,
++ smp_reschedule_interrupt,
++ IRQF_DISABLED|IRQF_NOBALANCING,
++ spinlock_name[cpu],
++ NULL);
++ if (rc < 0)
++ return rc;
++
++ disable_irq(rc); /* make sure it's never delivered */
++ per_cpu(spinlock_irq, cpu) = rc;
++
++ return 0;
++}
++
++void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
++{
++ if (per_cpu(spinlock_irq, cpu) >= 0)
++ unbind_from_irqhandler(per_cpu(spinlock_irq, cpu), NULL);
++ per_cpu(spinlock_irq, cpu) = -1;
++}
++
++int xen_spin_wait(raw_spinlock_t *lock, unsigned int token)
++{
++ int rc = 0, irq = __get_cpu_var(spinlock_irq);
++ raw_rwlock_t *rm_lock;
++ unsigned long flags;
++ struct spinning spinning;
++
++ /* If kicker interrupt not initialized yet, just spin. */
++ if (unlikely(irq < 0) || unlikely(!cpu_online(smp_processor_id())))
++ return 0;
++
++ token >>= TICKET_SHIFT;
++
++ /* announce we're spinning */
++ spinning.ticket = token;
++ spinning.lock = lock;
++ spinning.prev = __get_cpu_var(spinning);
++ smp_wmb();
++ __get_cpu_var(spinning) = &spinning;
++
++ /* clear pending */
++ xen_clear_irq_pending(irq);
++
++ do {
++ /* Check again to make sure it didn't become free while
++ * we weren't looking. */
++ if ((lock->slock & ((1U << TICKET_SHIFT) - 1)) == token) {
++ /* If we interrupted another spinlock while it was
++ * blocking, make sure it doesn't block (again)
++ * without rechecking the lock. */
++ if (spinning.prev)
++ xen_set_irq_pending(irq);
++ rc = 1;
++ break;
++ }
++
++ /* block until irq becomes pending */
++ xen_poll_irq(irq);
++ } while (!xen_test_irq_pending(irq));
++
++ /* Leave the irq pending so that any interrupted blocker will
++ * re-check. */
++ kstat_this_cpu.irqs[irq] += !rc;
++
++ /* announce we're done */
++ __get_cpu_var(spinning) = spinning.prev;
++ rm_lock = &__get_cpu_var(spinning_rm_lock);
++ raw_local_irq_save(flags);
++ __raw_write_lock(rm_lock);
++ __raw_write_unlock(rm_lock);
++ raw_local_irq_restore(flags);
++
++ return rc;
++}
++EXPORT_SYMBOL(xen_spin_wait);
++
++unsigned int xen_spin_adjust(raw_spinlock_t *lock, unsigned int token)
++{
++ return token;//todo
++}
++EXPORT_SYMBOL(xen_spin_adjust);
++
++int xen_spin_wait_flags(raw_spinlock_t *lock, unsigned int *token,
++ unsigned int flags)
++{
++ return xen_spin_wait(lock, *token);//todo
++}
++EXPORT_SYMBOL(xen_spin_wait_flags);
++
++void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
++{
++ unsigned int cpu;
++
++ token &= (1U << TICKET_SHIFT) - 1;
++ for_each_online_cpu(cpu) {
++ raw_rwlock_t *rm_lock;
++ unsigned long flags;
++ struct spinning *spinning;
++
++ if (cpu == raw_smp_processor_id())
++ continue;
++
++ rm_lock = &per_cpu(spinning_rm_lock, cpu);
++ raw_local_irq_save(flags);
++ __raw_read_lock(rm_lock);
++
++ spinning = per_cpu(spinning, cpu);
++ smp_rmb();
++ if (spinning
++ && (spinning->lock != lock || spinning->ticket != token))
++ spinning = NULL;
++
++ __raw_read_unlock(rm_lock);
++ raw_local_irq_restore(flags);
++
++ if (unlikely(spinning)) {
++ notify_remote_via_irq(per_cpu(spinlock_irq, cpu));
++ return;
++ }
++ }
++}
++EXPORT_SYMBOL(xen_spin_kick);
+--- sle11-2009-06-29.orig/drivers/xen/core/xen_sysfs.c 2008-12-15 11:27:22.000000000 +0100
++++ sle11-2009-06-29/drivers/xen/core/xen_sysfs.c 2009-03-16 16:33:40.000000000 +0100
+@@ -29,12 +29,12 @@ HYPERVISOR_ATTR_RO(type);
+
+ static int __init xen_sysfs_type_init(void)
+ {
+- return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
++ return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
+ }
+
+ static void xen_sysfs_type_destroy(void)
+ {
+- sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
++ sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
+ }
+
+ /* xen version attributes */
+@@ -90,13 +90,12 @@ static struct attribute_group version_gr
+
+ static int __init xen_sysfs_version_init(void)
+ {
+- return sysfs_create_group(&hypervisor_subsys.kobj,
+- &version_group);
++ return sysfs_create_group(hypervisor_kobj, &version_group);
+ }
+
+ static void xen_sysfs_version_destroy(void)
+ {
+- sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
++ sysfs_remove_group(hypervisor_kobj, &version_group);
+ }
+
+ /* UUID */
+@@ -126,12 +125,12 @@ HYPERVISOR_ATTR_RO(uuid);
+
+ static int __init xen_sysfs_uuid_init(void)
+ {
+- return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
++ return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
+ }
+
+ static void xen_sysfs_uuid_destroy(void)
+ {
+- sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
++ sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
+ }
+
+ /* xen compilation attributes */
+@@ -204,14 +203,12 @@ static struct attribute_group xen_compil
+
+ int __init static xen_compilation_init(void)
+ {
+- return sysfs_create_group(&hypervisor_subsys.kobj,
+- &xen_compilation_group);
++ return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
+ }
+
+ static void xen_compilation_destroy(void)
+ {
+- sysfs_remove_group(&hypervisor_subsys.kobj,
+- &xen_compilation_group);
++ sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
+ }
+
+ /* xen properties info */
+@@ -325,14 +322,12 @@ static struct attribute_group xen_proper
+
+ static int __init xen_properties_init(void)
+ {
+- return sysfs_create_group(&hypervisor_subsys.kobj,
+- &xen_properties_group);
++ return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
+ }
+
+ static void xen_properties_destroy(void)
+ {
+- sysfs_remove_group(&hypervisor_subsys.kobj,
+- &xen_properties_group);
++ sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
+ }
+
+ #ifdef CONFIG_KEXEC
+@@ -350,13 +345,12 @@ HYPERVISOR_ATTR_RO(vmcoreinfo);
+
+ static int __init xen_sysfs_vmcoreinfo_init(void)
+ {
+- return sysfs_create_file(&hypervisor_subsys.kobj,
+- &vmcoreinfo_attr.attr);
++ return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+ }
+
+ static void xen_sysfs_vmcoreinfo_destroy(void)
+ {
+- sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr);
++ sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+ }
+
+ #endif
+--- sle11-2009-06-29.orig/drivers/xen/gntdev/gntdev.c 2009-03-04 11:28:34.000000000 +0100
++++ sle11-2009-06-29/drivers/xen/gntdev/gntdev.c 2009-03-16 16:33:40.000000000 +0100
+@@ -782,7 +782,7 @@ static pte_t gntdev_clear_pte(struct vm_
+ op.status);
+ } else {
+ /* USING SHADOW PAGE TABLES. */
+- copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
++ copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
+ }
+
+ /* Finally, we unmap the grant from kernel space. */
+@@ -810,7 +810,7 @@ static pte_t gntdev_clear_pte(struct vm_
+ >> PAGE_SHIFT, INVALID_P2M_ENTRY);
+
+ } else {
+- copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
++ copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
+ }
+
+ return copy;
+--- sle11-2009-06-29.orig/drivers/xen/scsifront/scsifront.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/drivers/xen/scsifront/scsifront.c 2009-03-16 16:33:40.000000000 +0100
+@@ -260,19 +260,19 @@ static int map_data_for_request(struct v
+ return -ENOMEM;
+ }
+
+- if (sc->use_sg) {
++ if (scsi_bufflen(sc)) {
+ /* quoted scsi_lib.c/scsi_req_map_sg . */
+- struct scatterlist *sg, *sgl = (struct scatterlist *)sc->request_buffer;
+- unsigned int data_len = sc->request_bufflen;
++ struct scatterlist *sg, *sgl = scsi_sglist(sc);
++ unsigned int data_len = scsi_bufflen(sc);
+
+- nr_pages = (sc->request_bufflen + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
++ nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (nr_pages > VSCSIIF_SG_TABLESIZE) {
+ printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n");
+ ref_cnt = (-E2BIG);
+ goto big_to_sg;
+ }
+
+- for_each_sg (sgl, sg, sc->use_sg, i) {
++ for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
+ page = sg_page(sg);
+ off = sg->offset;
+ len = sg->length;
+@@ -306,45 +306,6 @@ static int map_data_for_request(struct v
+ ref_cnt++;
+ }
+ }
+- } else if (sc->request_bufflen) {
+- unsigned long end = ((unsigned long)sc->request_buffer
+- + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+- unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT;
+-
+- page = virt_to_page(sc->request_buffer);
+- nr_pages = end - start;
+- len = sc->request_bufflen;
+-
+- if (nr_pages > VSCSIIF_SG_TABLESIZE) {
+- ref_cnt = (-E2BIG);
+- goto big_to_sg;
+- }
+-
+- buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
+-
+- off = offset_in_page((unsigned long)sc->request_buffer);
+- for (i = 0; i < nr_pages; i++) {
+- bytes = PAGE_SIZE - off;
+-
+- if (bytes > len)
+- bytes = len;
+-
+- ref = gnttab_claim_grant_reference(&gref_head);
+- BUG_ON(ref == -ENOSPC);
+-
+- gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
+- buffer_pfn, write);
+-
+- info->shadow[id].gref[i] = ref;
+- ring_req->seg[i].gref = ref;
+- ring_req->seg[i].offset = (uint16_t)off;
+- ring_req->seg[i].length = (uint16_t)bytes;
+-
+- buffer_pfn++;
+- len -= bytes;
+- off = 0;
+- ref_cnt++;
+- }
+ }
+
+ big_to_sg:
+--- sle11-2009-06-29.orig/drivers/xen/xenoprof/xenoprofile.c 2009-03-11 15:39:38.000000000 +0100
++++ sle11-2009-06-29/drivers/xen/xenoprof/xenoprofile.c 2009-03-16 16:33:40.000000000 +0100
+@@ -78,7 +78,7 @@ static int xenoprof_resume(struct sys_de
+
+
+ static struct sysdev_class oprofile_sysclass = {
+- set_kset_name("oprofile"),
++ .name = "oprofile",
+ .resume = xenoprof_resume,
+ .suspend = xenoprof_suspend
+ };
+--- sle11-2009-06-29.orig/include/asm-x86/e820.h 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/include/asm-x86/e820.h 2009-03-16 16:33:40.000000000 +0100
+@@ -127,7 +127,11 @@ extern char *memory_setup(void);
+ #endif /* __KERNEL__ */
+ #endif /* __ASSEMBLY__ */
+
++#ifndef CONFIG_XEN
+ #define ISA_START_ADDRESS 0xa0000
++#else
++#define ISA_START_ADDRESS 0
++#endif
+ #define ISA_END_ADDRESS 0x100000
+ #define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
+
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/agp.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/agp.h 2009-03-16 16:33:40.000000000 +0100
+@@ -13,18 +13,13 @@
+ * page. This avoids data corruption on some CPUs.
+ */
+
+-/*
+- * Caller's responsibility to call global_flush_tlb() for performance
+- * reasons
+- */
+ #define map_page_into_agp(page) ( \
+ xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
+- ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
++ ?: set_pages_uc(page, 1))
+ #define unmap_page_from_agp(page) ( \
+ xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
+ /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
+- change_page_attr(page, 1, PAGE_KERNEL))
+-#define flush_agp_mappings() global_flush_tlb()
++ set_pages_wb(page, 1))
+
+ /*
+ * Could use CLFLUSH here if the cpu supports it. But then it would
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/desc.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,5 +1,404 @@
++#ifndef _ASM_DESC_H_
++#define _ASM_DESC_H_
++
++#ifndef __ASSEMBLY__
++#include <asm/desc_defs.h>
++#include <asm/ldt.h>
++#include <asm/mmu.h>
++#include <linux/smp.h>
++
++static inline void fill_ldt(struct desc_struct *desc,
++ const struct user_desc *info)
++{
++ desc->limit0 = info->limit & 0x0ffff;
++ desc->base0 = info->base_addr & 0x0000ffff;
++
++ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
++ desc->type = (info->read_exec_only ^ 1) << 1;
++ desc->type |= info->contents << 2;
++ desc->s = 1;
++ desc->dpl = 0x3;
++ desc->p = info->seg_not_present ^ 1;
++ desc->limit = (info->limit & 0xf0000) >> 16;
++ desc->avl = info->useable;
++ desc->d = info->seg_32bit;
++ desc->g = info->limit_in_pages;
++ desc->base2 = (info->base_addr & 0xff000000) >> 24;
++}
++
++#ifndef CONFIG_X86_NO_IDT
++extern struct desc_ptr idt_descr;
++extern gate_desc idt_table[];
++#endif
++
++#ifdef CONFIG_X86_64
++extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
++extern struct desc_ptr cpu_gdt_descr[];
++/* the cpu gdt accessor */
++#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
++
++static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
++ unsigned dpl, unsigned ist, unsigned seg)
++{
++ gate->offset_low = PTR_LOW(func);
++ gate->segment = __KERNEL_CS;
++ gate->ist = ist;
++ gate->p = 1;
++ gate->dpl = dpl;
++ gate->zero0 = 0;
++ gate->zero1 = 0;
++ gate->type = type;
++ gate->offset_middle = PTR_MIDDLE(func);
++ gate->offset_high = PTR_HIGH(func);
++}
++
++#else
++struct gdt_page {
++ struct desc_struct gdt[GDT_ENTRIES];
++} __attribute__((aligned(PAGE_SIZE)));
++DECLARE_PER_CPU(struct gdt_page, gdt_page);
++
++static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
++{
++ return per_cpu(gdt_page, cpu).gdt;
++}
++
++static inline void pack_gate(gate_desc *gate, unsigned char type,
++ unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
++
++{
++ gate->a = (seg << 16) | (base & 0xffff);
++ gate->b = (base & 0xffff0000) |
++ (((0x80 | type | (dpl << 5)) & 0xff) << 8);
++}
++
++#endif
++
++static inline int desc_empty(const void *ptr)
++{
++ const u32 *desc = ptr;
++ return !(desc[0] | desc[1]);
++}
++
++#ifndef CONFIG_XEN
++#define load_TR_desc() native_load_tr_desc()
++#define load_gdt(dtr) native_load_gdt(dtr)
++#define load_idt(dtr) native_load_idt(dtr)
++#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
++#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
++
++#define store_gdt(dtr) native_store_gdt(dtr)
++#define store_idt(dtr) native_store_idt(dtr)
++#define store_tr(tr) (tr = native_store_tr())
++#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
++
++#define load_TLS(t, cpu) native_load_tls(t, cpu)
++#define set_ldt native_set_ldt
++
++#define write_ldt_entry(dt, entry, desc) \
++ native_write_ldt_entry(dt, entry, desc)
++#define write_gdt_entry(dt, entry, desc, type) \
++ native_write_gdt_entry(dt, entry, desc, type)
++#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
++
++static inline void native_write_idt_entry(gate_desc *idt, int entry,
++ const gate_desc *gate)
++{
++ memcpy(&idt[entry], gate, sizeof(*gate));
++}
++
++static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
++ const void *desc)
++{
++ memcpy(&ldt[entry], desc, 8);
++}
++
++static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
++ const void *desc, int type)
++{
++ unsigned int size;
++ switch (type) {
++ case DESC_TSS:
++ size = sizeof(tss_desc);
++ break;
++ case DESC_LDT:
++ size = sizeof(ldt_desc);
++ break;
++ default:
++ size = sizeof(struct desc_struct);
++ break;
++ }
++ memcpy(&gdt[entry], desc, size);
++}
++#endif
++
++static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
++ unsigned long limit, unsigned char type,
++ unsigned char flags)
++{
++ desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
++ desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
++ (limit & 0x000f0000) | ((type & 0xff) << 8) |
++ ((flags & 0xf) << 20);
++ desc->p = 1;
++}
++
++
++#ifndef CONFIG_XEN
++static inline void set_tssldt_descriptor(void *d, unsigned long addr,
++ unsigned type, unsigned size)
++{
++#ifdef CONFIG_X86_64
++ struct ldttss_desc64 *desc = d;
++ memset(desc, 0, sizeof(*desc));
++ desc->limit0 = size & 0xFFFF;
++ desc->base0 = PTR_LOW(addr);
++ desc->base1 = PTR_MIDDLE(addr) & 0xFF;
++ desc->type = type;
++ desc->p = 1;
++ desc->limit1 = (size >> 16) & 0xF;
++ desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
++ desc->base3 = PTR_HIGH(addr);
++#else
++
++ pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
++#endif
++}
++
++static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
++{
++ struct desc_struct *d = get_cpu_gdt_table(cpu);
++ tss_desc tss;
++
++ /*
++ * sizeof(unsigned long) coming from an extra "long" at the end
++ * of the iobitmap. See tss_struct definition in processor.h
++ *
++ * -1? seg base+limit should be pointing to the address of the
++ * last valid byte
++ */
++ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
++ IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
++ write_gdt_entry(d, entry, &tss, DESC_TSS);
++}
++
++#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
++
++static inline void native_set_ldt(const void *addr, unsigned int entries)
++{
++ if (likely(entries == 0))
++ __asm__ __volatile__("lldt %w0"::"q" (0));
++ else {
++ unsigned cpu = smp_processor_id();
++ ldt_desc ldt;
++
++ set_tssldt_descriptor(&ldt, (unsigned long)addr,
++ DESC_LDT, entries * sizeof(ldt) - 1);
++ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
++ &ldt, DESC_LDT);
++ __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
++ }
++}
++
++static inline void native_load_tr_desc(void)
++{
++ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
++}
++
++static inline void native_load_gdt(const struct desc_ptr *dtr)
++{
++ asm volatile("lgdt %0"::"m" (*dtr));
++}
++
++static inline void native_load_idt(const struct desc_ptr *dtr)
++{
++ asm volatile("lidt %0"::"m" (*dtr));
++}
++
++static inline void native_store_gdt(struct desc_ptr *dtr)
++{
++ asm volatile("sgdt %0":"=m" (*dtr));
++}
++
++static inline void native_store_idt(struct desc_ptr *dtr)
++{
++ asm volatile("sidt %0":"=m" (*dtr));
++}
++
++static inline unsigned long native_store_tr(void)
++{
++ unsigned long tr;
++ asm volatile("str %0":"=r" (tr));
++ return tr;
++}
++
++static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
++{
++ unsigned int i;
++ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
++
++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
++ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
++}
++#else
++#define load_TLS(t, cpu) xen_load_tls(t, cpu)
++#define set_ldt xen_set_ldt
++
++extern int write_ldt_entry(struct desc_struct *ldt, int entry,
++ const void *desc);
++extern int write_gdt_entry(struct desc_struct *gdt, int entry,
++ const void *desc, int type);
++
++static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
++{
++ unsigned int i;
++ struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
++
++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
++ if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
++ *(u64 *)&t->tls_array[i]))
++ BUG();
++}
++#endif
++
++#define _LDT_empty(info) (\
++ (info)->base_addr == 0 && \
++ (info)->limit == 0 && \
++ (info)->contents == 0 && \
++ (info)->read_exec_only == 1 && \
++ (info)->seg_32bit == 0 && \
++ (info)->limit_in_pages == 0 && \
++ (info)->seg_not_present == 1 && \
++ (info)->useable == 0)
++
++#ifdef CONFIG_X86_64
++#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
++#else
++#define LDT_empty(info) (_LDT_empty(info))
++#endif
++
++static inline void clear_LDT(void)
++{
++ set_ldt(NULL, 0);
++}
++
++/*
++ * load one particular LDT into the current CPU
++ */
++static inline void load_LDT_nolock(mm_context_t *pc)
++{
++ set_ldt(pc->ldt, pc->size);
++}
++
++static inline void load_LDT(mm_context_t *pc)
++{
++ preempt_disable();
++ load_LDT_nolock(pc);
++ preempt_enable();
++}
++
++static inline unsigned long get_desc_base(const struct desc_struct *desc)
++{
++ return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
++}
++
++static inline unsigned long get_desc_limit(const struct desc_struct *desc)
++{
++ return desc->limit0 | (desc->limit << 16);
++}
++
++#ifndef CONFIG_X86_NO_IDT
++static inline void _set_gate(int gate, unsigned type, void *addr,
++ unsigned dpl, unsigned ist, unsigned seg)
++{
++ gate_desc s;
++ pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
++ /*
++ * does not need to be atomic because it is only done once at
++ * setup time
++ */
++ write_idt_entry(idt_table, gate, &s);
++}
++
++/*
++ * This needs to use 'idt_table' rather than 'idt', and
++ * thus use the _nonmapped_ version of the IDT, as the
++ * Pentium F0 0F bugfix can have resulted in the mapped
++ * IDT being write-protected.
++ */
++static inline void set_intr_gate(unsigned int n, void *addr)
++{
++ BUG_ON((unsigned)n > 0xFF);
++ _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
++}
++
++/*
++ * This routine sets up an interrupt gate at directory privilege level 3.
++ */
++static inline void set_system_intr_gate(unsigned int n, void *addr)
++{
++ BUG_ON((unsigned)n > 0xFF);
++ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
++}
++
++static inline void set_trap_gate(unsigned int n, void *addr)
++{
++ BUG_ON((unsigned)n > 0xFF);
++ _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
++}
++
++static inline void set_system_gate(unsigned int n, void *addr)
++{
++ BUG_ON((unsigned)n > 0xFF);
+ #ifdef CONFIG_X86_32
+-# include "desc_32.h"
++ _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
++#else
++ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
++#endif
++}
++
++static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
++{
++ BUG_ON((unsigned)n > 0xFF);
++ _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
++}
++
++static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
++{
++ BUG_ON((unsigned)n > 0xFF);
++ _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
++}
++
++static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
++{
++ BUG_ON((unsigned)n > 0xFF);
++ _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
++}
++#endif
++
+ #else
+-# include "desc_64.h"
++/*
++ * GET_DESC_BASE reads the descriptor base of the specified segment.
++ *
++ * Args:
++ * idx - descriptor index
++ * gdt - GDT pointer
++ * base - 32bit register to which the base will be written
++ * lo_w - lo word of the "base" register
++ * lo_b - lo byte of the "base" register
++ * hi_b - hi byte of the low word of the "base" register
++ *
++ * Example:
++ * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
++ * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
++ */
++#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
++ movb idx*8+4(gdt), lo_b; \
++ movb idx*8+7(gdt), hi_b; \
++ shll $16, base; \
++ movw idx*8+2(gdt), lo_w;
++
++
++#endif /* __ASSEMBLY__ */
++
+ #endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/desc_32.h 2008-12-15 11:27:22.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,262 +0,0 @@
+-#ifndef __ARCH_DESC_H
+-#define __ARCH_DESC_H
+-
+-#include <asm/ldt.h>
+-#include <asm/segment.h>
+-
+-#ifndef __ASSEMBLY__
+-
+-#include <linux/preempt.h>
+-#include <linux/smp.h>
+-
+-#include <asm/mmu.h>
+-
+-struct Xgt_desc_struct {
+- unsigned short size;
+- unsigned long address __attribute__((packed));
+- unsigned short pad;
+-} __attribute__ ((packed));
+-
+-struct gdt_page
+-{
+- struct desc_struct gdt[GDT_ENTRIES];
+-} __attribute__((aligned(PAGE_SIZE)));
+-DECLARE_PER_CPU(struct gdt_page, gdt_page);
+-
+-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+-{
+- return per_cpu(gdt_page, cpu).gdt;
+-}
+-
+-extern struct Xgt_desc_struct idt_descr;
+-extern struct desc_struct idt_table[];
+-extern void set_intr_gate(unsigned int irq, void * addr);
+-
+-static inline void pack_descriptor(__u32 *a, __u32 *b,
+- unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
+-{
+- *a = ((base & 0xffff) << 16) | (limit & 0xffff);
+- *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+- (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
+-}
+-
+-static inline void pack_gate(__u32 *a, __u32 *b,
+- unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
+-{
+- *a = (seg << 16) | (base & 0xffff);
+- *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
+-}
+-
+-#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
+-#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
+-#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
+-#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
+-#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
+-#define DESCTYPE_DPL3 0x60 /* DPL-3 */
+-#define DESCTYPE_S 0x10 /* !system */
+-
+-#ifndef CONFIG_XEN
+-#define load_TR_desc() native_load_tr_desc()
+-#define load_gdt(dtr) native_load_gdt(dtr)
+-#define load_idt(dtr) native_load_idt(dtr)
+-#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
+-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
+-
+-#define store_gdt(dtr) native_store_gdt(dtr)
+-#define store_idt(dtr) native_store_idt(dtr)
+-#define store_tr(tr) (tr = native_store_tr())
+-#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
+-
+-#define load_TLS(t, cpu) native_load_tls(t, cpu)
+-#define set_ldt native_set_ldt
+-
+-#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+-#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+-#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+-
+-static inline void write_dt_entry(struct desc_struct *dt,
+- int entry, u32 entry_low, u32 entry_high)
+-{
+- dt[entry].a = entry_low;
+- dt[entry].b = entry_high;
+-}
+-
+-static inline void native_set_ldt(const void *addr, unsigned int entries)
+-{
+- if (likely(entries == 0))
+- __asm__ __volatile__("lldt %w0"::"q" (0));
+- else {
+- unsigned cpu = smp_processor_id();
+- __u32 a, b;
+-
+- pack_descriptor(&a, &b, (unsigned long)addr,
+- entries * sizeof(struct desc_struct) - 1,
+- DESCTYPE_LDT, 0);
+- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
+- __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+- }
+-}
+-
+-
+-static inline void native_load_tr_desc(void)
+-{
+- asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+-}
+-
+-static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
+-{
+- asm volatile("lgdt %0"::"m" (*dtr));
+-}
+-
+-static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
+-{
+- asm volatile("lidt %0"::"m" (*dtr));
+-}
+-
+-static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
+-{
+- asm ("sgdt %0":"=m" (*dtr));
+-}
+-
+-static inline void native_store_idt(struct Xgt_desc_struct *dtr)
+-{
+- asm ("sidt %0":"=m" (*dtr));
+-}
+-
+-static inline unsigned long native_store_tr(void)
+-{
+- unsigned long tr;
+- asm ("str %0":"=r" (tr));
+- return tr;
+-}
+-
+-static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+-{
+- unsigned int i;
+- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+-
+- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+- gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+-}
+-#else
+-#define load_TLS(t, cpu) xen_load_tls(t, cpu)
+-#define set_ldt xen_set_ldt
+-
+-extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
+-extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
+-
+-static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+-{
+- unsigned int i;
+- struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
+-
+- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+- if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
+- *(u64 *)&t->tls_array[i]))
+- BUG();
+-}
+-#endif
+-
+-#ifndef CONFIG_X86_NO_IDT
+-static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
+-{
+- __u32 a, b;
+- pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
+- write_idt_entry(idt_table, gate, a, b);
+-}
+-#endif
+-
+-#ifndef CONFIG_X86_NO_TSS
+-static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
+-{
+- __u32 a, b;
+- pack_descriptor(&a, &b, (unsigned long)addr,
+- offsetof(struct tss_struct, __cacheline_filler) - 1,
+- DESCTYPE_TSS, 0);
+- write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
+-}
+-#endif
+-
+-
+-#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+-
+-#define LDT_entry_a(info) \
+- ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
+-
+-#define LDT_entry_b(info) \
+- (((info)->base_addr & 0xff000000) | \
+- (((info)->base_addr & 0x00ff0000) >> 16) | \
+- ((info)->limit & 0xf0000) | \
+- (((info)->read_exec_only ^ 1) << 9) | \
+- ((info)->contents << 10) | \
+- (((info)->seg_not_present ^ 1) << 15) | \
+- ((info)->seg_32bit << 22) | \
+- ((info)->limit_in_pages << 23) | \
+- ((info)->useable << 20) | \
+- 0x7000)
+-
+-#define LDT_empty(info) (\
+- (info)->base_addr == 0 && \
+- (info)->limit == 0 && \
+- (info)->contents == 0 && \
+- (info)->read_exec_only == 1 && \
+- (info)->seg_32bit == 0 && \
+- (info)->limit_in_pages == 0 && \
+- (info)->seg_not_present == 1 && \
+- (info)->useable == 0 )
+-
+-static inline void clear_LDT(void)
+-{
+- set_ldt(NULL, 0);
+-}
+-
+-/*
+- * load one particular LDT into the current CPU
+- */
+-static inline void load_LDT_nolock(mm_context_t *pc)
+-{
+- set_ldt(pc->ldt, pc->size);
+-}
+-
+-static inline void load_LDT(mm_context_t *pc)
+-{
+- preempt_disable();
+- load_LDT_nolock(pc);
+- preempt_enable();
+-}
+-
+-static inline unsigned long get_desc_base(unsigned long *desc)
+-{
+- unsigned long base;
+- base = ((desc[0] >> 16) & 0x0000ffff) |
+- ((desc[1] << 16) & 0x00ff0000) |
+- (desc[1] & 0xff000000);
+- return base;
+-}
+-
+-#else /* __ASSEMBLY__ */
+-
+-/*
+- * GET_DESC_BASE reads the descriptor base of the specified segment.
+- *
+- * Args:
+- * idx - descriptor index
+- * gdt - GDT pointer
+- * base - 32bit register to which the base will be written
+- * lo_w - lo word of the "base" register
+- * lo_b - lo byte of the "base" register
+- * hi_b - hi byte of the low word of the "base" register
+- *
+- * Example:
+- * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+- * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
+- */
+-#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
+- movb idx*8+4(gdt), lo_b; \
+- movb idx*8+7(gdt), hi_b; \
+- shll $16, base; \
+- movw idx*8+2(gdt), lo_w;
+-
+-#endif /* !__ASSEMBLY__ */
+-
+-#endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/desc_64.h 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,228 +0,0 @@
+-/* Written 2000 by Andi Kleen */
+-#ifndef __ARCH_DESC_H
+-#define __ARCH_DESC_H
+-
+-#include <linux/threads.h>
+-#include <asm/ldt.h>
+-
+-#ifndef __ASSEMBLY__
+-
+-#include <linux/string.h>
+-#include <linux/smp.h>
+-#include <asm/desc_defs.h>
+-
+-#include <asm/segment.h>
+-#include <asm/mmu.h>
+-
+-extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
+-
+-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
+-
+-#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
+-#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
+-
+-static inline void clear_LDT(void)
+-{
+- int cpu = get_cpu();
+-
+- /*
+- * NB. We load the default_ldt for lcall7/27 handling on demand, as
+- * it slows down context switching. Noone uses it anyway.
+- */
+- cpu = cpu; /* XXX avoid compiler warning */
+- xen_set_ldt(NULL, 0);
+- put_cpu();
+-}
+-
+-#ifndef CONFIG_X86_NO_TSS
+-static inline unsigned long __store_tr(void)
+-{
+- unsigned long tr;
+-
+- asm volatile ("str %w0":"=r" (tr));
+- return tr;
+-}
+-
+-#define store_tr(tr) (tr) = __store_tr()
+-#endif
+-
+-/*
+- * This is the ldt that every process will get unless we need
+- * something other than this.
+- */
+-extern struct desc_struct default_ldt[];
+-#ifndef CONFIG_X86_NO_IDT
+-extern struct gate_struct idt_table[];
+-#endif
+-extern struct desc_ptr cpu_gdt_descr[];
+-
+-/* the cpu gdt accessor */
+-#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
+-
+-#ifndef CONFIG_XEN
+-static inline void load_gdt(const struct desc_ptr *ptr)
+-{
+- asm volatile("lgdt %w0"::"m" (*ptr));
+-}
+-
+-static inline void store_gdt(struct desc_ptr *ptr)
+-{
+- asm("sgdt %w0":"=m" (*ptr));
+-}
+-#endif
+-
+-static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
+-{
+- struct gate_struct s;
+- s.offset_low = PTR_LOW(func);
+- s.segment = __KERNEL_CS;
+- s.ist = ist;
+- s.p = 1;
+- s.dpl = dpl;
+- s.zero0 = 0;
+- s.zero1 = 0;
+- s.type = type;
+- s.offset_middle = PTR_MIDDLE(func);
+- s.offset_high = PTR_HIGH(func);
+- /* does not need to be atomic because it is only done once at setup time */
+- memcpy(adr, &s, 16);
+-}
+-
+-#ifndef CONFIG_X86_NO_IDT
+-static inline void set_intr_gate(int nr, void *func)
+-{
+- BUG_ON((unsigned)nr > 0xFF);
+- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
+-}
+-
+-static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
+-{
+- BUG_ON((unsigned)nr > 0xFF);
+- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
+-}
+-
+-static inline void set_system_gate(int nr, void *func)
+-{
+- BUG_ON((unsigned)nr > 0xFF);
+- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
+-}
+-
+-static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
+-{
+- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
+-}
+-
+-static inline void load_idt(const struct desc_ptr *ptr)
+-{
+- asm volatile("lidt %w0"::"m" (*ptr));
+-}
+-
+-static inline void store_idt(struct desc_ptr *dtr)
+-{
+- asm("sidt %w0":"=m" (*dtr));
+-}
+-#endif
+-
+-static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
+- unsigned size)
+-{
+- struct ldttss_desc d;
+- memset(&d,0,sizeof(d));
+- d.limit0 = size & 0xFFFF;
+- d.base0 = PTR_LOW(tss);
+- d.base1 = PTR_MIDDLE(tss) & 0xFF;
+- d.type = type;
+- d.p = 1;
+- d.limit1 = (size >> 16) & 0xF;
+- d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
+- d.base3 = PTR_HIGH(tss);
+- memcpy(ptr, &d, 16);
+-}
+-
+-#ifndef CONFIG_X86_NO_TSS
+-static inline void set_tss_desc(unsigned cpu, void *addr)
+-{
+- /*
+- * sizeof(unsigned long) coming from an extra "long" at the end
+- * of the iobitmap. See tss_struct definition in processor.h
+- *
+- * -1? seg base+limit should be pointing to the address of the
+- * last valid byte
+- */
+- set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
+- (unsigned long)addr, DESC_TSS,
+- IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
+-}
+-#endif
+-
+-static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
+-{
+- set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
+- DESC_LDT, size * 8 - 1);
+-}
+-
+-#define LDT_entry_a(info) \
+- ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
+-/* Don't allow setting of the lm bit. It is useless anyways because
+- 64bit system calls require __USER_CS. */
+-#define LDT_entry_b(info) \
+- (((info)->base_addr & 0xff000000) | \
+- (((info)->base_addr & 0x00ff0000) >> 16) | \
+- ((info)->limit & 0xf0000) | \
+- (((info)->read_exec_only ^ 1) << 9) | \
+- ((info)->contents << 10) | \
+- (((info)->seg_not_present ^ 1) << 15) | \
+- ((info)->seg_32bit << 22) | \
+- ((info)->limit_in_pages << 23) | \
+- ((info)->useable << 20) | \
+- /* ((info)->lm << 21) | */ \
+- 0x7000)
+-
+-#define LDT_empty(info) (\
+- (info)->base_addr == 0 && \
+- (info)->limit == 0 && \
+- (info)->contents == 0 && \
+- (info)->read_exec_only == 1 && \
+- (info)->seg_32bit == 0 && \
+- (info)->limit_in_pages == 0 && \
+- (info)->seg_not_present == 1 && \
+- (info)->useable == 0 && \
+- (info)->lm == 0)
+-
+-static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
+-{
+- unsigned int i;
+- u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
+-
+- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+- if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
+- t->tls_array[i]))
+- BUG();
+-}
+-
+-/*
+- * load one particular LDT into the current CPU
+- */
+-static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
+-{
+- void *segments = pc->ldt;
+- int count = pc->size;
+-
+- if (likely(!count))
+- segments = NULL;
+-
+- xen_set_ldt(segments, count);
+-}
+-
+-static inline void load_LDT(mm_context_t *pc)
+-{
+- int cpu = get_cpu();
+- load_LDT_nolock(pc, cpu);
+- put_cpu();
+-}
+-
+-extern struct desc_ptr idt_descr;
+-
+-#endif /* !__ASSEMBLY__ */
+-
+-#endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2009-03-16 16:33:40.000000000 +0100
+@@ -84,23 +84,13 @@ dma_sync_single_range_for_device(struct
+ dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
+ }
+
+-static inline void
++extern void
+ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+- enum dma_data_direction direction)
+-{
+- if (swiotlb)
+- swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
+- flush_write_buffers();
+-}
++ enum dma_data_direction direction);
+
+-static inline void
++extern void
+ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+- enum dma_data_direction direction)
+-{
+- if (swiotlb)
+- swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
+- flush_write_buffers();
+-}
++ enum dma_data_direction direction);
+
+ extern int
+ dma_mapping_error(dma_addr_t dma_addr);
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:33:40.000000000 +0100
+@@ -64,7 +64,7 @@ enum fixed_addresses {
+ #endif
+ #ifdef CONFIG_X86_VISWS_APIC
+ FIX_CO_CPU, /* Cobalt timer */
+- FIX_CO_APIC, /* Cobalt APIC Redirection Table */
++ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
+ #endif
+@@ -73,7 +73,7 @@ enum fixed_addresses {
+ #endif
+ #ifdef CONFIG_X86_CYCLONE_TIMER
+ FIX_CYCLONE_TIMER, /*cyclone timer register*/
+-#endif
++#endif
+ #ifdef CONFIG_HIGHMEM
+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+@@ -93,11 +93,23 @@ enum fixed_addresses {
+ FIX_ISAMAP_END,
+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+ __end_of_permanent_fixed_addresses,
+- /* temporary boot-time mappings, used before ioremap() is functional */
+-#define NR_FIX_BTMAPS 16
+- FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
+- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
++ /*
++ * 256 temporary boot-time mappings, used by early_ioremap(),
++ * before ioremap() is functional.
++ *
++ * We round it up to the next 512 pages boundary so that we
++ * can have a single pgd entry and a single pte table:
++ */
++#define NR_FIX_BTMAPS 64
++#define FIX_BTMAPS_NESTING 4
++ FIX_BTMAP_END =
++ __end_of_permanent_fixed_addresses + 512 -
++ (__end_of_permanent_fixed_addresses & 511),
++ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
+ FIX_WP_TEST,
++#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
++ FIX_OHCI1394_BASE,
++#endif
+ __end_of_fixed_addresses
+ };
+
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:33:40.000000000 +0100
+@@ -15,6 +15,7 @@
+ #include <asm/apicdef.h>
+ #include <asm/page.h>
+ #include <asm/vsyscall.h>
++#include <asm/efi.h>
+ #include <asm/acpi.h>
+
+ /*
+@@ -46,6 +47,10 @@ enum fixed_addresses {
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
+ #endif
++#ifdef CONFIG_EFI
++ FIX_EFI_IO_MAP_LAST_PAGE,
++ FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
++#endif
+ #ifdef CONFIG_ACPI
+ FIX_ACPI_BEGIN,
+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+@@ -55,10 +60,22 @@ enum fixed_addresses {
+ FIX_ISAMAP_END,
+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+ __end_of_permanent_fixed_addresses,
+- /* temporary boot-time mappings, used before ioremap() is functional */
+-#define NR_FIX_BTMAPS 16
+- FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
+- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
++ /*
++ * 256 temporary boot-time mappings, used by early_ioremap(),
++ * before ioremap() is functional.
++ *
++ * We round it up to the next 512 pages boundary so that we
++ * can have a single pgd entry and a single pte table:
++ */
++#define NR_FIX_BTMAPS 64
++#define FIX_BTMAPS_NESTING 4
++ FIX_BTMAP_END =
++ __end_of_permanent_fixed_addresses + 512 -
++ (__end_of_permanent_fixed_addresses & 511),
++ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
++#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
++ FIX_OHCI1394_BASE,
++#endif
+ __end_of_fixed_addresses
+ };
+
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:33:40.000000000 +0100
+@@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table;
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+-#ifdef CONFIG_X86_PAE
+-#define LAST_PKMAP 512
+-#else
+-#define LAST_PKMAP 1024
+-#endif
+ /*
+ * Ordering is:
+ *
+@@ -57,13 +52,12 @@ extern pte_t *pkmap_page_table;
+ * VMALLOC_START
+ * high_memory
+ */
+-#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
+ #define LAST_PKMAP_MASK (LAST_PKMAP-1)
+ #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+ #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+-extern void * FASTCALL(kmap_high(struct page *page));
+-extern void FASTCALL(kunmap_high(struct page *page));
++extern void *kmap_high(struct page *page);
++extern void kunmap_high(struct page *page);
+
+ void *kmap(struct page *page);
+ void kunmap(struct page *page);
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/hypervisor.h 2009-03-16 16:33:40.000000000 +0100
+@@ -264,6 +264,25 @@ HYPERVISOR_poll(
+ return rc;
+ }
+
++static inline int __must_check
++HYPERVISOR_poll_no_timeout(
++ evtchn_port_t *ports, unsigned int nr_ports)
++{
++ int rc;
++ struct sched_poll sched_poll = {
++ .nr_ports = nr_ports
++ };
++ set_xen_guest_handle(sched_poll.ports, ports);
++
++ rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
++#if CONFIG_XEN_COMPAT <= 0x030002
++ if (rc == -ENOSYS)
++ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
++#endif
++
++ return rc;
++}
++
+ #ifdef CONFIG_XEN
+
+ static inline void
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,5 +1,247 @@
+-#ifdef CONFIG_X86_32
+-# include "irqflags_32.h"
++#ifndef _X86_IRQFLAGS_H_
++#define _X86_IRQFLAGS_H_
++
++#include <asm/processor-flags.h>
++
++#ifndef __ASSEMBLY__
++/*
++ * The use of 'barrier' in the following reflects their use as local-lock
++ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
++ * critical operations are executed. All critical operations must complete
++ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
++ * includes these barriers, for example.
++ */
++
++#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
++
++#define xen_restore_fl(f) \
++do { \
++ vcpu_info_t *_vcpu; \
++ barrier(); \
++ _vcpu = current_vcpu_info(); \
++ if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
++ barrier(); /* unmask then check (avoid races) */\
++ if (unlikely(_vcpu->evtchn_upcall_pending)) \
++ force_evtchn_callback(); \
++ } \
++} while (0)
++
++#define xen_irq_disable() \
++do { \
++ current_vcpu_info()->evtchn_upcall_mask = 1; \
++ barrier(); \
++} while (0)
++
++#define xen_irq_enable() \
++do { \
++ vcpu_info_t *_vcpu; \
++ barrier(); \
++ _vcpu = current_vcpu_info(); \
++ _vcpu->evtchn_upcall_mask = 0; \
++ barrier(); /* unmask then check (avoid races) */ \
++ if (unlikely(_vcpu->evtchn_upcall_pending)) \
++ force_evtchn_callback(); \
++} while (0)
++
++void xen_safe_halt(void);
++
++void xen_halt(void);
++
++#define __raw_local_save_flags() xen_save_fl()
++
++#define raw_local_irq_restore(flags) xen_restore_fl(flags)
++
++#define raw_local_irq_disable() xen_irq_disable()
++
++#define raw_local_irq_enable() xen_irq_enable()
++
++/*
++ * Used in the idle loop; sti takes one instruction cycle
++ * to complete:
++ */
++static inline void raw_safe_halt(void)
++{
++ xen_safe_halt();
++}
++
++/*
++ * Used when interrupts are already enabled or to
++ * shutdown the processor:
++ */
++static inline void halt(void)
++{
++ xen_halt();
++}
++
++/*
++ * For spinlocks, etc:
++ */
++#define __raw_local_irq_save() \
++({ \
++ unsigned long flags = __raw_local_save_flags(); \
++ \
++ raw_local_irq_disable(); \
++ \
++ flags; \
++})
+ #else
+-# include "irqflags_64.h"
++
++/* Offsets into shared_info_t. */
++#define evtchn_upcall_pending /* 0 */
++#define evtchn_upcall_mask 1
++
++#define sizeof_vcpu_shift 6
++
++#ifdef CONFIG_X86_64
++# define __REG_si %rsi
++# define __CPU_num %gs:pda_cpunumber
++#else
++# define __REG_si %esi
++# define __CPU_num TI_cpu(%ebp)
++#endif
++
++#ifdef CONFIG_SMP
++#define GET_VCPU_INFO movl __CPU_num,%esi ; \
++ shl $sizeof_vcpu_shift,%esi ; \
++ add HYPERVISOR_shared_info,__REG_si
++#else
++#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si
++#endif
++
++#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si)
++#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si)
++#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si)
++#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
++ __DISABLE_INTERRUPTS
++#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
++ __ENABLE_INTERRUPTS
++
++#ifndef CONFIG_X86_64
++#define INTERRUPT_RETURN iret
++#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
++sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
++ __TEST_PENDING ; \
++ jnz 14f /* process more events if necessary... */ ; \
++ movl PT_ESI(%esp), %esi ; \
++ sysexit ; \
++14: __DISABLE_INTERRUPTS ; \
++ TRACE_IRQS_OFF ; \
++sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
++ push %esp ; \
++ call evtchn_do_upcall ; \
++ add $4,%esp ; \
++ jmp ret_from_intr
++#endif
++
++
++#endif /* __ASSEMBLY__ */
++
++#ifndef __ASSEMBLY__
++#define raw_local_save_flags(flags) \
++ do { (flags) = __raw_local_save_flags(); } while (0)
++
++#define raw_local_irq_save(flags) \
++ do { (flags) = __raw_local_irq_save(); } while (0)
++
++static inline int raw_irqs_disabled_flags(unsigned long flags)
++{
++ return (flags != 0);
++}
++
++#define raw_irqs_disabled() \
++({ \
++ unsigned long flags = __raw_local_save_flags(); \
++ \
++ raw_irqs_disabled_flags(flags); \
++})
++
++/*
++ * makes the traced hardirq state match with the machine state
++ *
++ * should be a rarely used function, only in places where its
++ * otherwise impossible to know the irq state, like in traps.
++ */
++static inline void trace_hardirqs_fixup_flags(unsigned long flags)
++{
++ if (raw_irqs_disabled_flags(flags))
++ trace_hardirqs_off();
++ else
++ trace_hardirqs_on();
++}
++
++#define trace_hardirqs_fixup() \
++ trace_hardirqs_fixup_flags(__raw_local_save_flags())
++
++#else
++
++#ifdef CONFIG_X86_64
++/*
++ * Currently paravirt can't handle swapgs nicely when we
++ * don't have a stack we can rely on (such as a user space
++ * stack). So we either find a way around these or just fault
++ * and emulate if a guest tries to call swapgs directly.
++ *
++ * Either way, this is a good way to document that we don't
++ * have a reliable stack. x86_64 only.
++ */
++#define SWAPGS_UNSAFE_STACK swapgs
++#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
++#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
++#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
++#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
++ TRACE_IRQS_ON; \
++ ENABLE_INTERRUPTS(CLBR_NONE); \
++ SAVE_REST; \
++ LOCKDEP_SYS_EXIT; \
++ RESTORE_REST; \
++ __DISABLE_INTERRUPTS; \
++ TRACE_IRQS_OFF;
++
++#else
++#define ARCH_TRACE_IRQS_ON \
++ pushl %eax; \
++ pushl %ecx; \
++ pushl %edx; \
++ call trace_hardirqs_on; \
++ popl %edx; \
++ popl %ecx; \
++ popl %eax;
++
++#define ARCH_TRACE_IRQS_OFF \
++ pushl %eax; \
++ pushl %ecx; \
++ pushl %edx; \
++ call trace_hardirqs_off; \
++ popl %edx; \
++ popl %ecx; \
++ popl %eax;
++
++#define ARCH_LOCKDEP_SYS_EXIT \
++ pushl %eax; \
++ pushl %ecx; \
++ pushl %edx; \
++ call lockdep_sys_exit; \
++ popl %edx; \
++ popl %ecx; \
++ popl %eax;
++
++#define ARCH_LOCKDEP_SYS_EXIT_IRQ
++#endif
++
++#ifdef CONFIG_TRACE_IRQFLAGS
++# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
++# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
++#else
++# define TRACE_IRQS_ON
++# define TRACE_IRQS_OFF
++#endif
++#ifdef CONFIG_DEBUG_LOCK_ALLOC
++# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
++# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
++# else
++# define LOCKDEP_SYS_EXIT
++# define LOCKDEP_SYS_EXIT_IRQ
++# endif
++
++#endif /* __ASSEMBLY__ */
+ #endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/irqflags_32.h 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,212 +0,0 @@
+-/*
+- * include/asm-i386/irqflags.h
+- *
+- * IRQ flags handling
+- *
+- * This file gets included from lowlevel asm headers too, to provide
+- * wrapped versions of the local_irq_*() APIs, based on the
+- * raw_local_irq_*() functions from the lowlevel headers.
+- */
+-#ifndef _ASM_IRQFLAGS_H
+-#define _ASM_IRQFLAGS_H
+-
+-#ifndef __ASSEMBLY__
+-#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
+-
+-#define xen_restore_fl(f) \
+-do { \
+- vcpu_info_t *_vcpu; \
+- barrier(); \
+- _vcpu = current_vcpu_info(); \
+- if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
+- barrier(); /* unmask then check (avoid races) */\
+- if (unlikely(_vcpu->evtchn_upcall_pending)) \
+- force_evtchn_callback(); \
+- } \
+-} while (0)
+-
+-#define xen_irq_disable() \
+-do { \
+- current_vcpu_info()->evtchn_upcall_mask = 1; \
+- barrier(); \
+-} while (0)
+-
+-#define xen_irq_enable() \
+-do { \
+- vcpu_info_t *_vcpu; \
+- barrier(); \
+- _vcpu = current_vcpu_info(); \
+- _vcpu->evtchn_upcall_mask = 0; \
+- barrier(); /* unmask then check (avoid races) */ \
+- if (unlikely(_vcpu->evtchn_upcall_pending)) \
+- force_evtchn_callback(); \
+-} while (0)
+-
+-void xen_safe_halt(void);
+-
+-void xen_halt(void);
+-
+-/*
+- * The use of 'barrier' in the following reflects their use as local-lock
+- * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
+- * critical operations are executed. All critical operations must complete
+- * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
+- * includes these barriers, for example.
+- */
+-
+-#define __raw_local_save_flags() xen_save_fl()
+-
+-#define raw_local_irq_restore(flags) xen_restore_fl(flags)
+-
+-#define raw_local_irq_disable() xen_irq_disable()
+-
+-#define raw_local_irq_enable() xen_irq_enable()
+-
+-/*
+- * Used in the idle loop; sti takes one instruction cycle
+- * to complete:
+- */
+-static inline void raw_safe_halt(void)
+-{
+- xen_safe_halt();
+-}
+-
+-/*
+- * Used when interrupts are already enabled or to
+- * shutdown the processor:
+- */
+-static inline void halt(void)
+-{
+- xen_halt();
+-}
+-
+-/*
+- * For spinlocks, etc:
+- */
+-#define __raw_local_irq_save() \
+-({ \
+- unsigned long flags = __raw_local_save_flags(); \
+- \
+- raw_local_irq_disable(); \
+- \
+- flags; \
+-})
+-
+-#else
+-/* Offsets into shared_info_t. */
+-#define evtchn_upcall_pending /* 0 */
+-#define evtchn_upcall_mask 1
+-
+-#define sizeof_vcpu_shift 6
+-
+-#ifdef CONFIG_SMP
+-#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
+- shl $sizeof_vcpu_shift,%esi ; \
+- addl HYPERVISOR_shared_info,%esi
+-#else
+-#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
+-#endif
+-
+-#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
+-#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
+-#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
+-#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
+- __DISABLE_INTERRUPTS
+-#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
+- __ENABLE_INTERRUPTS
+-#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
+-sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
+- __TEST_PENDING ; \
+- jnz 14f /* process more events if necessary... */ ; \
+- movl PT_ESI(%esp), %esi ; \
+- sysexit ; \
+-14: __DISABLE_INTERRUPTS ; \
+- TRACE_IRQS_OFF ; \
+-sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
+- push %esp ; \
+- call evtchn_do_upcall ; \
+- add $4,%esp ; \
+- jmp ret_from_intr
+-#define INTERRUPT_RETURN iret
+-#endif /* __ASSEMBLY__ */
+-
+-#ifndef __ASSEMBLY__
+-#define raw_local_save_flags(flags) \
+- do { (flags) = __raw_local_save_flags(); } while (0)
+-
+-#define raw_local_irq_save(flags) \
+- do { (flags) = __raw_local_irq_save(); } while (0)
+-
+-static inline int raw_irqs_disabled_flags(unsigned long flags)
+-{
+- return (flags != 0);
+-}
+-
+-#define raw_irqs_disabled() \
+-({ \
+- unsigned long flags = __raw_local_save_flags(); \
+- \
+- raw_irqs_disabled_flags(flags); \
+-})
+-
+-/*
+- * makes the traced hardirq state match with the machine state
+- *
+- * should be a rarely used function, only in places where its
+- * otherwise impossible to know the irq state, like in traps.
+- */
+-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
+-{
+- if (raw_irqs_disabled_flags(flags))
+- trace_hardirqs_off();
+- else
+- trace_hardirqs_on();
+-}
+-
+-#define trace_hardirqs_fixup() \
+- trace_hardirqs_fixup_flags(__raw_local_save_flags())
+-#endif /* __ASSEMBLY__ */
+-
+-/*
+- * Do the CPU's IRQ-state tracing from assembly code. We call a
+- * C function, so save all the C-clobbered registers:
+- */
+-#ifdef CONFIG_TRACE_IRQFLAGS
+-
+-# define TRACE_IRQS_ON \
+- pushl %eax; \
+- pushl %ecx; \
+- pushl %edx; \
+- call trace_hardirqs_on; \
+- popl %edx; \
+- popl %ecx; \
+- popl %eax;
+-
+-# define TRACE_IRQS_OFF \
+- pushl %eax; \
+- pushl %ecx; \
+- pushl %edx; \
+- call trace_hardirqs_off; \
+- popl %edx; \
+- popl %ecx; \
+- popl %eax;
+-
+-#else
+-# define TRACE_IRQS_ON
+-# define TRACE_IRQS_OFF
+-#endif
+-
+-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+-# define LOCKDEP_SYS_EXIT \
+- pushl %eax; \
+- pushl %ecx; \
+- pushl %edx; \
+- call lockdep_sys_exit; \
+- popl %edx; \
+- popl %ecx; \
+- popl %eax;
+-#else
+-# define LOCKDEP_SYS_EXIT
+-#endif
+-
+-#endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/irqflags_64.h 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,178 +0,0 @@
+-/*
+- * include/asm-x86_64/irqflags.h
+- *
+- * IRQ flags handling
+- *
+- * This file gets included from lowlevel asm headers too, to provide
+- * wrapped versions of the local_irq_*() APIs, based on the
+- * raw_local_irq_*() functions from the lowlevel headers.
+- */
+-#ifndef _ASM_IRQFLAGS_H
+-#define _ASM_IRQFLAGS_H
+-#include <asm/processor-flags.h>
+-
+-#ifndef __ASSEMBLY__
+-/*
+- * Interrupt control:
+- */
+-
+-/*
+- * The use of 'barrier' in the following reflects their use as local-lock
+- * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
+- * critical operations are executed. All critical operations must complete
+- * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
+- * includes these barriers, for example.
+- */
+-
+-#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
+-
+-#define raw_local_save_flags(flags) \
+- do { (flags) = __raw_local_save_flags(); } while (0)
+-
+-#define raw_local_irq_restore(x) \
+-do { \
+- vcpu_info_t *_vcpu; \
+- barrier(); \
+- _vcpu = current_vcpu_info(); \
+- if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
+- barrier(); /* unmask then check (avoid races) */ \
+- if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
+- force_evtchn_callback(); \
+- } \
+-} while (0)
+-
+-#ifdef CONFIG_X86_VSMP
+-
+-/*
+- * Interrupt control for the VSMP architecture:
+- */
+-
+-static inline void raw_local_irq_disable(void)
+-{
+- unsigned long flags = __raw_local_save_flags();
+-
+- raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
+-}
+-
+-static inline void raw_local_irq_enable(void)
+-{
+- unsigned long flags = __raw_local_save_flags();
+-
+- raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
+-}
+-
+-static inline int raw_irqs_disabled_flags(unsigned long flags)
+-{
+- return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
+-}
+-
+-#else /* CONFIG_X86_VSMP */
+-
+-#define raw_local_irq_disable() \
+-do { \
+- current_vcpu_info()->evtchn_upcall_mask = 1; \
+- barrier(); \
+-} while (0)
+-
+-#define raw_local_irq_enable() \
+-do { \
+- vcpu_info_t *_vcpu; \
+- barrier(); \
+- _vcpu = current_vcpu_info(); \
+- _vcpu->evtchn_upcall_mask = 0; \
+- barrier(); /* unmask then check (avoid races) */ \
+- if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
+- force_evtchn_callback(); \
+-} while (0)
+-
+-static inline int raw_irqs_disabled_flags(unsigned long flags)
+-{
+- return (flags != 0);
+-}
+-
+-#endif
+-
+-/*
+- * For spinlocks, etc.:
+- */
+-
+-#define __raw_local_irq_save() \
+-({ \
+- unsigned long flags = __raw_local_save_flags(); \
+- \
+- raw_local_irq_disable(); \
+- \
+- flags; \
+-})
+-
+-#define raw_local_irq_save(flags) \
+- do { (flags) = __raw_local_irq_save(); } while (0)
+-
+-#define raw_irqs_disabled() \
+-({ \
+- unsigned long flags = __raw_local_save_flags(); \
+- \
+- raw_irqs_disabled_flags(flags); \
+-})
+-
+-/*
+- * makes the traced hardirq state match with the machine state
+- *
+- * should be a rarely used function, only in places where its
+- * otherwise impossible to know the irq state, like in traps.
+- */
+-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
+-{
+- if (raw_irqs_disabled_flags(flags))
+- trace_hardirqs_off();
+- else
+- trace_hardirqs_on();
+-}
+-
+-#define trace_hardirqs_fixup() \
+- trace_hardirqs_fixup_flags(__raw_local_save_flags())
+-/*
+- * Used in the idle loop; sti takes one instruction cycle
+- * to complete:
+- */
+-void xen_safe_halt(void);
+-static inline void raw_safe_halt(void)
+-{
+- xen_safe_halt();
+-}
+-
+-/*
+- * Used when interrupts are already enabled or to
+- * shutdown the processor:
+- */
+-void xen_halt(void);
+-static inline void halt(void)
+-{
+- xen_halt();
+-}
+-
+-#else /* __ASSEMBLY__: */
+-# ifdef CONFIG_TRACE_IRQFLAGS
+-# define TRACE_IRQS_ON call trace_hardirqs_on_thunk
+-# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk
+-# else
+-# define TRACE_IRQS_ON
+-# define TRACE_IRQS_OFF
+-# endif
+-# ifdef CONFIG_DEBUG_LOCK_ALLOC
+-# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
+-# define LOCKDEP_SYS_EXIT_IRQ \
+- TRACE_IRQS_ON; \
+- sti; \
+- SAVE_REST; \
+- LOCKDEP_SYS_EXIT; \
+- RESTORE_REST; \
+- cli; \
+- TRACE_IRQS_OFF;
+-# else
+-# define LOCKDEP_SYS_EXIT
+-# define LOCKDEP_SYS_EXIT_IRQ
+-# endif
+-#endif
+-
+-#endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/maddr_32.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/maddr_32.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,6 +1,7 @@
+ #ifndef _I386_MADDR_H
+ #define _I386_MADDR_H
+
++#include <asm/bug.h>
+ #include <xen/features.h>
+ #include <xen/interface/xen.h>
+
+@@ -151,25 +152,9 @@ static inline paddr_t pte_machine_to_phy
+ phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
+ return phys;
+ }
+-#endif
+-
+-#ifdef CONFIG_X86_PAE
+-#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } )
+-extern unsigned long long __supported_pte_mask;
+-static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
+-{
+- pte_t pte;
+-
+- pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
+- (pgprot_val(pgprot) >> 32);
+- pte.pte_high &= (__supported_pte_mask >> 32);
+- pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
+- __supported_pte_mask;
+- return pte;
+-}
+ #else
+-#define __pte_ma(x) ((pte_t) { (x) } )
+-#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
++#define pte_phys_to_machine phys_to_machine
++#define pte_machine_to_phys machine_to_phys
+ #endif
+
+ #else /* !CONFIG_XEN */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/maddr_64.h 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/maddr_64.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,6 +1,7 @@
+ #ifndef _X86_64_MADDR_H
+ #define _X86_64_MADDR_H
+
++#include <asm/bug.h>
+ #include <xen/features.h>
+ #include <xen/interface/xen.h>
+
+@@ -16,6 +17,7 @@ typedef unsigned long maddr_t;
+ #ifdef CONFIG_XEN
+
+ extern unsigned long *phys_to_machine_mapping;
++extern unsigned long max_mapnr;
+
+ #undef machine_to_phys_mapping
+ extern unsigned long *machine_to_phys_mapping;
+@@ -25,7 +27,7 @@ static inline unsigned long pfn_to_mfn(u
+ {
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return pfn;
+- BUG_ON(end_pfn && pfn >= end_pfn);
++ BUG_ON(max_mapnr && pfn >= max_mapnr);
+ return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
+ }
+
+@@ -33,7 +35,7 @@ static inline int phys_to_machine_mappin
+ {
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 1;
+- BUG_ON(end_pfn && pfn >= end_pfn);
++ BUG_ON(max_mapnr && pfn >= max_mapnr);
+ return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
+ }
+
+@@ -45,7 +47,7 @@ static inline unsigned long mfn_to_pfn(u
+ return mfn;
+
+ if (unlikely((mfn >> machine_to_phys_order) != 0))
+- return end_pfn;
++ return max_mapnr;
+
+ /* The array access can fail (e.g., device space beyond end of RAM). */
+ asm (
+@@ -60,7 +62,7 @@ static inline unsigned long mfn_to_pfn(u
+ " .quad 1b,3b\n"
+ ".previous"
+ : "=r" (pfn)
+- : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
++ : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
+
+ return pfn;
+ }
+@@ -88,16 +90,16 @@ static inline unsigned long mfn_to_pfn(u
+ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+ {
+ unsigned long pfn = mfn_to_pfn(mfn);
+- if ((pfn < end_pfn)
++ if ((pfn < max_mapnr)
+ && !xen_feature(XENFEAT_auto_translated_physmap)
+ && (phys_to_machine_mapping[pfn] != mfn))
+- return end_pfn; /* force !pfn_valid() */
++ return max_mapnr; /* force !pfn_valid() */
+ return pfn;
+ }
+
+ static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+ {
+- BUG_ON(end_pfn && pfn >= end_pfn);
++ BUG_ON(max_mapnr && pfn >= max_mapnr);
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return;
+@@ -135,9 +137,6 @@ static inline paddr_t pte_machine_to_phy
+ return phys;
+ }
+
+-#define __pte_ma(x) ((pte_t) { (x) } )
+-#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
+-
+ #else /* !CONFIG_XEN */
+
+ #define pfn_to_mfn(pfn) (pfn)
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:33:40.000000000 +0100
+@@ -51,8 +51,6 @@ static inline void __prepare_arch_switch
+ : : "r" (0) );
+ }
+
+-void leave_mm(unsigned long cpu);
+-
+ static inline void switch_mm(struct mm_struct *prev,
+ struct mm_struct *next,
+ struct task_struct *tsk)
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:33:40.000000000 +0100
+@@ -62,12 +62,6 @@ extern void mm_pin(struct mm_struct *mm)
+ extern void mm_unpin(struct mm_struct *mm);
+ void mm_pin_all(void);
+
+-static inline void load_cr3(pgd_t *pgd)
+-{
+- asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
+- "memory");
+-}
+-
+ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+ {
+@@ -97,7 +91,7 @@ static inline void switch_mm(struct mm_s
+ op++;
+
+ if (unlikely(next->context.ldt != prev->context.ldt)) {
+- /* load_LDT_nolock(&next->context, cpu) */
++ /* load_LDT_nolock(&next->context) */
+ op->cmd = MMUEXT_SET_LDT;
+ op->arg1.linear_addr = (unsigned long)next->context.ldt;
+ op->arg2.nr_ents = next->context.size;
+@@ -110,7 +104,7 @@ static inline void switch_mm(struct mm_s
+ else {
+ write_pda(mmu_state, TLBSTATE_OK);
+ if (read_pda(active_mm) != next)
+- out_of_line_bug();
++ BUG();
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+@@ -118,7 +112,7 @@ static inline void switch_mm(struct mm_s
+ */
+ load_cr3(next->pgd);
+ xen_new_user_pt(__pa(__user_pgd(next->pgd)));
+- load_LDT_nolock(&next->context, cpu);
++ load_LDT_nolock(&next->context);
+ }
+ }
+ #endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/page.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,13 +1,231 @@
++#ifndef _ASM_X86_PAGE_H
++#define _ASM_X86_PAGE_H
++
++#include <linux/const.h>
++
++/* PAGE_SHIFT determines the page size */
++#define PAGE_SHIFT 12
++#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
++#define PAGE_MASK (~(PAGE_SIZE-1))
++
+ #ifdef __KERNEL__
+-# ifdef CONFIG_X86_32
+-# include "page_32.h"
+-# else
+-# include "page_64.h"
+-# endif
++
++/*
++ * Need to repeat this here in order to not include pgtable.h (which in turn
++ * depends on definitions made here), but to be able to use the symbolics
++ * below. The preprocessor will warn if the two definitions aren't identical.
++ */
++#define _PAGE_BIT_PRESENT 0
++#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
++#define _PAGE_BIT_IO 9
++#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
++
++#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
++#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK)
++
++#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
++#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
++
++#define HPAGE_SHIFT PMD_SHIFT
++#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
++#define HPAGE_MASK (~(HPAGE_SIZE - 1))
++#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
++
++/* to align the pointer to the (next) page boundary */
++#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
++
++#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
++#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
++
++#ifndef __ASSEMBLY__
++#include <linux/types.h>
++#endif
++
++#ifdef CONFIG_X86_64
++#include <asm/page_64.h>
++#define max_pfn_mapped end_pfn_map
++#else
++#include <asm/page_32.h>
++#define max_pfn_mapped max_low_pfn
++#endif /* CONFIG_X86_64 */
++
++#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
++
++#define VM_DATA_DEFAULT_FLAGS \
++ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
++ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
++
++
++#ifndef __ASSEMBLY__
++
++extern int page_is_ram(unsigned long pagenr);
++
++struct page;
++
++static inline void clear_user_page(void *page, unsigned long vaddr,
++ struct page *pg)
++{
++ clear_page(page);
++}
++
++static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
++ struct page *topage)
++{
++ copy_page(to, from);
++}
++
++#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
++ alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
++#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
++
++typedef struct { pgprotval_t pgprot; } pgprot_t;
++
++#define pgprot_val(x) ((x).pgprot)
++#define __pgprot(x) ((pgprot_t) { (x) } )
++
++#include <asm/maddr.h>
++
++typedef struct { pgdval_t pgd; } pgd_t;
++
++#define __pgd_ma(x) ((pgd_t) { (x) } )
++static inline pgd_t xen_make_pgd(pgdval_t val)
++{
++ if (val & _PAGE_PRESENT)
++ val = pte_phys_to_machine(val);
++ return (pgd_t) { val };
++}
++
++#define __pgd_val(x) ((x).pgd)
++static inline pgdval_t xen_pgd_val(pgd_t pgd)
++{
++ pgdval_t ret = __pgd_val(pgd);
++#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
++ if (ret)
++ ret = machine_to_phys(ret) | _PAGE_PRESENT;
++#else
++ if (ret & _PAGE_PRESENT)
++ ret = pte_machine_to_phys(ret);
++#endif
++ return ret;
++}
++
++#if PAGETABLE_LEVELS >= 3
++#if PAGETABLE_LEVELS == 4
++typedef struct { pudval_t pud; } pud_t;
++
++#define __pud_ma(x) ((pud_t) { (x) } )
++static inline pud_t xen_make_pud(pudval_t val)
++{
++ if (val & _PAGE_PRESENT)
++ val = pte_phys_to_machine(val);
++ return (pud_t) { val };
++}
++
++#define __pud_val(x) ((x).pud)
++static inline pudval_t xen_pud_val(pud_t pud)
++{
++ pudval_t ret = __pud_val(pud);
++ if (ret & _PAGE_PRESENT)
++ ret = pte_machine_to_phys(ret);
++ return ret;
++}
++#else /* PAGETABLE_LEVELS == 3 */
++#include <asm-generic/pgtable-nopud.h>
++
++#define __pud_val(x) __pgd_val((x).pgd)
++static inline pudval_t xen_pud_val(pud_t pud)
++{
++ return xen_pgd_val(pud.pgd);
++}
++#endif /* PAGETABLE_LEVELS == 4 */
++
++typedef struct { pmdval_t pmd; } pmd_t;
++
++#define __pmd_ma(x) ((pmd_t) { (x) } )
++static inline pmd_t xen_make_pmd(pmdval_t val)
++{
++ if (val & _PAGE_PRESENT)
++ val = pte_phys_to_machine(val);
++ return (pmd_t) { val };
++}
++
++#define __pmd_val(x) ((x).pmd)
++static inline pmdval_t xen_pmd_val(pmd_t pmd)
++{
++ pmdval_t ret = __pmd_val(pmd);
++#if CONFIG_XEN_COMPAT <= 0x030002
++ if (ret)
++ ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
+ #else
+-# ifdef __i386__
+-# include "page_32.h"
+-# else
+-# include "page_64.h"
+-# endif
++ if (ret & _PAGE_PRESENT)
++ ret = pte_machine_to_phys(ret);
++#endif
++ return ret;
++}
++#else /* PAGETABLE_LEVELS == 2 */
++#include <asm-generic/pgtable-nopmd.h>
++
++#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
++#define __pmd_val(x) __pgd_val((x).pud.pgd)
++static inline pmdval_t xen_pmd_val(pmd_t pmd)
++{
++ return xen_pgd_val(pmd.pud.pgd);
++}
++#endif /* PAGETABLE_LEVELS >= 3 */
++
++#define __pte_ma(x) ((pte_t) { .pte = (x) } )
++static inline pte_t xen_make_pte(pteval_t val)
++{
++ if ((val & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
++ val = pte_phys_to_machine(val);
++ return (pte_t) { .pte = val };
++}
++
++#define __pte_val(x) ((x).pte)
++static inline pteval_t xen_pte_val(pte_t pte)
++{
++ pteval_t ret = __pte_val(pte);
++ if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
++ ret = pte_machine_to_phys(ret);
++ return ret;
++}
++
++#define pgd_val(x) xen_pgd_val(x)
++#define __pgd(x) xen_make_pgd(x)
++
++#ifndef __PAGETABLE_PUD_FOLDED
++#define pud_val(x) xen_pud_val(x)
++#define __pud(x) xen_make_pud(x)
++#endif
++
++#ifndef __PAGETABLE_PMD_FOLDED
++#define pmd_val(x) xen_pmd_val(x)
++#define __pmd(x) xen_make_pmd(x)
+ #endif
++
++#define pte_val(x) xen_pte_val(x)
++#define __pte(x) xen_make_pte(x)
++
++#define __pa(x) __phys_addr((unsigned long)(x))
++/* __pa_symbol should be used for C visible symbols.
++ This seems to be the official gcc blessed way to do such arithmetic. */
++#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
++
++#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
++
++#define __boot_va(x) __va(x)
++#define __boot_pa(x) __pa(x)
++
++#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
++#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
++#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
++
++#endif /* __ASSEMBLY__ */
++
++#include <asm-generic/memory_model.h>
++#include <asm-generic/page.h>
++
++#define __HAVE_ARCH_GATE_AREA 1
++
++#endif /* __KERNEL__ */
++#endif /* _ASM_X86_PAGE_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,37 +1,9 @@
+ #ifndef _X86_64_PAGE_H
+ #define _X86_64_PAGE_H
+
+-/* #include <linux/string.h> */
+-#ifndef __ASSEMBLY__
+-#include <linux/kernel.h>
+-#include <linux/types.h>
+-#include <asm/bug.h>
+-#endif
+-#include <linux/const.h>
+-#include <xen/interface/xen.h>
+-
+-/*
+- * Need to repeat this here in order to not include pgtable.h (which in turn
+- * depends on definitions made here), but to be able to use the symbolic
+- * below. The preprocessor will warn if the two definitions aren't identical.
+- */
+-#define _PAGE_PRESENT 0x001
+-#define _PAGE_IO 0x200
+-
+-/* PAGE_SHIFT determines the page size */
+-#define PAGE_SHIFT 12
+-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
+-#define PAGE_MASK (~(PAGE_SIZE-1))
+-
+-/* See Documentation/x86_64/mm.txt for a description of the memory map. */
+-#define __PHYSICAL_MASK_SHIFT 46
+-#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
+-#define __VIRTUAL_MASK_SHIFT 48
+-#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
+-
+-#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
++#define PAGETABLE_LEVELS 4
+
+-#define THREAD_ORDER 1
++#define THREAD_ORDER 1
+ #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+ #define CURRENT_MASK (~(THREAD_SIZE-1))
+
+@@ -51,106 +23,10 @@
+ #define MCE_STACK 5
+ #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
+
+-#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
+-#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
+-
+-#define HPAGE_SHIFT PMD_SHIFT
+-#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
+-#define HPAGE_MASK (~(HPAGE_SIZE - 1))
+-#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
+-
+-#ifdef __KERNEL__
+-#ifndef __ASSEMBLY__
+-
+-extern unsigned long end_pfn;
+-
+-#include <asm/maddr.h>
+-
+-void clear_page(void *);
+-void copy_page(void *, void *);
+-
+-#define clear_user_page(page, vaddr, pg) clear_page(page)
+-#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
+-
+-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+- alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
+-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+-
+-/*
+- * These are used to make use of C type-checking..
+- */
+-typedef struct { unsigned long pte; } pte_t;
+-typedef struct { unsigned long pmd; } pmd_t;
+-typedef struct { unsigned long pud; } pud_t;
+-typedef struct { unsigned long pgd; } pgd_t;
+-#define PTE_MASK PHYSICAL_PAGE_MASK
+-
+-typedef struct { unsigned long pgprot; } pgprot_t;
+-
+-#define __pte_val(x) ((x).pte)
+-#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO)) \
+- == _PAGE_PRESENT ? \
+- pte_machine_to_phys(__pte_val(x)) : \
+- __pte_val(x))
+-
+-#define __pmd_val(x) ((x).pmd)
+-static inline unsigned long pmd_val(pmd_t x)
+-{
+- unsigned long ret = __pmd_val(x);
+-#if CONFIG_XEN_COMPAT <= 0x030002
+- if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
+-#else
+- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
+-#endif
+- return ret;
+-}
+-
+-#define __pud_val(x) ((x).pud)
+-static inline unsigned long pud_val(pud_t x)
+-{
+- unsigned long ret = __pud_val(x);
+- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
+- return ret;
+-}
+-
+-#define __pgd_val(x) ((x).pgd)
+-static inline unsigned long pgd_val(pgd_t x)
+-{
+- unsigned long ret = __pgd_val(x);
+- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
+- return ret;
+-}
+-
+-#define pgprot_val(x) ((x).pgprot)
+-
+-static inline pte_t __pte(unsigned long x)
+-{
+- if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
+- x = pte_phys_to_machine(x);
+- return ((pte_t) { (x) });
+-}
+-
+-static inline pmd_t __pmd(unsigned long x)
+-{
+- if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
+- return ((pmd_t) { (x) });
+-}
+-
+-static inline pud_t __pud(unsigned long x)
+-{
+- if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
+- return ((pud_t) { (x) });
+-}
+-
+-static inline pgd_t __pgd(unsigned long x)
+-{
+- if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
+- return ((pgd_t) { (x) });
+-}
+-
+-#define __pgprot(x) ((pgprot_t) { (x) } )
++#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
++#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
+
+-#endif /* !__ASSEMBLY__ */
++#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
+
+ #define __PHYSICAL_START CONFIG_PHYSICAL_START
+ #define __KERNEL_ALIGN 0x200000
+@@ -166,52 +42,58 @@ static inline pgd_t __pgd(unsigned long
+
+ #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
+ #define __START_KERNEL_map _AC(0xffffffff80000000, UL)
+-#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
+
+ #if CONFIG_XEN_COMPAT <= 0x030002
+ #undef LOAD_OFFSET
+ #define LOAD_OFFSET 0
+ #endif
+
+-/* to align the pointer to the (next) page boundary */
+-#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+-
+-#define KERNEL_TEXT_SIZE (40*1024*1024)
+-#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
++/* See Documentation/x86_64/mm.txt for a description of the memory map. */
++#define __PHYSICAL_MASK_SHIFT 46
++#define __VIRTUAL_MASK_SHIFT 48
+
+-#define PAGE_OFFSET __PAGE_OFFSET
++/*
++ * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
++ * arch/x86/kernel/head_64.S), and it is mapped here:
++ */
++#define KERNEL_IMAGE_SIZE (128*1024*1024)
++#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
+
+ #ifndef __ASSEMBLY__
++void clear_page(void *page);
++void copy_page(void *to, void *from);
++
++extern unsigned long end_pfn;
++extern unsigned long end_pfn_map;
++
+ static inline unsigned long __phys_addr(unsigned long x)
+ {
+- return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
++ return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : __PAGE_OFFSET);
+ }
+-#endif
+
+-#define __pa(x) __phys_addr((unsigned long)(x))
+-#define __pa_symbol(x) __phys_addr((unsigned long)(x))
++#define __phys_reloc_hide(x) (x)
+
+-#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
+-#define __boot_va(x) __va(x)
+-#define __boot_pa(x) __pa(x)
+-#ifdef CONFIG_FLATMEM
+-#define pfn_valid(pfn) ((pfn) < end_pfn)
+-#endif
++/*
++ * These are used to make use of C type-checking..
++ */
++typedef unsigned long pteval_t;
++typedef unsigned long pmdval_t;
++typedef unsigned long pudval_t;
++typedef unsigned long pgdval_t;
++typedef unsigned long pgprotval_t;
++typedef unsigned long phys_addr_t;
+
+-#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+-#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+-#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
+-
+-#define VM_DATA_DEFAULT_FLAGS \
+- (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
++typedef struct page *pgtable_t;
++
++typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
+
+-#define __HAVE_ARCH_GATE_AREA 1
+ #define vmemmap ((struct page *)VMEMMAP_START)
+
+-#include <asm-generic/memory_model.h>
+-#include <asm-generic/page.h>
++#endif /* !__ASSEMBLY__ */
++
++#ifdef CONFIG_FLATMEM
++#define pfn_valid(pfn) ((pfn) < max_mapnr)
++#endif
+
+-#endif /* __KERNEL__ */
+
+ #endif /* _X86_64_PAGE_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pci.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:33:40.000000000 +0100
+@@ -71,6 +71,7 @@ extern int pci_mmap_page_range(struct pc
+
+
+ #ifdef CONFIG_PCI
++extern void early_quirks(void);
+ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+ enum pci_dma_burst_strategy *strat,
+ unsigned long *strategy_parameter)
+@@ -78,9 +79,10 @@ static inline void pci_dma_burst_advice(
+ *strat = PCI_DMA_BURST_INFINITY;
+ *strategy_parameter = ~0UL;
+ }
++#else
++static inline void early_quirks(void) { }
+ #endif
+
+-
+ #endif /* __KERNEL__ */
+
+ #ifdef CONFIG_X86_32
+@@ -95,6 +97,19 @@ static inline void pci_dma_burst_advice(
+ /* generic pci stuff */
+ #include <asm-generic/pci.h>
+
++#ifdef CONFIG_NUMA
++/* Returns the node based on pci bus */
++static inline int __pcibus_to_node(struct pci_bus *bus)
++{
++ struct pci_sysdata *sd = bus->sysdata;
++
++ return sd->node;
++}
+
++static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
++{
++ return node_to_cpumask(__pcibus_to_node(bus));
++}
++#endif
+
+ #endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgalloc_32.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgalloc_32.h 2009-03-16 16:33:40.000000000 +0100
+@@ -3,69 +3,109 @@
+
+ #include <linux/threads.h>
+ #include <linux/mm.h> /* for struct page */
++#include <linux/pagemap.h>
++#include <asm/tlb.h>
++#include <asm-generic/tlb.h>
+ #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
+
+ #define paravirt_alloc_pt(mm, pfn) do { } while (0)
+-#define paravirt_alloc_pd(pfn) do { } while (0)
+-#define paravirt_alloc_pd(pfn) do { } while (0)
++#define paravirt_alloc_pd(mm, pfn) do { } while (0)
+ #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
+ #define paravirt_release_pt(pfn) do { } while (0)
+ #define paravirt_release_pd(pfn) do { } while (0)
+
+-#define pmd_populate_kernel(mm, pmd, pte) \
+-do { \
+- paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \
+- set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
+-} while (0)
+-
+-#define pmd_populate(mm, pmd, pte) \
+-do { \
+- unsigned long pfn = page_to_pfn(pte); \
+- paravirt_alloc_pt(mm, pfn); \
+- if (PagePinned(virt_to_page((mm)->pgd))) { \
+- if (!PageHighMem(pte)) \
+- BUG_ON(HYPERVISOR_update_va_mapping( \
+- (unsigned long)__va(pfn << PAGE_SHIFT), \
+- pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \
+- else if (!test_and_set_bit(PG_pinned, &pte->flags)) \
+- kmap_flush_unused(); \
+- set_pmd(pmd, \
+- __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
+- } else \
+- *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
+-} while (0)
++static inline void pmd_populate_kernel(struct mm_struct *mm,
++ pmd_t *pmd, pte_t *pte)
++{
++ paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
++ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
++}
++
++static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
++{
++ unsigned long pfn = page_to_pfn(pte);
++
++ paravirt_alloc_pt(mm, pfn);
++ if (PagePinned(virt_to_page(mm->pgd))) {
++ if (!PageHighMem(pte))
++ BUG_ON(HYPERVISOR_update_va_mapping(
++ (unsigned long)__va(pfn << PAGE_SHIFT),
++ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
++ else if (!test_and_set_bit(PG_pinned, &pte->flags))
++ kmap_flush_unused();
++ set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
++ } else
++ *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
++}
++#define pmd_pgtable(pmd) pmd_page(pmd)
+
+ /*
+ * Allocate and free page tables.
+ */
++extern void pgd_test_and_unpin(pgd_t *);
+ extern pgd_t *pgd_alloc(struct mm_struct *);
+-extern void pgd_free(pgd_t *pgd);
++extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+ extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+-extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
++extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
+-static inline void pte_free_kernel(pte_t *pte)
++static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+ {
+ make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
+ free_page((unsigned long)pte);
+ }
+
+-extern void pte_free(struct page *pte);
++extern void __pte_free(pgtable_t);
++static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
++{
++ __pte_free(pte);
++}
++
+
+-#define __pte_free_tlb(tlb,pte) \
+-do { \
+- paravirt_release_pt(page_to_pfn(pte)); \
+- tlb_remove_page((tlb),(pte)); \
+-} while (0)
++extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
+ #ifdef CONFIG_X86_PAE
+ /*
+ * In the PAE case we free the pmds as part of the pgd.
+ */
+-#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); })
+-#define pmd_free(x) do { } while (0)
+-#define __pmd_free_tlb(tlb,x) do { } while (0)
+-#define pud_populate(mm, pmd, pte) BUG()
+-#endif
++extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
++
++extern void __pmd_free(pgtable_t);
++static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
++{
++ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
++ __pmd_free(virt_to_page(pmd));
++}
++
++extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
++
++static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
++{
++ struct page *page = virt_to_page(pmd);
++ unsigned long pfn = page_to_pfn(page);
++
++ paravirt_alloc_pd(mm, pfn);
++
++ /* Note: almost everything apart from _PAGE_PRESENT is
++ reserved at the pmd (PDPT) level. */
++ if (PagePinned(virt_to_page(mm->pgd))) {
++ BUG_ON(PageHighMem(page));
++ BUG_ON(HYPERVISOR_update_va_mapping(
++ (unsigned long)__va(pfn << PAGE_SHIFT),
++ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
++ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
++ } else
++ *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
++
++ /*
++ * According to Intel App note "TLBs, Paging-Structure Caches,
++ * and Their Invalidation", April 2007, document 317080-001,
++ * section 8.1: in PAE mode we explicitly have to flush the
++ * TLB via cr3 if the top-level pgd is changed...
++ */
++ if (mm == current->active_mm)
++ xen_tlb_flush();
++}
++#endif /* CONFIG_X86_PAE */
+
+ #endif /* _I386_PGALLOC_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgalloc_64.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgalloc_64.h 2009-03-16 16:33:40.000000000 +0100
+@@ -6,30 +6,13 @@
+ #include <linux/mm.h>
+ #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
+
+-#include <xen/features.h>
+-void make_page_readonly(void *va, unsigned int feature);
+-void make_page_writable(void *va, unsigned int feature);
+-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
+-void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
++pmd_t *early_get_pmd(unsigned long va);
++void early_make_page_readonly(void *va, unsigned int feature);
+
+ #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
+
+-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
+-{
+- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
+-}
+-
+-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+-{
+- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
+- BUG_ON(HYPERVISOR_update_va_mapping(
+- (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
+- pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
+- set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
+- } else {
+- *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
+- }
+-}
++#define pmd_populate_kernel(mm, pmd, pte) \
++ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
+
+ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+ {
+@@ -63,53 +46,58 @@ static inline void pgd_populate(struct m
+ }
+ }
+
+-extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
+-extern void pte_free(struct page *pte);
++#define pmd_pgtable(pmd) pmd_page(pmd)
+
+-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
++static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+ {
+- struct page *pg;
+-
+- pg = pte_alloc_one(mm, addr);
+- return pg ? page_address(pg) : NULL;
++ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
++ BUG_ON(HYPERVISOR_update_va_mapping(
++ (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
++ pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
++ set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
++ } else {
++ *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
++ }
+ }
+
+-static inline void pmd_free(pmd_t *pmd)
++extern void __pmd_free(pgtable_t);
++static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+ {
+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+- pte_free(virt_to_page(pmd));
++ __pmd_free(virt_to_page(pmd));
+ }
+
++extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
++
+ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+- struct page *pg;
+-
+- pg = pte_alloc_one(mm, addr);
+- return pg ? page_address(pg) : NULL;
++ return (pud_t *)pmd_alloc_one(mm, addr);
+ }
+
+-static inline void pud_free(pud_t *pud)
++static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+ {
+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+- pte_free(virt_to_page(pud));
++ __pmd_free(virt_to_page(pud));
+ }
+
+ static inline void pgd_list_add(pgd_t *pgd)
+ {
+ struct page *page = virt_to_page(pgd);
++ unsigned long flags;
+
+- spin_lock(&pgd_lock);
++ spin_lock_irqsave(&pgd_lock, flags);
+ list_add(&page->lru, &pgd_list);
+- spin_unlock(&pgd_lock);
++ spin_unlock_irqrestore(&pgd_lock, flags);
+ }
+
+ static inline void pgd_list_del(pgd_t *pgd)
+ {
+ struct page *page = virt_to_page(pgd);
++ unsigned long flags;
+
+- spin_lock(&pgd_lock);
++ spin_lock_irqsave(&pgd_lock, flags);
+ list_del(&page->lru);
+- spin_unlock(&pgd_lock);
++ spin_unlock_irqrestore(&pgd_lock, flags);
+ }
+
+ extern void pgd_test_and_unpin(pgd_t *);
+@@ -145,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm
+ return pgd;
+ }
+
+-static inline void pgd_free(pgd_t *pgd)
++static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+ {
+ pgd_test_and_unpin(pgd);
+ pgd_list_del(pgd);
+@@ -161,17 +149,30 @@ static inline pte_t *pte_alloc_one_kerne
+ return pte;
+ }
+
++extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
++
+ /* Should really implement gc for free page table pages. This could be
+ done with a reference count in struct page. */
+
+-static inline void pte_free_kernel(pte_t *pte)
++static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+ {
+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+ make_page_writable(pte, XENFEAT_writable_page_tables);
+ free_page((unsigned long)pte);
+ }
+
+-#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
++extern void __pte_free(pgtable_t);
++static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
++{
++ __pte_free(pte);
++}
++
++#define __pte_free_tlb(tlb,pte) \
++do { \
++ pgtable_page_dtor((pte)); \
++ tlb_remove_page((tlb), (pte)); \
++} while (0)
++
+ #define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
+ #define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
+
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,5 +1,467 @@
++#ifndef _ASM_X86_PGTABLE_H
++#define _ASM_X86_PGTABLE_H
++
++#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
++#define FIRST_USER_ADDRESS 0
++
++#define _PAGE_BIT_PRESENT 0
++#define _PAGE_BIT_RW 1
++#define _PAGE_BIT_USER 2
++#define _PAGE_BIT_PWT 3
++#define _PAGE_BIT_PCD 4
++#define _PAGE_BIT_ACCESSED 5
++#define _PAGE_BIT_DIRTY 6
++#define _PAGE_BIT_FILE 6
++#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
++#define _PAGE_BIT_PAT 7 /* on 4KB pages */
++#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
++#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
++ * has no associated page struct. */
++#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
++#define _PAGE_BIT_UNUSED3 11
++#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
++#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
++
++/*
++ * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
++ * sign-extended value on 32-bit with all 1's in the upper word,
++ * which preserves the upper pte values on 64-bit ptes:
++ */
++#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
++#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
++#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
++#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
++#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
++#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
++#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
++#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
++#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
++#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
++#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
++#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
++#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
++#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
++
++#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
++#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
++#else
++#define _PAGE_NX 0
++#endif
++
++/* If _PAGE_PRESENT is clear, we use these: */
++#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
++#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
++ pte_present gives true */
++
++#ifndef __ASSEMBLY__
++#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
++extern unsigned int __kernel_page_user;
++#else
++#define __kernel_page_user 0
++#endif
++#endif
++
++#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
++#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
++
++#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
++
++#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
++#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
++
++#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
++#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
++#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
++#define PAGE_COPY PAGE_COPY_NOEXEC
++#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
++#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
++
++#ifdef CONFIG_X86_32
++#define _PAGE_KERNEL_EXEC \
++ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
++#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
++
++#ifndef __ASSEMBLY__
++extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
++#endif /* __ASSEMBLY__ */
++#else
++#define __PAGE_KERNEL_EXEC \
++ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
++#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
++#endif
++
++#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
++#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
++#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
++#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
++#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
++#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
++
++/*
++ * We don't support GLOBAL page in xenolinux64
++ */
++#define MAKE_GLOBAL(x) __pgprot((x))
++
++#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
++#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
++#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
++#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
++#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
++#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
++#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
++#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
++#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
++#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
++#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
++
++/* xwr */
++#define __P000 PAGE_NONE
++#define __P001 PAGE_READONLY
++#define __P010 PAGE_COPY
++#define __P011 PAGE_COPY
++#define __P100 PAGE_READONLY_EXEC
++#define __P101 PAGE_READONLY_EXEC
++#define __P110 PAGE_COPY_EXEC
++#define __P111 PAGE_COPY_EXEC
++
++#define __S000 PAGE_NONE
++#define __S001 PAGE_READONLY
++#define __S010 PAGE_SHARED
++#define __S011 PAGE_SHARED
++#define __S100 PAGE_READONLY_EXEC
++#define __S101 PAGE_READONLY_EXEC
++#define __S110 PAGE_SHARED_EXEC
++#define __S111 PAGE_SHARED_EXEC
++
++#ifndef __ASSEMBLY__
++
++/*
++ * ZERO_PAGE is a global shared page that is always zero: used
++ * for zero-mapped memory areas etc..
++ */
++extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
++#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
++
++extern spinlock_t pgd_lock;
++extern struct list_head pgd_list;
++
++/*
++ * The following only work if pte_present() is true.
++ * Undefined behaviour if not..
++ */
++static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
++static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
++static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
++static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
++static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
++static inline int pte_global(pte_t pte) { return 0; }
++static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
++
++static inline int pmd_large(pmd_t pte) {
++ return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
++ (_PAGE_PSE|_PAGE_PRESENT);
++}
++
++static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
++static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
++static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
++static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
++static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
++static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
++static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
++static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
++static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
++static inline pte_t pte_mkglobal(pte_t pte) { return pte; }
++static inline pte_t pte_clrglobal(pte_t pte) { return pte; }
++
++extern pteval_t __supported_pte_mask;
++
++static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
++{
++ return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
++ pgprot_val(pgprot)) & __supported_pte_mask);
++}
++
++static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
++{
++ return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
++ pgprot_val(pgprot)) & __supported_pte_mask);
++}
++
++static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
++{
++ return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
++ pgprot_val(pgprot)) & __supported_pte_mask);
++}
++
++static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
++{
++ pteval_t val = pte_val(pte);
++
++ val &= _PAGE_CHG_MASK;
++ val |= pgprot_val(newprot) & __supported_pte_mask;
++
++ return __pte(val);
++}
++
++#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
++
++#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
++
++#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
++#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
++
++#define set_pte_atomic(ptep, pte) \
++ xen_set_pte_atomic(ptep, pte)
++
++#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
++
++#ifndef __PAGETABLE_PUD_FOLDED
++#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd)
++#define pgd_clear(pgd) xen_pgd_clear(pgd)
++#endif
++
++#ifndef set_pud
++# define set_pud(pudp, pud) xen_set_pud(pudp, pud)
++#endif
++
++#ifndef __PAGETABLE_PMD_FOLDED
++#define pud_clear(pud) xen_pud_clear(pud)
++#endif
++
++#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
++#define pmd_clear(pmd) xen_pmd_clear(pmd)
++
++#define pte_update(mm, addr, ptep) do { } while (0)
++#define pte_update_defer(mm, addr, ptep) do { } while (0)
++
++#endif /* __ASSEMBLY__ */
++
+ #ifdef CONFIG_X86_32
+ # include "pgtable_32.h"
+ #else
+ # include "pgtable_64.h"
+ #endif
++
++#ifndef __ASSEMBLY__
++
++enum {
++ PG_LEVEL_NONE,
++ PG_LEVEL_4K,
++ PG_LEVEL_2M,
++ PG_LEVEL_1G,
++};
++
++/*
++ * Helper function that returns the kernel pagetable entry controlling
++ * the virtual address 'address'. NULL means no pagetable entry present.
++ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
++ * as a pte too.
++ */
++extern pte_t *lookup_address(unsigned long address, unsigned int *level);
++
++/* local pte updates need not use xchg for locking */
++static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
++{
++ xen_set_pte(ptep, __pte(0));
++ return res;
++}
++
++static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
++ pte_t *ptep , pte_t pte)
++{
++ if ((mm != current->mm && mm != &init_mm) ||
++ HYPERVISOR_update_va_mapping(addr, pte, 0))
++ xen_set_pte(ptep, pte);
++}
++
++static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
++ pte_t *ptep)
++{
++ if ((mm != current->mm && mm != &init_mm)
++ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
++ __xen_pte_clear(ptep);
++}
++
++#ifndef CONFIG_PARAVIRT
++/*
++ * Rules for using pte_update - it must be called after any PTE update which
++ * has not been done using the set_pte / clear_pte interfaces. It is used by
++ * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
++ * updates should either be sets, clears, or set_pte_atomic for P->P
++ * transitions, which means this hook should only be called for user PTEs.
++ * This hook implies a P->P protection or access change has taken place, which
++ * requires a subsequent TLB flush. The notification can optionally be delayed
++ * until the TLB flush event by using the pte_update_defer form of the
++ * interface, but care must be taken to assure that the flush happens while
++ * still holding the same page table lock so that the shadow and primary pages
++ * do not become out of sync on SMP.
++ */
++#define pte_update(mm, addr, ptep) do { } while (0)
++#define pte_update_defer(mm, addr, ptep) do { } while (0)
++#endif
++
++/*
++ * We only update the dirty/accessed state if we set
++ * the dirty bit by hand in the kernel, since the hardware
++ * will do the accessed bit for us, and we don't want to
++ * race with other CPU's that might be updating the dirty
++ * bit at the same time.
++ */
++#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
++#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
++({ \
++ int __changed = !pte_same(*(ptep), entry); \
++ if (__changed && (dirty)) { \
++ if ( likely((vma)->vm_mm == current->mm) ) { \
++ BUG_ON(HYPERVISOR_update_va_mapping(address, \
++ entry, \
++ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
++ UVMF_INVLPG|UVMF_MULTI)); \
++ } else { \
++ xen_l1_entry_update(ptep, entry); \
++ flush_tlb_page(vma, address); \
++ } \
++ } \
++ __changed; \
++})
++
++#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
++#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
++ int __ret = 0; \
++ if (pte_young(*(ptep))) \
++ __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
++ &(ptep)->pte); \
++ if (__ret) \
++ pte_update((vma)->vm_mm, addr, ptep); \
++ __ret; \
++})
++
++#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
++#define ptep_clear_flush_young(vma, address, ptep) \
++({ \
++ pte_t __pte = *(ptep); \
++ int __young = pte_young(__pte); \
++ __pte = pte_mkold(__pte); \
++ if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
++ (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
++ else if (__young) \
++ (ptep)->pte_low = __pte.pte_low; \
++ __young; \
++})
++
++#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
++#define ptep_clear_flush(vma, addr, ptep) \
++({ \
++ pte_t *__ptep = (ptep); \
++ pte_t __res = *__ptep; \
++ if (!pte_none(__res) && \
++ ((vma)->vm_mm != current->mm || \
++ HYPERVISOR_update_va_mapping(addr, __pte(0), \
++ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
++ UVMF_INVLPG|UVMF_MULTI))) { \
++ __xen_pte_clear(__ptep); \
++ flush_tlb_page(vma, addr); \
++ } \
++ __res; \
++})
++
++#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
++static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
++{
++ pte_t pte = *ptep;
++ if (!pte_none(pte)
++ && (mm != &init_mm
++ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
++ pte = xen_ptep_get_and_clear(ptep, pte);
++ pte_update(mm, addr, ptep);
++ }
++ return pte;
++}
++
++#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
++#define ptep_get_and_clear_full(mm, addr, ptep, full) \
++ ((full) ? ({ \
++ pte_t *__ptep = (ptep); \
++ pte_t __res = *__ptep; \
++ if (!PagePinned(virt_to_page((mm)->pgd))) \
++ __xen_pte_clear(__ptep); \
++ else if (!pte_none(__res)) \
++ xen_l1_entry_update(__ptep, __pte(0)); \
++ __res; \
++ }) : \
++ ptep_get_and_clear(mm, addr, ptep))
++
++pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
++
++#define __HAVE_ARCH_PTEP_SET_WRPROTECT
++static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
++{
++ pte_t pte = *ptep;
++ if (pte_write(pte))
++ set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
++}
++
++#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
++ xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
++
++#define arbitrary_virt_to_machine(va) \
++({ \
++ unsigned int __lvl; \
++ pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \
++ BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
++ (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT) \
++ | ((unsigned long)(va) & (PAGE_SIZE - 1))); \
++})
++
++#ifdef CONFIG_HIGHPTE
++#include <asm/io.h>
++struct page *kmap_atomic_to_page(void *);
++#define ptep_to_machine(ptep) \
++({ \
++ pte_t *__ptep = (ptep); \
++ page_to_phys(kmap_atomic_to_page(__ptep)) \
++ | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \
++})
++#else
++#define ptep_to_machine(ptep) virt_to_machine(ptep)
++#endif
++
++#include <asm-generic/pgtable.h>
++
++#include <xen/features.h>
++void make_page_readonly(void *va, unsigned int feature);
++void make_page_writable(void *va, unsigned int feature);
++void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
++void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
++
++struct vm_area_struct;
++
++int direct_remap_pfn_range(struct vm_area_struct *vma,
++ unsigned long address,
++ unsigned long mfn,
++ unsigned long size,
++ pgprot_t prot,
++ domid_t domid);
++int direct_kernel_remap_pfn_range(unsigned long address,
++ unsigned long mfn,
++ unsigned long size,
++ pgprot_t prot,
++ domid_t domid);
++int create_lookup_pte_addr(struct mm_struct *mm,
++ unsigned long address,
++ uint64_t *ptep);
++int touch_pte_range(struct mm_struct *mm,
++ unsigned long address,
++ unsigned long size);
++
++int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
++ unsigned long addr, unsigned long end, pgprot_t newprot,
++ int dirty_accountable);
++
++#endif /* __ASSEMBLY__ */
++
++#endif /* _ASM_X86_PGTABLE_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:33:40.000000000 +0100
+@@ -18,16 +18,18 @@
+ printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
+ &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+
+-#define pud_none(pud) 0
+-#define pud_bad(pud) 0
+-#define pud_present(pud) 1
+
+-/*
+- * All present pages with !NX bit are kernel-executable:
+- */
+-static inline int pte_exec_kernel(pte_t pte)
++static inline int pud_none(pud_t pud)
++{
++ return __pud_val(pud) == 0;
++}
++static inline int pud_bad(pud_t pud)
+ {
+- return !(__pte_val(pte) & _PAGE_NX);
++ return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
++}
++static inline int pud_present(pud_t pud)
++{
++ return __pud_val(pud) & _PAGE_PRESENT;
+ }
+
+ /* Rules for using set_pte: the pte being assigned *must* be
+@@ -44,14 +46,6 @@ static inline void xen_set_pte(pte_t *pt
+ ptep->pte_low = pte.pte_low;
+ }
+
+-static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+- pte_t *ptep , pte_t pte)
+-{
+- if ((mm != current->mm && mm != &init_mm) ||
+- HYPERVISOR_update_va_mapping(addr, pte, 0))
+- xen_set_pte(ptep, pte);
+-}
+-
+ static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+ {
+ set_64bit((unsigned long long *)(ptep),__pte_val(pte));
+@@ -70,14 +64,11 @@ static inline void xen_set_pud(pud_t *pu
+ * entry, so clear the bottom half first and enforce ordering with a compiler
+ * barrier.
+ */
+-static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
++static inline void __xen_pte_clear(pte_t *ptep)
+ {
+- if ((mm != current->mm && mm != &init_mm)
+- || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
+- ptep->pte_low = 0;
+- smp_wmb();
+- ptep->pte_high = 0;
+- }
++ ptep->pte_low = 0;
++ smp_wmb();
++ ptep->pte_high = 0;
+ }
+
+ static inline void xen_pmd_clear(pmd_t *pmd)
+@@ -85,21 +76,25 @@ static inline void xen_pmd_clear(pmd_t *
+ xen_l2_entry_update(pmd, __pmd(0));
+ }
+
+-#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
+-#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
+-#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte)
+-#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
+-#define set_pud(pudp, pud) xen_set_pud(pudp, pud)
+-#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
+-#define pmd_clear(pmd) xen_pmd_clear(pmd)
++static inline void pud_clear(pud_t *pudp)
++{
++ pgdval_t pgd;
++
++ set_pud(pudp, __pud(0));
+
+-/*
+- * Pentium-II erratum A13: in PAE mode we explicitly have to flush
+- * the TLB via cr3 if the top-level pgd is changed...
+- * We do not let the generic code free and clear pgd entries due to
+- * this erratum.
+- */
+-static inline void pud_clear (pud_t * pud) { }
++ /*
++ * According to Intel App note "TLBs, Paging-Structure Caches,
++ * and Their Invalidation", April 2007, document 317080-001,
++ * section 8.1: in PAE mode we explicitly have to flush the
++ * TLB via cr3 if the top-level pgd is changed...
++ *
++ * Make sure the pud entry we're updating is within the
++ * current pgd to avoid unnecessary TLB flushes.
++ */
++ pgd = read_cr3();
++ if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
++ xen_tlb_flush();
++}
+
+ #define pud_page(pud) \
+ ((struct page *) __va(pud_val(pud) & PAGE_MASK))
+@@ -128,24 +123,6 @@ static inline pte_t xen_ptep_get_and_cle
+ #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
+ #endif
+
+-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
+-#define ptep_clear_flush(vma, addr, ptep) \
+-({ \
+- pte_t *__ptep = (ptep); \
+- pte_t __res = *__ptep; \
+- if (!pte_none(__res) && \
+- ((vma)->vm_mm != current->mm || \
+- HYPERVISOR_update_va_mapping(addr, __pte(0), \
+- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
+- UVMF_INVLPG|UVMF_MULTI))) { \
+- __ptep->pte_low = 0; \
+- smp_wmb(); \
+- __ptep->pte_high = 0; \
+- flush_tlb_page(vma, addr); \
+- } \
+- __res; \
+-})
+-
+ #define __HAVE_ARCH_PTE_SAME
+ static inline int pte_same(pte_t a, pte_t b)
+ {
+@@ -168,26 +145,12 @@ static inline int pte_none(pte_t pte)
+ mfn_to_local_pfn(__pte_mfn(_pte)) : \
+ __pte_mfn(_pte))
+
+-extern unsigned long long __supported_pte_mask;
+-
+-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+-{
+- return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
+- pgprot_val(pgprot)) & __supported_pte_mask);
+-}
+-
+-static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+-{
+- return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
+- pgprot_val(pgprot)) & __supported_pte_mask);
+-}
+-
+ /*
+ * Bits 0, 6 and 7 are taken in the low part of the pte,
+ * put the 32 bits of offset into the high part.
+ */
+ #define pte_to_pgoff(pte) ((pte).pte_high)
+-#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
++#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
+ #define PTE_FILE_MAX_BITS 32
+
+ /* Encode and de-code a swap entry */
+@@ -195,8 +158,6 @@ static inline pmd_t pfn_pmd(unsigned lon
+ #define __swp_offset(x) ((x).val >> 5)
+ #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
+ #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
+-#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })
+-
+-#define __pmd_free_tlb(tlb, x) do { } while (0)
++#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+
+ #endif /* _I386_PGTABLE_3LEVEL_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,8 +1,6 @@
+ #ifndef _I386_PGTABLE_H
+ #define _I386_PGTABLE_H
+
+-#include <asm/hypervisor.h>
+-
+ /*
+ * The Linux memory management assumes a three-level page table setup. On
+ * the i386, we use that, but "fold" the mid level into the top-level page
+@@ -25,20 +23,10 @@
+
+ struct vm_area_struct;
+
+-/*
+- * ZERO_PAGE is a global shared page that is always zero: used
+- * for zero-mapped memory areas etc..
+- */
+-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+-extern unsigned long empty_zero_page[1024];
+ extern pgd_t *swapper_pg_dir;
+-extern struct kmem_cache *pmd_cache;
+-extern spinlock_t pgd_lock;
+-extern struct page *pgd_list;
+-void check_pgt_cache(void);
+
+-void pmd_ctor(struct kmem_cache *, void *);
+-void pgtable_cache_init(void);
++static inline void pgtable_cache_init(void) { }
++static inline void check_pgt_cache(void) { }
+ void paging_init(void);
+
+
+@@ -58,16 +46,9 @@ void paging_init(void);
+ #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
+ #define PGDIR_MASK (~(PGDIR_SIZE-1))
+
+-#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
+-#define FIRST_USER_ADDRESS 0
+-
+ #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
+ #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
+
+-#define TWOLEVEL_PGDIR_SHIFT 22
+-#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
+-#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
+-
+ /* Just any arbitrary offset to the start of the vmalloc VM area: the
+ * current 8MB value just means that there will be a 8MB "hole" after the
+ * physical memory until the kernel virtual memory starts. That means that
+@@ -78,121 +59,19 @@ void paging_init(void);
+ #define VMALLOC_OFFSET (8*1024*1024)
+ #define VMALLOC_START (((unsigned long) high_memory + \
+ 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
+-#ifdef CONFIG_HIGHMEM
+-# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
+-#else
+-# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
+-#endif
+-
+-/*
+- * _PAGE_PSE set in the page directory entry just means that
+- * the page directory entry points directly to a 4MB-aligned block of
+- * memory.
+- */
+-#define _PAGE_BIT_PRESENT 0
+-#define _PAGE_BIT_RW 1
+-#define _PAGE_BIT_USER 2
+-#define _PAGE_BIT_PWT 3
+-#define _PAGE_BIT_PCD 4
+-#define _PAGE_BIT_ACCESSED 5
+-#define _PAGE_BIT_DIRTY 6
+-#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
+-#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+-/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */
+-#define _PAGE_BIT_UNUSED2 10
+-#define _PAGE_BIT_UNUSED3 11
+-#define _PAGE_BIT_NX 63
+-
+-#define _PAGE_PRESENT 0x001
+-#define _PAGE_RW 0x002
+-#define _PAGE_USER 0x004
+-#define _PAGE_PWT 0x008
+-#define _PAGE_PCD 0x010
+-#define _PAGE_ACCESSED 0x020
+-#define _PAGE_DIRTY 0x040
+-#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
+-#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
+-/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */
+-#define _PAGE_UNUSED2 0x400
+-#define _PAGE_UNUSED3 0x800
+-
+-/* If _PAGE_PRESENT is clear, we use these: */
+-#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
+-#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
+- pte_present gives true */
+ #ifdef CONFIG_X86_PAE
+-#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
++#define LAST_PKMAP 512
+ #else
+-#define _PAGE_NX 0
++#define LAST_PKMAP 1024
+ #endif
+
+-/* Mapped page is I/O or foreign and has no associated page struct. */
+-#define _PAGE_IO 0x200
++#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
+
+-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
+-
+-#define PAGE_NONE \
+- __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+-#define PAGE_SHARED \
+- __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
+-
+-#define PAGE_SHARED_EXEC \
+- __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
+-#define PAGE_COPY_NOEXEC \
+- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+-#define PAGE_COPY_EXEC \
+- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+-#define PAGE_COPY \
+- PAGE_COPY_NOEXEC
+-#define PAGE_READONLY \
+- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+-#define PAGE_READONLY_EXEC \
+- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+-
+-#define _PAGE_KERNEL \
+- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
+-#define _PAGE_KERNEL_EXEC \
+- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
+-
+-extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
+-#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
+-#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+-#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
+-#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+-#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+-
+-#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
+-#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
+-#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
+-#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
+-#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
+-#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
+-#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+-
+-/*
+- * The i386 can't do page protection for execute, and considers that
+- * the same are read. Also, write permissions imply read permissions.
+- * This is the closest we can get..
+- */
+-#define __P000 PAGE_NONE
+-#define __P001 PAGE_READONLY
+-#define __P010 PAGE_COPY
+-#define __P011 PAGE_COPY
+-#define __P100 PAGE_READONLY_EXEC
+-#define __P101 PAGE_READONLY_EXEC
+-#define __P110 PAGE_COPY_EXEC
+-#define __P111 PAGE_COPY_EXEC
+-
+-#define __S000 PAGE_NONE
+-#define __S001 PAGE_READONLY
+-#define __S010 PAGE_SHARED
+-#define __S011 PAGE_SHARED
+-#define __S100 PAGE_READONLY_EXEC
+-#define __S101 PAGE_READONLY_EXEC
+-#define __S110 PAGE_SHARED_EXEC
+-#define __S111 PAGE_SHARED_EXEC
++#ifdef CONFIG_HIGHMEM
++# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
++#else
++# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
++#endif
+
+ /*
+ * Define this if things work differently on an i386 and an i486:
+@@ -221,28 +100,6 @@ extern unsigned long pg0[];
+
+ #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
+
+-/*
+- * The following only work if pte_present() is true.
+- * Undefined behaviour if not..
+- */
+-static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
+-static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
+-static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
+-static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; }
+-
+-/*
+- * The following only works if pte_present() is not true.
+- */
+-static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
+-
+-static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
+-static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
+-static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
+-static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
+-static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
+-static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
+-static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
+-
+ #ifdef CONFIG_X86_PAE
+ # include <asm/pgtable-3level.h>
+ #else
+@@ -250,111 +107,6 @@ static inline pte_t pte_mkhuge(pte_t pte
+ #endif
+
+ /*
+- * Rules for using pte_update - it must be called after any PTE update which
+- * has not been done using the set_pte / clear_pte interfaces. It is used by
+- * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
+- * updates should either be sets, clears, or set_pte_atomic for P->P
+- * transitions, which means this hook should only be called for user PTEs.
+- * This hook implies a P->P protection or access change has taken place, which
+- * requires a subsequent TLB flush. The notification can optionally be delayed
+- * until the TLB flush event by using the pte_update_defer form of the
+- * interface, but care must be taken to assure that the flush happens while
+- * still holding the same page table lock so that the shadow and primary pages
+- * do not become out of sync on SMP.
+- */
+-#define pte_update(mm, addr, ptep) do { } while (0)
+-#define pte_update_defer(mm, addr, ptep) do { } while (0)
+-
+-/* local pte updates need not use xchg for locking */
+-static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
+-{
+- xen_set_pte(ptep, __pte(0));
+- return res;
+-}
+-
+-/*
+- * We only update the dirty/accessed state if we set
+- * the dirty bit by hand in the kernel, since the hardware
+- * will do the accessed bit for us, and we don't want to
+- * race with other CPU's that might be updating the dirty
+- * bit at the same time.
+- */
+-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+-#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
+-({ \
+- int __changed = !pte_same(*(ptep), entry); \
+- if (__changed && (dirty)) { \
+- if ( likely((vma)->vm_mm == current->mm) ) { \
+- BUG_ON(HYPERVISOR_update_va_mapping(address, \
+- entry, \
+- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
+- UVMF_INVLPG|UVMF_MULTI)); \
+- } else { \
+- xen_l1_entry_update(ptep, entry); \
+- flush_tlb_page(vma, address); \
+- } \
+- } \
+- __changed; \
+-})
+-
+-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+-#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
+- int __ret = 0; \
+- if (pte_young(*(ptep))) \
+- __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
+- &(ptep)->pte_low); \
+- if (__ret) \
+- pte_update((vma)->vm_mm, addr, ptep); \
+- __ret; \
+-})
+-
+-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+-#define ptep_clear_flush_young(vma, address, ptep) \
+-({ \
+- pte_t __pte = *(ptep); \
+- int __young = pte_young(__pte); \
+- __pte = pte_mkold(__pte); \
+- if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
+- (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
+- else if (__young) \
+- (ptep)->pte_low = __pte.pte_low; \
+- __young; \
+-})
+-
+-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+-{
+- pte_t pte = *ptep;
+- if (!pte_none(pte)
+- && (mm != &init_mm
+- || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
+- pte = xen_ptep_get_and_clear(ptep, pte);
+- pte_update(mm, addr, ptep);
+- }
+- return pte;
+-}
+-
+-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+-#define ptep_get_and_clear_full(mm, addr, ptep, full) \
+- ((full) ? ({ \
+- pte_t __res = *(ptep); \
+- if (PagePinned(virt_to_page((mm)->pgd))) \
+- xen_l1_entry_update(ptep, __pte(0)); \
+- else \
+- *(ptep) = __pte(0); \
+- __res; \
+- }) : \
+- ptep_get_and_clear(mm, addr, ptep))
+-
+-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+-{
+- pte_t pte = *ptep;
+- if (pte_write(pte))
+- set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
+-}
+-
+-/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ * dst - pointer to pgd range anwhere on a pgd page
+@@ -383,26 +135,6 @@ static inline void clone_pgd_range(pgd_t
+
+ #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+
+-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+-{
+- /*
+- * Since this might change the present bit (which controls whether
+- * a pte_t object has undergone p2m translation), we must use
+- * pte_val() on the input pte and __pte() for the return value.
+- */
+- paddr_t pteval = pte_val(pte);
+-
+- pteval &= _PAGE_CHG_MASK;
+- pteval |= pgprot_val(newprot);
+-#ifdef CONFIG_X86_PAE
+- pteval &= __supported_pte_mask;
+-#endif
+- return __pte(pteval);
+-}
+-
+-#define pmd_large(pmd) \
+-((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
+-
+ /*
+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
+ *
+@@ -424,6 +156,8 @@ static inline pte_t pte_modify(pte_t pte
+ */
+ #define pgd_offset_k(address) pgd_offset(&init_mm, address)
+
++static inline int pud_large(pud_t pud) { return 0; }
++
+ /*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+@@ -449,26 +183,6 @@ static inline pte_t pte_modify(pte_t pte
+ #define pmd_page_vaddr(pmd) \
+ ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
+
+-/*
+- * Helper function that returns the kernel pagetable entry controlling
+- * the virtual address 'address'. NULL means no pagetable entry present.
+- * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+- * as a pte too.
+- */
+-extern pte_t *lookup_address(unsigned long address);
+-
+-/*
+- * Make a given kernel text page executable/non-executable.
+- * Returns the previous executability setting of that page (which
+- * is used to restore the previous state). Used by the SMP bootup code.
+- * NOTE: this is an __init function for security reasons.
+- */
+-#ifdef CONFIG_X86_PAE
+- extern int set_kernel_exec(unsigned long vaddr, int enable);
+-#else
+- static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
+-#endif
+-
+ #if defined(CONFIG_HIGHPTE)
+ #define pte_offset_map(dir, address) \
+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
+@@ -496,72 +210,22 @@ extern pte_t *lookup_address(unsigned lo
+ */
+ #define update_mmu_cache(vma,address,pte) do { } while (0)
+
+-#include <xen/features.h>
+ void make_lowmem_page_readonly(void *va, unsigned int feature);
+ void make_lowmem_page_writable(void *va, unsigned int feature);
+-void make_page_readonly(void *va, unsigned int feature);
+-void make_page_writable(void *va, unsigned int feature);
+-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
+-void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
+-
+-#define virt_to_ptep(va) \
+-({ \
+- pte_t *__ptep = lookup_address((unsigned long)(va)); \
+- BUG_ON(!__ptep || !pte_present(*__ptep)); \
+- __ptep; \
+-})
+-
+-#define arbitrary_virt_to_machine(va) \
+- (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
+- | ((unsigned long)(va) & (PAGE_SIZE - 1)))
+-
+-#ifdef CONFIG_HIGHPTE
+-#include <asm/io.h>
+-struct page *kmap_atomic_to_page(void *);
+-#define ptep_to_machine(ptep) \
+-({ \
+- pte_t *__ptep = (ptep); \
+- page_to_phys(kmap_atomic_to_page(__ptep)) \
+- | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \
+-})
+-#else
+-#define ptep_to_machine(ptep) virt_to_machine(ptep)
+-#endif
+
+ #endif /* !__ASSEMBLY__ */
+
++/*
++ * kern_addr_valid() is (1) for FLATMEM and (0) for
++ * SPARSEMEM and DISCONTIGMEM
++ */
+ #ifdef CONFIG_FLATMEM
+ #define kern_addr_valid(addr) (1)
+-#endif /* CONFIG_FLATMEM */
+-
+-int direct_remap_pfn_range(struct vm_area_struct *vma,
+- unsigned long address,
+- unsigned long mfn,
+- unsigned long size,
+- pgprot_t prot,
+- domid_t domid);
+-int direct_kernel_remap_pfn_range(unsigned long address,
+- unsigned long mfn,
+- unsigned long size,
+- pgprot_t prot,
+- domid_t domid);
+-int create_lookup_pte_addr(struct mm_struct *mm,
+- unsigned long address,
+- uint64_t *ptep);
+-int touch_pte_range(struct mm_struct *mm,
+- unsigned long address,
+- unsigned long size);
+-
+-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+- unsigned long addr, unsigned long end, pgprot_t newprot,
+- int dirty_accountable);
+-
+-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
+- xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
++#else
++#define kern_addr_valid(kaddr) (0)
++#endif
+
+ #define io_remap_pfn_range(vma,from,pfn,size,prot) \
+ direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
+
+-#include <asm-generic/pgtable.h>
+-
+ #endif /* _I386_PGTABLE_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:33:40.000000000 +0100
+@@ -13,49 +13,26 @@
+ #include <linux/threads.h>
+ #include <linux/sched.h>
+ #include <asm/pda.h>
+-#ifdef CONFIG_XEN
+-#include <asm/hypervisor.h>
+
++#ifdef CONFIG_XEN
+ extern pud_t level3_user_pgt[512];
+
+ extern void xen_init_pt(void);
+-
+-extern pte_t *lookup_address(unsigned long address);
+-
+-#define virt_to_ptep(va) \
+-({ \
+- pte_t *__ptep = lookup_address((unsigned long)(va)); \
+- BUG_ON(!__ptep || !pte_present(*__ptep)); \
+- __ptep; \
+-})
+-
+-#define arbitrary_virt_to_machine(va) \
+- (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
+- | ((unsigned long)(va) & (PAGE_SIZE - 1)))
+-
+-#define ptep_to_machine(ptep) virt_to_machine(ptep)
+ #endif
+
+ extern pud_t level3_kernel_pgt[512];
+ extern pud_t level3_ident_pgt[512];
+ extern pmd_t level2_kernel_pgt[512];
+ extern pgd_t init_level4_pgt[];
+-extern unsigned long __supported_pte_mask;
+
+ #define swapper_pg_dir init_level4_pgt
+
+ extern void paging_init(void);
+-extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
+-
+-/*
+- * ZERO_PAGE is a global shared page that is always zero: used
+- * for zero-mapped memory areas etc..
+- */
+-extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
+-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+
+ #endif /* !__ASSEMBLY__ */
+
++#define SHARED_KERNEL_PMD 1
++
+ /*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+@@ -98,31 +75,63 @@ extern unsigned long empty_zero_page[PAG
+ #define pgd_none(x) (!__pgd_val(x))
+ #define pud_none(x) (!__pud_val(x))
+
+-static inline void set_pte(pte_t *dst, pte_t val)
++struct mm_struct;
++
++#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
++
++static inline void xen_set_pte(pte_t *ptep, pte_t pte)
++{
++ *ptep = pte;
++}
++
++static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+ {
+- *dst = val;
++ xen_set_pte(ptep, pte);
+ }
+
+-#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
+-#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
+-#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
++#ifdef CONFIG_SMP
++static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
++{
++ return __pte_ma(xchg(&xp->pte, 0));
++}
++#else
++#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
++#endif
+
+-static inline void pud_clear (pud_t * pud)
++static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
+ {
+- set_pud(pud, __pud(0));
++ xen_l2_entry_update(pmdp, pmd);
++}
++
++static inline void xen_pmd_clear(pmd_t *pmd)
++{
++ xen_set_pmd(pmd, xen_make_pmd(0));
++}
++
++static inline void xen_set_pud(pud_t *pudp, pud_t pud)
++{
++ xen_l3_entry_update(pudp, pud);
++}
++
++static inline void xen_pud_clear(pud_t *pud)
++{
++ xen_set_pud(pud, xen_make_pud(0));
+ }
+
+ #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
+
+-static inline void pgd_clear (pgd_t * pgd)
++static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
+- set_pgd(pgd, __pgd(0));
+- set_pgd(__user_pgd(pgd), __pgd(0));
++ xen_l4_entry_update(pgdp, pgd);
+ }
+
+-#define pte_same(a, b) ((a).pte == (b).pte)
++static inline void xen_pgd_clear(pgd_t * pgd)
++{
++ xen_set_pgd(pgd, xen_make_pgd(0));
++ xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
++}
+
+-#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
++#define pte_same(a, b) ((a).pte == (b).pte)
+
+ #endif /* !__ASSEMBLY__ */
+
+@@ -133,8 +142,6 @@ static inline void pgd_clear (pgd_t * pg
+ #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
+ #define PGDIR_MASK (~(PGDIR_SIZE-1))
+
+-#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
+-#define FIRST_USER_ADDRESS 0
+
+ #define MAXMEM _AC(0x3fffffffffff, UL)
+ #define VMALLOC_START _AC(0xffffc20000000000, UL)
+@@ -144,105 +151,6 @@ static inline void pgd_clear (pgd_t * pg
+ #define MODULES_END _AC(0xfffffffffff00000, UL)
+ #define MODULES_LEN (MODULES_END - MODULES_VADDR)
+
+-#define _PAGE_BIT_PRESENT 0
+-#define _PAGE_BIT_RW 1
+-#define _PAGE_BIT_USER 2
+-#define _PAGE_BIT_PWT 3
+-#define _PAGE_BIT_PCD 4
+-#define _PAGE_BIT_ACCESSED 5
+-#define _PAGE_BIT_DIRTY 6
+-#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
+-#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+-#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+-
+-#define _PAGE_PRESENT 0x001
+-#define _PAGE_RW 0x002
+-#define _PAGE_USER 0x004
+-#define _PAGE_PWT 0x008
+-#define _PAGE_PCD 0x010
+-#define _PAGE_ACCESSED 0x020
+-#define _PAGE_DIRTY 0x040
+-#define _PAGE_PSE 0x080 /* 2MB page */
+-#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
+-#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
+-
+-#define _PAGE_PROTNONE 0x080 /* If not present */
+-#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX)
+-
+-/* Mapped page is I/O or foreign and has no associated page struct. */
+-#define _PAGE_IO 0x200
+-
+-#ifndef __ASSEMBLY__
+-#if CONFIG_XEN_COMPAT <= 0x030002
+-extern unsigned int __kernel_page_user;
+-#else
+-#define __kernel_page_user 0
+-#endif
+-#endif
+-
+-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
+-
+-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
+-
+-#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+-#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
+-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+-#define PAGE_COPY PAGE_COPY_NOEXEC
+-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+-#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+-#define __PAGE_KERNEL \
+- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
+-#define __PAGE_KERNEL_EXEC \
+- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
+-#define __PAGE_KERNEL_NOCACHE \
+- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
+-#define __PAGE_KERNEL_RO \
+- (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
+-#define __PAGE_KERNEL_VSYSCALL \
+- (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+-#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
+- (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
+-#define __PAGE_KERNEL_LARGE \
+- (__PAGE_KERNEL | _PAGE_PSE)
+-#define __PAGE_KERNEL_LARGE_EXEC \
+- (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+-
+-/*
+- * We don't support GLOBAL page in xenolinux64
+- */
+-#define MAKE_GLOBAL(x) __pgprot((x))
+-
+-#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
+-#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
+-#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
+-#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
+-#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
+-#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
+-#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
+-#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+-
+-/* xwr */
+-#define __P000 PAGE_NONE
+-#define __P001 PAGE_READONLY
+-#define __P010 PAGE_COPY
+-#define __P011 PAGE_COPY
+-#define __P100 PAGE_READONLY_EXEC
+-#define __P101 PAGE_READONLY_EXEC
+-#define __P110 PAGE_COPY_EXEC
+-#define __P111 PAGE_COPY_EXEC
+-
+-#define __S000 PAGE_NONE
+-#define __S001 PAGE_READONLY
+-#define __S010 PAGE_SHARED
+-#define __S011 PAGE_SHARED
+-#define __S100 PAGE_READONLY_EXEC
+-#define __S101 PAGE_READONLY_EXEC
+-#define __S110 PAGE_SHARED_EXEC
+-#define __S111 PAGE_SHARED_EXEC
+-
+ #ifndef __ASSEMBLY__
+
+ static inline unsigned long pgd_bad(pgd_t pgd)
+@@ -260,119 +168,26 @@ static inline unsigned long pmd_bad(pmd_
+ return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
+ }
+
+-#define set_pte_at(_mm,addr,ptep,pteval) do { \
+- if (((_mm) != current->mm && (_mm) != &init_mm) || \
+- HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
+- set_pte((ptep), (pteval)); \
+-} while (0)
+-
+ #define pte_none(x) (!(x).pte)
+ #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
+-#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
+
+-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
++#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
+
+ #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
+ #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
+ __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
+-#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \
++#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
+ (_pte).pte & _PAGE_PRESENT ? \
+ mfn_to_local_pfn(__pte_mfn(_pte)) : \
+ __pte_mfn(_pte))
+
+ #define pte_page(x) pfn_to_page(pte_pfn(x))
+
+-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+-{
+- unsigned long pte = page_nr << PAGE_SHIFT;
+- pte |= pgprot_val(pgprot);
+- pte &= __supported_pte_mask;
+- return __pte(pte);
+-}
+-
+-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+-{
+- pte_t pte = *ptep;
+- if (!pte_none(pte)) {
+- if ((mm != &init_mm) ||
+- HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+- pte = __pte_ma(xchg(&ptep->pte, 0));
+- }
+- return pte;
+-}
+-
+-static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
+-{
+- if (full) {
+- pte_t pte = *ptep;
+- if (PagePinned(virt_to_page(mm->pgd)))
+- xen_l1_entry_update(ptep, __pte(0));
+- else
+- *ptep = __pte(0);
+- return pte;
+- }
+- return ptep_get_and_clear(mm, addr, ptep);
+-}
+-
+-#define ptep_clear_flush(vma, addr, ptep) \
+-({ \
+- pte_t *__ptep = (ptep); \
+- pte_t __res = *__ptep; \
+- if (!pte_none(__res) && \
+- ((vma)->vm_mm != current->mm || \
+- HYPERVISOR_update_va_mapping(addr, __pte(0), \
+- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
+- UVMF_INVLPG|UVMF_MULTI))) { \
+- __ptep->pte = 0; \
+- flush_tlb_page(vma, addr); \
+- } \
+- __res; \
+-})
+-
+-/*
+- * The following only work if pte_present() is true.
+- * Undefined behaviour if not..
+- */
+-#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
+-static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
+-static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
+-static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
+-static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
+-static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
+-
+-static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
+-static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
+-static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
+-static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; }
+-static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
+-static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
+-static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
+-static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
+-static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
+-
+-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+-{
+- if (!pte_young(*ptep))
+- return 0;
+- return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
+-}
+-
+-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+-{
+- pte_t pte = *ptep;
+- if (pte_write(pte))
+- set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
+-}
+-
+ /*
+ * Macro to mark a page protection value as "uncacheable".
+ */
+ #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
+
+-static inline int pmd_large(pmd_t pte) {
+- return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
+-}
+-
+
+ /*
+ * Conversion functions: convert a page and protection to a page entry,
+@@ -388,6 +203,7 @@ static inline int pmd_large(pmd_t pte) {
+ #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
+ #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
+ #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
++static inline int pgd_large(pgd_t pgd) { return 0; }
+ #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+
+ /* PUD - Level3 access */
+@@ -398,6 +214,12 @@ static inline int pmd_large(pmd_t pte) {
+ #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
+ #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
+
++static inline int pud_large(pud_t pte)
++{
++ return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
++ (_PAGE_PSE|_PAGE_PRESENT);
++}
++
+ /* PMD - Level 2 access */
+ #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
+ #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
+@@ -413,36 +235,18 @@ static inline int pmd_large(pmd_t pte) {
+ #else
+ #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
+ #endif
+-#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
+ #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
+ #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+
+ #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+-#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
++#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
+ #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
+
+ /* PTE - Level 1 access. */
+
+ /* page, protection -> pte */
+ #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+-#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
+
+-/* Change flags of a PTE */
+-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+-{
+- /*
+- * Since this might change the present bit (which controls whether
+- * a pte_t object has undergone p2m translation), we must use
+- * pte_val() on the input pte and __pte() for the return value.
+- */
+- unsigned long pteval = pte_val(pte);
+-
+- pteval &= _PAGE_CHG_MASK;
+- pteval |= pgprot_val(newprot);
+- pteval &= __supported_pte_mask;
+- return __pte(pteval);
+-}
+-
+ #define pte_index(address) \
+ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+ #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
+@@ -456,101 +260,21 @@ static inline pte_t pte_modify(pte_t pte
+
+ #define update_mmu_cache(vma,address,pte) do { } while (0)
+
+-/*
+- * Rules for using ptep_establish: the pte MUST be a user pte, and
+- * must be a present->present transition.
+- */
+-#define __HAVE_ARCH_PTEP_ESTABLISH
+-#define ptep_establish(vma, address, ptep, pteval) \
+- do { \
+- if ( likely((vma)->vm_mm == current->mm) ) { \
+- BUG_ON(HYPERVISOR_update_va_mapping(address, \
+- pteval, \
+- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
+- UVMF_INVLPG|UVMF_MULTI)); \
+- } else { \
+- xen_l1_entry_update(ptep, pteval); \
+- flush_tlb_page(vma, address); \
+- } \
+- } while (0)
+-
+-/* We only update the dirty/accessed state if we set
+- * the dirty bit by hand in the kernel, since the hardware
+- * will do the accessed bit for us, and we don't want to
+- * race with other CPU's that might be updating the dirty
+- * bit at the same time. */
+-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+-#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
+-({ \
+- int __changed = !pte_same(*(ptep), entry); \
+- if (__changed && (dirty)) \
+- ptep_establish(vma, address, ptep, entry); \
+- __changed; \
+-})
+-
+-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+-#define ptep_clear_flush_young(vma, address, ptep) \
+-({ \
+- pte_t __pte = *(ptep); \
+- int __young = pte_young(__pte); \
+- __pte = pte_mkold(__pte); \
+- if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
+- (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
+- else if (__young) \
+- set_pte(ptep, __pte); \
+- __young; \
+-})
+-
+ /* Encode and de-code a swap entry */
+ #define __swp_type(x) (((x).val >> 1) & 0x3f)
+ #define __swp_offset(x) ((x).val >> 8)
+ #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
+ #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
+-#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
+-
+-extern spinlock_t pgd_lock;
+-extern struct list_head pgd_list;
++#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
+ extern int kern_addr_valid(unsigned long addr);
+-
+-#define DOMID_LOCAL (0xFFFFU)
+-
+-struct vm_area_struct;
+-
+-int direct_remap_pfn_range(struct vm_area_struct *vma,
+- unsigned long address,
+- unsigned long mfn,
+- unsigned long size,
+- pgprot_t prot,
+- domid_t domid);
+-
+-int direct_kernel_remap_pfn_range(unsigned long address,
+- unsigned long mfn,
+- unsigned long size,
+- pgprot_t prot,
+- domid_t domid);
+-
+-int create_lookup_pte_addr(struct mm_struct *mm,
+- unsigned long address,
+- uint64_t *ptep);
+-
+-int touch_pte_range(struct mm_struct *mm,
+- unsigned long address,
+- unsigned long size);
+-
+-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+- unsigned long addr, unsigned long end, pgprot_t newprot,
+- int dirty_accountable);
+-
+-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
+- xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
+-
+-pte_t *lookup_address(unsigned long addr);
++extern void cleanup_highmap(void);
+
+ #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+ direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
+
+ #define HAVE_ARCH_UNMAPPED_AREA
++#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+
+ #define pgtable_cache_init() do { } while (0)
+ #define check_pgt_cache() do { } while (0)
+@@ -563,13 +287,7 @@ pte_t *lookup_address(unsigned long addr
+ #define kc_offset_to_vaddr(o) \
+ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
+
+-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
+-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+ #define __HAVE_ARCH_PTE_SAME
+-#include <asm-generic/pgtable.h>
+ #endif /* !__ASSEMBLY__ */
+
+ #endif /* _X86_64_PGTABLE_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/processor.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,5 +1,793 @@
++#ifndef __ASM_X86_PROCESSOR_H
++#define __ASM_X86_PROCESSOR_H
++
++#include <asm/processor-flags.h>
++
++/* migration helpers, for KVM - will be removed in 2.6.25: */
++#include <asm/vm86.h>
++#define Xgt_desc_struct desc_ptr
++
++/* Forward declaration, a strange C thing */
++struct task_struct;
++struct mm_struct;
++
++#include <asm/vm86.h>
++#include <asm/math_emu.h>
++#include <asm/segment.h>
++#include <asm/types.h>
++#include <asm/sigcontext.h>
++#include <asm/current.h>
++#include <asm/cpufeature.h>
++#include <asm/system.h>
++#include <asm/page.h>
++#include <asm/percpu.h>
++#include <asm/msr.h>
++#include <asm/desc_defs.h>
++#include <asm/nops.h>
++#include <linux/personality.h>
++#include <linux/cpumask.h>
++#include <linux/cache.h>
++#include <linux/threads.h>
++#include <linux/init.h>
++#include <xen/interface/physdev.h>
++
++/*
++ * Default implementation of macro that returns current
++ * instruction pointer ("program counter").
++ */
++static inline void *current_text_addr(void)
++{
++ void *pc;
++ asm volatile("mov $1f,%0\n1:":"=r" (pc));
++ return pc;
++}
++
++#ifdef CONFIG_X86_VSMP
++#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
++#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
++#else
++#define ARCH_MIN_TASKALIGN 16
++#define ARCH_MIN_MMSTRUCT_ALIGN 0
++#endif
++
++/*
++ * CPU type and hardware bug flags. Kept separately for each CPU.
++ * Members of this structure are referenced in head.S, so think twice
++ * before touching them. [mj]
++ */
++
++struct cpuinfo_x86 {
++ __u8 x86; /* CPU family */
++ __u8 x86_vendor; /* CPU vendor */
++ __u8 x86_model;
++ __u8 x86_mask;
++#ifdef CONFIG_X86_32
++ char wp_works_ok; /* It doesn't on 386's */
++ char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
++ char hard_math;
++ char rfu;
++ char fdiv_bug;
++ char f00f_bug;
++ char coma_bug;
++ char pad0;
++#else
++ /* number of 4K pages in DTLB/ITLB combined(in pages)*/
++ int x86_tlbsize;
++ __u8 x86_virt_bits, x86_phys_bits;
++ /* cpuid returned core id bits */
++ __u8 x86_coreid_bits;
++ /* Max extended CPUID function supported */
++ __u32 extended_cpuid_level;
++#endif
++ int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
++ __u32 x86_capability[NCAPINTS];
++ char x86_vendor_id[16];
++ char x86_model_id[64];
++ int x86_cache_size; /* in KB - valid for CPUS which support this
++ call */
++ int x86_cache_alignment; /* In bytes */
++ int x86_power;
++ unsigned long loops_per_jiffy;
++#ifdef CONFIG_SMP
++ cpumask_t llc_shared_map; /* cpus sharing the last level cache */
++#endif
++ u16 x86_max_cores; /* cpuid returned max cores value */
++ u16 apicid;
++ u16 x86_clflush_size;
++#ifdef CONFIG_SMP
++ u16 booted_cores; /* number of cores as seen by OS */
++ u16 phys_proc_id; /* Physical processor id. */
++ u16 cpu_core_id; /* Core id */
++ u16 cpu_index; /* index into per_cpu list */
++#endif
++} __attribute__((__aligned__(SMP_CACHE_BYTES)));
++
++#define X86_VENDOR_INTEL 0
++#define X86_VENDOR_CYRIX 1
++#define X86_VENDOR_AMD 2
++#define X86_VENDOR_UMC 3
++#define X86_VENDOR_NEXGEN 4
++#define X86_VENDOR_CENTAUR 5
++#define X86_VENDOR_TRANSMETA 7
++#define X86_VENDOR_NSC 8
++#define X86_VENDOR_NUM 9
++#define X86_VENDOR_UNKNOWN 0xff
++
++/*
++ * capabilities of CPUs
++ */
++extern struct cpuinfo_x86 boot_cpu_data;
++extern struct cpuinfo_x86 new_cpu_data;
++extern __u32 cleared_cpu_caps[NCAPINTS];
++
++#ifdef CONFIG_SMP
++DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
++#define cpu_data(cpu) per_cpu(cpu_info, cpu)
++#define current_cpu_data cpu_data(smp_processor_id())
++#else
++#define cpu_data(cpu) boot_cpu_data
++#define current_cpu_data boot_cpu_data
++#endif
++
++void cpu_detect(struct cpuinfo_x86 *c);
++
++extern void identify_cpu(struct cpuinfo_x86 *);
++extern void identify_boot_cpu(void);
++extern void identify_secondary_cpu(struct cpuinfo_x86 *);
++extern void print_cpu_info(struct cpuinfo_x86 *);
++extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
++extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
++extern unsigned short num_cache_leaves;
++
++#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
++extern void detect_ht(struct cpuinfo_x86 *c);
++#else
++static inline void detect_ht(struct cpuinfo_x86 *c) {}
++#endif
++
++static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
++ unsigned int *ecx, unsigned int *edx)
++{
++ /* ecx is often an input as well as an output. */
++ __asm__(XEN_CPUID
++ : "=a" (*eax),
++ "=b" (*ebx),
++ "=c" (*ecx),
++ "=d" (*edx)
++ : "0" (*eax), "2" (*ecx));
++}
++
++static inline void load_cr3(pgd_t *pgdir)
++{
++ write_cr3(__pa(pgdir));
++}
++
++#ifndef CONFIG_X86_NO_TSS
++#ifdef CONFIG_X86_32
++/* This is the TSS defined by the hardware. */
++struct x86_hw_tss {
++ unsigned short back_link, __blh;
++ unsigned long sp0;
++ unsigned short ss0, __ss0h;
++ unsigned long sp1;
++ unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
++ unsigned long sp2;
++ unsigned short ss2, __ss2h;
++ unsigned long __cr3;
++ unsigned long ip;
++ unsigned long flags;
++ unsigned long ax, cx, dx, bx;
++ unsigned long sp, bp, si, di;
++ unsigned short es, __esh;
++ unsigned short cs, __csh;
++ unsigned short ss, __ssh;
++ unsigned short ds, __dsh;
++ unsigned short fs, __fsh;
++ unsigned short gs, __gsh;
++ unsigned short ldt, __ldth;
++ unsigned short trace, io_bitmap_base;
++} __attribute__((packed));
++extern struct tss_struct doublefault_tss;
++#else
++struct x86_hw_tss {
++ u32 reserved1;
++ u64 sp0;
++ u64 sp1;
++ u64 sp2;
++ u64 reserved2;
++ u64 ist[7];
++ u32 reserved3;
++ u32 reserved4;
++ u16 reserved5;
++ u16 io_bitmap_base;
++} __attribute__((packed)) ____cacheline_aligned;
++#endif
++#endif /* CONFIG_X86_NO_TSS */
++
++/*
++ * Size of io_bitmap.
++ */
++#define IO_BITMAP_BITS 65536
++#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
++#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
++#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
++#define INVALID_IO_BITMAP_OFFSET 0x8000
++#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
++
++#ifndef CONFIG_X86_NO_TSS
++struct tss_struct {
++ struct x86_hw_tss x86_tss;
++
++ /*
++ * The extra 1 is there because the CPU will access an
++ * additional byte beyond the end of the IO permission
++ * bitmap. The extra byte must be all 1 bits, and must
++ * be within the limit.
++ */
++ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
++ /*
++ * Cache the current maximum and the last task that used the bitmap:
++ */
++ unsigned long io_bitmap_max;
++ struct thread_struct *io_bitmap_owner;
++ /*
++ * pads the TSS to be cacheline-aligned (size is 0x100)
++ */
++ unsigned long __cacheline_filler[35];
++ /*
++ * .. and then another 0x100 bytes for emergency kernel stack
++ */
++ unsigned long stack[64];
++} __attribute__((packed));
++
++DECLARE_PER_CPU(struct tss_struct, init_tss);
++
++/* Save the original ist values for checking stack pointers during debugging */
++struct orig_ist {
++ unsigned long ist[7];
++};
++#endif /* CONFIG_X86_NO_TSS */
++
++#define MXCSR_DEFAULT 0x1f80
++
++struct i387_fsave_struct {
++ u32 cwd;
++ u32 swd;
++ u32 twd;
++ u32 fip;
++ u32 fcs;
++ u32 foo;
++ u32 fos;
++ u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
++ u32 status; /* software status information */
++};
++
++struct i387_fxsave_struct {
++ u16 cwd;
++ u16 swd;
++ u16 twd;
++ u16 fop;
++ union {
++ struct {
++ u64 rip;
++ u64 rdp;
++ };
++ struct {
++ u32 fip;
++ u32 fcs;
++ u32 foo;
++ u32 fos;
++ };
++ };
++ u32 mxcsr;
++ u32 mxcsr_mask;
++ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
++ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
++ u32 padding[24];
++} __attribute__((aligned(16)));
++
++struct i387_soft_struct {
++ u32 cwd;
++ u32 swd;
++ u32 twd;
++ u32 fip;
++ u32 fcs;
++ u32 foo;
++ u32 fos;
++ u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
++ u8 ftop, changed, lookahead, no_update, rm, alimit;
++ struct info *info;
++ u32 entry_eip;
++};
++
++union i387_union {
++ struct i387_fsave_struct fsave;
++ struct i387_fxsave_struct fxsave;
++ struct i387_soft_struct soft;
++};
++
++#ifdef CONFIG_X86_32
++DECLARE_PER_CPU(u8, cpu_llc_id);
++#elif !defined(CONFIG_X86_NO_TSS)
++DECLARE_PER_CPU(struct orig_ist, orig_ist);
++#endif
++
++extern void print_cpu_info(struct cpuinfo_x86 *);
++extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
++extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
++extern unsigned short num_cache_leaves;
++
++struct thread_struct {
++/* cached TLS descriptors. */
++ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
++ unsigned long sp0;
++ unsigned long sp;
++#ifdef CONFIG_X86_32
++ unsigned long sysenter_cs;
++#else
++ unsigned long usersp; /* Copy from PDA */
++ unsigned short es, ds, fsindex, gsindex;
++#endif
++ unsigned long ip;
++ unsigned long fs;
++ unsigned long gs;
++/* Hardware debugging registers */
++ unsigned long debugreg0;
++ unsigned long debugreg1;
++ unsigned long debugreg2;
++ unsigned long debugreg3;
++ unsigned long debugreg6;
++ unsigned long debugreg7;
++/* fault info */
++ unsigned long cr2, trap_no, error_code;
++/* floating point info */
++ union i387_union i387 __attribute__((aligned(16)));;
++#ifdef CONFIG_X86_32
++/* virtual 86 mode info */
++ struct vm86_struct __user *vm86_info;
++ unsigned long screen_bitmap;
++ unsigned long v86flags, v86mask, saved_sp0;
++ unsigned int saved_fs, saved_gs;
++#endif
++/* IO permissions */
++ unsigned long *io_bitmap_ptr;
++ unsigned long iopl;
++/* max allowed port in the bitmap, in bytes: */
++ unsigned io_bitmap_max;
++/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
++ unsigned long debugctlmsr;
++/* Debug Store - if not 0 points to a DS Save Area configuration;
++ * goes into MSR_IA32_DS_AREA */
++ unsigned long ds_area_msr;
++};
++
++static inline unsigned long xen_get_debugreg(int regno)
++{
++ return HYPERVISOR_get_debugreg(regno);
++}
++
++static inline void xen_set_debugreg(int regno, unsigned long value)
++{
++ WARN_ON(HYPERVISOR_set_debugreg(regno, value));
++}
++
++/*
++ * Set IOPL bits in EFLAGS from given mask
++ */
++static inline void xen_set_iopl_mask(unsigned mask)
++{
++ struct physdev_set_iopl set_iopl;
++
++ /* Force the change at ring 0. */
++ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
++ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
++}
++
++#ifndef CONFIG_X86_NO_TSS
++static inline void native_load_sp0(struct tss_struct *tss,
++ struct thread_struct *thread)
++{
++ tss->x86_tss.sp0 = thread->sp0;
++#ifdef CONFIG_X86_32
++ /* Only happens when SEP is enabled, no need to test "SEP"arately */
++ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
++ tss->x86_tss.ss1 = thread->sysenter_cs;
++ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
++ }
++#endif
++}
++#else
++#define xen_load_sp0(tss, thread) do { \
++ if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
++ BUG(); \
++} while (0)
++#endif
++
++#define __cpuid xen_cpuid
++#define paravirt_enabled() 0
++
++/*
++ * These special macros can be used to get or set a debugging register
++ */
++#define get_debugreg(var, register) \
++ (var) = xen_get_debugreg(register)
++#define set_debugreg(value, register) \
++ xen_set_debugreg(register, value)
++
++#define load_sp0 xen_load_sp0
++
++#define set_iopl_mask xen_set_iopl_mask
++
++/*
++ * Save the cr4 feature set we're using (ie
++ * Pentium 4MB enable and PPro Global page
++ * enable), so that any CPU's that boot up
++ * after us can get the correct flags.
++ */
++extern unsigned long mmu_cr4_features;
++
++static inline void set_in_cr4(unsigned long mask)
++{
++ unsigned cr4;
++ mmu_cr4_features |= mask;
++ cr4 = read_cr4();
++ cr4 |= mask;
++ write_cr4(cr4);
++}
++
++static inline void clear_in_cr4(unsigned long mask)
++{
++ unsigned cr4;
++ mmu_cr4_features &= ~mask;
++ cr4 = read_cr4();
++ cr4 &= ~mask;
++ write_cr4(cr4);
++}
++
++struct microcode_header {
++ unsigned int hdrver;
++ unsigned int rev;
++ unsigned int date;
++ unsigned int sig;
++ unsigned int cksum;
++ unsigned int ldrver;
++ unsigned int pf;
++ unsigned int datasize;
++ unsigned int totalsize;
++ unsigned int reserved[3];
++};
++
++struct microcode {
++ struct microcode_header hdr;
++ unsigned int bits[0];
++};
++
++typedef struct microcode microcode_t;
++typedef struct microcode_header microcode_header_t;
++
++/* microcode format is extended from prescott processors */
++struct extended_signature {
++ unsigned int sig;
++ unsigned int pf;
++ unsigned int cksum;
++};
++
++struct extended_sigtable {
++ unsigned int count;
++ unsigned int cksum;
++ unsigned int reserved[3];
++ struct extended_signature sigs[0];
++};
++
++typedef struct {
++ unsigned long seg;
++} mm_segment_t;
++
++
++/*
++ * create a kernel thread without removing it from tasklists
++ */
++extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
++
++/* Free all resources held by a thread. */
++extern void release_thread(struct task_struct *);
++
++/* Prepare to copy thread state - unlazy all lazy status */
++extern void prepare_to_copy(struct task_struct *tsk);
++
++unsigned long get_wchan(struct task_struct *p);
++
++/*
++ * Generic CPUID function
++ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
++ * resulting in stale register contents being returned.
++ */
++static inline void cpuid(unsigned int op,
++ unsigned int *eax, unsigned int *ebx,
++ unsigned int *ecx, unsigned int *edx)
++{
++ *eax = op;
++ *ecx = 0;
++ __cpuid(eax, ebx, ecx, edx);
++}
++
++/* Some CPUID calls want 'count' to be placed in ecx */
++static inline void cpuid_count(unsigned int op, int count,
++ unsigned int *eax, unsigned int *ebx,
++ unsigned int *ecx, unsigned int *edx)
++{
++ *eax = op;
++ *ecx = count;
++ __cpuid(eax, ebx, ecx, edx);
++}
++
++/*
++ * CPUID functions returning a single datum
++ */
++static inline unsigned int cpuid_eax(unsigned int op)
++{
++ unsigned int eax, ebx, ecx, edx;
++
++ cpuid(op, &eax, &ebx, &ecx, &edx);
++ return eax;
++}
++static inline unsigned int cpuid_ebx(unsigned int op)
++{
++ unsigned int eax, ebx, ecx, edx;
++
++ cpuid(op, &eax, &ebx, &ecx, &edx);
++ return ebx;
++}
++static inline unsigned int cpuid_ecx(unsigned int op)
++{
++ unsigned int eax, ebx, ecx, edx;
++
++ cpuid(op, &eax, &ebx, &ecx, &edx);
++ return ecx;
++}
++static inline unsigned int cpuid_edx(unsigned int op)
++{
++ unsigned int eax, ebx, ecx, edx;
++
++ cpuid(op, &eax, &ebx, &ecx, &edx);
++ return edx;
++}
++
++/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
++static inline void rep_nop(void)
++{
++ __asm__ __volatile__("rep;nop": : :"memory");
++}
++
++/* Stop speculative execution */
++static inline void sync_core(void)
++{
++ int tmp;
++ asm volatile("cpuid" : "=a" (tmp) : "0" (1)
++ : "ebx", "ecx", "edx", "memory");
++}
++
++#define cpu_relax() rep_nop()
++
++static inline void __monitor(const void *eax, unsigned long ecx,
++ unsigned long edx)
++{
++ /* "monitor %eax,%ecx,%edx;" */
++ asm volatile(
++ ".byte 0x0f,0x01,0xc8;"
++ : :"a" (eax), "c" (ecx), "d"(edx));
++}
++
++static inline void __mwait(unsigned long eax, unsigned long ecx)
++{
++ /* "mwait %eax,%ecx;" */
++ asm volatile(
++ ".byte 0x0f,0x01,0xc9;"
++ : :"a" (eax), "c" (ecx));
++}
++
++static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
++{
++ /* "mwait %eax,%ecx;" */
++ asm volatile(
++ "sti; .byte 0x0f,0x01,0xc9;"
++ : :"a" (eax), "c" (ecx));
++}
++
++extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
++
++extern int force_mwait;
++
++extern void select_idle_routine(const struct cpuinfo_x86 *c);
++
++extern unsigned long boot_option_idle_override;
++
++extern void enable_sep_cpu(void);
++extern int sysenter_setup(void);
++
++/* Defined in head.S */
++extern struct desc_ptr early_gdt_descr;
++
++extern void cpu_set_gdt(int);
++extern void switch_to_new_gdt(void);
++extern void cpu_init(void);
++extern void init_gdt(int cpu);
++
++/* from system description table in BIOS. Mostly for MCA use, but
++ * others may find it useful. */
++extern unsigned int machine_id;
++extern unsigned int machine_submodel_id;
++extern unsigned int BIOS_revision;
++
++/* Boot loader type from the setup header */
++extern int bootloader_type;
++
++extern char ignore_fpu_irq;
++#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
++
++#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
++#define ARCH_HAS_PREFETCHW
++#define ARCH_HAS_SPINLOCK_PREFETCH
++
++#ifdef CONFIG_X86_32
++#define BASE_PREFETCH ASM_NOP4
++#define ARCH_HAS_PREFETCH
++#else
++#define BASE_PREFETCH "prefetcht0 (%1)"
++#endif
++
++/* Prefetch instructions for Pentium III and AMD Athlon */
++/* It's not worth to care about 3dnow! prefetches for the K6
++ because they are microcoded there and very slow.
++ However we don't do prefetches for pre XP Athlons currently
++ That should be fixed. */
++static inline void prefetch(const void *x)
++{
++ alternative_input(BASE_PREFETCH,
++ "prefetchnta (%1)",
++ X86_FEATURE_XMM,
++ "r" (x));
++}
++
++/* 3dnow! prefetch to get an exclusive cache line. Useful for
++ spinlocks to avoid one state transition in the cache coherency protocol. */
++static inline void prefetchw(const void *x)
++{
++ alternative_input(BASE_PREFETCH,
++ "prefetchw (%1)",
++ X86_FEATURE_3DNOW,
++ "r" (x));
++}
++
++#define spin_lock_prefetch(x) prefetchw(x)
+ #ifdef CONFIG_X86_32
+-# include "processor_32.h"
++/*
++ * User space process size: 3GB (default).
++ */
++#define TASK_SIZE (PAGE_OFFSET)
++#define STACK_TOP TASK_SIZE
++#define STACK_TOP_MAX STACK_TOP
++
++#define INIT_THREAD { \
++ .sp0 = sizeof(init_stack) + (long)&init_stack, \
++ .vm86_info = NULL, \
++ .sysenter_cs = __KERNEL_CS, \
++ .io_bitmap_ptr = NULL, \
++ .fs = __KERNEL_PERCPU, \
++}
++
++/*
++ * Note that the .io_bitmap member must be extra-big. This is because
++ * the CPU will access an additional byte beyond the end of the IO
++ * permission bitmap. The extra byte must be all 1 bits, and must
++ * be within the limit.
++ */
++#define INIT_TSS { \
++ .x86_tss = { \
++ .sp0 = sizeof(init_stack) + (long)&init_stack, \
++ .ss0 = __KERNEL_DS, \
++ .ss1 = __KERNEL_CS, \
++ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
++ }, \
++ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
++}
++
++#define start_thread(regs, new_eip, new_esp) do { \
++ __asm__("movl %0,%%gs": :"r" (0)); \
++ regs->fs = 0; \
++ set_fs(USER_DS); \
++ regs->ds = __USER_DS; \
++ regs->es = __USER_DS; \
++ regs->ss = __USER_DS; \
++ regs->cs = __USER_CS; \
++ regs->ip = new_eip; \
++ regs->sp = new_esp; \
++} while (0)
++
++
++extern unsigned long thread_saved_pc(struct task_struct *tsk);
++
++#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
++#define KSTK_TOP(info) \
++({ \
++ unsigned long *__ptr = (unsigned long *)(info); \
++ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
++})
++
++/*
++ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
++ * This is necessary to guarantee that the entire "struct pt_regs"
++ * is accessable even if the CPU haven't stored the SS/ESP registers
++ * on the stack (interrupt gate does not save these registers
++ * when switching to the same priv ring).
++ * Therefore beware: accessing the ss/esp fields of the
++ * "struct pt_regs" is possible, but they may contain the
++ * completely wrong values.
++ */
++#define task_pt_regs(task) \
++({ \
++ struct pt_regs *__regs__; \
++ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
++ __regs__ - 1; \
++})
++
++#define KSTK_ESP(task) (task_pt_regs(task)->sp)
++
+ #else
+-# include "processor_64.h"
++/*
++ * User space process size. 47bits minus one guard page.
++ */
++#define TASK_SIZE64 (0x800000000000UL - 4096)
++
++/* This decides where the kernel will search for a free chunk of vm
++ * space during mmap's.
++ */
++#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
++ 0xc0000000 : 0xFFFFe000)
++
++#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
++ IA32_PAGE_OFFSET : TASK_SIZE64)
++#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
++ IA32_PAGE_OFFSET : TASK_SIZE64)
++
++#define STACK_TOP TASK_SIZE
++#define STACK_TOP_MAX TASK_SIZE64
++
++#define INIT_THREAD { \
++ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
++}
++
++#define INIT_TSS { \
++ .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
++}
++
++#define start_thread(regs, new_rip, new_rsp) do { \
++ asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
++ load_gs_index(0); \
++ (regs)->ip = (new_rip); \
++ (regs)->sp = (new_rsp); \
++ write_pda(oldrsp, (new_rsp)); \
++ (regs)->cs = __USER_CS; \
++ (regs)->ss = __USER_DS; \
++ (regs)->flags = 0x200; \
++ set_fs(USER_DS); \
++} while (0)
++
++/*
++ * Return saved PC of a blocked thread.
++ * What is this good for? it will be always the scheduler or ret_from_fork.
++ */
++#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
++
++#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
++#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
++#endif /* CONFIG_X86_64 */
++
++/* This decides where the kernel will search for a free chunk of vm
++ * space during mmap's.
++ */
++#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
++
++#define KSTK_EIP(task) (task_pt_regs(task)->ip)
++
+ #endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/processor_32.h 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,751 +0,0 @@
+-/*
+- * include/asm-i386/processor.h
+- *
+- * Copyright (C) 1994 Linus Torvalds
+- */
+-
+-#ifndef __ASM_I386_PROCESSOR_H
+-#define __ASM_I386_PROCESSOR_H
+-
+-#include <asm/vm86.h>
+-#include <asm/math_emu.h>
+-#include <asm/segment.h>
+-#include <asm/page.h>
+-#include <asm/types.h>
+-#include <asm/sigcontext.h>
+-#include <asm/cpufeature.h>
+-#include <asm/msr.h>
+-#include <asm/system.h>
+-#include <linux/cache.h>
+-#include <linux/threads.h>
+-#include <asm/percpu.h>
+-#include <linux/cpumask.h>
+-#include <linux/init.h>
+-#include <asm/processor-flags.h>
+-#include <xen/interface/physdev.h>
+-
+-/* flag for disabling the tsc */
+-#define tsc_disable 0
+-
+-struct desc_struct {
+- unsigned long a,b;
+-};
+-
+-#define desc_empty(desc) \
+- (!((desc)->a | (desc)->b))
+-
+-#define desc_equal(desc1, desc2) \
+- (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
+-/*
+- * Default implementation of macro that returns current
+- * instruction pointer ("program counter").
+- */
+-#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
+-
+-/*
+- * CPU type and hardware bug flags. Kept separately for each CPU.
+- * Members of this structure are referenced in head.S, so think twice
+- * before touching them. [mj]
+- */
+-
+-struct cpuinfo_x86 {
+- __u8 x86; /* CPU family */
+- __u8 x86_vendor; /* CPU vendor */
+- __u8 x86_model;
+- __u8 x86_mask;
+- char wp_works_ok; /* It doesn't on 386's */
+- char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
+- char hard_math;
+- char rfu;
+- int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
+- unsigned long x86_capability[NCAPINTS];
+- char x86_vendor_id[16];
+- char x86_model_id[64];
+- int x86_cache_size; /* in KB - valid for CPUS which support this
+- call */
+- int x86_cache_alignment; /* In bytes */
+- char fdiv_bug;
+- char f00f_bug;
+- char coma_bug;
+- char pad0;
+- int x86_power;
+- unsigned long loops_per_jiffy;
+-#ifdef CONFIG_SMP
+- cpumask_t llc_shared_map; /* cpus sharing the last level cache */
+-#endif
+- unsigned char x86_max_cores; /* cpuid returned max cores value */
+- unsigned char apicid;
+- unsigned short x86_clflush_size;
+-#ifdef CONFIG_SMP
+- unsigned char booted_cores; /* number of cores as seen by OS */
+- __u8 phys_proc_id; /* Physical processor id. */
+- __u8 cpu_core_id; /* Core id */
+- __u8 cpu_index; /* index into per_cpu list */
+-#endif
+-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+-
+-#define X86_VENDOR_INTEL 0
+-#define X86_VENDOR_CYRIX 1
+-#define X86_VENDOR_AMD 2
+-#define X86_VENDOR_UMC 3
+-#define X86_VENDOR_NEXGEN 4
+-#define X86_VENDOR_CENTAUR 5
+-#define X86_VENDOR_TRANSMETA 7
+-#define X86_VENDOR_NSC 8
+-#define X86_VENDOR_NUM 9
+-#define X86_VENDOR_UNKNOWN 0xff
+-
+-/*
+- * capabilities of CPUs
+- */
+-
+-extern struct cpuinfo_x86 boot_cpu_data;
+-extern struct cpuinfo_x86 new_cpu_data;
+-#ifndef CONFIG_X86_NO_TSS
+-extern struct tss_struct doublefault_tss;
+-DECLARE_PER_CPU(struct tss_struct, init_tss);
+-#endif
+-
+-#ifdef CONFIG_SMP
+-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+-#define cpu_data(cpu) per_cpu(cpu_info, cpu)
+-#define current_cpu_data cpu_data(smp_processor_id())
+-#else
+-#define cpu_data(cpu) boot_cpu_data
+-#define current_cpu_data boot_cpu_data
+-#endif
+-
+-/*
+- * the following now lives in the per cpu area:
+- * extern int cpu_llc_id[NR_CPUS];
+- */
+-DECLARE_PER_CPU(u8, cpu_llc_id);
+-extern char ignore_fpu_irq;
+-
+-void __init cpu_detect(struct cpuinfo_x86 *c);
+-
+-extern void identify_boot_cpu(void);
+-extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+-extern void print_cpu_info(struct cpuinfo_x86 *);
+-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+-extern unsigned short num_cache_leaves;
+-
+-#ifdef CONFIG_X86_HT
+-extern void detect_ht(struct cpuinfo_x86 *c);
+-#else
+-static inline void detect_ht(struct cpuinfo_x86 *c) {}
+-#endif
+-
+-static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+- unsigned int *ecx, unsigned int *edx)
+-{
+- /* ecx is often an input as well as an output. */
+- __asm__(XEN_CPUID
+- : "=a" (*eax),
+- "=b" (*ebx),
+- "=c" (*ecx),
+- "=d" (*edx)
+- : "0" (*eax), "2" (*ecx));
+-}
+-
+-#define load_cr3(pgdir) write_cr3(__pa(pgdir))
+-
+-/*
+- * Save the cr4 feature set we're using (ie
+- * Pentium 4MB enable and PPro Global page
+- * enable), so that any CPU's that boot up
+- * after us can get the correct flags.
+- */
+-extern unsigned long mmu_cr4_features;
+-
+-static inline void set_in_cr4 (unsigned long mask)
+-{
+- unsigned cr4;
+- mmu_cr4_features |= mask;
+- cr4 = read_cr4();
+- cr4 |= mask;
+- write_cr4(cr4);
+-}
+-
+-static inline void clear_in_cr4 (unsigned long mask)
+-{
+- unsigned cr4;
+- mmu_cr4_features &= ~mask;
+- cr4 = read_cr4();
+- cr4 &= ~mask;
+- write_cr4(cr4);
+-}
+-
+-/* Stop speculative execution */
+-static inline void sync_core(void)
+-{
+- int tmp;
+- asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
+-}
+-
+-static inline void __monitor(const void *eax, unsigned long ecx,
+- unsigned long edx)
+-{
+- /* "monitor %eax,%ecx,%edx;" */
+- asm volatile(
+- ".byte 0x0f,0x01,0xc8;"
+- : :"a" (eax), "c" (ecx), "d"(edx));
+-}
+-
+-static inline void __mwait(unsigned long eax, unsigned long ecx)
+-{
+- /* "mwait %eax,%ecx;" */
+- asm volatile(
+- ".byte 0x0f,0x01,0xc9;"
+- : :"a" (eax), "c" (ecx));
+-}
+-
+-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+-
+-/* from system description table in BIOS. Mostly for MCA use, but
+-others may find it useful. */
+-extern unsigned int machine_id;
+-extern unsigned int machine_submodel_id;
+-extern unsigned int BIOS_revision;
+-extern unsigned int mca_pentium_flag;
+-
+-/* Boot loader type from the setup header */
+-extern int bootloader_type;
+-
+-/*
+- * User space process size: 3GB (default).
+- */
+-#define TASK_SIZE (PAGE_OFFSET)
+-
+-/* This decides where the kernel will search for a free chunk of vm
+- * space during mmap's.
+- */
+-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+-
+-#define HAVE_ARCH_PICK_MMAP_LAYOUT
+-
+-extern void hard_disable_TSC(void);
+-extern void disable_TSC(void);
+-extern void hard_enable_TSC(void);
+-
+-/*
+- * Size of io_bitmap.
+- */
+-#define IO_BITMAP_BITS 65536
+-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+-#ifndef CONFIG_X86_NO_TSS
+-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
+-#endif
+-#define INVALID_IO_BITMAP_OFFSET 0x8000
+-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
+-
+-struct i387_fsave_struct {
+- long cwd;
+- long swd;
+- long twd;
+- long fip;
+- long fcs;
+- long foo;
+- long fos;
+- long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+- long status; /* software status information */
+-};
+-
+-struct i387_fxsave_struct {
+- unsigned short cwd;
+- unsigned short swd;
+- unsigned short twd;
+- unsigned short fop;
+- long fip;
+- long fcs;
+- long foo;
+- long fos;
+- long mxcsr;
+- long mxcsr_mask;
+- long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+- long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+- long padding[56];
+-} __attribute__ ((aligned (16)));
+-
+-struct i387_soft_struct {
+- long cwd;
+- long swd;
+- long twd;
+- long fip;
+- long fcs;
+- long foo;
+- long fos;
+- long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+- unsigned char ftop, changed, lookahead, no_update, rm, alimit;
+- struct info *info;
+- unsigned long entry_eip;
+-};
+-
+-union i387_union {
+- struct i387_fsave_struct fsave;
+- struct i387_fxsave_struct fxsave;
+- struct i387_soft_struct soft;
+-};
+-
+-typedef struct {
+- unsigned long seg;
+-} mm_segment_t;
+-
+-struct thread_struct;
+-
+-#ifndef CONFIG_X86_NO_TSS
+-/* This is the TSS defined by the hardware. */
+-struct i386_hw_tss {
+- unsigned short back_link,__blh;
+- unsigned long esp0;
+- unsigned short ss0,__ss0h;
+- unsigned long esp1;
+- unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
+- unsigned long esp2;
+- unsigned short ss2,__ss2h;
+- unsigned long __cr3;
+- unsigned long eip;
+- unsigned long eflags;
+- unsigned long eax,ecx,edx,ebx;
+- unsigned long esp;
+- unsigned long ebp;
+- unsigned long esi;
+- unsigned long edi;
+- unsigned short es, __esh;
+- unsigned short cs, __csh;
+- unsigned short ss, __ssh;
+- unsigned short ds, __dsh;
+- unsigned short fs, __fsh;
+- unsigned short gs, __gsh;
+- unsigned short ldt, __ldth;
+- unsigned short trace, io_bitmap_base;
+-} __attribute__((packed));
+-
+-struct tss_struct {
+- struct i386_hw_tss x86_tss;
+-
+- /*
+- * The extra 1 is there because the CPU will access an
+- * additional byte beyond the end of the IO permission
+- * bitmap. The extra byte must be all 1 bits, and must
+- * be within the limit.
+- */
+- unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+- /*
+- * Cache the current maximum and the last task that used the bitmap:
+- */
+- unsigned long io_bitmap_max;
+- struct thread_struct *io_bitmap_owner;
+- /*
+- * pads the TSS to be cacheline-aligned (size is 0x100)
+- */
+- unsigned long __cacheline_filler[35];
+- /*
+- * .. and then another 0x100 bytes for emergency kernel stack
+- */
+- unsigned long stack[64];
+-} __attribute__((packed));
+-#endif
+-
+-#define ARCH_MIN_TASKALIGN 16
+-
+-struct thread_struct {
+-/* cached TLS descriptors. */
+- struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+- unsigned long esp0;
+- unsigned long sysenter_cs;
+- unsigned long eip;
+- unsigned long esp;
+- unsigned long fs;
+- unsigned long gs;
+-/* Hardware debugging registers */
+- unsigned long debugreg[8]; /* %%db0-7 debug registers */
+-/* fault info */
+- unsigned long cr2, trap_no, error_code;
+-/* floating point info */
+- union i387_union i387;
+-/* virtual 86 mode info */
+- struct vm86_struct __user * vm86_info;
+- unsigned long screen_bitmap;
+- unsigned long v86flags, v86mask, saved_esp0;
+- unsigned int saved_fs, saved_gs;
+-/* IO permissions */
+- unsigned long *io_bitmap_ptr;
+- unsigned long iopl;
+-/* max allowed port in the bitmap, in bytes: */
+- unsigned long io_bitmap_max;
+-};
+-
+-#define INIT_THREAD { \
+- .esp0 = sizeof(init_stack) + (long)&init_stack, \
+- .vm86_info = NULL, \
+- .sysenter_cs = __KERNEL_CS, \
+- .io_bitmap_ptr = NULL, \
+- .fs = __KERNEL_PERCPU, \
+-}
+-
+-/*
+- * Note that the .io_bitmap member must be extra-big. This is because
+- * the CPU will access an additional byte beyond the end of the IO
+- * permission bitmap. The extra byte must be all 1 bits, and must
+- * be within the limit.
+- */
+-#define INIT_TSS { \
+- .x86_tss = { \
+- .esp0 = sizeof(init_stack) + (long)&init_stack, \
+- .ss0 = __KERNEL_DS, \
+- .ss1 = __KERNEL_CS, \
+- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
+- }, \
+- .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
+-}
+-
+-#define start_thread(regs, new_eip, new_esp) do { \
+- __asm__("movl %0,%%gs": :"r" (0)); \
+- regs->xfs = 0; \
+- set_fs(USER_DS); \
+- regs->xds = __USER_DS; \
+- regs->xes = __USER_DS; \
+- regs->xss = __USER_DS; \
+- regs->xcs = __USER_CS; \
+- regs->eip = new_eip; \
+- regs->esp = new_esp; \
+-} while (0)
+-
+-/* Forward declaration, a strange C thing */
+-struct task_struct;
+-struct mm_struct;
+-
+-/* Free all resources held by a thread. */
+-extern void release_thread(struct task_struct *);
+-
+-/* Prepare to copy thread state - unlazy all lazy status */
+-extern void prepare_to_copy(struct task_struct *tsk);
+-
+-/*
+- * create a kernel thread without removing it from tasklists
+- */
+-extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
+-
+-extern unsigned long thread_saved_pc(struct task_struct *tsk);
+-void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
+-
+-unsigned long get_wchan(struct task_struct *p);
+-
+-#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
+-#define KSTK_TOP(info) \
+-({ \
+- unsigned long *__ptr = (unsigned long *)(info); \
+- (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
+-})
+-
+-/*
+- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+- * This is necessary to guarantee that the entire "struct pt_regs"
+- * is accessable even if the CPU haven't stored the SS/ESP registers
+- * on the stack (interrupt gate does not save these registers
+- * when switching to the same priv ring).
+- * Therefore beware: accessing the xss/esp fields of the
+- * "struct pt_regs" is possible, but they may contain the
+- * completely wrong values.
+- */
+-#define task_pt_regs(task) \
+-({ \
+- struct pt_regs *__regs__; \
+- __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+- __regs__ - 1; \
+-})
+-
+-#define KSTK_EIP(task) (task_pt_regs(task)->eip)
+-#define KSTK_ESP(task) (task_pt_regs(task)->esp)
+-
+-
+-struct microcode_header {
+- unsigned int hdrver;
+- unsigned int rev;
+- unsigned int date;
+- unsigned int sig;
+- unsigned int cksum;
+- unsigned int ldrver;
+- unsigned int pf;
+- unsigned int datasize;
+- unsigned int totalsize;
+- unsigned int reserved[3];
+-};
+-
+-struct microcode {
+- struct microcode_header hdr;
+- unsigned int bits[0];
+-};
+-
+-typedef struct microcode microcode_t;
+-typedef struct microcode_header microcode_header_t;
+-
+-/* microcode format is extended from prescott processors */
+-struct extended_signature {
+- unsigned int sig;
+- unsigned int pf;
+- unsigned int cksum;
+-};
+-
+-struct extended_sigtable {
+- unsigned int count;
+- unsigned int cksum;
+- unsigned int reserved[3];
+- struct extended_signature sigs[0];
+-};
+-
+-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+-static inline void rep_nop(void)
+-{
+- __asm__ __volatile__("rep;nop": : :"memory");
+-}
+-
+-#define cpu_relax() rep_nop()
+-
+-#ifndef CONFIG_X86_NO_TSS
+-static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
+-{
+- tss->x86_tss.esp0 = thread->esp0;
+- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
+- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+- tss->x86_tss.ss1 = thread->sysenter_cs;
+- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+- }
+-}
+-#else
+-#define xen_load_esp0(tss, thread) do { \
+- if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
+- BUG(); \
+-} while (0)
+-#endif
+-
+-
+-static inline unsigned long xen_get_debugreg(int regno)
+-{
+- return HYPERVISOR_get_debugreg(regno);
+-}
+-
+-static inline void xen_set_debugreg(int regno, unsigned long value)
+-{
+- WARN_ON(HYPERVISOR_set_debugreg(regno, value));
+-}
+-
+-/*
+- * Set IOPL bits in EFLAGS from given mask
+- */
+-static inline void xen_set_iopl_mask(unsigned mask)
+-{
+- struct physdev_set_iopl set_iopl;
+-
+- /* Force the change at ring 0. */
+- set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+-}
+-
+-
+-#define paravirt_enabled() 0
+-#define __cpuid xen_cpuid
+-
+-#define load_esp0 xen_load_esp0
+-
+-/*
+- * These special macros can be used to get or set a debugging register
+- */
+-#define get_debugreg(var, register) \
+- (var) = xen_get_debugreg(register)
+-#define set_debugreg(value, register) \
+- xen_set_debugreg(register, value)
+-
+-#define set_iopl_mask xen_set_iopl_mask
+-
+-/*
+- * Generic CPUID function
+- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+- * resulting in stale register contents being returned.
+- */
+-static inline void cpuid(unsigned int op,
+- unsigned int *eax, unsigned int *ebx,
+- unsigned int *ecx, unsigned int *edx)
+-{
+- *eax = op;
+- *ecx = 0;
+- __cpuid(eax, ebx, ecx, edx);
+-}
+-
+-/* Some CPUID calls want 'count' to be placed in ecx */
+-static inline void cpuid_count(unsigned int op, int count,
+- unsigned int *eax, unsigned int *ebx,
+- unsigned int *ecx, unsigned int *edx)
+-{
+- *eax = op;
+- *ecx = count;
+- __cpuid(eax, ebx, ecx, edx);
+-}
+-
+-/*
+- * CPUID functions returning a single datum
+- */
+-static inline unsigned int cpuid_eax(unsigned int op)
+-{
+- unsigned int eax, ebx, ecx, edx;
+-
+- cpuid(op, &eax, &ebx, &ecx, &edx);
+- return eax;
+-}
+-static inline unsigned int cpuid_ebx(unsigned int op)
+-{
+- unsigned int eax, ebx, ecx, edx;
+-
+- cpuid(op, &eax, &ebx, &ecx, &edx);
+- return ebx;
+-}
+-static inline unsigned int cpuid_ecx(unsigned int op)
+-{
+- unsigned int eax, ebx, ecx, edx;
+-
+- cpuid(op, &eax, &ebx, &ecx, &edx);
+- return ecx;
+-}
+-static inline unsigned int cpuid_edx(unsigned int op)
+-{
+- unsigned int eax, ebx, ecx, edx;
+-
+- cpuid(op, &eax, &ebx, &ecx, &edx);
+- return edx;
+-}
+-
+-/* generic versions from gas */
+-#define GENERIC_NOP1 ".byte 0x90\n"
+-#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
+-#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
+-#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
+-#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
+-#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
+-#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
+-#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
+-
+-/* Opteron nops */
+-#define K8_NOP1 GENERIC_NOP1
+-#define K8_NOP2 ".byte 0x66,0x90\n"
+-#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
+-#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
+-#define K8_NOP5 K8_NOP3 K8_NOP2
+-#define K8_NOP6 K8_NOP3 K8_NOP3
+-#define K8_NOP7 K8_NOP4 K8_NOP3
+-#define K8_NOP8 K8_NOP4 K8_NOP4
+-
+-/* K7 nops */
+-/* uses eax dependencies (arbitary choice) */
+-#define K7_NOP1 GENERIC_NOP1
+-#define K7_NOP2 ".byte 0x8b,0xc0\n"
+-#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
+-#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
+-#define K7_NOP5 K7_NOP4 ASM_NOP1
+-#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
+-#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
+-#define K7_NOP8 K7_NOP7 ASM_NOP1
+-
+-/* P6 nops */
+-/* uses eax dependencies (Intel-recommended choice) */
+-#define P6_NOP1 GENERIC_NOP1
+-#define P6_NOP2 ".byte 0x66,0x90\n"
+-#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
+-#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
+-#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
+-#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
+-#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
+-#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
+-
+-#ifdef CONFIG_MK8
+-#define ASM_NOP1 K8_NOP1
+-#define ASM_NOP2 K8_NOP2
+-#define ASM_NOP3 K8_NOP3
+-#define ASM_NOP4 K8_NOP4
+-#define ASM_NOP5 K8_NOP5
+-#define ASM_NOP6 K8_NOP6
+-#define ASM_NOP7 K8_NOP7
+-#define ASM_NOP8 K8_NOP8
+-#elif defined(CONFIG_MK7)
+-#define ASM_NOP1 K7_NOP1
+-#define ASM_NOP2 K7_NOP2
+-#define ASM_NOP3 K7_NOP3
+-#define ASM_NOP4 K7_NOP4
+-#define ASM_NOP5 K7_NOP5
+-#define ASM_NOP6 K7_NOP6
+-#define ASM_NOP7 K7_NOP7
+-#define ASM_NOP8 K7_NOP8
+-#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
+- defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
+- defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
+-#define ASM_NOP1 P6_NOP1
+-#define ASM_NOP2 P6_NOP2
+-#define ASM_NOP3 P6_NOP3
+-#define ASM_NOP4 P6_NOP4
+-#define ASM_NOP5 P6_NOP5
+-#define ASM_NOP6 P6_NOP6
+-#define ASM_NOP7 P6_NOP7
+-#define ASM_NOP8 P6_NOP8
+-#else
+-#define ASM_NOP1 GENERIC_NOP1
+-#define ASM_NOP2 GENERIC_NOP2
+-#define ASM_NOP3 GENERIC_NOP3
+-#define ASM_NOP4 GENERIC_NOP4
+-#define ASM_NOP5 GENERIC_NOP5
+-#define ASM_NOP6 GENERIC_NOP6
+-#define ASM_NOP7 GENERIC_NOP7
+-#define ASM_NOP8 GENERIC_NOP8
+-#endif
+-
+-#define ASM_NOP_MAX 8
+-
+-/* Prefetch instructions for Pentium III and AMD Athlon */
+-/* It's not worth to care about 3dnow! prefetches for the K6
+- because they are microcoded there and very slow.
+- However we don't do prefetches for pre XP Athlons currently
+- That should be fixed. */
+-#define ARCH_HAS_PREFETCH
+-static inline void prefetch(const void *x)
+-{
+- alternative_input(ASM_NOP4,
+- "prefetchnta (%1)",
+- X86_FEATURE_XMM,
+- "r" (x));
+-}
+-
+-#define ARCH_HAS_PREFETCH
+-#define ARCH_HAS_PREFETCHW
+-#define ARCH_HAS_SPINLOCK_PREFETCH
+-
+-/* 3dnow! prefetch to get an exclusive cache line. Useful for
+- spinlocks to avoid one state transition in the cache coherency protocol. */
+-static inline void prefetchw(const void *x)
+-{
+- alternative_input(ASM_NOP4,
+- "prefetchw (%1)",
+- X86_FEATURE_3DNOW,
+- "r" (x));
+-}
+-#define spin_lock_prefetch(x) prefetchw(x)
+-
+-extern void select_idle_routine(const struct cpuinfo_x86 *c);
+-
+-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
+-
+-extern unsigned long boot_option_idle_override;
+-extern void enable_sep_cpu(void);
+-extern int sysenter_setup(void);
+-
+-/* Defined in head.S */
+-extern struct Xgt_desc_struct early_gdt_descr;
+-
+-extern void cpu_set_gdt(int);
+-extern void switch_to_new_gdt(void);
+-extern void cpu_init(void);
+-extern void init_gdt(int cpu);
+-
+-extern int force_mwait;
+-
+-#endif /* __ASM_I386_PROCESSOR_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/processor_64.h 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,461 +0,0 @@
+-/*
+- * include/asm-x86_64/processor.h
+- *
+- * Copyright (C) 1994 Linus Torvalds
+- */
+-
+-#ifndef __ASM_X86_64_PROCESSOR_H
+-#define __ASM_X86_64_PROCESSOR_H
+-
+-#include <asm/segment.h>
+-#include <asm/page.h>
+-#include <asm/types.h>
+-#include <asm/sigcontext.h>
+-#include <asm/cpufeature.h>
+-#include <linux/threads.h>
+-#include <asm/msr.h>
+-#include <asm/current.h>
+-#include <asm/system.h>
+-#include <asm/mmsegment.h>
+-#include <asm/percpu.h>
+-#include <linux/personality.h>
+-#include <linux/cpumask.h>
+-#include <asm/processor-flags.h>
+-
+-#define TF_MASK 0x00000100
+-#define IF_MASK 0x00000200
+-#define IOPL_MASK 0x00003000
+-#define NT_MASK 0x00004000
+-#define VM_MASK 0x00020000
+-#define AC_MASK 0x00040000
+-#define VIF_MASK 0x00080000 /* virtual interrupt flag */
+-#define VIP_MASK 0x00100000 /* virtual interrupt pending */
+-#define ID_MASK 0x00200000
+-
+-#define desc_empty(desc) \
+- (!((desc)->a | (desc)->b))
+-
+-#define desc_equal(desc1, desc2) \
+- (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
+-
+-/*
+- * Default implementation of macro that returns current
+- * instruction pointer ("program counter").
+- */
+-#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
+-
+-/*
+- * CPU type and hardware bug flags. Kept separately for each CPU.
+- */
+-
+-struct cpuinfo_x86 {
+- __u8 x86; /* CPU family */
+- __u8 x86_vendor; /* CPU vendor */
+- __u8 x86_model;
+- __u8 x86_mask;
+- int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
+- __u32 x86_capability[NCAPINTS];
+- char x86_vendor_id[16];
+- char x86_model_id[64];
+- int x86_cache_size; /* in KB */
+- int x86_clflush_size;
+- int x86_cache_alignment;
+- int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
+- __u8 x86_virt_bits, x86_phys_bits;
+- __u8 x86_max_cores; /* cpuid returned max cores value */
+- __u32 x86_power;
+- __u32 extended_cpuid_level; /* Max extended CPUID function supported */
+- unsigned long loops_per_jiffy;
+-#ifdef CONFIG_SMP
+- cpumask_t llc_shared_map; /* cpus sharing the last level cache */
+-#endif
+- __u8 apicid;
+-#ifdef CONFIG_SMP
+- __u8 booted_cores; /* number of cores as seen by OS */
+- __u8 phys_proc_id; /* Physical Processor id. */
+- __u8 cpu_core_id; /* Core id. */
+- __u8 cpu_index; /* index into per_cpu list */
+-#endif
+-} ____cacheline_aligned;
+-
+-#define X86_VENDOR_INTEL 0
+-#define X86_VENDOR_CYRIX 1
+-#define X86_VENDOR_AMD 2
+-#define X86_VENDOR_UMC 3
+-#define X86_VENDOR_NEXGEN 4
+-#define X86_VENDOR_CENTAUR 5
+-#define X86_VENDOR_TRANSMETA 7
+-#define X86_VENDOR_NUM 8
+-#define X86_VENDOR_UNKNOWN 0xff
+-
+-#ifdef CONFIG_SMP
+-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+-#define cpu_data(cpu) per_cpu(cpu_info, cpu)
+-#define current_cpu_data cpu_data(smp_processor_id())
+-#else
+-#define cpu_data(cpu) boot_cpu_data
+-#define current_cpu_data boot_cpu_data
+-#endif
+-
+-extern char ignore_irq13;
+-
+-extern void identify_cpu(struct cpuinfo_x86 *);
+-extern void print_cpu_info(struct cpuinfo_x86 *);
+-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+-extern unsigned short num_cache_leaves;
+-
+-/*
+- * Save the cr4 feature set we're using (ie
+- * Pentium 4MB enable and PPro Global page
+- * enable), so that any CPU's that boot up
+- * after us can get the correct flags.
+- */
+-extern unsigned long mmu_cr4_features;
+-
+-static inline void set_in_cr4 (unsigned long mask)
+-{
+- mmu_cr4_features |= mask;
+- __asm__("movq %%cr4,%%rax\n\t"
+- "orq %0,%%rax\n\t"
+- "movq %%rax,%%cr4\n"
+- : : "irg" (mask)
+- :"ax");
+-}
+-
+-static inline void clear_in_cr4 (unsigned long mask)
+-{
+- mmu_cr4_features &= ~mask;
+- __asm__("movq %%cr4,%%rax\n\t"
+- "andq %0,%%rax\n\t"
+- "movq %%rax,%%cr4\n"
+- : : "irg" (~mask)
+- :"ax");
+-}
+-
+-
+-/*
+- * User space process size. 47bits minus one guard page.
+- */
+-#define TASK_SIZE64 (0x800000000000UL - 4096)
+-
+-/* This decides where the kernel will search for a free chunk of vm
+- * space during mmap's.
+- */
+-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
+-
+-#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
+-#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
+-
+-#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
+-
+-/*
+- * Size of io_bitmap.
+- */
+-#define IO_BITMAP_BITS 65536
+-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+-#ifndef CONFIG_X86_NO_TSS
+-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
+-#endif
+-#define INVALID_IO_BITMAP_OFFSET 0x8000
+-
+-struct i387_fxsave_struct {
+- u16 cwd;
+- u16 swd;
+- u16 twd;
+- u16 fop;
+- u64 rip;
+- u64 rdp;
+- u32 mxcsr;
+- u32 mxcsr_mask;
+- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+- u32 padding[24];
+-} __attribute__ ((aligned (16)));
+-
+-union i387_union {
+- struct i387_fxsave_struct fxsave;
+-};
+-
+-#ifndef CONFIG_X86_NO_TSS
+-struct tss_struct {
+- u32 reserved1;
+- u64 rsp0;
+- u64 rsp1;
+- u64 rsp2;
+- u64 reserved2;
+- u64 ist[7];
+- u32 reserved3;
+- u32 reserved4;
+- u16 reserved5;
+- u16 io_bitmap_base;
+- /*
+- * The extra 1 is there because the CPU will access an
+- * additional byte beyond the end of the IO permission
+- * bitmap. The extra byte must be all 1 bits, and must
+- * be within the limit. Thus we have:
+- *
+- * 128 bytes, the bitmap itself, for ports 0..0x3ff
+- * 8 bytes, for an extra "long" of ~0UL
+- */
+- unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+-} __attribute__((packed)) ____cacheline_aligned;
+-
+-DECLARE_PER_CPU(struct tss_struct,init_tss);
+-#endif
+-
+-
+-extern struct cpuinfo_x86 boot_cpu_data;
+-#ifndef CONFIG_X86_NO_TSS
+-/* Save the original ist values for checking stack pointers during debugging */
+-struct orig_ist {
+- unsigned long ist[7];
+-};
+-DECLARE_PER_CPU(struct orig_ist, orig_ist);
+-#endif
+-
+-#ifdef CONFIG_X86_VSMP
+-#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
+-#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
+-#else
+-#define ARCH_MIN_TASKALIGN 16
+-#define ARCH_MIN_MMSTRUCT_ALIGN 0
+-#endif
+-
+-struct thread_struct {
+- unsigned long rsp0;
+- unsigned long rsp;
+- unsigned long userrsp; /* Copy from PDA */
+- unsigned long fs;
+- unsigned long gs;
+- unsigned short es, ds, fsindex, gsindex;
+-/* Hardware debugging registers */
+- unsigned long debugreg0;
+- unsigned long debugreg1;
+- unsigned long debugreg2;
+- unsigned long debugreg3;
+- unsigned long debugreg6;
+- unsigned long debugreg7;
+-/* fault info */
+- unsigned long cr2, trap_no, error_code;
+-/* floating point info */
+- union i387_union i387 __attribute__((aligned(16)));
+-/* IO permissions. the bitmap could be moved into the GDT, that would make
+- switch faster for a limited number of ioperm using tasks. -AK */
+- int ioperm;
+- unsigned long *io_bitmap_ptr;
+- unsigned io_bitmap_max;
+-/* cached TLS descriptors. */
+- u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
+- unsigned int iopl;
+-} __attribute__((aligned(16)));
+-
+-#define INIT_THREAD { \
+- .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+-}
+-
+-#ifndef CONFIG_X86_NO_TSS
+-#define INIT_TSS { \
+- .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+-}
+-#endif
+-
+-#define INIT_MMAP \
+-{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
+-
+-#define start_thread(regs,new_rip,new_rsp) do { \
+- asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
+- load_gs_index(0); \
+- (regs)->rip = (new_rip); \
+- (regs)->rsp = (new_rsp); \
+- write_pda(oldrsp, (new_rsp)); \
+- (regs)->cs = __USER_CS; \
+- (regs)->ss = __USER_DS; \
+- (regs)->eflags = 0x200; \
+- set_fs(USER_DS); \
+-} while(0)
+-
+-#define get_debugreg(var, register) \
+- var = HYPERVISOR_get_debugreg(register)
+-#define set_debugreg(value, register) do { \
+- if (HYPERVISOR_set_debugreg(register, value)) \
+- BUG(); \
+-} while (0)
+-
+-struct task_struct;
+-struct mm_struct;
+-
+-/* Free all resources held by a thread. */
+-extern void release_thread(struct task_struct *);
+-
+-/* Prepare to copy thread state - unlazy all lazy status */
+-extern void prepare_to_copy(struct task_struct *tsk);
+-
+-/*
+- * create a kernel thread without removing it from tasklists
+- */
+-extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
+-
+-/*
+- * Return saved PC of a blocked thread.
+- * What is this good for? it will be always the scheduler or ret_from_fork.
+- */
+-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
+-
+-extern unsigned long get_wchan(struct task_struct *p);
+-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
+-#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
+-#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
+-
+-
+-struct microcode_header {
+- unsigned int hdrver;
+- unsigned int rev;
+- unsigned int date;
+- unsigned int sig;
+- unsigned int cksum;
+- unsigned int ldrver;
+- unsigned int pf;
+- unsigned int datasize;
+- unsigned int totalsize;
+- unsigned int reserved[3];
+-};
+-
+-struct microcode {
+- struct microcode_header hdr;
+- unsigned int bits[0];
+-};
+-
+-typedef struct microcode microcode_t;
+-typedef struct microcode_header microcode_header_t;
+-
+-/* microcode format is extended from prescott processors */
+-struct extended_signature {
+- unsigned int sig;
+- unsigned int pf;
+- unsigned int cksum;
+-};
+-
+-struct extended_sigtable {
+- unsigned int count;
+- unsigned int cksum;
+- unsigned int reserved[3];
+- struct extended_signature sigs[0];
+-};
+-
+-
+-#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2)
+-#define ASM_NOP1 P6_NOP1
+-#define ASM_NOP2 P6_NOP2
+-#define ASM_NOP3 P6_NOP3
+-#define ASM_NOP4 P6_NOP4
+-#define ASM_NOP5 P6_NOP5
+-#define ASM_NOP6 P6_NOP6
+-#define ASM_NOP7 P6_NOP7
+-#define ASM_NOP8 P6_NOP8
+-#else
+-#define ASM_NOP1 K8_NOP1
+-#define ASM_NOP2 K8_NOP2
+-#define ASM_NOP3 K8_NOP3
+-#define ASM_NOP4 K8_NOP4
+-#define ASM_NOP5 K8_NOP5
+-#define ASM_NOP6 K8_NOP6
+-#define ASM_NOP7 K8_NOP7
+-#define ASM_NOP8 K8_NOP8
+-#endif
+-
+-/* Opteron nops */
+-#define K8_NOP1 ".byte 0x90\n"
+-#define K8_NOP2 ".byte 0x66,0x90\n"
+-#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
+-#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
+-#define K8_NOP5 K8_NOP3 K8_NOP2
+-#define K8_NOP6 K8_NOP3 K8_NOP3
+-#define K8_NOP7 K8_NOP4 K8_NOP3
+-#define K8_NOP8 K8_NOP4 K8_NOP4
+-
+-/* P6 nops */
+-/* uses eax dependencies (Intel-recommended choice) */
+-#define P6_NOP1 ".byte 0x90\n"
+-#define P6_NOP2 ".byte 0x66,0x90\n"
+-#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
+-#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
+-#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
+-#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
+-#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
+-#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
+-
+-#define ASM_NOP_MAX 8
+-
+-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+-static inline void rep_nop(void)
+-{
+- __asm__ __volatile__("rep;nop": : :"memory");
+-}
+-
+-/* Stop speculative execution */
+-static inline void sync_core(void)
+-{
+- int tmp;
+- asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
+-}
+-
+-#define ARCH_HAS_PREFETCHW 1
+-static inline void prefetchw(void *x)
+-{
+- alternative_input("prefetcht0 (%1)",
+- "prefetchw (%1)",
+- X86_FEATURE_3DNOW,
+- "r" (x));
+-}
+-
+-#define ARCH_HAS_SPINLOCK_PREFETCH 1
+-
+-#define spin_lock_prefetch(x) prefetchw(x)
+-
+-#define cpu_relax() rep_nop()
+-
+-static inline void __monitor(const void *eax, unsigned long ecx,
+- unsigned long edx)
+-{
+- /* "monitor %eax,%ecx,%edx;" */
+- asm volatile(
+- ".byte 0x0f,0x01,0xc8;"
+- : :"a" (eax), "c" (ecx), "d"(edx));
+-}
+-
+-static inline void __mwait(unsigned long eax, unsigned long ecx)
+-{
+- /* "mwait %eax,%ecx;" */
+- asm volatile(
+- ".byte 0x0f,0x01,0xc9;"
+- : :"a" (eax), "c" (ecx));
+-}
+-
+-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+-{
+- /* "mwait %eax,%ecx;" */
+- asm volatile(
+- "sti; .byte 0x0f,0x01,0xc9;"
+- : :"a" (eax), "c" (ecx));
+-}
+-
+-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+-
+-#define stack_current() \
+-({ \
+- struct thread_info *ti; \
+- asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
+- ti->task; \
+-})
+-
+-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
+-
+-extern unsigned long boot_option_idle_override;
+-/* Boot loader type from the setup header */
+-extern int bootloader_type;
+-
+-#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+-
+-#endif /* __ASM_X86_64_PROCESSOR_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/segment.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,5 +1,204 @@
++#ifndef _ASM_X86_SEGMENT_H_
++#define _ASM_X86_SEGMENT_H_
++
++/* Simple and small GDT entries for booting only */
++
++#define GDT_ENTRY_BOOT_CS 2
++#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
++
++#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
++#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
++
++#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
++#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
++
+ #ifdef CONFIG_X86_32
+-# include "segment_32.h"
++/*
++ * The layout of the per-CPU GDT under Linux:
++ *
++ * 0 - null
++ * 1 - reserved
++ * 2 - reserved
++ * 3 - reserved
++ *
++ * 4 - unused <==== new cacheline
++ * 5 - unused
++ *
++ * ------- start of TLS (Thread-Local Storage) segments:
++ *
++ * 6 - TLS segment #1 [ glibc's TLS segment ]
++ * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
++ * 8 - TLS segment #3
++ * 9 - reserved
++ * 10 - reserved
++ * 11 - reserved
++ *
++ * ------- start of kernel segments:
++ *
++ * 12 - kernel code segment <==== new cacheline
++ * 13 - kernel data segment
++ * 14 - default user CS
++ * 15 - default user DS
++ * 16 - TSS
++ * 17 - LDT
++ * 18 - PNPBIOS support (16->32 gate)
++ * 19 - PNPBIOS support
++ * 20 - PNPBIOS support
++ * 21 - PNPBIOS support
++ * 22 - PNPBIOS support
++ * 23 - APM BIOS support
++ * 24 - APM BIOS support
++ * 25 - APM BIOS support
++ *
++ * 26 - ESPFIX small SS
++ * 27 - per-cpu [ offset to per-cpu data area ]
++ * 28 - unused
++ * 29 - unused
++ * 30 - unused
++ * 31 - TSS for double fault handler
++ */
++#define GDT_ENTRY_TLS_MIN 6
++#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
++
++#define GDT_ENTRY_DEFAULT_USER_CS 14
++#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
++
++#define GDT_ENTRY_DEFAULT_USER_DS 15
++#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
++
++#define GDT_ENTRY_KERNEL_BASE 12
++
++#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
++#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
++
++#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
++#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
++
++#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
++#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
++
++#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
++#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
++
++#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
++#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
++
++#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
++#ifdef CONFIG_SMP
++#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+ #else
+-# include "../../segment_64.h"
++#define __KERNEL_PERCPU 0
++#endif
++
++#define GDT_ENTRY_DOUBLEFAULT_TSS 31
++
++/*
++ * The GDT has 32 entries
++ */
++#define GDT_ENTRIES 32
++
++/* The PnP BIOS entries in the GDT */
++#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
++#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
++#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
++#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
++#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
++
++/* The PnP BIOS selectors */
++#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
++#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
++#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
++#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
++#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
++
++/* Bottom two bits of selector give the ring privilege level */
++#define SEGMENT_RPL_MASK 0x3
++/* Bit 2 is table indicator (LDT/GDT) */
++#define SEGMENT_TI_MASK 0x4
++
++/* User mode is privilege level 3 */
++#define USER_RPL 0x3
++/* LDT segment has TI set, GDT has it cleared */
++#define SEGMENT_LDT 0x4
++#define SEGMENT_GDT 0x0
++
++/*
++ * Matching rules for certain types of segments.
++ */
++
++/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
++#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
++ || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
++
++/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
++#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
++ || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
++ || ((x) & ~3) == (FLAT_USER_CS & ~3))
++
++/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
++#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
++
++#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
++
++#else
++#include <asm/cache.h>
++
++#define __KERNEL_CS 0x10
++#define __KERNEL_DS 0x18
++
++#define __KERNEL32_CS 0x08
++
++/*
++ * we cannot use the same code segment descriptor for user and kernel
++ * -- not even in the long flat mode, because of different DPL /kkeil
++ * The segment offset needs to contain a RPL. Grr. -AK
++ * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
++ */
++
++#define __USER32_CS 0x23 /* 4*8+3 */
++#define __USER_DS 0x2b /* 5*8+3 */
++#define __USER_CS 0x33 /* 6*8+3 */
++#define __USER32_DS __USER_DS
++
++#define GDT_ENTRY_TSS 8 /* needs two entries */
++#define GDT_ENTRY_LDT 10 /* needs two entries */
++#define GDT_ENTRY_TLS_MIN 12
++#define GDT_ENTRY_TLS_MAX 14
++
++#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
++#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
++
++/* TLS indexes for 64bit - hardcoded in arch_prctl */
++#define FS_TLS 0
++#define GS_TLS 1
++
++#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
++#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
++
++#define GDT_ENTRIES 16
++
++#endif
++
++/* User mode is privilege level 3 */
++#define USER_RPL 0x3
++/* LDT segment has TI set, GDT has it cleared */
++#define SEGMENT_LDT 0x4
++#define SEGMENT_GDT 0x0
++
++/* Bottom two bits of selector give the ring privilege level */
++#define SEGMENT_RPL_MASK 0x3
++/* Bit 2 is table indicator (LDT/GDT) */
++#define SEGMENT_TI_MASK 0x4
++
++#define IDT_ENTRIES 256
++#define GDT_SIZE (GDT_ENTRIES * 8)
++#define GDT_ENTRY_TLS_ENTRIES 3
++#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
++
++#ifdef __KERNEL__
++#ifndef __ASSEMBLY__
++extern const char early_idt_handlers[IDT_ENTRIES][10];
++#endif
++#endif
++
+ #endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/segment_32.h 2008-12-15 11:27:22.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,150 +0,0 @@
+-#ifndef _ASM_SEGMENT_H
+-#define _ASM_SEGMENT_H
+-
+-/*
+- * The layout of the per-CPU GDT under Linux:
+- *
+- * 0 - null
+- * 1 - reserved
+- * 2 - reserved
+- * 3 - reserved
+- *
+- * 4 - unused <==== new cacheline
+- * 5 - unused
+- *
+- * ------- start of TLS (Thread-Local Storage) segments:
+- *
+- * 6 - TLS segment #1 [ glibc's TLS segment ]
+- * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
+- * 8 - TLS segment #3
+- * 9 - reserved
+- * 10 - reserved
+- * 11 - reserved
+- *
+- * ------- start of kernel segments:
+- *
+- * 12 - kernel code segment <==== new cacheline
+- * 13 - kernel data segment
+- * 14 - default user CS
+- * 15 - default user DS
+- * 16 - TSS
+- * 17 - LDT
+- * 18 - PNPBIOS support (16->32 gate)
+- * 19 - PNPBIOS support
+- * 20 - PNPBIOS support
+- * 21 - PNPBIOS support
+- * 22 - PNPBIOS support
+- * 23 - APM BIOS support
+- * 24 - APM BIOS support
+- * 25 - APM BIOS support
+- *
+- * 26 - ESPFIX small SS
+- * 27 - per-cpu [ offset to per-cpu data area ]
+- * 28 - unused
+- * 29 - unused
+- * 30 - unused
+- * 31 - TSS for double fault handler
+- */
+-#define GDT_ENTRY_TLS_ENTRIES 3
+-#define GDT_ENTRY_TLS_MIN 6
+-#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+-
+-#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+-
+-#define GDT_ENTRY_DEFAULT_USER_CS 14
+-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
+-
+-#define GDT_ENTRY_DEFAULT_USER_DS 15
+-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
+-
+-#define GDT_ENTRY_KERNEL_BASE 12
+-
+-#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
+-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
+-
+-#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
+-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
+-
+-#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
+-#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
+-
+-#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
+-#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
+-
+-#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
+-#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+-
+-#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
+-#ifdef CONFIG_SMP
+-#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+-#else
+-#define __KERNEL_PERCPU 0
+-#endif
+-
+-#define GDT_ENTRY_DOUBLEFAULT_TSS 31
+-
+-/*
+- * The GDT has 32 entries
+- */
+-#define GDT_ENTRIES 32
+-#define GDT_SIZE (GDT_ENTRIES * 8)
+-
+-/* Simple and small GDT entries for booting only */
+-
+-#define GDT_ENTRY_BOOT_CS 2
+-#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
+-
+-#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
+-#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
+-
+-/* The PnP BIOS entries in the GDT */
+-#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
+-#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
+-#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
+-#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
+-#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
+-
+-/* The PnP BIOS selectors */
+-#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
+-#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
+-#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
+-#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
+-#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
+-
+-/*
+- * The interrupt descriptor table has room for 256 idt's,
+- * the global descriptor table is dependent on the number
+- * of tasks we can have..
+- */
+-#define IDT_ENTRIES 256
+-
+-/* Bottom two bits of selector give the ring privilege level */
+-#define SEGMENT_RPL_MASK 0x3
+-/* Bit 2 is table indicator (LDT/GDT) */
+-#define SEGMENT_TI_MASK 0x4
+-
+-/* User mode is privilege level 3 */
+-#define USER_RPL 0x3
+-/* LDT segment has TI set, GDT has it cleared */
+-#define SEGMENT_LDT 0x4
+-#define SEGMENT_GDT 0x0
+-
+-#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
+-
+-/*
+- * Matching rules for certain types of segments.
+- */
+-
+-/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
+-#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
+- || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
+-
+-/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
+-#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
+- || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
+- || ((x) & ~3) == (FLAT_USER_CS & ~3))
+-
+-/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
+-#define SEGMENT_IS_PNP_CODE(x) (((x) & ~0x0b) == GDT_ENTRY_PNPBIOS_BASE * 8)
+-
+-#endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/smp_32.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/smp_32.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,56 +1,51 @@
+ #ifndef __ASM_SMP_H
+ #define __ASM_SMP_H
+
++#ifndef __ASSEMBLY__
++#include <linux/cpumask.h>
++#include <linux/init.h>
++
+ /*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
+-#ifndef __ASSEMBLY__
+-#include <linux/kernel.h>
+-#include <linux/threads.h>
+-#include <linux/cpumask.h>
++#ifdef CONFIG_X86_LOCAL_APIC
++# include <asm/mpspec.h>
++# include <asm/apic.h>
++# ifdef CONFIG_X86_IO_APIC
++# include <asm/io_apic.h>
++# endif
+ #endif
+
+-#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
+-#include <linux/bitops.h>
+-#include <asm/mpspec.h>
+-#include <asm/apic.h>
+-#ifdef CONFIG_X86_IO_APIC
+-#include <asm/io_apic.h>
+-#endif
+-#endif
++#define cpu_callout_map cpu_possible_map
++#define cpu_callin_map cpu_possible_map
+
+-#define BAD_APICID 0xFFu
+-#ifdef CONFIG_SMP
+-#ifndef __ASSEMBLY__
++extern int smp_num_siblings;
++extern unsigned int num_processors;
+
+-/*
+- * Private routines/data
+- */
+-
+ extern void smp_alloc_memory(void);
+-extern int pic_mode;
+-extern int smp_num_siblings;
+-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
++extern void lock_ipi_call_lock(void);
++extern void unlock_ipi_call_lock(void);
+
+ extern void (*mtrr_hook) (void);
+ extern void zap_low_mappings (void);
+-extern void lock_ipi_call_lock(void);
+-extern void unlock_ipi_call_lock(void);
+
+-#define MAX_APICID 256
+-extern u8 __initdata x86_cpu_to_apicid_init[];
+-extern void *x86_cpu_to_apicid_ptr;
++DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
++DECLARE_PER_CPU(cpumask_t, cpu_core_map);
++DECLARE_PER_CPU(u8, cpu_llc_id);
+ DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
+
+-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+-
+ #ifdef CONFIG_HOTPLUG_CPU
+ extern void cpu_exit_clear(void);
+ extern void cpu_uninit(void);
+ #endif
+
++#ifdef CONFIG_SMP
++
+ #ifndef CONFIG_XEN
++
++/* Globals due to paravirt */
++extern void set_cpu_sibling_map(int cpu);
++
+ struct smp_ops
+ {
+ void (*smp_prepare_boot_cpu)(void);
+@@ -104,11 +99,11 @@ void native_smp_prepare_cpus(unsigned in
+ int native_cpu_up(unsigned int cpunum);
+ void native_smp_cpus_done(unsigned int max_cpus);
+
+-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) \
+-do { } while (0)
+-
+-#else
++#ifndef CONFIG_PARAVIRT
++#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
++#endif
+
++#else /* CONFIG_XEN */
+
+ void xen_smp_send_stop(void);
+ void xen_smp_send_reschedule(int cpu);
+@@ -120,7 +115,12 @@ int xen_smp_call_function_mask(cpumask_t
+ #define smp_send_reschedule xen_smp_send_reschedule
+ #define smp_call_function_mask xen_smp_call_function_mask
+
+-#endif
++extern void prefill_possible_map(void);
++
++#endif /* CONFIG_XEN */
++
++extern int __cpu_disable(void);
++extern void __cpu_die(unsigned int cpu);
+
+ /*
+ * This function is needed by all SMP systems. It must _always_ be valid
+@@ -130,64 +130,49 @@ int xen_smp_call_function_mask(cpumask_t
+ DECLARE_PER_CPU(int, cpu_number);
+ #define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+
+-extern cpumask_t cpu_possible_map;
+-#define cpu_callin_map cpu_possible_map
++#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
++
++#define safe_smp_processor_id() smp_processor_id()
+
+ /* We don't mark CPUs online until __cpu_up(), so we need another measure */
+ static inline int num_booting_cpus(void)
+ {
+- return cpus_weight(cpu_possible_map);
++ return cpus_weight(cpu_callout_map);
+ }
+
+-#define safe_smp_processor_id() smp_processor_id()
+-extern int __cpu_disable(void);
+-extern void __cpu_die(unsigned int cpu);
+-extern void prefill_possible_map(void);
+-extern unsigned int num_processors;
+-
+-#endif /* !__ASSEMBLY__ */
+-
+ #else /* CONFIG_SMP */
+
+ #define safe_smp_processor_id() 0
+ #define cpu_physical_id(cpu) boot_cpu_physical_apicid
+
+-#define NO_PROC_ID 0xFF /* No processor magic marker */
+-
+-#endif /* CONFIG_SMP */
+-
+-#ifndef __ASSEMBLY__
++#endif /* !CONFIG_SMP */
+
+ #ifdef CONFIG_X86_LOCAL_APIC
+
+-#ifdef APIC_DEFINITION
++static __inline int logical_smp_processor_id(void)
++{
++ /* we don't want to mark this access volatile - bad code generation */
++ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
++}
++
++# ifdef APIC_DEFINITION
+ extern int hard_smp_processor_id(void);
+-#else
+-#include <mach_apicdef.h>
++# else
++# include <mach_apicdef.h>
+ static inline int hard_smp_processor_id(void)
+ {
+ /* we don't want to mark this access volatile - bad code generation */
+- return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
++ return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
+ }
+-#endif /* APIC_DEFINITION */
++# endif /* APIC_DEFINITION */
+
+ #else /* CONFIG_X86_LOCAL_APIC */
+
+-#ifndef CONFIG_SMP
+-#define hard_smp_processor_id() 0
+-#endif
++# ifndef CONFIG_SMP
++# define hard_smp_processor_id() 0
++# endif
+
+ #endif /* CONFIG_X86_LOCAL_APIC */
+
+-extern u8 apicid_2_node[];
+-
+-#ifdef CONFIG_X86_LOCAL_APIC
+-static __inline int logical_smp_processor_id(void)
+-{
+- /* we don't want to mark this access volatile - bad code generation */
+- return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
+-}
+-#endif
+-#endif
+-
++#endif /* !ASSEMBLY */
+ #endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/smp_64.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/smp_64.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,139 +1,103 @@
+ #ifndef __ASM_SMP_H
+ #define __ASM_SMP_H
+
+-/*
+- * We need the APIC definitions automatically as part of 'smp.h'
+- */
+-#include <linux/threads.h>
+ #include <linux/cpumask.h>
+-#include <linux/bitops.h>
+ #include <linux/init.h>
+-extern int disable_apic;
+
+ #ifdef CONFIG_X86_LOCAL_APIC
+-#include <asm/mpspec.h>
++/*
++ * We need the APIC definitions automatically as part of 'smp.h'
++ */
+ #include <asm/apic.h>
+ #ifdef CONFIG_X86_IO_APIC
+ #include <asm/io_apic.h>
+ #endif
+-#include <asm/thread_info.h>
++#include <asm/mpspec.h>
+ #endif
+-
+-#ifdef CONFIG_SMP
+-
+ #include <asm/pda.h>
++#include <asm/thread_info.h>
+
+-struct pt_regs;
+-
+-extern cpumask_t cpu_present_mask;
+-extern cpumask_t cpu_possible_map;
+-extern cpumask_t cpu_online_map;
+ extern cpumask_t cpu_initialized;
+
+-/*
+- * Private routines/data
+- */
+-
++extern int smp_num_siblings;
++extern unsigned int num_processors;
++
+ extern void smp_alloc_memory(void);
+-extern volatile unsigned long smp_invalidate_needed;
+ extern void lock_ipi_call_lock(void);
+ extern void unlock_ipi_call_lock(void);
+-extern int smp_num_siblings;
+-extern void smp_send_reschedule(int cpu);
++
+ extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+ void *info, int wait);
+
+-/*
+- * cpu_sibling_map and cpu_core_map now live
+- * in the per cpu area
+- *
+- * extern cpumask_t cpu_sibling_map[NR_CPUS];
+- * extern cpumask_t cpu_core_map[NR_CPUS];
+- */
+ DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+ DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+-DECLARE_PER_CPU(u8, cpu_llc_id);
+-
+-#define SMP_TRAMPOLINE_BASE 0x6000
++DECLARE_PER_CPU(u16, cpu_llc_id);
++DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
++DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
+
+-/*
+- * On x86 all CPUs are mapped 1:1 to the APIC space.
+- * This simplifies scheduling and IPI sending and
+- * compresses data structures.
+- */
+-
+-static inline int num_booting_cpus(void)
++#ifdef CONFIG_X86_LOCAL_APIC
++static inline int cpu_present_to_apicid(int mps_cpu)
+ {
+- return cpus_weight(cpu_possible_map);
++ if (cpu_present(mps_cpu))
++ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
++ else
++ return BAD_APICID;
+ }
++#endif
+
+-#define raw_smp_processor_id() read_pda(cpunumber)
++#ifdef CONFIG_SMP
++
++#define SMP_TRAMPOLINE_BASE 0x6000
+
+ extern int __cpu_disable(void);
+ extern void __cpu_die(unsigned int cpu);
+ extern void prefill_possible_map(void);
+-extern unsigned num_processors;
+ extern unsigned __cpuinitdata disabled_cpus;
+
+-#define NO_PROC_ID 0xFF /* No processor magic marker */
+-
+-#endif /* CONFIG_SMP */
++#define raw_smp_processor_id() read_pda(cpunumber)
++#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+
+-#define safe_smp_processor_id() smp_processor_id()
+-
+-#ifdef CONFIG_X86_LOCAL_APIC
+-static inline int hard_smp_processor_id(void)
+-{
+- /* we don't want to mark this access volatile - bad code generation */
+- return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
+-}
+-#endif
++#define stack_smp_processor_id() \
++ ({ \
++ struct thread_info *ti; \
++ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
++ ti->cpu; \
++})
+
+ /*
+- * Some lowlevel functions might want to know about
+- * the real APIC ID <-> CPU # mapping.
++ * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
++ * scheduling and IPI sending and compresses data structures.
+ */
+-extern u8 __initdata x86_cpu_to_apicid_init[];
+-extern void *x86_cpu_to_apicid_ptr;
+-DECLARE_PER_CPU(u8, x86_cpu_to_apicid); /* physical ID */
+-extern u8 bios_cpu_apicid[];
+-
+-#ifdef CONFIG_X86_LOCAL_APIC
+-static inline int cpu_present_to_apicid(int mps_cpu)
++static inline int num_booting_cpus(void)
+ {
+- if (mps_cpu < NR_CPUS)
+- return (int)bios_cpu_apicid[mps_cpu];
+- else
+- return BAD_APICID;
++ return cpus_weight(cpu_possible_map);
+ }
+-#endif
+
+-#ifndef CONFIG_SMP
++extern void smp_send_reschedule(int cpu);
++
++#else /* CONFIG_SMP */
++
++extern unsigned int boot_cpu_id;
++#define cpu_physical_id(cpu) boot_cpu_id
+ #define stack_smp_processor_id() 0
+-#define cpu_logical_map(x) (x)
+-#else
+-#include <asm/thread_info.h>
+-#define stack_smp_processor_id() \
+-({ \
+- struct thread_info *ti; \
+- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
+- ti->cpu; \
+-})
+-#endif
++
++#endif /* !CONFIG_SMP */
++
++#define safe_smp_processor_id() smp_processor_id()
+
+ #ifdef CONFIG_X86_LOCAL_APIC
+ static __inline int logical_smp_processor_id(void)
+ {
+ /* we don't want to mark this access volatile - bad code generation */
+- return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
++ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
++}
++
++static inline int hard_smp_processor_id(void)
++{
++ /* we don't want to mark this access volatile - bad code generation */
++ return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
+ }
+ #endif
+
+-#ifdef CONFIG_SMP
+-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+-#else
+-extern unsigned int boot_cpu_id;
+-#define cpu_physical_id(cpu) boot_cpu_id
+-#endif /* !CONFIG_SMP */
+ #endif
+
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:33:40.000000000 +0100
+@@ -0,0 +1,333 @@
++#ifndef _X86_SPINLOCK_H_
++#define _X86_SPINLOCK_H_
++
++#include <asm/atomic.h>
++#include <asm/rwlock.h>
++#include <asm/page.h>
++#include <asm/processor.h>
++#include <linux/compiler.h>
++
++/*
++ * Your basic SMP spinlocks, allowing only a single CPU anywhere
++ *
++ * Simple spin lock operations. There are two variants, one clears IRQ's
++ * on the local processor, one does not.
++ *
++ * These are fair FIFO ticket locks, which are currently limited to 256
++ * CPUs.
++ *
++ * (the type definitions are in asm/spinlock_types.h)
++ */
++
++#ifdef CONFIG_X86_32
++# define LOCK_PTR_REG "a"
++# define REG_PTR_MODE "k"
++#else
++# define LOCK_PTR_REG "D"
++# define REG_PTR_MODE "q"
++#endif
++
++#if defined(CONFIG_X86_32) && \
++ (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
++/*
++ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
++ * (PPro errata 66, 92)
++ */
++# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
++#else
++# define UNLOCK_LOCK_PREFIX
++#endif
++
++int xen_spinlock_init(unsigned int cpu);
++void xen_spinlock_cleanup(unsigned int cpu);
++extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
++extern int xen_spin_wait_flags(raw_spinlock_t *, unsigned int *token,
++ unsigned int flags);
++extern unsigned int xen_spin_adjust(raw_spinlock_t *, unsigned int token);
++extern void xen_spin_kick(raw_spinlock_t *, unsigned int token);
++
++/*
++ * Ticket locks are conceptually two parts, one indicating the current head of
++ * the queue, and the other indicating the current tail. The lock is acquired
++ * by atomically noting the tail and incrementing it by one (thus adding
++ * ourself to the queue and noting our position), then waiting until the head
++ * becomes equal to the the initial value of the tail.
++ *
++ * We use an xadd covering *both* parts of the lock, to increment the tail and
++ * also load the position of the head, which takes care of memory ordering
++ * issues and should be optimal for the uncontended case. Note the tail must be
++ * in the high part, because a wide xadd increment of the low part would carry
++ * up and contaminate the high part.
++ *
++ * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
++ * save some instructions and make the code more elegant. There really isn't
++ * much between them in performance though, especially as locks are out of line.
++ */
++#if (NR_CPUS < 256)
++#define TICKET_SHIFT 8
++#define __raw_spin_lock_preamble \
++ asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
++ "cmpb %h0, %b0\n\t" \
++ "sete %1" \
++ : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
++ : "0" (0x0100) \
++ : "memory", "cc")
++#define __raw_spin_lock_body \
++ asm("1:\t" \
++ "cmpb %h0, %b0\n\t" \
++ "je 2f\n\t" \
++ "decl %1\n\t" \
++ "jz 2f\n\t" \
++ "rep ; nop\n\t" \
++ "movb %2, %b0\n\t" \
++ /* don't need lfence here, because loads are in-order */ \
++ "jmp 1b\n" \
++ "2:" \
++ : "+Q" (token), "+g" (count) \
++ : "m" (lock->slock) \
++ : "memory", "cc")
++
++
++static inline int __raw_spin_trylock(raw_spinlock_t *lock)
++{
++ int tmp, new;
++
++ asm("movzwl %2, %0\n\t"
++ "cmpb %h0, %b0\n\t"
++ "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
++ "jne 1f\n\t"
++ LOCK_PREFIX "cmpxchgw %w1, %2\n\t"
++ "1:\t"
++ "sete %b1\n\t"
++ "movzbl %b1, %0\n\t"
++ : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
++ :
++ : "memory", "cc");
++
++ return tmp;
++}
++
++static inline void __raw_spin_unlock(raw_spinlock_t *lock)
++{
++ unsigned int token;
++ unsigned char kick;
++
++ asm(UNLOCK_LOCK_PREFIX "incb %2\n\t"
++ "movzwl %2, %0\n\t"
++ "cmpb %h0, %b0\n\t"
++ "setne %1"
++ : "=&Q" (token), "=qm" (kick), "+m" (lock->slock)
++ :
++ : "memory", "cc");
++ if (kick)
++ xen_spin_kick(lock, token);
++}
++#else
++#define TICKET_SHIFT 16
++#define __raw_spin_lock_preamble \
++ do { \
++ unsigned int tmp; \
++ asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
++ "shldl $16, %0, %3\n\t" \
++ "cmpw %w3, %w0\n\t" \
++ "sete %1"
++ : "=&r" (token), "=qm" (free), "+m" (lock->slock), \
++ "=&g" (tmp) \
++ : "0" (0x00010000) \
++ : "memory", "cc"); \
++ } while (0)
++#define __raw_spin_lock_body \
++ do { \
++ unsigned int tmp; \
++ asm("shldl $16, %0, %2\n" \
++ "1:\t" \
++ "cmpw %w2, %w0\n\t" \
++ "je 2f\n\t" \
++ "decl %1\n\t" \
++ "jz 2f\n\t" \
++ "rep ; nop\n\t" \
++ "movw %3, %w0\n\t" \
++ /* don't need lfence here, because loads are in-order */ \
++ "jmp 1b\n" \
++ "2:" \
++ : "+r" (token), "+g" (count), "=&g" (tmp) \
++ : "m" (lock->slock) \
++ : "memory", "cc"); \
++ } while (0)
++
++static inline int __raw_spin_trylock(raw_spinlock_t *lock)
++{
++ int tmp;
++ int new;
++
++ asm("movl %2, %0\n\t"
++ "movl %0, %1\n\t"
++ "roll $16, %0\n\t"
++ "cmpl %0, %1\n\t"
++ "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
++ "jne 1f\n\t"
++ LOCK_PREFIX "cmpxchgl %1, %2\n"
++ "1:\t"
++ "sete %b1\n\t"
++ "movzbl %b1, %0\n\t"
++ : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
++ :
++ : "memory", "cc");
++
++ return tmp;
++}
++
++static inline void __raw_spin_unlock(raw_spinlock_t *lock)
++{
++ unsigned int token, tmp;
++ bool kick;
++
++ asm(UNLOCK_LOCK_PREFIX "incw %2\n\t"
++ "movl %2, %0\n\t"
++ "shldl $16, %0, %3\n\t"
++ "cmpw %w3, %w0\n\t"
++ "setne %1"
++ : "=&r" (token), "=qm" (kick), "+m" (lock->slock), "=&r" (tmp)
++ :
++ : "memory", "cc");
++ if (kick)
++ xen_spin_kick(lock, token);
++}
++#endif
++
++static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
++{
++ int tmp = *(volatile signed int *)(&(lock)->slock);
++
++ return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
++}
++
++static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
++{
++ int tmp = *(volatile signed int *)(&(lock)->slock);
++
++ return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
++}
++
++static inline void __raw_spin_lock(raw_spinlock_t *lock)
++{
++ unsigned int token, count;
++ bool free;
++
++ __raw_spin_lock_preamble;
++ if (unlikely(!free))
++ token = xen_spin_adjust(lock, token);
++ do {
++ count = 1 << 10;
++ __raw_spin_lock_body;
++ } while (unlikely(!count) && !xen_spin_wait(lock, token));
++}
++
++static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
++ unsigned long flags)
++{
++ unsigned int token, count;
++ bool free;
++
++ __raw_spin_lock_preamble;
++ if (unlikely(!free))
++ token = xen_spin_adjust(lock, token);
++ do {
++ count = 1 << 10;
++ __raw_spin_lock_body;
++ } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
++}
++
++static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
++{
++ while (__raw_spin_is_locked(lock))
++ cpu_relax();
++}
++
++/*
++ * Read-write spinlocks, allowing multiple readers
++ * but only one writer.
++ *
++ * NOTE! it is quite common to have readers in interrupts
++ * but no interrupt writers. For those circumstances we
++ * can "mix" irq-safe locks - any writer needs to get a
++ * irq-safe write-lock, but readers can get non-irqsafe
++ * read-locks.
++ *
++ * On x86, we implement read-write locks as a 32-bit counter
++ * with the high bit (sign) being the "contended" bit.
++ */
++
++/**
++ * read_can_lock - would read_trylock() succeed?
++ * @lock: the rwlock in question.
++ */
++static inline int __raw_read_can_lock(raw_rwlock_t *lock)
++{
++ return (int)(lock)->lock > 0;
++}
++
++/**
++ * write_can_lock - would write_trylock() succeed?
++ * @lock: the rwlock in question.
++ */
++static inline int __raw_write_can_lock(raw_rwlock_t *lock)
++{
++ return (lock)->lock == RW_LOCK_BIAS;
++}
++
++static inline void __raw_read_lock(raw_rwlock_t *rw)
++{
++ asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
++ "jns 1f\n"
++ "call __read_lock_failed\n\t"
++ "1:\n"
++ ::LOCK_PTR_REG (rw) : "memory");
++}
++
++static inline void __raw_write_lock(raw_rwlock_t *rw)
++{
++ asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
++ "jz 1f\n"
++ "call __write_lock_failed\n\t"
++ "1:\n"
++ ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
++}
++
++static inline int __raw_read_trylock(raw_rwlock_t *lock)
++{
++ atomic_t *count = (atomic_t *)lock;
++
++ atomic_dec(count);
++ if (atomic_read(count) >= 0)
++ return 1;
++ atomic_inc(count);
++ return 0;
++}
++
++static inline int __raw_write_trylock(raw_rwlock_t *lock)
++{
++ atomic_t *count = (atomic_t *)lock;
++
++ if (atomic_sub_and_test(RW_LOCK_BIAS, count))
++ return 1;
++ atomic_add(RW_LOCK_BIAS, count);
++ return 0;
++}
++
++static inline void __raw_read_unlock(raw_rwlock_t *rw)
++{
++ asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
++}
++
++static inline void __raw_write_unlock(raw_rwlock_t *rw)
++{
++ asm volatile(LOCK_PREFIX "addl %1, %0"
++ : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
++}
++
++#define _raw_spin_relax(lock) cpu_relax()
++#define _raw_read_relax(lock) cpu_relax()
++#define _raw_write_relax(lock) cpu_relax()
++
++#endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/system.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,5 +1,393 @@
++#ifndef _ASM_X86_SYSTEM_H_
++#define _ASM_X86_SYSTEM_H_
++
++#include <asm/asm.h>
++#include <asm/segment.h>
++#include <asm/cpufeature.h>
++#include <asm/cmpxchg.h>
++#include <asm/nops.h>
++#include <asm/hypervisor.h>
++
++#include <linux/kernel.h>
++#include <linux/irqflags.h>
++
++/* entries in ARCH_DLINFO: */
++#ifdef CONFIG_IA32_EMULATION
++# define AT_VECTOR_SIZE_ARCH 2
++#else
++# define AT_VECTOR_SIZE_ARCH 1
++#endif
++
++#ifdef CONFIG_X86_32
++
++struct task_struct; /* one of the stranger aspects of C forward declarations */
++struct task_struct *__switch_to(struct task_struct *prev,
++ struct task_struct *next);
++
++/*
++ * Saving eflags is important. It switches not only IOPL between tasks,
++ * it also protects other tasks from NT leaking through sysenter etc.
++ */
++#define switch_to(prev, next, last) do { \
++ unsigned long esi, edi; \
++ asm volatile("pushfl\n\t" /* Save flags */ \
++ "pushl %%ebp\n\t" \
++ "movl %%esp,%0\n\t" /* save ESP */ \
++ "movl %5,%%esp\n\t" /* restore ESP */ \
++ "movl $1f,%1\n\t" /* save EIP */ \
++ "pushl %6\n\t" /* restore EIP */ \
++ "jmp __switch_to\n" \
++ "1:\t" \
++ "popl %%ebp\n\t" \
++ "popfl" \
++ :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \
++ "=a" (last), "=S" (esi), "=D" (edi) \
++ :"m" (next->thread.sp), "m" (next->thread.ip), \
++ "2" (prev), "d" (next)); \
++} while (0)
++
++/*
++ * disable hlt during certain critical i/o operations
++ */
++#define HAVE_DISABLE_HLT
++#else
++#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
++#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
++
++/* frame pointer must be last for get_wchan */
++#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
++#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
++
++#define __EXTRA_CLOBBER \
++ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
++ "r12", "r13", "r14", "r15"
++
++/* Save restore flags to clear handle leaking NT */
++#define switch_to(prev, next, last) \
++ asm volatile(SAVE_CONTEXT \
++ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
++ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
++ "call __switch_to\n\t" \
++ ".globl thread_return\n" \
++ "thread_return:\n\t" \
++ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
++ "movq %P[thread_info](%%rsi),%%r8\n\t" \
++ LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
++ "movq %%rax,%%rdi\n\t" \
++ "jc ret_from_fork\n\t" \
++ RESTORE_CONTEXT \
++ : "=a" (last) \
++ : [next] "S" (next), [prev] "D" (prev), \
++ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
++ [ti_flags] "i" (offsetof(struct thread_info, flags)), \
++ [tif_fork] "i" (TIF_FORK), \
++ [thread_info] "i" (offsetof(struct task_struct, stack)), \
++ [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
++ : "memory", "cc" __EXTRA_CLOBBER)
++#endif
++
++#ifdef __KERNEL__
++#define _set_base(addr, base) do { unsigned long __pr; \
++__asm__ __volatile__ ("movw %%dx,%1\n\t" \
++ "rorl $16,%%edx\n\t" \
++ "movb %%dl,%2\n\t" \
++ "movb %%dh,%3" \
++ :"=&d" (__pr) \
++ :"m" (*((addr)+2)), \
++ "m" (*((addr)+4)), \
++ "m" (*((addr)+7)), \
++ "0" (base) \
++ ); } while (0)
++
++#define _set_limit(addr, limit) do { unsigned long __lr; \
++__asm__ __volatile__ ("movw %%dx,%1\n\t" \
++ "rorl $16,%%edx\n\t" \
++ "movb %2,%%dh\n\t" \
++ "andb $0xf0,%%dh\n\t" \
++ "orb %%dh,%%dl\n\t" \
++ "movb %%dl,%2" \
++ :"=&d" (__lr) \
++ :"m" (*(addr)), \
++ "m" (*((addr)+6)), \
++ "0" (limit) \
++ ); } while (0)
++
++#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
++#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
++
++extern void load_gs_index(unsigned);
++
++/*
++ * Load a segment. Fall back on loading the zero
++ * segment if something goes wrong..
++ */
++#define loadsegment(seg, value) \
++ asm volatile("\n" \
++ "1:\t" \
++ "movl %k0,%%" #seg "\n" \
++ "2:\n" \
++ ".section .fixup,\"ax\"\n" \
++ "3:\t" \
++ "movl %k1, %%" #seg "\n\t" \
++ "jmp 2b\n" \
++ ".previous\n" \
++ _ASM_EXTABLE(1b,3b) \
++ : :"r" (value), "r" (0))
++
++
++/*
++ * Save a segment register away
++ */
++#define savesegment(seg, value) \
++ asm volatile("mov %%" #seg ",%0":"=rm" (value))
++
++static inline unsigned long get_limit(unsigned long segment)
++{
++ unsigned long __limit;
++ __asm__("lsll %1,%0"
++ :"=r" (__limit):"r" (segment));
++ return __limit+1;
++}
++
++static inline void xen_clts(void)
++{
++ HYPERVISOR_fpu_taskswitch(0);
++}
++
++static inline void xen_stts(void)
++{
++ HYPERVISOR_fpu_taskswitch(1);
++}
++
++/*
++ * Volatile isn't enough to prevent the compiler from reordering the
++ * read/write functions for the control registers and messing everything up.
++ * A memory clobber would solve the problem, but would prevent reordering of
++ * all loads stores around it, which can hurt performance. Solution is to
++ * use a variable and mimic reads and writes to it to enforce serialization
++ */
++static unsigned long __force_order;
++
++static inline unsigned long xen_read_cr0(void)
++{
++ unsigned long val;
++ asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
++ return val;
++}
++
++static inline void xen_write_cr0(unsigned long val)
++{
++ asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
++}
++
++#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
++#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
++
++static inline unsigned long xen_read_cr3(void)
++{
++ unsigned long val;
++ asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
++#ifdef CONFIG_X86_32
++ return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
++#else
++ return machine_to_phys(val);
++#endif
++}
++
++static inline void xen_write_cr3(unsigned long val)
++{
++#ifdef CONFIG_X86_32
++ val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
++#else
++ val = phys_to_machine(val);
++#endif
++ asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
++}
++
++static inline unsigned long xen_read_cr4(void)
++{
++ unsigned long val;
++ asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
++ return val;
++}
++
++#define xen_read_cr4_safe() xen_read_cr4()
++
++static inline void xen_write_cr4(unsigned long val)
++{
++ asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
++}
++
++#ifdef CONFIG_X86_64
++static inline unsigned long xen_read_cr8(void)
++{
++ return 0;
++}
++
++static inline void xen_write_cr8(unsigned long val)
++{
++ BUG_ON(val);
++}
++#endif
++
++static inline void xen_wbinvd(void)
++{
++ asm volatile("wbinvd": : :"memory");
++}
++#define read_cr0() (xen_read_cr0())
++#define write_cr0(x) (xen_write_cr0(x))
++#define read_cr2() (xen_read_cr2())
++#define write_cr2(x) (xen_write_cr2(x))
++#define read_cr3() (xen_read_cr3())
++#define write_cr3(x) (xen_write_cr3(x))
++#define read_cr4() (xen_read_cr4())
++#define read_cr4_safe() (xen_read_cr4_safe())
++#define write_cr4(x) (xen_write_cr4(x))
++#define wbinvd() (xen_wbinvd())
++#ifdef CONFIG_X86_64
++#define read_cr8() (xen_read_cr8())
++#define write_cr8(x) (xen_write_cr8(x))
++#endif
++
++/* Clear the 'TS' bit */
++#define clts() (xen_clts())
++#define stts() (xen_stts())
++
++#endif /* __KERNEL__ */
++
++static inline void clflush(volatile void *__p)
++{
++ asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
++}
++
++#define nop() __asm__ __volatile__ ("nop")
++
++void disable_hlt(void);
++void enable_hlt(void);
++
++extern int es7000_plat;
++void cpu_idle_wait(void);
++
++extern unsigned long arch_align_stack(unsigned long sp);
++extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
++
++void default_idle(void);
++
++/*
++ * Force strict CPU ordering.
++ * And yes, this is required on UP too when we're talking
++ * to devices.
++ */
+ #ifdef CONFIG_X86_32
+-# include "system_32.h"
++/*
++ * For now, "wmb()" doesn't actually do anything, as all
++ * Intel CPU's follow what Intel calls a *Processor Order*,
++ * in which all writes are seen in the program order even
++ * outside the CPU.
++ *
++ * I expect future Intel CPU's to have a weaker ordering,
++ * but I'd also expect them to finally get their act together
++ * and add some real memory barriers if so.
++ *
++ * Some non intel clones support out of order store. wmb() ceases to be a
++ * nop for these.
++ */
++#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
++#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
++#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
++#else
++#define mb() asm volatile("mfence":::"memory")
++#define rmb() asm volatile("lfence":::"memory")
++#define wmb() asm volatile("sfence" ::: "memory")
++#endif
++
++/**
++ * read_barrier_depends - Flush all pending reads that subsequents reads
++ * depend on.
++ *
++ * No data-dependent reads from memory-like regions are ever reordered
++ * over this barrier. All reads preceding this primitive are guaranteed
++ * to access memory (but not necessarily other CPUs' caches) before any
++ * reads following this primitive that depend on the data return by
++ * any of the preceding reads. This primitive is much lighter weight than
++ * rmb() on most CPUs, and is never heavier weight than is
++ * rmb().
++ *
++ * These ordering constraints are respected by both the local CPU
++ * and the compiler.
++ *
++ * Ordering is not guaranteed by anything other than these primitives,
++ * not even by data dependencies. See the documentation for
++ * memory_barrier() for examples and URLs to more information.
++ *
++ * For example, the following code would force ordering (the initial
++ * value of "a" is zero, "b" is one, and "p" is "&a"):
++ *
++ * <programlisting>
++ * CPU 0 CPU 1
++ *
++ * b = 2;
++ * memory_barrier();
++ * p = &b; q = p;
++ * read_barrier_depends();
++ * d = *q;
++ * </programlisting>
++ *
++ * because the read of "*q" depends on the read of "p" and these
++ * two reads are separated by a read_barrier_depends(). However,
++ * the following code, with the same initial values for "a" and "b":
++ *
++ * <programlisting>
++ * CPU 0 CPU 1
++ *
++ * a = 2;
++ * memory_barrier();
++ * b = 3; y = b;
++ * read_barrier_depends();
++ * x = a;
++ * </programlisting>
++ *
++ * does not enforce ordering, since there is no data dependency between
++ * the read of "a" and the read of "b". Therefore, on some CPUs, such
++ * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
++ * in cases like this where there are no data dependencies.
++ **/
++
++#define read_barrier_depends() do { } while (0)
++
++#ifdef CONFIG_SMP
++#define smp_mb() mb()
++#ifdef CONFIG_X86_PPRO_FENCE
++# define smp_rmb() rmb()
+ #else
+-# include "system_64.h"
++# define smp_rmb() barrier()
++#endif
++#ifdef CONFIG_X86_OOSTORE
++# define smp_wmb() wmb()
++#else
++# define smp_wmb() barrier()
++#endif
++#define smp_read_barrier_depends() read_barrier_depends()
++#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
++#else
++#define smp_mb() barrier()
++#define smp_rmb() barrier()
++#define smp_wmb() barrier()
++#define smp_read_barrier_depends() do { } while (0)
++#define set_mb(var, value) do { var = value; barrier(); } while (0)
++#endif
++
++/*
++ * Stop RDTSC speculation. This is needed when you need to use RDTSC
++ * (or get_cycles or vread that possibly accesses the TSC) in a defined
++ * code region.
++ *
++ * (Could use an alternative three way for this if there was one.)
++ */
++static inline void rdtsc_barrier(void)
++{
++ alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
++ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
++}
++
+ #endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/system_32.h 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,312 +0,0 @@
+-#ifndef __ASM_SYSTEM_H
+-#define __ASM_SYSTEM_H
+-
+-#include <linux/kernel.h>
+-#include <asm/segment.h>
+-#include <asm/cpufeature.h>
+-#include <asm/cmpxchg.h>
+-#include <asm/synch_bitops.h>
+-#include <asm/hypervisor.h>
+-
+-#ifdef __KERNEL__
+-#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
+-
+-struct task_struct; /* one of the stranger aspects of C forward declarations.. */
+-extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
+-
+-/*
+- * Saving eflags is important. It switches not only IOPL between tasks,
+- * it also protects other tasks from NT leaking through sysenter etc.
+- */
+-#define switch_to(prev,next,last) do { \
+- unsigned long esi,edi; \
+- asm volatile("pushfl\n\t" /* Save flags */ \
+- "pushl %%ebp\n\t" \
+- "movl %%esp,%0\n\t" /* save ESP */ \
+- "movl %5,%%esp\n\t" /* restore ESP */ \
+- "movl $1f,%1\n\t" /* save EIP */ \
+- "pushl %6\n\t" /* restore EIP */ \
+- "jmp __switch_to\n" \
+- "1:\t" \
+- "popl %%ebp\n\t" \
+- "popfl" \
+- :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
+- "=a" (last),"=S" (esi),"=D" (edi) \
+- :"m" (next->thread.esp),"m" (next->thread.eip), \
+- "2" (prev), "d" (next)); \
+-} while (0)
+-
+-#define _set_base(addr,base) do { unsigned long __pr; \
+-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+- "rorl $16,%%edx\n\t" \
+- "movb %%dl,%2\n\t" \
+- "movb %%dh,%3" \
+- :"=&d" (__pr) \
+- :"m" (*((addr)+2)), \
+- "m" (*((addr)+4)), \
+- "m" (*((addr)+7)), \
+- "0" (base) \
+- ); } while(0)
+-
+-#define _set_limit(addr,limit) do { unsigned long __lr; \
+-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+- "rorl $16,%%edx\n\t" \
+- "movb %2,%%dh\n\t" \
+- "andb $0xf0,%%dh\n\t" \
+- "orb %%dh,%%dl\n\t" \
+- "movb %%dl,%2" \
+- :"=&d" (__lr) \
+- :"m" (*(addr)), \
+- "m" (*((addr)+6)), \
+- "0" (limit) \
+- ); } while(0)
+-
+-#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
+-#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
+-
+-/*
+- * Load a segment. Fall back on loading the zero
+- * segment if something goes wrong..
+- */
+-#define loadsegment(seg,value) \
+- asm volatile("\n" \
+- "1:\t" \
+- "mov %0,%%" #seg "\n" \
+- "2:\n" \
+- ".section .fixup,\"ax\"\n" \
+- "3:\t" \
+- "pushl $0\n\t" \
+- "popl %%" #seg "\n\t" \
+- "jmp 2b\n" \
+- ".previous\n" \
+- ".section __ex_table,\"a\"\n\t" \
+- ".align 4\n\t" \
+- ".long 1b,3b\n" \
+- ".previous" \
+- : :"rm" (value))
+-
+-/*
+- * Save a segment register away
+- */
+-#define savesegment(seg, value) \
+- asm volatile("mov %%" #seg ",%0":"=rm" (value))
+-
+-static inline void xen_clts(void)
+-{
+- HYPERVISOR_fpu_taskswitch(0);
+-}
+-
+-static inline unsigned long xen_read_cr0(void)
+-{
+- unsigned long val;
+- asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
+- return val;
+-}
+-
+-static inline void xen_write_cr0(unsigned long val)
+-{
+- asm volatile("movl %0,%%cr0": :"r" (val));
+-}
+-
+-#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
+-
+-static inline void xen_write_cr2(unsigned long val)
+-{
+- asm volatile("movl %0,%%cr2": :"r" (val));
+-}
+-
+-static inline unsigned long xen_read_cr3(void)
+-{
+- unsigned long val;
+- asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
+- return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
+-}
+-
+-static inline void xen_write_cr3(unsigned long val)
+-{
+- val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
+- asm volatile("movl %0,%%cr3": :"r" (val));
+-}
+-
+-static inline unsigned long xen_read_cr4(void)
+-{
+- unsigned long val;
+- asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
+- return val;
+-}
+-
+-static inline unsigned long xen_read_cr4_safe(void)
+-{
+- unsigned long val;
+- /* This could fault if %cr4 does not exist */
+- asm volatile("1: movl %%cr4, %0 \n"
+- "2: \n"
+- ".section __ex_table,\"a\" \n"
+- ".long 1b,2b \n"
+- ".previous \n"
+- : "=r" (val): "0" (0));
+- return val;
+-}
+-
+-static inline void xen_write_cr4(unsigned long val)
+-{
+- asm volatile("movl %0,%%cr4": :"r" (val));
+-}
+-
+-static inline void xen_wbinvd(void)
+-{
+- asm volatile("wbinvd": : :"memory");
+-}
+-
+-static inline void clflush(volatile void *__p)
+-{
+- asm volatile("clflush %0" : "+m" (*(char __force *)__p));
+-}
+-
+-#define read_cr0() (xen_read_cr0())
+-#define write_cr0(x) (xen_write_cr0(x))
+-#define read_cr2() (xen_read_cr2())
+-#define write_cr2(x) (xen_write_cr2(x))
+-#define read_cr3() (xen_read_cr3())
+-#define write_cr3(x) (xen_write_cr3(x))
+-#define read_cr4() (xen_read_cr4())
+-#define read_cr4_safe() (xen_read_cr4_safe())
+-#define write_cr4(x) (xen_write_cr4(x))
+-#define wbinvd() (xen_wbinvd())
+-
+-/* Clear the 'TS' bit */
+-#define clts() (xen_clts())
+-
+-/* Set the 'TS' bit */
+-#define stts() (HYPERVISOR_fpu_taskswitch(1))
+-
+-#endif /* __KERNEL__ */
+-
+-static inline unsigned long get_limit(unsigned long segment)
+-{
+- unsigned long __limit;
+- __asm__("lsll %1,%0"
+- :"=r" (__limit):"r" (segment));
+- return __limit+1;
+-}
+-
+-#define nop() __asm__ __volatile__ ("nop")
+-
+-/*
+- * Force strict CPU ordering.
+- * And yes, this is required on UP too when we're talking
+- * to devices.
+- *
+- * For now, "wmb()" doesn't actually do anything, as all
+- * Intel CPU's follow what Intel calls a *Processor Order*,
+- * in which all writes are seen in the program order even
+- * outside the CPU.
+- *
+- * I expect future Intel CPU's to have a weaker ordering,
+- * but I'd also expect them to finally get their act together
+- * and add some real memory barriers if so.
+- *
+- * Some non intel clones support out of order store. wmb() ceases to be a
+- * nop for these.
+- */
+-
+-
+-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+-
+-/**
+- * read_barrier_depends - Flush all pending reads that subsequents reads
+- * depend on.
+- *
+- * No data-dependent reads from memory-like regions are ever reordered
+- * over this barrier. All reads preceding this primitive are guaranteed
+- * to access memory (but not necessarily other CPUs' caches) before any
+- * reads following this primitive that depend on the data return by
+- * any of the preceding reads. This primitive is much lighter weight than
+- * rmb() on most CPUs, and is never heavier weight than is
+- * rmb().
+- *
+- * These ordering constraints are respected by both the local CPU
+- * and the compiler.
+- *
+- * Ordering is not guaranteed by anything other than these primitives,
+- * not even by data dependencies. See the documentation for
+- * memory_barrier() for examples and URLs to more information.
+- *
+- * For example, the following code would force ordering (the initial
+- * value of "a" is zero, "b" is one, and "p" is "&a"):
+- *
+- * <programlisting>
+- * CPU 0 CPU 1
+- *
+- * b = 2;
+- * memory_barrier();
+- * p = &b; q = p;
+- * read_barrier_depends();
+- * d = *q;
+- * </programlisting>
+- *
+- * because the read of "*q" depends on the read of "p" and these
+- * two reads are separated by a read_barrier_depends(). However,
+- * the following code, with the same initial values for "a" and "b":
+- *
+- * <programlisting>
+- * CPU 0 CPU 1
+- *
+- * a = 2;
+- * memory_barrier();
+- * b = 3; y = b;
+- * read_barrier_depends();
+- * x = a;
+- * </programlisting>
+- *
+- * does not enforce ordering, since there is no data dependency between
+- * the read of "a" and the read of "b". Therefore, on some CPUs, such
+- * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
+- * in cases like this where there are no data dependencies.
+- **/
+-
+-#define read_barrier_depends() do { } while(0)
+-
+-#ifdef CONFIG_SMP
+-#define smp_mb() mb()
+-#ifdef CONFIG_X86_PPRO_FENCE
+-# define smp_rmb() rmb()
+-#else
+-# define smp_rmb() barrier()
+-#endif
+-#ifdef CONFIG_X86_OOSTORE
+-# define smp_wmb() wmb()
+-#else
+-# define smp_wmb() barrier()
+-#endif
+-#define smp_read_barrier_depends() read_barrier_depends()
+-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
+-#else
+-#define smp_mb() barrier()
+-#define smp_rmb() barrier()
+-#define smp_wmb() barrier()
+-#define smp_read_barrier_depends() do { } while(0)
+-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+-#endif
+-
+-#include <linux/irqflags.h>
+-
+-/*
+- * disable hlt during certain critical i/o operations
+- */
+-#define HAVE_DISABLE_HLT
+-void disable_hlt(void);
+-void enable_hlt(void);
+-
+-extern int es7000_plat;
+-void cpu_idle_wait(void);
+-
+-extern unsigned long arch_align_stack(unsigned long sp);
+-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+-
+-void default_idle(void);
+-void __show_registers(struct pt_regs *, int all);
+-
+-#endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/system_64.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/system_64.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,122 +1,9 @@
+ #ifndef __ASM_SYSTEM_H
+ #define __ASM_SYSTEM_H
+
+-#include <linux/kernel.h>
+ #include <asm/segment.h>
+ #include <asm/cmpxchg.h>
+
+-#include <asm/synch_bitops.h>
+-#include <asm/hypervisor.h>
+-#include <xen/interface/arch-x86_64.h>
+-
+-#ifdef __KERNEL__
+-
+-/* entries in ARCH_DLINFO: */
+-#ifdef CONFIG_IA32_EMULATION
+-# define AT_VECTOR_SIZE_ARCH 2
+-#else
+-# define AT_VECTOR_SIZE_ARCH 1
+-#endif
+-
+-#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
+-#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
+-
+-/* frame pointer must be last for get_wchan */
+-#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
+-
+-#define __EXTRA_CLOBBER \
+- ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
+-
+-/* Save restore flags to clear handle leaking NT */
+-#define switch_to(prev,next,last) \
+- asm volatile(SAVE_CONTEXT \
+- "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
+- "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
+- "call __switch_to\n\t" \
+- ".globl thread_return\n" \
+- "thread_return:\n\t" \
+- "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
+- "movq %P[thread_info](%%rsi),%%r8\n\t" \
+- LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
+- "movq %%rax,%%rdi\n\t" \
+- "jc ret_from_fork\n\t" \
+- RESTORE_CONTEXT \
+- : "=a" (last) \
+- : [next] "S" (next), [prev] "D" (prev), \
+- [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
+- [ti_flags] "i" (offsetof(struct thread_info, flags)),\
+- [tif_fork] "i" (TIF_FORK), \
+- [thread_info] "i" (offsetof(struct task_struct, stack)), \
+- [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
+- : "memory", "cc" __EXTRA_CLOBBER)
+-
+-extern void load_gs_index(unsigned);
+-
+-/*
+- * Load a segment. Fall back on loading the zero
+- * segment if something goes wrong..
+- */
+-#define loadsegment(seg,value) \
+- asm volatile("\n" \
+- "1:\t" \
+- "movl %k0,%%" #seg "\n" \
+- "2:\n" \
+- ".section .fixup,\"ax\"\n" \
+- "3:\t" \
+- "movl %1,%%" #seg "\n\t" \
+- "jmp 2b\n" \
+- ".previous\n" \
+- ".section __ex_table,\"a\"\n\t" \
+- ".align 8\n\t" \
+- ".quad 1b,3b\n" \
+- ".previous" \
+- : :"r" (value), "r" (0))
+-
+-/*
+- * Clear and set 'TS' bit respectively
+- */
+-#define clts() (HYPERVISOR_fpu_taskswitch(0))
+-
+-static inline unsigned long read_cr0(void)
+-{
+- unsigned long cr0;
+- asm volatile("movq %%cr0,%0" : "=r" (cr0));
+- return cr0;
+-}
+-
+-static inline void write_cr0(unsigned long val)
+-{
+- asm volatile("movq %0,%%cr0" :: "r" (val));
+-}
+-
+-#define read_cr2() current_vcpu_info()->arch.cr2
+-
+-#define write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
+-
+-#define read_cr3() ({ \
+- unsigned long __dummy; \
+- asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \
+- machine_to_phys(__dummy); \
+-})
+-
+-static inline void write_cr3(unsigned long val)
+-{
+- val = phys_to_machine(val);
+- asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
+-}
+-
+-static inline unsigned long read_cr4(void)
+-{
+- unsigned long cr4;
+- asm volatile("movq %%cr4,%0" : "=r" (cr4));
+- return cr4;
+-}
+-
+-static inline void write_cr4(unsigned long val)
+-{
+- asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
+-}
+
+ static inline unsigned long read_cr8(void)
+ {
+@@ -128,52 +15,6 @@ static inline void write_cr8(unsigned lo
+ BUG_ON(val);
+ }
+
+-#define stts() (HYPERVISOR_fpu_taskswitch(1))
+-
+-#define wbinvd() \
+- __asm__ __volatile__ ("wbinvd": : :"memory")
+-
+-#endif /* __KERNEL__ */
+-
+-static inline void clflush(volatile void *__p)
+-{
+- asm volatile("clflush %0" : "+m" (*(char __force *)__p));
+-}
+-
+-#define nop() __asm__ __volatile__ ("nop")
+-
+-#ifdef CONFIG_SMP
+-#define smp_mb() mb()
+-#define smp_rmb() barrier()
+-#define smp_wmb() barrier()
+-#define smp_read_barrier_depends() do {} while(0)
+-#else
+-#define smp_mb() barrier()
+-#define smp_rmb() barrier()
+-#define smp_wmb() barrier()
+-#define smp_read_barrier_depends() do {} while(0)
+-#endif
+-
+-
+-/*
+- * Force strict CPU ordering.
+- * And yes, this is required on UP too when we're talking
+- * to devices.
+- */
+-#define mb() asm volatile("mfence":::"memory")
+-#define rmb() asm volatile("lfence":::"memory")
+-#define wmb() asm volatile("sfence" ::: "memory")
+-
+-#define read_barrier_depends() do {} while(0)
+-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
+-
+-#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
+-
+ #include <linux/irqflags.h>
+
+-void cpu_idle_wait(void);
+-
+-extern unsigned long arch_align_stack(unsigned long sp);
+-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+-
+ #endif
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/tlbflush.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/tlbflush.h 2009-03-16 16:33:40.000000000 +0100
+@@ -1,5 +1,106 @@
++#ifndef _ASM_X86_TLBFLUSH_H
++#define _ASM_X86_TLBFLUSH_H
++
++#include <linux/mm.h>
++#include <linux/sched.h>
++
++#include <asm/processor.h>
++#include <asm/system.h>
++
++#define __flush_tlb() xen_tlb_flush()
++#define __flush_tlb_global() xen_tlb_flush()
++#define __flush_tlb_single(addr) xen_invlpg(addr)
++#define __flush_tlb_all() xen_tlb_flush()
++#define __flush_tlb_one(addr) xen_invlpg(addr)
++
+ #ifdef CONFIG_X86_32
+-# include "tlbflush_32.h"
++# define TLB_FLUSH_ALL 0xffffffff
+ #else
+-# include "tlbflush_64.h"
++# define TLB_FLUSH_ALL -1ULL
+ #endif
++
++/*
++ * TLB flushing:
++ *
++ * - flush_tlb() flushes the current mm struct TLBs
++ * - flush_tlb_all() flushes all processes TLBs
++ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
++ * - flush_tlb_page(vma, vmaddr) flushes one page
++ * - flush_tlb_range(vma, start, end) flushes a range of pages
++ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
++ *
++ * ..but the i386 has somewhat limited tlb flushing capabilities,
++ * and page-granular flushes are available only on i486 and up.
++ *
++ * x86-64 can only flush individual pages or full VMs. For a range flush
++ * we always do the full VM. Might be worth trying if for a small
++ * range a few INVLPGs in a row are a win.
++ */
++
++#ifndef CONFIG_SMP
++
++#define flush_tlb() __flush_tlb()
++#define flush_tlb_all() __flush_tlb_all()
++#define local_flush_tlb() __flush_tlb()
++
++static inline void flush_tlb_mm(struct mm_struct *mm)
++{
++ if (mm == current->active_mm)
++ __flush_tlb();
++}
++
++static inline void flush_tlb_page(struct vm_area_struct *vma,
++ unsigned long addr)
++{
++ if (vma->vm_mm == current->active_mm)
++ __flush_tlb_one(addr);
++}
++
++static inline void flush_tlb_range(struct vm_area_struct *vma,
++ unsigned long start, unsigned long end)
++{
++ if (vma->vm_mm == current->active_mm)
++ __flush_tlb();
++}
++
++#else /* SMP */
++
++#include <asm/smp.h>
++
++#define local_flush_tlb() __flush_tlb()
++
++#define flush_tlb_all xen_tlb_flush_all
++#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
++#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
++#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
++
++#define flush_tlb() flush_tlb_current_task()
++
++static inline void flush_tlb_range(struct vm_area_struct *vma,
++ unsigned long start, unsigned long end)
++{
++ flush_tlb_mm(vma->vm_mm);
++}
++
++#define TLBSTATE_OK 1
++#define TLBSTATE_LAZY 2
++
++#ifdef CONFIG_X86_32
++struct tlb_state
++{
++ struct mm_struct *active_mm;
++ int state;
++ char __cacheline_padding[L1_CACHE_BYTES-8];
++};
++DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
++#endif
++
++#endif /* SMP */
++
++static inline void flush_tlb_kernel_range(unsigned long start,
++ unsigned long end)
++{
++ flush_tlb_all();
++}
++
++#endif /* _ASM_X86_TLBFLUSH_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/tlbflush_32.h 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,99 +0,0 @@
+-#ifndef _I386_TLBFLUSH_H
+-#define _I386_TLBFLUSH_H
+-
+-#include <linux/mm.h>
+-#include <asm/processor.h>
+-
+-#define __flush_tlb() xen_tlb_flush()
+-#define __flush_tlb_global() xen_tlb_flush()
+-#define __flush_tlb_all() xen_tlb_flush()
+-
+-#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
+-
+-#define __flush_tlb_single(addr) xen_invlpg(addr)
+-
+-#define __flush_tlb_one(addr) __flush_tlb_single(addr)
+-
+-/*
+- * TLB flushing:
+- *
+- * - flush_tlb() flushes the current mm struct TLBs
+- * - flush_tlb_all() flushes all processes TLBs
+- * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+- * - flush_tlb_page(vma, vmaddr) flushes one page
+- * - flush_tlb_range(vma, start, end) flushes a range of pages
+- * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+- *
+- * ..but the i386 has somewhat limited tlb flushing capabilities,
+- * and page-granular flushes are available only on i486 and up.
+- */
+-
+-#define TLB_FLUSH_ALL 0xffffffff
+-
+-
+-#ifndef CONFIG_SMP
+-
+-#include <linux/sched.h>
+-
+-#define flush_tlb() __flush_tlb()
+-#define flush_tlb_all() __flush_tlb_all()
+-#define local_flush_tlb() __flush_tlb()
+-
+-static inline void flush_tlb_mm(struct mm_struct *mm)
+-{
+- if (mm == current->active_mm)
+- __flush_tlb();
+-}
+-
+-static inline void flush_tlb_page(struct vm_area_struct *vma,
+- unsigned long addr)
+-{
+- if (vma->vm_mm == current->active_mm)
+- __flush_tlb_one(addr);
+-}
+-
+-static inline void flush_tlb_range(struct vm_area_struct *vma,
+- unsigned long start, unsigned long end)
+-{
+- if (vma->vm_mm == current->active_mm)
+- __flush_tlb();
+-}
+-
+-#else /* SMP */
+-
+-#include <asm/smp.h>
+-
+-#define local_flush_tlb() \
+- __flush_tlb()
+-
+-#define flush_tlb_all xen_tlb_flush_all
+-#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
+-#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
+-#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
+-
+-#define flush_tlb() flush_tlb_current_task()
+-
+-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
+-{
+- flush_tlb_mm(vma->vm_mm);
+-}
+-
+-#define TLBSTATE_OK 1
+-#define TLBSTATE_LAZY 2
+-
+-struct tlb_state
+-{
+- struct mm_struct *active_mm;
+- int state;
+- char __cacheline_padding[L1_CACHE_BYTES-8];
+-};
+-DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+-#endif /* SMP */
+-
+-static inline void flush_tlb_kernel_range(unsigned long start,
+- unsigned long end)
+-{
+- flush_tlb_all();
+-}
+-
+-#endif /* _I386_TLBFLUSH_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/tlbflush_64.h 2009-02-16 16:18:36.000000000 +0100
++++ /dev/null 1970-01-01 00:00:00.000000000 +0000
+@@ -1,97 +0,0 @@
+-#ifndef _X8664_TLBFLUSH_H
+-#define _X8664_TLBFLUSH_H
+-
+-#include <linux/mm.h>
+-#include <linux/sched.h>
+-#include <asm/processor.h>
+-#include <asm/system.h>
+-
+-#define __flush_tlb() xen_tlb_flush()
+-
+-/*
+- * Global pages have to be flushed a bit differently. Not a real
+- * performance problem because this does not happen often.
+- */
+-#define __flush_tlb_global() xen_tlb_flush()
+-
+-#define __flush_tlb_all() __flush_tlb_global()
+-
+-#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
+-
+-
+-/*
+- * TLB flushing:
+- *
+- * - flush_tlb() flushes the current mm struct TLBs
+- * - flush_tlb_all() flushes all processes TLBs
+- * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+- * - flush_tlb_page(vma, vmaddr) flushes one page
+- * - flush_tlb_range(vma, start, end) flushes a range of pages
+- * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+- *
+- * x86-64 can only flush individual pages or full VMs. For a range flush
+- * we always do the full VM. Might be worth trying if for a small
+- * range a few INVLPGs in a row are a win.
+- */
+-
+-#ifndef CONFIG_SMP
+-
+-#define flush_tlb() __flush_tlb()
+-#define flush_tlb_all() __flush_tlb_all()
+-#define local_flush_tlb() __flush_tlb()
+-
+-static inline void flush_tlb_mm(struct mm_struct *mm)
+-{
+- if (mm == current->active_mm)
+- __flush_tlb();
+-}
+-
+-static inline void flush_tlb_page(struct vm_area_struct *vma,
+- unsigned long addr)
+-{
+- if (vma->vm_mm == current->active_mm)
+- __flush_tlb_one(addr);
+-}
+-
+-static inline void flush_tlb_range(struct vm_area_struct *vma,
+- unsigned long start, unsigned long end)
+-{
+- if (vma->vm_mm == current->active_mm)
+- __flush_tlb();
+-}
+-
+-#else
+-
+-#include <asm/smp.h>
+-
+-#define local_flush_tlb() \
+- __flush_tlb()
+-
+-#define flush_tlb_all xen_tlb_flush_all
+-#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
+-#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
+-#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
+-
+-#define flush_tlb() flush_tlb_current_task()
+-
+-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
+-{
+- flush_tlb_mm(vma->vm_mm);
+-}
+-
+-#define TLBSTATE_OK 1
+-#define TLBSTATE_LAZY 2
+-
+-/* Roughly an IPI every 20MB with 4k pages for freeing page table
+- ranges. Cost is about 42k of memory for each CPU. */
+-#define ARCH_FREE_PTE_NR 5350
+-
+-#endif
+-
+-static inline void flush_tlb_kernel_range(unsigned long start,
+- unsigned long end)
+-{
+- flush_tlb_all();
+-}
+-
+-#endif /* _X8664_TLBFLUSH_H */
+--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/irq_vectors.h 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/include/asm-x86/mach-xen/irq_vectors.h 2009-03-16 16:33:40.000000000 +0100
+@@ -82,7 +82,8 @@
+
+ #define RESCHEDULE_VECTOR 0
+ #define CALL_FUNCTION_VECTOR 1
+-#define NR_IPIS 2
++#define SPIN_UNLOCK_VECTOR 2
++#define NR_IPIS 3
+
+ /*
+ * The maximum number of vectors supported by i386 processors
+--- sle11-2009-06-29.orig/include/asm-x86/mmu.h 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/mmu.h 2009-03-16 16:33:40.000000000 +0100
+@@ -23,7 +23,7 @@ typedef struct {
+ void *vdso;
+ } mm_context_t;
+
+-#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+ void leave_mm(int cpu);
+ #else
+ static inline void leave_mm(int cpu)
+--- sle11-2009-06-29.orig/include/asm-x86/ptrace.h 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/include/asm-x86/ptrace.h 2009-03-16 16:33:40.000000000 +0100
+@@ -249,7 +249,9 @@ extern void user_enable_single_step(stru
+ extern void user_disable_single_step(struct task_struct *);
+
+ extern void user_enable_block_step(struct task_struct *);
+-#ifdef CONFIG_X86_DEBUGCTLMSR
++#if defined(CONFIG_XEN)
++#define arch_has_block_step() (0)
++#elif defined(CONFIG_X86_DEBUGCTLMSR)
+ #define arch_has_block_step() (1)
+ #else
+ #define arch_has_block_step() (boot_cpu_data.x86 >= 6)
+--- sle11-2009-06-29.orig/include/asm-x86/thread_info.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/include/asm-x86/thread_info.h 2009-03-16 16:33:40.000000000 +0100
+@@ -94,6 +94,9 @@ struct thread_info {
+ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
+ #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
+ #define TIF_PERFMON_CTXSW 28 /* perfmon needs ctxsw calls */
++#ifdef CONFIG_X86_XEN
++#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */
++#endif
+
+ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
+ #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
+@@ -118,6 +121,7 @@ struct thread_info {
+ #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
+ #define _TIF_PERFMON_WORK (1 << TIF_PERFMON_WORK)
+ #define _TIF_PERFMON_CTXSW (1 << TIF_PERFMON_CTXSW)
++#define _TIF_CSTAR (1 << TIF_CSTAR)
+
+ /* work to do in syscall_trace_enter() */
+ #define _TIF_WORK_SYSCALL_ENTRY \
+@@ -147,12 +151,12 @@ struct thread_info {
+ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
+ _TIF_NOTSC|_TIF_PERFMON_CTXSW)
+
+-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+-#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
+ #else
+-#define _TIF_WORK_CTXSW_NEXT (_TIF_NOTSC | _TIF_DEBUG)
+-#define _TIF_WORK_CTXSW_PREV (_TIF_NOTSC)
++#define _TIF_WORK_CTXSW (_TIF_NOTSC \
++ /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/)
+ #endif
++#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
++#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
+
+ #define PREEMPT_ACTIVE 0x10000000
+
+--- sle11-2009-06-29.orig/include/asm-x86/time.h 2009-06-29 15:14:52.000000000 +0200
++++ sle11-2009-06-29/include/asm-x86/time.h 2009-03-16 16:33:40.000000000 +0100
+@@ -58,4 +58,10 @@ static inline int native_set_wallclock(u
+
+ extern unsigned long __init calibrate_cpu(void);
+
++#ifdef CONFIG_XEN
++extern int xen_independent_wallclock(void);
++extern unsigned long xen_read_persistent_clock(void);
++extern int xen_update_persistent_clock(void);
++#endif
++
+ #endif
+--- sle11-2009-06-29.orig/include/linux/page-flags.h 2009-02-16 16:17:21.000000000 +0100
++++ sle11-2009-06-29/include/linux/page-flags.h 2009-03-16 16:33:40.000000000 +0100
+@@ -102,8 +102,8 @@ enum pageflags {
+ PG_foreign, /* Page is owned by foreign allocator. */
+ PG_pinned, /* Cannot alias with PG_owner_priv_1 since
+ * bad_page() checks include this bit.
+- * Also cannot use PG_arch_1 since that now
+- * has a different purpose on x86. */
++ * Should not use PG_arch_1 as that may have
++ * a different purpose elsewhere. */
+ #endif
+ __NR_PAGEFLAGS,
+
+--- sle11-2009-06-29.orig/include/linux/pci.h 2008-12-15 11:27:22.000000000 +0100
++++ sle11-2009-06-29/include/linux/pci.h 2009-03-16 16:33:40.000000000 +0100
+@@ -644,6 +644,9 @@ int pcie_set_readrq(struct pci_dev *dev,
+ void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
+ int __must_check pci_assign_resource(struct pci_dev *dev, int i);
+ int pci_select_bars(struct pci_dev *dev, unsigned long flags);
++#ifdef CONFIG_XEN
++void pci_restore_bars(struct pci_dev *);
++#endif
+
+ /* ROM control related routines */
+ void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
+--- sle11-2009-06-29.orig/include/xen/evtchn.h 2009-03-04 11:28:34.000000000 +0100
++++ sle11-2009-06-29/include/xen/evtchn.h 2009-03-16 16:33:40.000000000 +0100
+@@ -130,12 +130,37 @@ static inline void clear_evtchn(int port
+ synch_clear_bit(port, s->evtchn_pending);
+ }
+
++static inline void set_evtchn(int port)
++{
++ shared_info_t *s = HYPERVISOR_shared_info;
++ synch_set_bit(port, s->evtchn_pending);
++}
++
++static inline int test_evtchn(int port)
++{
++ shared_info_t *s = HYPERVISOR_shared_info;
++ return synch_test_bit(port, s->evtchn_pending);
++}
++
+ static inline void notify_remote_via_evtchn(int port)
+ {
+ struct evtchn_send send = { .port = port };
+ VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
+ }
+
++/* Clear an irq's pending state, in preparation for polling on it. */
++void xen_clear_irq_pending(int irq);
++
++/* Set an irq's pending state, to avoid blocking on it. */
++void xen_set_irq_pending(int irq);
++
++/* Test an irq's pending state. */
++int xen_test_irq_pending(int irq);
++
++/* Poll waiting for an irq to become pending. In the usual case, the
++ irq will be disabled so it won't deliver an interrupt. */
++void xen_poll_irq(int irq);
++
+ /*
+ * Use these to access the event channel underlying the IRQ handle returned
+ * by bind_*_to_irqhandler().
+--- sle11-2009-06-29.orig/kernel/sysctl_check.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/kernel/sysctl_check.c 2009-03-16 16:33:40.000000000 +0100
+@@ -899,7 +899,7 @@ static const struct trans_ctl_table tran
+ };
+
+ #ifdef CONFIG_XEN
+-static struct trans_ctl_table trans_xen_table[] = {
++static const struct trans_ctl_table trans_xen_table[] = {
+ { CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" },
+ { CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" },
+ {}
+--- sle11-2009-06-29.orig/lib/swiotlb-xen.c 2009-02-16 16:18:36.000000000 +0100
++++ sle11-2009-06-29/lib/swiotlb-xen.c 2009-03-16 16:33:40.000000000 +0100
+@@ -30,7 +30,6 @@
+ #include <asm/gnttab_dma.h>
+
+ int swiotlb;
+-EXPORT_SYMBOL(swiotlb);
+
+ #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
+
+@@ -289,6 +288,15 @@ __sync_single(struct phys_addr buffer, c
+ }
+ }
+
++static inline unsigned int is_span_boundary(unsigned int index,
++ unsigned int nslots,
++ unsigned long offset_slots,
++ unsigned long max_slots)
++{
++ unsigned long offset = (offset_slots + index) & (max_slots - 1);
++ return offset + nslots > max_slots;
++}
++
+ /*
+ * Allocates bounce buffer and returns its kernel virtual address.
+ */
+@@ -300,6 +308,15 @@ map_single(struct device *hwdev, struct
+ unsigned int nslots, stride, index, wrap;
+ struct phys_addr slot_buf;
+ int i;
++ unsigned long mask;
++ unsigned long offset_slots;
++ unsigned long max_slots;
++
++ mask = dma_get_seg_boundary(hwdev);
++ offset_slots = -IO_TLB_SEGSIZE;
++ max_slots = mask + 1
++ ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
++ : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+
+ /*
+ * For mappings greater than a page, we limit the stride (and
+@@ -319,12 +336,21 @@ map_single(struct device *hwdev, struct
+ */
+ spin_lock_irqsave(&io_tlb_lock, flags);
+ {
+- wrap = index = ALIGN(io_tlb_index, stride);
+-
++ index = ALIGN(io_tlb_index, stride);
+ if (index >= iotlb_nslabs)
+- wrap = index = 0;
++ index = 0;
++ wrap = index;
+
+ do {
++ while (is_span_boundary(index, nslots, offset_slots,
++ max_slots)) {
++ index += stride;
++ if (index >= iotlb_nslabs)
++ index = 0;
++ if (index == wrap)
++ goto not_found;
++ }
++
+ /*
+ * If we find a slot that indicates we have 'nslots'
+ * number of contiguous buffers, we allocate the
+@@ -359,6 +385,7 @@ map_single(struct device *hwdev, struct
+ index = 0;
+ } while (index != wrap);
+
++ not_found:
+ spin_unlock_irqrestore(&io_tlb_lock, flags);
+ return NULL;
+ }